-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdf_parser.py
88 lines (72 loc) · 3.05 KB
/
pdf_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import os
from dotenv import load_dotenv
from openai import OpenAI # Version 1.33.0
from openai.types.beta.threads.message_create_params import Attachment, AttachmentToolFileSearch
import json
# Load environment variables from a .env file
load_dotenv()
MY_OPENAI_KEY = os.getenv('OPENAI_API_KEY') # Add your OpenAI API key
client = OpenAI(api_key=MY_OPENAI_KEY)
# Upload your pdf(s) to the OpenAI API
file = client.files.create(
file=open('downloads/8_PDD.pdf', 'rb'),
purpose='assistants'
)
# Create thread
thread = client.beta.threads.create()
# Create an Assistant (or fetch it if it was already created). It has to have
# "file_search" tool enabled to attach files when prompting it.
def get_assistant():
for assistant in client.beta.assistants.list():
if assistant.name == 'My Assistant Name':
return assistant
# No Assistant found, create a new one
return client.beta.assistants.create(
model='gpt-4o',
description='You are a PDF retrieval assistant.',
instructions="You are a helpful assistant designed to output only JSON. Find information from the text and files provided.",
tools=[{"type": "file_search"}],
# response_format={"type": "json_object"}, # Isn't possible with "file_search"
name='My Assistant Name',
)
# Add your prompt here
prompt = "What's the coordinates of the landfill site? Output in JSON format, like:\n{'lat': 123.45,'lon': 67.89}"
client.beta.threads.messages.create(
thread_id = thread.id,
role='user',
content=prompt,
attachments=[Attachment(file_id=file.id, tools=[AttachmentToolFileSearch(type='file_search')])]
)
# Run the created thread with the assistant. It will wait until the message is processed.
run = client.beta.threads.runs.create_and_poll(
thread_id=thread.id,
assistant_id=get_assistant().id,
timeout=300, # 5 minutes
# response_format={"type": "json_object"}, # Isn't possible
)
# Eg. issue with openai server
if run.status != "completed":
raise Exception('Run failed:', run.status)
# Fetch outputs of the thread
messages_cursor = client.beta.threads.messages.list(thread_id=thread.id)
messages = [message for message in messages_cursor]
message = messages[0] # This is the output from the Assistant (second message is your message)
assert message.content[0].type == "text"
# Output text of the Assistant
res_txt = message.content[0].text.value
# Because the Assistant can't produce JSON (as we're using "file_search"),
# it will likely output text + some JSON code. We can parse and extract just
# the JSON part, and ignore everything else (eg. gpt4o will start with something
# similar to "Of course, here's the parsed text: {useful_JSON_here}")
if res_txt.startswith('```json'):
res_txt = res_txt[6:]
if res_txt.endswith('```'):
res_txt = res_txt[:-3]
res_txt = res_txt[:res_txt.rfind('}')+1]
res_txt = res_txt[res_txt.find('{'):]
res_txt.strip()
# Parse the JSON output
data = json.loads(res_txt)
print(data)
# Delete the file(s) afterward to preserve space (max 100gb/company)
delete_ok = client.files.delete(file.id)