-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
114 lines (89 loc) · 3.12 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import hashlib
import chromadb
from chromadb import Collection
from chromadb.config import Settings as ChromaDbSettings
from chromadb.errors import InvalidCollectionException
from langchain.docstore.document import Document
from langchain_community.document_loaders import PyPDFLoader
from typing import List
# Each file gets its own ChromaDB index, using hash as the index name
def get_collection_name(fname: str):
sha = hashlib.sha224()
sha.update(fname.encode())
return sha.hexdigest()
# Takes a Langchain Document and splits the contents into text chunks
# of size <chunk_size> characters with <overlap> characters
def split_to_chunks(document: Document, chunk_size: int, overlap: int):
txt = document.page_content
# remove line changes and extra whitespace
txt = txt.replace("\n", " ")
txt = " ".join(txt.split())
txt_len = len(txt)
chunks = []
for start in range(0, txt_len, (chunk_size - overlap)):
chunks.append(
Document(
page_content=txt[start : start + chunk_size],
metadata={**document.metadata, "idx": len(chunks)},
)
)
return chunks
# Take a PDF filename, split it into pages and chunks, return the chunks
def parse_pdf(filename):
loader = PyPDFLoader(filename)
pages = loader.load_and_split()
chunks = []
for page in pages:
chunks.extend(split_to_chunks(page, 500, 50))
return chunks
# Get or create ChromaDB index for a given file
def build_index(filename: str, override_collection_name="") -> Collection:
chroma = chromadb.PersistentClient(
"./chroma_data",
ChromaDbSettings(
anonymized_telemetry=False,
),
)
collection_name = get_collection_name(
override_collection_name
if override_collection_name is not None
else filename
)
collection = None
try:
collection = chroma.get_collection(collection_name)
print("Found existing collection")
except InvalidCollectionException:
print(f"Will create a new collection for {filename}")
if not collection:
collection = chroma.create_collection(collection_name)
chunks = parse_pdf(filename)
for chunk in chunks:
collection.add(
documents=chunk.page_content,
metadatas=chunk.metadata,
ids=f"{chunk.metadata['page']}_{chunk.metadata['idx']}",
)
print(f"index ready for {filename}")
return collection
SYSTEM_PROMPT_TPL = """
Use the following pieces of information to answer the question of the user.
<context>
{context}
</context>
"""
# Build the messages to be sent to OpenAI chat completion API
# System prompt + user message
def build_messages(user_question: str, context: List[str]):
context_str = "\n\n- ".join(context)
system_prompt = SYSTEM_PROMPT_TPL.format(context=context_str)
print("")
print("system prompt for LLM:")
print("------")
print(system_prompt)
print("------")
print("")
return [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_question},
]