-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathllm_model_inference.py
172 lines (144 loc) · 5.61 KB
/
llm_model_inference.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import os
import multiprocessing
import torch
import chromadb
from chromadb.config import Settings
from dotenv import load_dotenv
from langchain.llms import LlamaCpp
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate
from langchain.text_splitter import TokenTextSplitter
from langchain.docstore.document import Document
from langchain.chains import LLMChain
from langchain.embeddings import GPT4AllEmbeddings, HuggingFaceEmbeddings
from langchain.chains.question_answering import load_qa_chain
from langchain.memory import ConversationBufferMemory
from langchain.vectorstores import Chroma
from pdf_parser import pdf_to_ocr
load_dotenv()
# GPU Inference
cuda_available = 0
if (torch.cuda.is_available()):
print ('Nvidia GPU detected!')
os.environ['LLAMA_CPP_LIB'] = os.getenv('LLAMA_CPP_LIB', 'usr/local/lib/libllama.so')
os.environ['LLAMA_CUBLAS'] = os.getenv('LLAMA_CUBLAS', 'on')
cuda_available = 1
else:
print ('Defaulting to CPU!')
# Model initialization
MODEL_PATH = os.getenv('MODEL_PATH')
if cuda_available:
# GPU Layers = 25 acceptable for 4GB VRAM
llm = LlamaCpp(model_path=MODEL_PATH, n_ctx=2048, n_gpu_layers=25, max_tokens=2048, temperature=0, n_batch=512)
embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2",
model_kwargs={'device': 'cuda'},
encode_kwargs={'normalize_embeddings': False}
)
else:
llm = LlamaCpp(model_path=MODEL_PATH, n_ctx=2048, n_threads=multiprocessing.cpu_count(), temperature=0)
embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2",
model_kwargs={'device': 'cpu'},
encode_kwargs={'normalize_embeddings': False}
)
# Chroma DB
persist_directory = os.getenv('PERSIST_DIRECTORY')
CHROMA_SETTINGS = Settings(persist_directory=persist_directory, anonymized_telemetry=False)
chroma_client = chromadb.PersistentClient(settings=CHROMA_SETTINGS, path=persist_directory)
# Prompts
SUMMARY_PROMPT_TEMPLATE = """
### System:
You are an AI assistant. You will be given a task. You must generate a detailed and long answer.
### User:
Summarize the following text.
{text}
### Response:
Sure, here is a summary of the text:
"""
GRAMMAR_PROMPT_TEMPLATE = """
### System:
You are an AI assistant that follows instruction extremely well. Help as much as you can.
### User:
Read the following text, and rewrite all sentences after correcting all the writing mistakes:
{text}
### Response:
"""
CHAT_PROMPT_TEMPLATE = """
### System:
You are an AI assistant that helps people find information.
### User:
This is your previous chat history, where "Human:" is the user's query and "AI:" is your response to the query:
{chat_history}
This is the information provided to you:
{context}
Use only the conversation history (if there was previous conversations made) and the information to answer the following query.
{question}
### Response:
"""
summarize_prompt = PromptTemplate.from_template(SUMMARY_PROMPT_TEMPLATE)
grammar_prompt = PromptTemplate(template=GRAMMAR_PROMPT_TEMPLATE, input_variables=["text"])
qa_prompt = PromptTemplate(template=CHAT_PROMPT_TEMPLATE, input_variables=["question", "chat_history", "context"])
def summarize_pdf(pdf_path):
# Convert pdf to text
text = pdf_to_ocr(pdf_path)
# Split text into chunks
text_splitter = TokenTextSplitter()
texts = text_splitter.split_text(text)
docs = [Document(page_content=t) for t in texts]
summary_chain = load_summarize_chain(llm, chain_type='stuff', prompt=summarize_prompt)
# Run inference
try:
result = summary_chain.run(docs)
except Exception as e:
return e
return result
def grammar_check(text):
llm_chain = LLMChain(prompt=grammar_prompt, llm=llm)
# Run inference
try:
result = llm_chain.run(text)
return result
except Exception as e:
return e
def ingest_file(pdf_path):
# Convert pdf to text
try:
text = pdf_to_ocr(pdf_path)
text_splitter = TokenTextSplitter(chunk_size=200, chunk_overlap=0)
texts = text_splitter.split_text(text)
docs = [Document(page_content=t) for t in texts]
db = Chroma.from_documents(docs, embeddings,
persist_directory=persist_directory,
client=chroma_client,
client_settings=CHROMA_SETTINGS)
print ('File has been ingested!')
return "File has been uploaded!"
except Exception as e:
return e
def chat_qa(query, chat_history):
# Use stored embeddings
db = Chroma(persist_directory=persist_directory,
embedding_function=embeddings,
client_settings=CHROMA_SETTINGS,
client=chroma_client)
print (db.get())
# Initialize chat memory, uses chat state from frontend
memory = ConversationBufferMemory(memory_key="chat_history", input_key="question")
for index in range(0, len(chat_history), 2):
prev_user_msg, prev_ai_msg = chat_history[index], chat_history[index+1]
memory.chat_memory.add_user_message(prev_user_msg)
memory.chat_memory.add_ai_message(prev_ai_msg)
qa = load_qa_chain(llm=llm, memory=memory, prompt=qa_prompt, verbose=True) # Initialize QA
docs = db.similarity_search(query, k=8) # Get relevant docs
result = qa({
"input_documents" : docs,
"question" : query
})
print (result, flush=True)
return result['output_text']
# Sample inference
if __name__ == "__main__":
pdf_path = "./samples/tiny-attention.pdf"
res = summarize_pdf(pdf_path)
print (res)