-
Notifications
You must be signed in to change notification settings - Fork 0
/
hackathon_runner.py
268 lines (210 loc) · 8.74 KB
/
hackathon_runner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
import os
import pandas as pd
import PyPDF2
from paperscraper.pdf import save_pdf
from paperscraper.get_dumps import biorxiv
from VectorDatabase import Lantern, Fragment, Publication
import openai
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain import PromptTemplate
import PyPDF2
keywords_groups = {
'CX-MS': ['cross-link', 'crosslink', 'XL-MS', 'CX-MS', 'CL-MS', 'XLMS', 'CXMS', 'CLMS', "chemical crosslinking mass spectrometry", 'photo-crosslinking', 'crosslinking restraints', 'crosslinking-derived restraints', 'chemical crosslinking', 'in vivo crosslinking', 'crosslinking data'],
'HDX': ['Hydrogen–deuterium exchange mass spectrometry', 'Hydrogen/deuterium exchange mass spectrometry' 'HDX', 'HDXMS', 'HDX-MS'],
'EPR': ['electron paramagnetic resonance spectroscopy', 'EPR', 'DEER', "Double electron electron resonance spectroscopy"],
'FRET': ['FRET', "forster resonance energy transfer", "fluorescence resonance energy transfer"],
'AFM': ['AFM', "atomic force microscopy" ],
'SAS': ['SAS', 'SAXS', 'SANS', "Small angle solution scattering", "solution scattering", "SEC-SAXS", "SEC-SAS", "SASBDB", "Small angle X-ray scattering", "Small angle neutron scattering"],
'3DGENOME': ['HiC', 'Hi-C', "chromosome conformation capture"],
'Y2H': ['Y2H', "yeast two-hybrid"],
'DNA_FOOTPRINTING': ["DNA Footprinting", "hydroxyl radical footprinting"],
'XRAY_TOMOGRAPHY': ["soft x-ray tomography"],
'FTIR': ["FTIR", "Infrared spectroscopy", "Fourier-transform infrared spectroscopy"],
'FLUORESCENCE': ["Fluorescence imaging", "fluorescence microscopy", "TIRF"],
'EVOLUTION': ['coevolution', "evolutionary covariance"],
'PREDICTED': ["predicted contacts"],
'INTEGRATIVE': ["integrative structure", "hybrid structure", "integrative modeling", "hybrid modeling"],
'SHAPE': ['Hydroxyl Acylation analyzed by Primer Extension']
}
import re
class LlmHandler:
def __init__(self):
self.text_splitter = RecursiveCharacterTextSplitter(separators = ["\n\n", "\n", ".", ","], chunk_size=300, chunk_overlap=100)
self.llm=ChatOpenAI(
openai_api_key=openai_api_key,
temperature=0, model_name="gpt-4", max_tokens=300, request_timeout = 30, max_retries=3
)
def evaluate_queries(self, embedding, queries):
chatbot = RetrievalQA.from_chain_type(
llm=self.llm,
chain_type="stuff",
retriever=embedding.as_retriever(search_type="similarity", search_kwargs={"k":3})
)
template = """ {query}? """
response = []
for q in queries:
prompt = PromptTemplate(
input_variables=["query"],
template=template,
)
response.append(chatbot.run(
prompt.format(query=q)
))
return response
llm = LlmHandler()
methods_string = ''
for i, (k, v) in enumerate(keywords_groups.items()):
if i > 0:
methods_string += ' or '
methods_string += f'{k} ({", ".join(v)})'
def get_embeddings(fname):
"""
"""
loader = TextLoader(fname)
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(separators = ["\n\n", "\n", ".", ","],chunk_size = 300, chunk_overlap=100)
docs = text_splitter.split_documents(documents)
emb = OpenAIEmbeddings()
input_texts = [d.page_content for d in docs]
input_embeddings = emb.embed_documents(input_texts)
text_embeddings = list(zip(input_texts, input_embeddings))
return text_embeddings, emb
def retreiveTextFromPdf(inp_file):
json = pd.read_json(path_or_buf=inp_file, lines=True)
lantern = Lantern()
for n, doi in enumerate(json['doi']):
#print(n, doi)
##NOTE: This is for example purpose only
if n > 0:
break
if lantern.publicationExists(doi):
continue
paper_data = {'doi': doi}
doi = doi.replace("/", "-")
pdf_dir = './papers/'
if not os.path.exists(pdf_dir):
os.mkdir(pdf_dir)
pdfsavefile='./papers/' + doi +'.pdf'
save_pdf(paper_data, filepath=pdfsavefile)
# creating a pdf reader object
reader = PyPDF2.PdfReader(pdfsavefile)
save_txt_path = 'scrapped_txts/'
if not os.path.exists(save_txt_path):
os.mkdir(save_txt_path)
extract_text = ''
for page in reader.pages:
extract_text+=page.extract_text()
txt_file = str('{}.txt'.format(doi))
with open(save_txt_path+txt_file, 'w') as file:
file.write(extract_text)
txt_embs, emb = get_embeddings(save_txt_path+txt_file)
fragments = []
for txt, embs in txt_embs:
fragment = Fragment(doi, 'methods', txt, embs)
fragments.append(fragment)
title = ""
pmc = ""
pubmed = ""
publication = Publication(doi, title, pmc, pubmed, doi)
lantern.insertEmbeddings(fragments)
lantern.insertPublication(publication)
os.remove(pdfsavefile)
def add_publication_by_doi(doi):
lantern = Lantern()
if lantern.publicationExists(doi):
return
paper_data = {'doi': doi}
doi = doi.replace("/", "-")
pdf_dir = './papers/'
if not os.path.exists(pdf_dir):
os.mkdir(pdf_dir)
pdfsavefile='./papers/' + doi +'.pdf'
save_pdf(paper_data, filepath=pdfsavefile)
# creating a pdf reader object
reader = PyPDF2.PdfReader(pdfsavefile)
save_txt_path = 'scrapped_txts/'
if not os.path.exists(save_txt_path):
os.mkdir(save_txt_path)
extract_text = ''
for page in reader.pages:
extract_text+=page.extract_text()
txt_file = str('{}.txt'.format(doi))
with open(save_txt_path+txt_file, 'w') as file:
file.write(extract_text)
txt_embs, emb = get_embeddings(save_txt_path+txt_file)
fragments = []
for txt, embs in txt_embs:
fragment = Fragment(doi, 'methods', txt, embs)
fragments.append(fragment)
title = ""
pmc = ""
pubmed = ""
publication = Publication(doi, title, pmc, pubmed, doi)
lantern.insertEmbeddings(fragments)
lantern.insertPublication(publication)
#print(fragments)
os.remove(pdfsavefile)
def process_result(result):
if result == None:
return (False, None)
for response in result:
if "cryo" in response.lower():
return (False, None)
return (response.lower().startswith('yes'), response)
lantern = Lantern()
def get_embeddings_for_pub(id):
input_texts = []
input_embeddings = []
if lantern.publicationExists(id):
fragments = lantern.getAllFragmentsOfPublication(id)
for fragment in fragments:
input_texts.append(fragment.content)
input_embeddings.append(fragment.vector)
text_embeddings = list(zip(input_texts, input_embeddings))
return text_embeddings
def main():
open_ai_emb = OpenAIEmbeddings()
#add_publication_by_doi('10.1101/2023.10.31.564925')
#add_publication_by_doi('10.1101/2023.03.03.531047')
query = [f"You are reading a materials and methods section of a scientific paper. Here is the list of structural biology methods {methods_string}.\n\n Did the authors use any methods from the list? \n\n Answer with Yes or No followed by the names of the methods."]
lantern = Lantern()
publications = lantern.getUnreadPublication()
all_results = []
rows = []
hits = 0
for pub in publications[5:]:
text_embeddings = get_embeddings_for_pub(pub.id)
flag = False
for text, _ in text_embeddings:
if re.search("cryo-?em", text, re.IGNORECASE):
flag = True
break
if flag:
faissIndex = FAISS.from_embeddings(text_embeddings=text_embeddings, embedding=open_ai_emb)
result = llm.evaluate_queries(faissIndex, query)
classification, response = process_result(result)
hits += classification
else:
classification, response = process_result(None)
#print('paper not about cryo-em')
rows.append([pub.doi, pub.title, "11-2-2023", "11-5-2023", "", int(classification), response, ""])
from google_sheets import SpreadsheetUpdater
gs = SpreadsheetUpdater()
print(rows)
gs.append_rows(rows)
msg = f"""
This batch of paper analysis has concluded.
{len(rows)} papers were analyzed in total over the date range 11/2 - 11/3
{hits} {"were" if ((hits>0) or (hits == 0)) else was} classified as having multi-method structural data
"""
print(msg)
gs.email(message=msg)
main()