-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel_base.py
163 lines (126 loc) · 7.03 KB
/
model_base.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# -*- coding: utf-8 -*-
from text_mining import load_classes_chebi, delete_files_in_directory, run_text_mining, add_publication, text_prep, CatalysisIE_search
from preprocess_onto import *
from onto_extension_new import preprocess_classes,create_classes_onto
from txt_extract import get_abstract, get_metadata
from CatalysisIE.model import *
from CatalysisIE.utils import *
from pubchempy import get_compounds
import statistics
import json
import pybliometrics
pybliometrics.scopus.init()
def set_config_key(key, value):
globals()[key] = value
def find_id_by_text(nested_list, search_text):
for item in nested_list:
if search_text in item["data"]["text"]:
return nested_list.index(item)#["id"]
return None
def id_from_abstr_snippet(nested_list, search_text):
for i in range(len(abstract), 8, -5):
res = find_id_by_text(label_data, abstract[1:i])
if res is not None:
return res
return None
with open("config.json") as json_config:
for key, value in json.load(json_config).items():
set_config_key(key, value)
# TODO: Checkpoint dynamisch einbauen!
#ckpt_name = "CatalysisIE/checkpoint/train_demo1_checkpoint-v4.ckpt"
ckpt_name = "CatalysisIE/checkpoint/CV_0.ckpt"
model = BERTSpan.load_from_checkpoint(ckpt_name, model_name=bert_name, train_dataset=[], val_dataset=[], test_dataset=[])
out_dict = {}
label_list = ["Reactant","Product", "Characterization","Reaction","Catalyst","Treatment"]
import csv
with open('./import/DOI-Seed.csv') as csvfile:
doi_list = list(csv.reader(csvfile))
doi_list
with open("./Project_new1.json") as f:
label_data = json.load(f)
for i in glob.iglob(path):
abstract = None
title, doi, publisher = get_metadata(i)
if doi == None:
continue
print(title+' : '+doi)
abstract = get_abstract(i, doi, publisher)
if abstract != None:
#p_id = add_publication(doi,title,abstract)
#if p_id == None:
# print('p_id = None')
# continue
sents = text_prep(abstract)
categories, chem_list, reac_dict, sup_cat, abbreviation, raw_entities = CatalysisIE_search(model, sents)
entry_annotation_lst = id_from_abstr_snippet(label_data,abstract)
if entry_annotation_lst == None:
print("PROBLEM WITH fetched ABSTRACT! Check for misplaced characters like fi instead of fi")
print(abstract[1:40])
label_dict_manual = {i: 0 for i in label_list}
label_dict_model = {i: 0 for i in label_list}
word_list_man = []
word_list_mod = []
label_man_index = {}
out_dict[entry_annotation_lst] = []
"""
out_dict = {1: {man: int, man_labels: {label1 : int, label2: int...}, model_name: int, model_name_labels: {label1:int, label2:int,...}, ...}, {2:...},...}
"""
# Searches for similar entries in manually annotated and automatically annotated data
for item in label_data[entry_annotation_lst]["annotations"][0]["result"]:
label_dict_manual[item["value"]["labels"][0]] += 1
word_list_man.append(item["value"]["text"])
label_man_index[item["value"]["text"]] = item["value"]["labels"][0]
category_keys = categories.keys()
#list(set(, chem_list))
if item["value"]["text"].rstrip('s') in [s.rstrip('s') for s in category_keys]:
try:
if categories[item["value"]["text"]] == item["value"]["labels"][0]:
# contained in both manually and automatically labeled data
label_dict_model[item["value"]["labels"][0]] += 1
word_list_mod.append(item["value"]["text"])
except:
if categories[item["value"]["text"].rstrip('s')] == item["value"]["labels"][0]: # try again without trailing s in model-labeled data
# contained in both manually and automatically labeled data
label_dict_model[item["value"]["labels"][0]] += 1
word_list_mod.append(item["value"]["text"])
for i in chem_list:
if i not in word_list_mod:# and i in list(label_man_index.keys()):
try:
print("Adding '{}' with label {}".format(i,label_man_index[i]))
label_dict_model[label_man_index[i]] += 1
word_list_mod.append(i)
except:
word_list_mod.append(i)
# Token detected without classification -> subsuming acc. to Fig 2 of Zhang et al. https://github.com/nsndimt/CatalysisIE/blob/main/JCIM2022.pdf
if label_dict_model["Catalyst"] != label_dict_manual["Catalyst"]:
label_dict_model["Catalyst"] += 1
print("Correct label for '{}' not clear. Assumed as Catalyst".format(i))
elif label_dict_model["Product"] != label_dict_manual["Product"]:
label_dict_model["Product"] += 1
print("Correct label for '{}' not clear. Assumed as Product".format(i))
elif label_dict_model["Reactant"] != label_dict_manual["Reactant"]:
label_dict_model["Reactant"] += 1
print("Correct label for '{}' not clear. Assumed as Reactant".format(i))
elif label_dict_model["Reaction"] != label_dict_manual["Reaction"]:
label_dict_model["Reaction"] += 1
print("Correct label for '{}' not clear. Assumed as Reaction".format(i))
elif label_dict_model["Characterization"] != label_dict_manual["Characterization"]:
label_dict_model["Characterization"] += 1
print("Correct label for '{}' not clear. Assumed as Characterization".format(i))
else:
label_dict_model["Treatment"] += 1
print("Correct label for '{}' not clear. Assumed as Treatment".format(i))
else:
print("Correct label for '{}' not clear, entry omitted.".format(i))
num_lab_man = len(set(word_list_man))
num_lab_mod = len(set(word_list_mod))
recall = num_lab_mod/num_lab_man
prec_classes = []
for lab_class in list(label_dict_manual.keys()):
if label_dict_manual[lab_class] !=0:
prec_classes.append(label_dict_model[lab_class]/label_dict_manual[lab_class])
deviation = statistics.stdev(prec_classes)
prec = sum(prec_classes)/len(prec_classes)
out_dict[entry_annotation_lst] = {"man": num_lab_man, "man_labels": label_dict_manual, "base_model": num_lab_mod, "base_model_labels":label_dict_model, "recall": recall, "precision": prec, "st_dev": deviation, "doi": doi, "token_man":label_man_index,"token_mod":list(set(word_list_mod)), "abstract": abstract}
with open("./out_dict_base_mod.json",'w') as f:
json.dump(out_dict, f)