-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathindex.py
82 lines (71 loc) · 2.66 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import sys
import os
# 获取当前文件所在的文件夹路径
current_folder = os.path.dirname(os.path.abspath(__file__))
# 将当前文件所在的文件夹路径添加到Python包路径
sys.path.append(current_folder)
import pandas as pd
from rlt2t.predictor.predictor import Predictor
def get_t2t_model_paths():
paths = [
'idea-bart-xl-0.2-rank',
"idea-bart-xl-0.3",
'uer-large-199-0.1-rank',
'uer-large-199-0.2',
'uer-base-139-0.1-142-rank',
]
output_paths = []
for item in paths:
output_paths.append(os.path.join(current_folder, 'sub-models', item))
return output_paths
def get_t2t_score_model_paths():
paths = [
'idea-bart-xl-0.2-rank',
'uer-large-199-0.1-rank',
'uer-large-199-0.2',
'uer-base-139-0.1-142-rank',
'fnlp-base-249-242-503650-rank',
]
weights = None
output_paths = []
for item in paths:
output_paths.append(os.path.join(current_folder, 'sub-models', item))
return output_paths, weights
def invoke(input_data_path, output_data_path):
split_token = '1799'
t2t_model_paths = get_t2t_model_paths()
t2t_score_model_paths, weights = get_t2t_score_model_paths()
predictor = Predictor(t2t_model_paths,
[],
t2t_score_model_paths,
score_model_weights=weights,
idf_path=os.path.join(current_folder, 'idf.json'),
beam_size_list=[10],
batch_size=256,
num_hypotheses=4)
df = pd.read_csv(input_data_path,
header=None, index_col=False,
names=["report_id", "description", "clinical"])
records = df.to_dict('records')
for idx, record in enumerate(records):
if not record.get('clinical', '') or str(record['clinical']) == 'nan':
record['clinical'] = ''
texts = []
for record in records:
text = " ".join([record['clinical'].strip(), split_token, record['description'].strip()])
texts.append(text)
output_texts = [item['output'] for item in predictor.predict_v2(texts, score_version='v1', self_boost=False, boost_size=3)]
output_records = []
for idx in range(len(records)):
output_records.append({
'report_id': records[idx]['report_id'],
'prediction': output_texts[idx]
})
output_df = pd.DataFrame(output_records)
output_df.to_csv(output_data_path, header=False, index=False)
if __name__ == '__main__':
import time
t1 = time.time()
invoke('test.csv', 'output.csv')
t2 = time.time()
print(t2-t1)