forked from lvyilin/BaikeNRE
-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_json.py
182 lines (151 loc) · 5.57 KB
/
generate_json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# encoding=utf-8
import urllib.request
from bs4 import BeautifulSoup
import json
import os
import re
import string
import pymongo
import shutil
import pyltp
def inf_table(bs, d):
str_list = []
i = 0
for k in bs.find_all('table'):
for tr in k.find_all('tr'):
str_list.append(str(tr.get_text()).strip())
d['table'] = str_list.copy()
def inf_infocox(bs, d):
i = 1
for k in bs.find_all('div', class_ = 'star-info-block relations'):
for tr in k.find_all('div', class_ = 'name'):
d['infobox'][str(tr.contents[0]).strip()] = str(tr.contents[1]).strip().replace('<em>','').replace('</em>','')
for k in bs.find_all('div', class_='basic-info cmn-clearfix'):
for tr in k.find_all('dt', class_="basicInfo-item name"):
temp = 1
for tt in k.find_all('dd', class_="basicInfo-item value"):
if (temp != i):
temp = temp + 1
continue
tr_txt = tr.get_text().replace(u"\xa0", "").replace(".", "").strip()
d['infobox'][tr_txt] = str(tt.get_text()).strip()
break
i = i + 1
def inf_para(bs, d):
str_list = []
for k in bs.find_all('div', class_='para'):
if str(k.parent.get("class")) == "['lemma-summary']":
continue
str_list.append(str(k.get_text()).replace('\n','').replace('\xa0','') + "\n")
d['body'] = ''.join(str_list)
# def inf_summary(bs, d):
# str_list = []
# for k in bs.find_all('div', class_='lemma-summary'):
# str_list.append(str(k.get_text()).strip())
# d['abstract'] = ''.join(str_list)
def inf_summary(bs, d):
str_list = []
for k in bs.find_all('div', class_='lemma-summary'):
for j in k.find_all('div', class_='para'):
str_list.append(str(j.get_text()).replace('\n','').replace('\xa0','') + '\n')
d['abstract'] = ''.join(str_list)
def inf_name(bs, d):
title_node = bs.find('dd', class_="lemmaWgt-lemmaTitle-title")
if title_node is not None:
title_node = title_node.find("h1")
if title_node is not None:
d['name'] = str(title_node.get_text().strip())
def inf_lables(bs, d):
for labels in bs.find_all("span", class_="taglist"):
d["tags"].append(str(labels.get_text()).strip())
def inf_links(bs, d):
links = {}
for link in bs.find_all('a', href=re.compile(r"/item/")):
links[str(link.get_text()).strip()] = str(link['href'])
d["links"] = links
def write_json(resdict, filename):
with open("parse_data/" + filename, "w", encoding="utf-8") as f:
f.write(json.dumps(resdict, ensure_ascii=False))
def load_entity():
d = dict()
with open("person_relation.txt", "r", encoding="utf8") as f:
for line in f:
li = line.split(" ")
# ENTITY_MAP.add(line.split(" ")[0])
d[li[0]] = (li[1], li[2])
return d
def load_person_entity_set():
d = set()
with open("person.txt", "r", encoding="utf8") as f:
for line in f:
d.add(line.rstrip())
return d
def build_relation_pattern(d):
s = u""
for k, v in d.items():
s += k + "|"
# s = s[0:-1]
s = s.rstrip('|')
ptn = u"(" + s + u")"
return re.compile(ptn)
DATASET_PATH = "D:\\result\\"
# LTP_DATA_DIR = "D:\\Projects\\ltp_data_v3.4.0"
# cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')
# pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')
# ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model')
#
# segmentor = pyltp.Segmentor()
# segmentor.load(cws_model_path)
# postagger = pyltp.Postagger() # 初始化实例
# postagger.load(pos_model_path)
# recognizer = pyltp.NamedEntityRecognizer() # 初始化实例
# recognizer.load(ner_model_path) # 加载模型
person_entity_set = load_person_entity_set()
for root, subdirs, files in os.walk(DATASET_PATH):
for filename in files:
if filename.endswith(".jpg"):
continue
if filename!='杨开慧46':
continue
if filename not in person_entity_set:
continue
if os.path.isfile("parse_data/" + filename + ".json"):
continue
file_path = os.path.join(root, filename)
with open(file_path, "r", encoding="utf8") as f:
result_dict = {
'name': "",
'tags': [],
'abstract': "",
'infobox': {},
'body': "",
'table': []
}
try:
bs = BeautifulSoup(f, "lxml")
inf_lables(bs, result_dict)
is_person_entity = False
for tag in result_dict['tags']:
if str(tag).endswith(u"人物"):
is_person_entity = True
break
if not is_person_entity:
continue
inf_name(bs, result_dict)
if result_dict['name'] == "":
result_dict['name'] = filename.rstrip(string.digits)
inf_summary(bs, result_dict)
inf_para(bs, result_dict)
inf_infocox(bs, result_dict)
inf_table(bs, result_dict)
inf_links(bs, result_dict)
except:
print("error:" + filename)
continue
save_name = filename
# if os.path.exists(save_name + ".json"):
# i = 1
# while os.path.exists(save_name + i + ".json"):
# i += 1
# save_name = save_name + i
write_json(result_dict, save_name + ".json")