-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathstep1.py
71 lines (62 loc) · 2.17 KB
/
step1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import json
import regex
import nltk.data
from nltk.tokenize import word_tokenize
import sys
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
def tokenize(string):
return word_tokenize(string)
def split_paragraphs(text):
"""
remove urls, lowercase all words and separate paragraphs
"""
splits = regex.split(r'\n+', text)
paras = []
for split in splits[1:]: # skip the titles
split = split.strip()
if len(split) == 0:
continue
if 'Section::' in split:
continue
paras.append(split)
paras = " ".join(paras)
return sent_detector.tokenize(paras)
def split_sent(sent):
strings = regex.split('<a |</a>', sent)
new_strings = []
count = 0
for s in strings:
s = s.strip()
if s:
if 'href=' in s:
s = s.lstrip('href="')
href, text = s.split('">')
new_strings.append((text, href))
count += 1
else:
ss = tokenize(s)
new_strings.extend([(_, None) for _ in ss])
return new_strings, count / len(new_strings), count
fw = open('out-more.json', 'w')
with open('en.json', 'r') as f:
for i, line in enumerate(f):
data = json.loads(line)
entry = {"id": data['id'], "url": data['url'], 'title': data['title']}
outputs = []
if len(data['text']) > 50:
try:
sents = split_paragraphs(data['text'])
for sent in sents:
if len(sent) < 400:
output, ratio, count = split_sent(sent)
if count > 1 and ratio >= 0.10 and len(output) >= 8 and output[0][0][0].isupper():
text = [_[0] for _ in output]
hyperlink = [_[1] for _ in output]
outputs.append((text, hyperlink))
except Exception:
pass
if len(outputs) > 0:
entry['text'] = outputs
fw.write(json.dumps(entry) + '\n')
sys.stdout.write('finished {}/{} \r'.format(i, 5989879))
fw.close()