forked from jianliu-ml/EventCausalityIdentification
-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
105 lines (81 loc) · 3.16 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import pickle
import random
import numpy as np
def build_embedding_table(word_map, wv_file, dim):
res = [list()] * len(word_map)
print('Reading word vector...')
for idx, line in enumerate(open(wv_file, 'r')):
if idx % 50000 == 0:
print('\rReading...', idx)
line = line.split()
word = line[0]
if len(line) != dim + 1 or not word in word_map:
continue
res[word_map[word]] = list(map(lambda t: float(t), line[1:]))
lens = len(list(filter(lambda x: len(x)>0, res)))
print('Hit', lens)
print('Unknown', len(word_map) - lens)
def _random_vector(x):
if len(x) == 0:
bias = 2 * np.sqrt(3.0 / dim)
return [random.random() * bias - bias for _ in range(dim)]
return x
res = list(map(_random_vector, res))
return res
if __name__ == '__main__':
with open('document_raw.pickle', 'rb') as f:
# The protocol version used is detected automatically, so we do not
# have to specify it.
documents = pickle.load(f)
word_map = {'<PAD>': 0, '<UNK>': 1, '<S>': 2, '</S>': 3}
word_list = ['<PAD>', '<UNK>', '<S>', '</S>']
for doc in documents:
[all_token, ecb_star_events, ecb_coref_relations,
ecb_star_time, ecbstar_events_plotLink, ecbstar_timelink,
evaluation_data, evaluationcrof_data] = documents[doc]
for token in all_token:
_, _, _, word = token
word = word.lower()
if not word in word_map:
word_map[word] = len(word_map)
word_list.append(word)
for doc in documents:
[all_token, ecb_star_events, ecb_coref_relations,
ecb_star_time, ecbstar_events_plotLink, ecbstar_timelink,
evaluation_data, evaluationcrof_data] = documents[doc]
for i in range(len(all_token)):
temp = list(all_token[i])
temp[-1] = word_map[temp[-1].lower()]
all_token[i] = temp
wv_file = '/home/jliu/data/WordVector/GoogleNews-vectors-negative300.txt'
dim = 300
vec = build_embedding_table(word_map, wv_file, dim)
with open('data.pickle', 'wb') as f:
data = {
'data': documents,
'word_map': word_map,
'word_list': word_list,
'word_vector': vec
}
pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
# for doc in documents:
# [all_token, ecb_star_events, ecb_coref_relations,
# ecb_star_time, ecbstar_events_plotLink, ecbstar_timelink,
# evaluation_data, evaluationcrof_data] = documents[doc]
# for elem in ecb_star_events:
# print(ecb_star_events[elem])
# print()
# for elem in ecb_coref_relations:
# print(ecb_coref_relations[elem])
# print()
# for elem in ecb_star_time:
# print(ecb_star_time[elem])
# print()
# for elem in ecbstar_events_plotLink:
# print(elem, ecbstar_events_plotLink[elem])
# print()
# for elem in ecbstar_timelink:
# print(elem, ecbstar_timelink[elem])
# print()
# for elem in evaluation_data:
# print(elem)