-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_training.py
336 lines (255 loc) · 9.19 KB
/
create_training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
from nltk.tokenize import TreebankWordTokenizer
import datefinder
import random
import json
import re
st = TreebankWordTokenizer()
# this is the hard data which is used to create the feature vectors for a paragraph
# most-used street names
with open('./database/hard_data/streets.json', 'r') as f:
streets = json.load(f)
# names of all states in all countries
with open('./database/hard_data/states.json', 'r') as f:
states = json.load(f)
# names of all cities of the world
with open('./database/hard_data/cities.json', 'r') as f:
cities = json.load(f)
# names of all the countries of the world
with open('./database/hard_data/countries.json', 'r') as f:
countries = json.load(f)
# to simulate a real blog, we place non-address paragraphs taken from real
# blogs and append it below and above an address to simulate a real blog
with open('./database/hard_data/garbage', 'r') as f:
garbage = f.read()
# a sample list of 1000 restaurants
with open('./database/hard_data/cafes', 'r') as f:
cafes = f.read()
# regular expression for finding phone numbers
rephone = re.compile(
r'\+[0-9][0-9]*|\([0-9]{3}\)|[0-9]{4} [0-9]{4}|([0-9]{3,4}[- ]){2}[0-9]{3,4}|[0-9]{10}')
# regular expression for finding numbers
renum = re.compile(r'(?i)^[a-z0-9][a-z0-9\- ]{4,8}[a-z0-9]$')
garbage = garbage.split('\n')
garbage = [g for g in garbage if g != '']
cafes = cafes.split('\n')
cafes = [c for c in cafes if c != '']
lengths1 = []
lengths2 = []
# the street names have a number indicating it's popularity eg. Road, Drive, Lane etc.
# are most popular. So we find the sum of all scores to normalize them later
summ = 0
for key in streets.keys():
summ += streets[key]
summ = float(summ) / 3
def generate_data():
'''
This method creates synthetic blogs which have hierarchical addresses. Since the Wallmart
dataset is highly regular ie. it is of the form :-
address1;
city, state, postalcode;
country
So to break this regularity as we can expect any type of format in real life blogs,
we randomly omit the last 2 parts of the address ie. the phone number and the
city, state and poscode part.
Parameters
----------
Returns
-------
References
----------
Used addresses of all 5800 Wallmart stores in US
'''
labels1 = []
with open('./database/hard_data/walmart-full.json') as addrs:
addrs = json.load(addrs)
addresses_train = []
print "generating hierarchical addresses..."
for i in range(len(addrs)):
temp = []
y = []
cnt = 0
rnum = random.random()
gnum1 = -1
gnum2 = -1
# for selecting the number of garbage texts above and below the address
while gnum1 < 0 or gnum2 < 0:
gnum1 = int(random.gauss(10, 5))
gnum2 = int(random.gauss(10, 5))
# gnum1 = 0
# gnum2 = 0
temp += random.sample(garbage, gnum1)
y += [0] * gnum1
# necessarily append the address1
temp.append(addrs[i]['address']['address1'].encode('ascii', 'ignore'))
cnt += 1
if rnum > 0.05:
temp.append(addrs[i]['address']['city'].encode('ascii', 'ignore') + ", " + addrs[i]['address'][
'state'].encode('ascii', 'ignore') + ", " + addrs[i]['address']['postalCode'].encode('ascii', 'ignore'))
cnt += 1
# dont put phone numbers in all cases as then it will learn that only
if rnum > 0.6 and 'phone' in addrs[i]:
temp.append(addrs[i]['phone'].encode('ascii', 'ignore'))
cnt += 1
y += [1] * cnt
temp += random.sample(garbage, gnum2)
y += [0] * gnum2
labels1 += y
lengths1.append(len(y))
# for i in range(len(y)):
# print (temp[i], y[i])
addresses_train.append(temp)
data_vec = []
for i in range(len(addresses_train)):
if i % 100 == 0:
print i
data_vec += getdet(addresses_train[i])
with open("./database/features/train1", "w") as f:
print >> f, addresses_train
with open("./database/features/labels1.py", "w") as f1:
print >> f1, labels1
with open("./database/features/lenghts1.py", "w") as f1:
print >> f1, lengths1
with open("./database/features/datavec1.py", "w") as f2:
print >> f2, data_vec
def oneliners():
'''
This method creates synthetic blogs which have oneline addresses. Since the New-York
Restaurants dataset is highly regular ie. it is of the form :-
address1; city, state, postalcode; country
So to break this regularity as we can expect any type of format in real life blogs,
we randomly omit the last 2 parts of the address ie. the phone number and the
city, state and poscode part.
Parameters
----------
Returns
-------
References
----------
Used addresses of 6000 restaurants in New-York
'''
with open('./database/hard_data/us_rest1.json') as rests:
rests = json.load(rests)
print "generating one line addresses..."
# randomly select data of 6000 restaurants from the json file
randlist = random.sample(range(1, len(rests['data'])), 6000)
one_line_addrs = []
idx = 0
# in the rests dictionary, the address portions are at the given indices
order = [9, 11, 12, 13, 14]
labels2 = []
# for selecting the number of garbage texts above and below the address
for idx in randlist:
str1 = ""
temp = []
# print idx
y1 = []
rnum = random.random()
gnum1 = -1
gnum2 = -1
while gnum1 <= 0 or gnum2 <= 0:
gnum1 = int(random.gauss(10, 5))
gnum2 = int(random.gauss(10, 5))
temp += random.sample(garbage, gnum1)
# concatenate gnum1 number of 0s to indicate gnum1 number of garbage paragraphs
y1 += [0] * gnum1
# here also we randomly remove some parts of the one-line addresses
# this translates to chopping off some portion from the back of the order list
ordd = order
if rnum < 0.5:
ordd = order[:-1]
if rnum < 0.4:
ordd = ordd[:-1]
for od in ordd:
part = rests['data'][idx][od]
if part != None:
str1 += part.encode("ascii", "ignore") + ", "
# this is to capitalize the first letter of all the words
str1 = str1.title()
temp.append(str1)
temp += random.sample(garbage, gnum2)
# concatenate a 1 ie. address for this one-line address
y1 += [1]
# concatenate gnum2 number of 0s to indicate gnum2 number of garbage paragraphs
y1 += [0] * gnum2
lengths2.append(len(y1))
labels2 += y1
one_line_addrs.append(temp)
data_vec = []
for i in range(len(one_line_addrs)):
if i % 100 == 0:
print i
data_vec += getdet(one_line_addrs[i])
with open("./database/features/train2", "w") as f:
print >> f, one_line_addrs
with open("./database/features/labels2.py", "w") as f1:
print >> f1, labels2
with open("./database/features/lengths2.py", "w") as f1:
print >> f1, lengths2
with open("./database/features/datavec2.py", "w") as f2:
print >> f2, data_vec
def getdet(data):
'''
This method is used to get all tha labels for a single synthetic blog
It calls the getvec function for all the paragraphs inside it
Parameters
----------
data : A list of paragraphs which forms a synthetic blog
Returns
-------
feature_vec : A list of shape=(n_paragraphs, 8)
'''
feature_vec = []
for i in range(len(data)):
feature_vec.append(getvec([data[i]]))
return feature_vec
def getvec(lines):
'''
features:
number of streets(0), cities(1), states(2), countries(3): INT
sum of weights of the streets: FLOAT (4)
has phone number?: 0/1 (5)
zip codes?: 0/1 (6)
length of paragraph: INT (7)
has date?: 0/1 (8)
This method calculates the feature vector for a single paragraph using the above
features
Parameters
----------
data : A list of paragraphs which forms a synthetic blog
Returns
-------
vec : A list length 8
'''
vec = [0]*9
for line in lines:
phnum = len(rephone.findall(line))
nums = len(renum.findall(line))
numterm = 0
for terms in st.tokenize(line):
numterm+=1
# terms = terms.lower()
if terms.lower() in streets:
vec[0] += 1
vec[4] += streets[terms.lower()]/float(summ)
if terms in states:
# state names are biased towards US and Australia addresses
# therefore we don't add their weights
vec[1] += 1
if terms in cities:
vec[2] += 1
if terms in countries:
vec[3] += 1
vec[5] = phnum
vec[6] = nums
vec[7] = numterm
matches = datefinder.find_dates(line, strict=True)
try:
for match in matches:
vec[8] = 1
break
except:
pass
return vec
if __name__ == '__main__':
generate_data()
oneliners()