-
Notifications
You must be signed in to change notification settings - Fork 12
/
preprocessing.py
146 lines (130 loc) · 5.05 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import os
from urllib.parse import urlparse
import operator
'''
This file contains a set of functions that are used for data cleaning and extraction
'''
logpath='cleaning.log'
fid = open(logpath, 'w') #create empty log file
fid.close()
'''
this function is used for extracting the events in a specified country
'''
def get_specified_data(path, countryColumn, country, attrColumn):
newpath = path + '_attr'
if not os.path.exists(newpath):
os.makedirs(newpath)
flist = os.listdir(path) # all files in the directory
for f in flist:
count = 0
fname, newfname = os.path.join(path,f), os.path.join(newpath,f)
dictionary = {} # store how many times each attributes occures
reader=open(fname,'r',encoding='utf-8')
writer=open(newfname,'w',encoding='utf-8')
for line in reader:
columns = line.strip('\n').split('\t')
for i in countryColumn:
# have bug
#if columns[i] == country: # match country
if country in columns[i]:
count = count + 1
key = columns[attrColumn][:2] # for a event code, say 042, we just need 04
if key in dictionary.keys():
dictionary[key] = dictionary[key]+1
else:
dictionary[key] = 1;
writer.write(country+'\n')
dict_writer(writer, dictionary) # write the dictionary to a file
writer.close()
reader.close()
log('attr processing of ' + fname + ' finished',logpath) # log info after each file is processed
return newpath
# write a dictionary to a file
def dict_writer(writer, dictionary):
for key, value in dictionary.items():
writer.write(key + ': ' + str(value) + '\n')
'''
function for extracting the events happened in the wanted countries
parameter:
fname: the name of the file that iftoberead
newfname: the name of output file
idxs: the columns that should be excluded in the output files
@return line_count
'''
def url_columns(path, idxs):
log('start to get urls', logpath)
urldic = {}
newpath= 'url_' + path
if not os.path.exists(newpath):
os.makedirs(newpath)
flist = os.listdir(path) # all files in the directory
for f in flist:
fname, newfname = path+f, newpath+f
reader=open(fname,'r',encoding='utf-8')
writer=open(newfname,'w',encoding='utf-8')
for line in reader:
items=line.strip('\n').split('\t')
try:
if 'Australia' not in items[36] or 'Australia' not in items[43]:
continue
except Exception as e:
log(str(line)+' error', logpath)
log(str(e), logpath)
string=''
for i in range(0,len(items)):
if i in idxs:
# process url
if i == 57:
r = urlparse(items[i])
string = string + str(r.scheme)+'://'+str(r.hostname)+'\t'+items[i]
key = str(r.hostname)
if key in urldic.keys():
urldic[key] = urldic[key]+1
else:
urldic[key] = 1;
else:
string = string + items[i]+'\t'
writer.write(string+'\n')
writer.close()
reader.close()
log('delete ' + fname + ' finished',logpath) # log info after each file is processed
sortBKs = sorted(urldic.items(),key=lambda t:t[1])
for key, value in sortBKs:
with open('urllist.txt','a',encoding='utf-8') as uf:
uf.write(key + '\t' + str(value) + '\n')
log('finish get urls', logpath)
return newpath
'''
used for cleaning useless columns
'''
def del_columns(path, idxs):
newpath= 'del_' + path
if not os.path.exists(newpath):
os.makedirs(newpath)
flist = os.listdir(path) # all files in the directory
for f in flist:
fname, newfname = path+f, newpath+f
reader=open(fname,'r',encoding='utf-8')
writer=open(newfname,'w',encoding='utf-8')
for line in reader:
items=line.strip('\n').split('\t')
string=''
for i in range(0,len(items)):
if i in idxs:
string = string + items[i]+'\t'
writer.write(string+'\n')
writer.close()
reader.close()
log('delete ' + fname + ' finished',logpath) # log info after each file is processed
return newpath
'''
used to record the processing progress
'''
def log(s, logpath):
with open(logpath, 'a', encoding='utf-8') as logfile:
logfile.write(s+' '+str(datetime.datetime.now())+'\n')
columns = [1,7,17,25,26,29,30,31,32,33,34,35,36,42,43,53,54]
#clean_file_path = del_columns('data-2013/', columns)
#print('cleaned file path: ' + clean_file_path)
# notice: after delete, column index of a attr will change !
#attr_path = get_specified_data(clean_file_path,[12],'Australia',4)