-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtext_parse.py
341 lines (295 loc) · 12.6 KB
/
text_parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
import sys
import glob
import numpy as np
from nltk.stem.wordnet import WordNetLemmatizer
wnl = WordNetLemmatizer()
import re
np.set_printoptions(threshold='nan')
vocab = []
vocabCount = [] # Counts the number of occurences of a word
vocabAppearance = [] # Counts the number of documents vocab appears in
minCount = 5
minAppearance = 3
# Information on what words are invalid
# invalidSentences TODO
invalidLength = 3 # Stores the min length of a word
invalidCharacters = ['.', ',', '"', "'", '-', '_', '(', ')', '<', '>', '[', ']', '?', ':', ';',
'!', '@', '#', '$', '%', '^','&', '*', '/', '\r','\n',
'1','2','3','4','5','6','7','8','9','0'] # Stores characters to be removed
invalidWords = ["that","with","from","were", "would", "this", "only", "until",
"they", "their", "have", "which", "also", "could", "these", "such", 'been',
"into", "than", "then", "more", "about", "will", "company", "business",
"there", "shall", "other", "through", "when", "even", "most", "between",
"over", "some", "well", "what", "where", "like", "including", "them",
"because", "most", "while", "after", "each", "said",
"january", "february", "march", "april", "may", "june", "july", "august",
"september", "october", "november", "december"] # Stores any words to be ignored
wordsF = ["she", "her", "hers"]
wordsM = ["he", "him", "his"]
countF = []
countM = []
# Empties train file
def clearTrain():
with open("output/train.ldac", "w") as file:
file.write("")
print "Cleared train"
# Cleans out any bad characters from a word, and returns that word
def cleanWord(word):
# Remove any non-unicode characters
word=word.decode('utf-8','ignore').encode("utf-8")
# Remove any invalid characters
for c in word:
if c in invalidCharacters:
wordParts = word.split(c)
# print wordParts
word = ""
for i in wordParts:
word = word + i
return word
def getWordGender(word, docNum):
# TODO count num F words vs M words
# countF = 0
# countM = 0
if len(countF) <= docNum:
countF.append(0)
countM.append(0)
# genderClass.append(0.5)
if word in wordsF:
countF[docNum] = countF[docNum] + 1
elif word in wordsM:
countM[docNum] = countM[docNum] + 1
# print "Doc " + str(docNum) + " is " + str(genderClass[docNum]) + " percent female."
# Removes any invalid characters from word, or returns empty string if word is invalid
def getValidWord(word):
# print "Original word: " + word
# Check word is sufficiently long
if len(word) <= invalidLength:
return ""
# Change word to lower case
word = word.lower()
# Getting the root of the word with NLTK
try:
word = wnl.lemmatize(word)
except UnicodeDecodeError:
print "Word " + word + "could not be lemmatized."
# Check if word itself is invalid
if word in invalidWords:
return ""
# print "Valid Word: " + word
return word
def parseDoc(filename, docNum):
cur_words = [] # index
cur_counts = [] # count
cur_total = 0 # total count
cur_stats = np.empty(shape=(0, 2), dtype=int)
# Read from file
try:
with open(filename) as textfile:
for line in textfile:
# for line in textPart.split('\r'):
# print line.split('\r')
# TODO Remove line if copyright/everywhere
# print line + '\n'
line = re.sub("Copyright .* Harvard Business School." , '', line)
line = re.sub("This document is authorized for use only by .*", '', line)
line = re.sub("Do Not Copy or Post", '', line)
# print line
# raw_input("Press Enter to continue...\n")\
for word in line.split(" "):
# Removes any invalid characters from word
# or returns empty string if word is invalid
word = cleanWord(word)
# TODO get gender
getWordGender(word, docNum)
word = getValidWord(word)
if word != "":
# Check if word is already in vocabulary
i = vocab.index(word) if word in vocab else -1
# if i >= 0:
# print "Check vocab: " + vocab[i] + ", " + word
if i >= 0:
# Word exists in vocab
# Increment the total count for the word
vocabCount[i] = vocabCount[i] + 1
# Increment word count for specific word
# Check if word is already in current vocabulary
j = np.where(cur_stats[:, 0] == (i))[0][0] if (i) in cur_stats[:, 0] else -1
# if j>= 0:
# print "Check current vocab: " + str(i) + ", " + str(cur_stats[j])
# print j
if j >= 0:
# Seeing word again - Count just needs to be incremented
# Increment the word count for current doc
cur_stats[j][1] = cur_stats[j][1] + 1
else:
# First time seeing word in this document
# Count needs to be created for this document
cur_stats = np.append(
cur_stats, np.array([[i, 1]]), axis=0)
# Increment the total vocab appearance
vocabAppearance[i] = vocabAppearance[i] + 1
else:
# New Word - append to vocab
vocab.append(word)
vocabCount.append(1)
vocabAppearance.append(1)
cur_stats = np.append(
cur_stats, np.array([[len(vocab)-1, 1]]), axis=0)
# print cur_stats[:20]
# raw_input("Press Enter to continue...")
return cur_stats
except IOError:
print "File " + filename + " does not exist"
return NULL
def removeWords2(file_trains, minCount, minAppearance):
print "Removing Words..."
# Version of remove words that removes words from the vocab and all the file_trains
# Sort the trains by id so that it is easier to remove from
for train in file_trains:
# print train[:5]
train.view('int, int').sort(order=['f0'], axis=0)
# print train[:5]
# raw_input("Press Enter to continue...")
numRemoved = 0
i = 0
while i < len(vocab):
# print file_trains[0][:5]
# raw_input("Press Enter to continue...")
# print "i " + str(i) + ", len vocab " + str(len(vocab))
if (vocabCount[i] < minCount) or (vocabAppearance[i] < minAppearance):
# Remove the word
# print "Removing " + vocab[i] + " at index " + str(i)
del vocab[i]
del vocabCount[i]
del vocabAppearance[i]
for j in range(len(file_trains)):
# Check if the word is in the train
# Remove i from the train
# print train[:5]
file_trains[j] = file_trains[j][(file_trains[j][:, 0] != (i + numRemoved))]
# print train[:5]
# raw_input("Press Enter to continue...")
numRemoved = numRemoved + 1
else:
if numRemoved > 0:
# print "Words removed: " + str(numRemoved)
for train in file_trains:
for j in range(len(train)):
if train[j][0] >= i:
train[j][0] = train[j][0] - numRemoved
numRemoved = 0
i = i + 1
# def removeWords(train, minCount, minAppearance):
# # We are removing words that are insignifacnt, they do not appear enougth times or in enough documents.
# toRemove = []
# for i in train[:, 0]:
# if vocabCount[i] < minCount:
# # print "Word " + vocab[i] + " only has " + str(vocabCount[i]) + " counts."
# toRemove.append(i)
# elif vocabAppearance[i] < minAppearance:
# # print "Word " + vocab[i] + " only has " + str(vocabAppearance[i]) + " appearances."
# toRemove.append(i)
# # Word is insignifacant - remove it
# toRemove = sorted(toRemove, reverse=True)
# for i in toRemove:
# train = train[(train[:, 0] != i)]
# # raw_input("Press Enter to continue...")
# return train
def formatTrain(train):
# Formatting of train line
# print train
# raw_input("Press Enter to continue...")
buffer = str(len(train))
for i in range(len(train)):
buffer = buffer + " " + str(train[i][0]) + ":" + str(train[i][1])
# print buffer
# At this point buffer is one line of train file
return buffer
def writeToTrain(line):
with open("output/train.ldac", "a") as file:
file.write(line)
file.write("\n")
# print "Wrote line to train"
# def cleanVocab():
# for i in reversed(range(len(vocab))):
# if vocabCount[i] < minCount or vocabAppearance[i] < minAppearance:
# vocab.pop(i)
# vocabCount.pop(i)
# vocabAppearance.pop(i)
def writeVocab():
with open("output/vocab.txt", "w") as file:
buffer = ""
for word in vocab:
buffer = buffer + word
if vocab.index(word) < len(vocab) - 1:
buffer = buffer + "\n"
file.write(buffer)
file.write("\n")
# print vocab
print "Wrote Vocab"
def getGenderStats(files):
percentF = 0.5;
with open("output/genders.txt", "w") as file:
for i in range(len(countF)):
percentF = 0.5;
if (countF[i] + countM[i]) != 0:
percentF = float(countF[i]) / (countF[i] + countM[i])
# print "Doc " + str(i) + " is " + str(percentF) + " percent female."
if (percentF > 0.7):
file.write("Doc " + files[i] + " is mostly female (" + str(percentF*100) + "%).\n")
# print "Doc " + str(i) + " is mostly female (" + str(percentF) + "%).\n"
elif (percentF < 0.3):
file.write("Doc " + files[i] + " is mostly male. (" + str((1 - percentF)*100) + "%).\n")
# print "Doc " + str(i) + " is mostly male. (" + str(1 - percentF) + "%).\n"
else:
file.write("Doc " + files[i] + " is unknown.\n")
# print "Doc " + str(i) + " is unknown.\n"
print ("Wrote genders.")
def main(argv):
numFiles = 0
# Get command line arguments
argc = len(argv)
if argc < 1:
print "Usage: input_file"
sys.exit()
# Allows users to specify all files in a directory
files = [] # list of file names
for i in range(argc):
files.extend(glob.glob(argv[i]))
# Clear the train file in case of old data
clearTrain()
# file_trains = np.empty(shape=[0,1,2], dtype=int); # this doesn't work because we do no know how many words a doc may have
file_trains = []
# Allowing user to specify multiple docs at the same time
for i in range(len(files)):
input_file = files[i]
train = parseDoc(input_file, i)
# file_trains = np.append(file_trains, np.array([[train_line]]), axis = 0)
if train.any():
file_trains.append(train)
numFiles = numFiles + 1
print "Parsed file: " + files[i]
else:
print "ERROR"
# print file_trains
# print "Parsed Files"
# print len(file_trains)
print "Parsed " + str(numFiles) + " files."
removeWords2(file_trains, minCount, minAppearance)
for j in range(len(file_trains)):
# file_trains[j] = removeWords(file_trains[j], minCount, minAppearance)
trainLine = formatTrain(file_trains[j])
# print "Formatted train"
# If file opened sucessfully store the word data in train file
writeToTrain(trainLine)
print "Wrote train"
# Writes the discovered vocabulary to file
writeVocab()
print "Found " + str(len(vocab)) + " valid words."
getGenderStats(files)
#Call the main function to start the program
# main()
if __name__ == '__main__':
# Doing this so that main can be called from both the
# command line as well as another script as a function call
main(sys.argv[1:])