-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtop_words.py
executable file
·113 lines (106 loc) · 3.99 KB
/
top_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/usr/bin/env python
#coding=utf-8
from __future__ import division
import os
import sys
tokens_dict = {}
doc_dict = {}
doc_num = {}
############################################################################
# read tokens from file_name into doc_dict
############################################################################
def read_tokens(file_name):
global doc_dict
global doc_num
f = open(file_name, 'r')
doc_tokens = {}
total_doc = 0
line_num = 0
while True:
line = f.readline();
if not line: break
line_num += 1
terms = line.strip().split('\t')
if len(terms) != 3:
print file_name, line_num, terms
sys.exit(-1)
word, frequency, left = terms[0:3]
doc_tokens[word] = [int(frequency), int(left)]
total_doc = int(frequency) + int(left)
f.close()
doc_dict[file_name] = doc_tokens
doc_num[file_name] = total_doc
############################################################################
# pick out all files recursive
############################################################################
def fetch_all_file(dir_name):
files = os.listdir(dir_name)
for f in files:
if os.path.isdir(dir_name + os.sep + f):
fetch_all_file(dir_name + os.sep + f)
else:
read_tokens(dir_name + os.sep + f)
############################################################################
# extract the feature by counting X^2
############################################################################
def extract_feature(dir_name):
global doc_dict
global doc_num
global tokens_dict
for doc in doc_dict:
tmp_dict = {}
for token in doc_dict[doc]:
other_include_doc = 0
other_exclude_doc = 0
for other_doc in doc_dict:
if doc != other_doc:
l = doc_dict[other_doc].get(token, [])
if len(l) == 0:
other_exclude_doc += doc_num[other_doc]
else:
other_include_doc += l[0]
other_exclude_doc += l[1]
tmp_dict[token] =doc_dict[doc][token][0:]
tmp_dict[token].extend([other_include_doc, other_exclude_doc])
tokens_dict[doc] = tmp_dict
#----------------------------------------------------------------------#
# extract top 1000 word
#----------------------------------------------------------------------#
N = 0
for t in doc_num:
N += doc_num[t]
for doc in tokens_dict:
tmp = {}
for t in tokens_dict[doc]:
if len(tokens_dict[doc][t]) != 4:
print "Wrong in tokens_dict:", doc, t, tokens_dict[doc][t]
sys.exit(-1)
else:
l = tokens_dict[doc][t]
z1 = l[0] * l[3] - l[1] * l[2]
x2 = (z1 * z1 * N) / ((l[0] + l[1]) * (l[0] + l[2]) * (l[2] + l[3]) * (l[1] + l[3]))
tmp[t] = [x2, N / (l[0] + l[2])]
output(dir_name, doc, tmp)
############################################################################
# output the result
############################################################################
def output(out_dir, source_file, tokens):
file_name = source_file.strip().split(os.sep)[-1]
out = open(out_dir + os.sep + file_name, 'w')
li = sorted(tokens.iteritems(), key = lambda d:d[1][0], reverse = True)
for i in range(3000):
print >> out, li[i][0] + '\t' + str(li[i][1][1])
out.close()
############################################################################
# main module
############################################################################
if __name__ == "__main__":
if len(sys.argv) != 3:
print "Usage: python top_words.py dir_name result_dir"
sys.exit(-1)
if os.path.isdir(sys.argv[1]):
fetch_all_file(sys.argv[1])
else:
print "Invalid dir_name"
sys.exit(-1)
extract_feature(sys.argv[2])