-
Notifications
You must be signed in to change notification settings - Fork 1
/
tf_idf.py
executable file
·83 lines (75 loc) · 2.69 KB
/
tf_idf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env python
#coding=utf-8
from __future__ import division
import os
import sys
import math
feature_word= {}
#############################################################################
# init feature word set
#############################################################################
def feature_word_init(feature_word_file):
global feature_word
f = open(feature_word_file, 'r')
seqence = 0
while True:
term = f.readline()
if not term: break
term = term.strip().rstrip(os.sep).split('\t')
seqence += 1
feature_word[term[0]] = [seqence, math.log(float(term[1]))]
f.close()
############################################################################
# statistic term frequency of word per document
############################################################################
def statistic_frequence(file_name, out):
global feature_word
doc_term = dict()
total_term = 0
f = open(file_name, 'r')
while True:
line = f.readline()
if not line: break
terms = line.strip().rstrip(os.sep).split(' ')
for t in terms:
total_term += 1
t = t.strip().split('/')[0].strip()
if t in feature_word:
times = doc_term.get(t, 0)
if times == 0:
doc_term[t] = 1
else:
doc_term[t] += 1
f.close()
label = file_name.strip().split(os.sep)[-2]
li = sorted(feature_word.iteritems(), key = lambda d:d[1][0])
print >> out, label[-2:],
for t in li:
if t[0] in doc_term:
tf = doc_term[t[0]] / total_term
print >> out, str(t[1][0]) + ':' + str(tf * t[1][1]),
print >> out
############################################################################
# pick out all files recursive
############################################################################
def fetch_all_file(dir_name, out):
files = os.listdir(dir_name)
for f in files:
if os.path.isdir(dir_name + os.sep + f):
fetch_all_file(dir_name + os.sep + f, out)
else:
statistic_frequence(dir_name + os.sep + f, out)
############################################################################
# main module
############################################################################
if __name__ == "__main__":
if len(sys.argv) != 4:
print "Usage: python tf_idf.py dir_name result_file_name feature_word_file"
sys.exit(-1)
feature_word_init(sys.argv[3])
out = open(sys.argv[2], 'w')
if os.path.isdir(sys.argv[1]):
fetch_all_file(sys.argv[1], out)
else:
statistic_frequence(sys.argv[1], out)
out.close()