-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathcompute_idf.py
113 lines (88 loc) · 3.28 KB
/
compute_idf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
'''
Compute the Inverse Document Frequency (IDF) of Wikipedia
articles using the vocabulary defined in <vocab_path>.
'''
import cPickle as pkl
import numpy as np
import random
import utils
from collections import OrderedDict
from nltk.tokenize import wordpunct_tokenize
import re
import parameters as prm
print 'loading vocabulary'
vocab = utils.load_vocab(prm.vocab_path, prm.n_words)
textbegin = False
title = ''
text = ''
n = 0
f = open(prm.dump_path, "rb")
print 'creating IDF'
m = 0 # number of documents
df = {} # word-document frenquency
while True:
line = f.readline()
if (line == ''):
break
line = line.lower()
if ("<page>" in line):
pagebegin = True
title = ""
sections = []
if ("</page>" in line):
pagebegin = False
title = ""
if ("<title>" in line) and ("</title>" in line) and pagebegin:
title = line.replace(" <title>","").replace("</title>\n","")
if n % 100000 == 0:
print n
n += 1
#if n > 1000000:
# break
if line.strip()[:2] == "==" and line.strip()[-2:] == "==" and line.strip()[:3] != "===" and line.strip()[-3:] != "===": # another section begins...
sections.append({"text": ""})
if textbegin:
if "[[category:" != line[:11]: # skip the categories
line_clean = line.replace("</text>", "")
text += line_clean
sections[-1]["text"] += line_clean
if ("<text xml:space=\"preserve\">" in line) and pagebegin:
textbegin = True
line_clean = line.replace(" <text xml:space=\"preserve\">","")
sections.append({"text": line_clean}) #add a section, it will be the abstract
text = line_clean
if ("</text>" in line) and pagebegin:
textbegin = False
if "[[category:" != line[:11]: # skip the categories
text += line.replace("</text>","")
m += 1
words_sections = []
# Clean text and get hyperlinks
for j, section in enumerate(sections):
text = section["text"]
text = text.replace('\n', ' ')
text = re.sub(r'\<\;ref.*?\<\;\/ref\&g', '', text)
text = re.sub(r'\<\;ref.*?\/\>\;', '', text)
text = re.sub(r'\[\[file\:.*?\]\]', '', text)
text = re.sub(r'\[\[image\:.*?\]\]', '', text)
words = wordpunct_tokenize(text)
words_sections += words
# compute document frequency per word in the training set
wb = dict.fromkeys(words_sections, 0) #remove duplicated words
for w in wb.keys():
if w in vocab:
if w not in df:
df[w] = 0
df[w] += 1
#compute inverse document frequency:
idf = dict.fromkeys(range(len(vocab)), 0) #initialize dic with length of vocabulary and values equal to 1.
for wi, fr in df.items():
idf[wi] = np.log(float(m) / (1. + float(fr))) # total number of documents divided by the number of documents word w appears. Sum 1 to avoid division by zero
#normalize
maxidf = np.asarray(idf.values()).max()
for wi, fr in idf.items():
idf[wi] = fr / maxidf
with open(prm.idf_path, "wb") as f:
pkl.dump(idf, f)
f.close()
print 'done'