-
Notifications
You must be signed in to change notification settings - Fork 1
/
Wikitriever.py
executable file
·86 lines (71 loc) · 3.35 KB
/
Wikitriever.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/python
import sys
sys.dont_write_bytecode = True
import math
from TermSiphon import passArgsCheck, getTextFile
from termExtraction import extract_term_list
import wikiExtraction
def get_sorted_article_titles(txtFilePath):
print "\nTERM EXTRACTION:"
terms_counts_dict, terms_weights_dict = \
extract_term_list(txtFilePath,verbose=True,method='wiki',\
returnType='dictionary')
search_terms_list = sorted(terms_weights_dict, \
key=terms_weights_dict.get, \
reverse = True)
print
print "WIKIPEDIA ARTICLE RETRIEVAL:"
print "searching for articles..."
# dict of term, results(list)
retrieved_wiki_titles_dict = \
wikiExtraction.get_wiki_titles(search_terms_list)
print "sorting articles by relevance weighting..."
all_results = {}
for term in search_terms_list:
if term in retrieved_wiki_titles_dict:
results = retrieved_wiki_titles_dict[term]
num_results = len(results)
for title in results:
weight = math.log(num_results - results.index(title)+1)*(math.log(terms_counts_dict[term]+1))
if title in all_results:
all_results[title] = all_results[title] + weight
else:
all_results[title] = weight
relevance_weighting_list = sorted(all_results, key = all_results.get, reverse = True)
print "sorting articles by cosine similarity..."
cosine_similarity_list = wikiExtraction.vector_sort(relevance_weighting_list,terms_weights_dict)
return relevance_weighting_list, cosine_similarity_list
if __name__ == '__main__':
if not passArgsCheck(sys.argv):
print "Arguments should be a path to a .txt or .pdf file and\noptionally an integer number of Wikipedia article titles to be returned. \nFor example:\n./Wikitriever path/to/file.pdf 15"
sys.exit()
if len(sys.argv) > 2:
returnSize = int(sys.argv[2])
else:
returnSize = 10
path = sys.argv[1]
txtFilePath = getTextFile(path)
relevance_weighting_list, cosine_similarity_list = \
get_sorted_article_titles(txtFilePath)
maxTermStringLength = max(len(title) for title in relevance_weighting_list[:returnSize]+cosine_similarity_list[:returnSize])
print "\n\n"+'-'*(maxTermStringLength*2+1)
print ' '*(max(maxTermStringLength-15,0))+"Extracted Wikipedia Articles"
print '-'*(maxTermStringLength*2+1)
maxTermStringLength = max([maxTermStringLength,len("Relevance Weighting"),\
len("Cosine Similarity")])
print "Relevance Weighting".ljust(maxTermStringLength), "| Cosine Similarity".ljust(maxTermStringLength)
print '-'*maxTermStringLength, "| "+'-'*(maxTermStringLength-2)
rwLength = len(relevance_weighting_list)
csLength = len(cosine_similarity_list)
for i in xrange(0,returnSize):
if i < rwLength or i < csLength:
if i < rwLength:
print relevance_weighting_list[i].ljust(maxTermStringLength),
else:
print ' '*maxTermStringLength,
if i < csLength:
print "| "+ cosine_similarity_list[i].ljust(maxTermStringLength)
else:
print "|"
print '-'*(maxTermStringLength*2+1)
print