-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathquery_results.py
executable file
·155 lines (132 loc) · 5.39 KB
/
query_results.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/usr/bin/env python
import argparse
import sqlite3
import collections
import map_reduce_utils as mru
from math import log
from create_db import * # just for config/locations
def main():
description = '''
given a sqlite3 database file produced using create_db.py,
either display the top k most similar documents in the corpus
or, if given a new file to analyze, calculate the k most
similar documents to it in the original corpus. This designed
to be run on smaller datasets just to gain insight/intuiton.
'''
parser = argparse.ArgumentParser(
description=description,
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
top_help = "the number of similar documents to display"
parser.add_argument('--top', '-t', help=top_help, type=int, default=5)
file_help = '''find the k most similar documents to file instead of the
entire corpus'''
parser.add_argument('--file', '-f', dest='file', help=file_help)
db_name_help = "the location of the sqlite database file to query"
parser.add_argument('--database', '-d', dest='db_location',
help=db_name_help, default=DEFAULT_DB_LOCATION)
args = parser.parse_args()
with sqlite3.connect(args.db_location) as conn:
db = conn.cursor()
if args.file:
compare_file(db, args.file, args.top)
else:
find_top_k(db, args.top)
conn.commit()
def find_top_k(db, k):
"""
prints to top k similar documents in the 'similarities' table
of the database referenced by the sqlite3 cursor dbx
"""
results = db.execute('''SELECT *
FROM similarities
ORDER BY similarity DESC
LIMIT ?;''', (k,))
print '\n'.join(map(lambda x: '{} {} {:.10f}'.format(x[0], x[1], x[2]),
results.fetchall()))
def compare_file(db, filename, k):
"""
finds the k most similar documents to filename in the database referenced
by the sqlite3 cursor db and prints them along with the cosine similarity
metric between filename and each of the k documents
"""
with open(filename, 'r') as f:
contents = f.read()
contents = mru.clean_text(contents)
counts = {}
for word in contents:
if word in counts:
counts[word] += 1
else:
counts[word] = 1
docs_containing = {w: num_docs_containing(db, w) for w in set(contents)}
# we're going to use the number of documents in the original
# corpus to calculate tfidf, not including the file we are now
# analyzing, since the tfidf scores we have in the database were
# calculated with this number
corp_size = get_corpus_size(db)
doc_size = len(contents)
tfidfs = {word: tfidf(count, doc_size, corp_size, docs_containing[word])
for word, count in counts.items()}
# now, calculate the similarity metric with each document in the database
similarities = {}
documents = db.execute('SELECT DISTINCT document FROM tfidf;').fetchall()
for doc in map(lambda x: x[0], documents):
similarity = 0
for word in set(contents):
other_doc_tfidf = get_tfidf(db, doc, word)
this_doc_tfidf = tfidfs[word]
similarity += this_doc_tfidf * other_doc_tfidf
similarities[doc] = similarity
top_k = collections.Counter(similarities).most_common(k)
print '\n'.join(map(lambda x: ':\t'.join([repr(i) for i in x]), top_k))
def tfidf(n, N, D, m):
"""
given the document frequency n, the document length N, corpus size D
and corpus frequency m, returns the tfidf score for this word and document
"""
# Since we're using counts from the original corpus, it could be that
# this word is unique to the document we are analyzing now, in which
# case we need to do some (sort of) smoothing
if m == 0:
m = 1
if (n == 0 or D == 0):
return 0.0
else:
return (float(n) / float(N)) * log(float(D) / float(m), 10)
def get_tfidf(db, doc, word):
"""
returns the tfidf score of word in the document named doc in the
database referenced by the sqlite3 cursosr db
"""
results = db.execute('''SELECT tfidf
FROM tfidf
WHERE word = ?
AND document = ?
LIMIT 1''', (word, doc)).fetchall()
# if there are no results, then we simply return 0 so that nothing
# is added to the similarity for this word
if results == []:
return 0.0
else:
return results[0][0]
def get_corpus_size(db):
"""
returns the number of unique documents in the 'tfidf' table in
the database referenced by the sqlite3 cursor db
"""
results = db.execute('SELECT COUNT(distinct document) FROM tfidf;')
return int(results.fetchall()[0][0])
def num_docs_containing(db, word):
"""
returns the number of documents which contain word by querying
the 'tfidf' table in the database referenced by the sqlite3
cursor db
"""
result = db.execute('''SELECT COUNT(distinct document)
FROM tfidf
WHERE document = ?;
''', (word,))
return int(result.fetchall()[0][0])
if __name__ == '__main__':
main()