-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
added create_db.py and query_results.py create_db.py uses the output from mapred_tfidf.py to produce a sqlite3 database which query_results.py can use to display documents most similar to a new document, or the most similar documents in the original corpus
- Loading branch information
Showing
2 changed files
with
270 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
#!/usr/bin/env python | ||
|
||
from __future__ import print_function | ||
import argparse | ||
import sqlite3 | ||
import os | ||
import re | ||
|
||
IS_MAP_REDUCE_OUTPUT = re.compile('^part-.*$') | ||
|
||
DEFAULT_DB_LOCATION = 'tfidf.db' | ||
|
||
|
||
def main(database_name, tfidf_location, similarities_location): | ||
""" | ||
populates tables in a sqlite3 database named database_name containing | ||
the similarities and tfidf scores produced by mapred_tfidf.py. note that | ||
this is designed for smaller datasets, not the entire patent dataset. | ||
""" | ||
with sqlite3.connect(database_name) as conn: | ||
db = conn.cursor() | ||
# yay denormalization | ||
populate_tfidf(db, tfidf_location) | ||
populate_simimlarities(db, similarities_location) | ||
conn.commit() | ||
|
||
|
||
def populate_tfidf(db, input_dir): | ||
""" | ||
creates and populates a table in the database referenced by the | ||
cursor db named tfidf using the content of input_dir, | ||
which should contain the output from the tfidf mapper | ||
from mapred_tfidf.py | ||
""" | ||
db.execute('CREATE TABLE tfidf (word text, document text, tfidf real);') | ||
for subdir, dirs, files in os.walk(input_dir): | ||
for filename in files: | ||
# ignore "Success" and CRC files from mapred | ||
if IS_MAP_REDUCE_OUTPUT.match(filename): | ||
for line in open(input_dir + '/' + filename, 'r'): | ||
vals = line.strip().split() | ||
sql = 'INSERT INTO tfidf VALUES (?, ?, ?);' | ||
db.execute(sql, [vals[0], vals[1], vals[2]]) | ||
|
||
|
||
def populate_simimlarities(db, input_dir): | ||
""" | ||
creates and populates a table in the database referenced by the | ||
cursor db named similarities using the content of input_dir, | ||
which should contain the output from the similarity reducer | ||
from mapred_tfidf.py | ||
""" | ||
# Since as of right now, we're only going to use this to query | ||
# top k similar documents, it doesn't matter which order | ||
# the documents are in. | ||
db.execute('''CREATE TABLE similarities | ||
(doc1 text, doc2 text, similarity real);''') | ||
for subdir, dirs, files in os.walk(input_dir): | ||
for filename in files: | ||
if IS_MAP_REDUCE_OUTPUT.match(filename): | ||
for line in open(input_dir + '/' + filename, 'r'): | ||
vals = line.strip().split() | ||
sql = 'INSERT INTO similarities VALUES (?, ?, ?);' | ||
db.execute(sql, (vals[0], vals[1], vals[2])) | ||
|
||
|
||
if __name__ == '__main__': | ||
description = ''' uses the output of mapred_tfidf to produce a sqlite3 | ||
database which can be used to query the most similar | ||
documents in a corpus and compare a new text to the | ||
documnets already processed with mapred_tfidf. | ||
''' | ||
|
||
parser = argparse.ArgumentParser( | ||
description=description, | ||
formatter_class=argparse.ArgumentDefaultsHelpFormatter | ||
) | ||
|
||
db_name_help = 'the name of the sqlite3 db file to create' | ||
parser.add_argument('--output', '-o', default=DEFAULT_DB_LOCATION, | ||
dest='db_name', help=db_name_help) | ||
|
||
tfidf_location_help = 'the location of the tf-idf output from mapred_tfidf' | ||
parser.add_argument('--tfidf', '-t', dest='tfidf_loc', | ||
default='tfidf', help=tfidf_location_help) | ||
|
||
sim_loc_help = 'the location of the similarities output from mapred_tfidf' | ||
parser.add_argument('--similarities', '-s', dest='similarities_location', | ||
default='similarities', help=sim_loc_help) | ||
|
||
force_help = 'if provided, automatically overwrite db_name if it exists' | ||
parser.add_argument('--force', '-f', dest='force', default=False, | ||
help=force_help, action='store_true') | ||
|
||
args = parser.parse_args() | ||
db_name = args.db_name | ||
tfidf_location = args.tfidf_loc | ||
similarities_location = args.similarities_location | ||
force = args.force | ||
|
||
# don't clobber the db location unless user asks to | ||
if os.path.exists(db_name): | ||
if not force: | ||
prompt = 'Overwrite db file {} [y/n]?'.format(db_name) | ||
response = raw_input(prompt) | ||
if response not in ['y', 'yes', 'Y', 'Yes']: | ||
exit() | ||
|
||
try: | ||
os.remove(db_name) | ||
except OSError as ose: | ||
err_msg = 'unable to rm db file. is it a dir? chmod?' | ||
print(err_msg, file=os.stderr) | ||
|
||
main(db_name, tfidf_location, similarities_location) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,155 @@ | ||
#!/usr/bin/env python | ||
|
||
import argparse | ||
import sqlite3 | ||
import collections | ||
import map_reduce_utils as mru | ||
from math import log | ||
from create_db import * # just for config/locations | ||
|
||
def main(): | ||
description = ''' | ||
given a sqlite3 database file produced using create_db.py, | ||
either display the top k most similar documents in the corpus | ||
or, if given a new file to analyze, calculate the k most | ||
similar documents to it in the original corpus. This designed | ||
to be run on smaller datasets just to gain insight/intuiton. | ||
''' | ||
parser = argparse.ArgumentParser( | ||
description=description, | ||
formatter_class=argparse.ArgumentDefaultsHelpFormatter | ||
) | ||
|
||
top_help = "the number of similar documents to display" | ||
parser.add_argument('--top', '-t', help=top_help, type=int, default=5) | ||
|
||
file_help = '''find the k most similar documents to file instead of the | ||
entire corpus''' | ||
parser.add_argument('--file', '-f', dest='file', help=file_help) | ||
|
||
db_name_help = "the location of the sqlite database file to query" | ||
parser.add_argument('--database', '-d', dest='db_location', | ||
help=db_name_help, default=DEFAULT_DB_LOCATION) | ||
|
||
args = parser.parse_args() | ||
with sqlite3.connect(args.db_location) as conn: | ||
db = conn.cursor() | ||
if args.file: | ||
compare_file(db, args.file, args.top) | ||
else: | ||
find_top_k(db, args.top) | ||
conn.commit() | ||
|
||
|
||
def find_top_k(db, k): | ||
""" | ||
prints to top k similar documents in the 'similarities' table | ||
of the database referenced by the sqlite3 cursor dbx | ||
""" | ||
results = db.execute('''SELECT * | ||
FROM similarities | ||
ORDER BY similarity DESC | ||
LIMIT ?;''', (k,)) | ||
print '\n'.join(map(lambda x: '{} {} {:.10f}'.format(x[0], x[1], x[2]), | ||
results.fetchall())) | ||
|
||
|
||
def compare_file(db, filename, k): | ||
""" | ||
finds the k most similar documents to filename in the database referenced | ||
by the sqlite3 cursor db and prints them along with the cosine similarity | ||
metric between filename and each of the k documents | ||
""" | ||
with open(filename, 'r') as f: | ||
contents = f.read() | ||
contents = mru.clean_text(contents) | ||
counts = {} | ||
for word in contents: | ||
if word in counts: | ||
counts[word] += 1 | ||
else: | ||
counts[word] = 1 | ||
docs_containing = {w: num_docs_containing(db, w) for w in set(contents)} | ||
|
||
# we're going to use the number of documents in the original | ||
# corpus to calculate tfidf, not including the file we are now | ||
# analyzing, since the tfidf scores we have in the database were | ||
# calculated with this number | ||
corp_size = get_corpus_size(db) | ||
doc_size = len(contents) | ||
tfidfs = {word: tfidf(count, doc_size, corp_size, docs_containing[word]) | ||
for word, count in counts.items()} | ||
|
||
# now, calculate the similarity metric with each document in the database | ||
similarities = {} | ||
documents = db.execute('SELECT DISTINCT document FROM tfidf;').fetchall() | ||
for doc in map(lambda x: x[0], documents): | ||
similarity = 0 | ||
for word in set(contents): | ||
other_doc_tfidf = get_tfidf(db, doc, word) | ||
this_doc_tfidf = tfidfs[word] | ||
similarity += this_doc_tfidf * other_doc_tfidf | ||
similarities[doc] = similarity | ||
top_k = collections.Counter(similarities).most_common(k) | ||
print '\n'.join(map(lambda x: ':\t'.join([repr(i) for i in x]), top_k)) | ||
|
||
|
||
def tfidf(n, N, D, m): | ||
""" | ||
given the document frequency n, the document length N, corpus size D | ||
and corpus frequency m, returns the tfidf score for this word and document | ||
""" | ||
# Since we're using counts from the original corpus, it could be that | ||
# this word is unique to the document we are analyzing now, in which | ||
# case we need to do some (sort of) smoothing | ||
if m == 0: | ||
m = 1 | ||
|
||
if (n == 0 or D == 0): | ||
return 0.0 | ||
else: | ||
return (float(n) / float(N)) * log(float(D) / float(m), 10) | ||
|
||
|
||
def get_tfidf(db, doc, word): | ||
""" | ||
returns the tfidf score of word in the document named doc in the | ||
database referenced by the sqlite3 cursosr db | ||
""" | ||
results = db.execute('''SELECT tfidf | ||
FROM tfidf | ||
WHERE word = ? | ||
AND document = ? | ||
LIMIT 1''', (word, doc)).fetchall() | ||
# if there are no results, then we simply return 0 so that nothing | ||
# is added to the similarity for this word | ||
if results == []: | ||
return 0.0 | ||
else: | ||
return results[0][0] | ||
|
||
|
||
def get_corpus_size(db): | ||
""" | ||
returns the number of unique documents in the 'tfidf' table in | ||
the database referenced by the sqlite3 cursor db | ||
""" | ||
results = db.execute('SELECT COUNT(distinct document) FROM tfidf;') | ||
return int(results.fetchall()[0][0]) | ||
|
||
|
||
def num_docs_containing(db, word): | ||
""" | ||
returns the number of documents which contain word by querying | ||
the 'tfidf' table in the database referenced by the sqlite3 | ||
cursor db | ||
""" | ||
result = db.execute('''SELECT COUNT(distinct document) | ||
FROM tfidf | ||
WHERE document = ?; | ||
''', (word,)) | ||
return int(result.fetchall()[0][0]) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |