Skip to content

Commit

Permalink
added scripts to write/query db
Browse files Browse the repository at this point in the history
added create_db.py and query_results.py

create_db.py uses the output from mapred_tfidf.py to produce a sqlite3
database which query_results.py can use to display documents most
similar to a new document, or the most similar documents in the original
corpus
  • Loading branch information
zbsimon committed Jan 15, 2015
1 parent fd85fc5 commit 9c1441c
Show file tree
Hide file tree
Showing 2 changed files with 270 additions and 0 deletions.
115 changes: 115 additions & 0 deletions create_db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
#!/usr/bin/env python

from __future__ import print_function
import argparse
import sqlite3
import os
import re

IS_MAP_REDUCE_OUTPUT = re.compile('^part-.*$')

DEFAULT_DB_LOCATION = 'tfidf.db'


def main(database_name, tfidf_location, similarities_location):
"""
populates tables in a sqlite3 database named database_name containing
the similarities and tfidf scores produced by mapred_tfidf.py. note that
this is designed for smaller datasets, not the entire patent dataset.
"""
with sqlite3.connect(database_name) as conn:
db = conn.cursor()
# yay denormalization
populate_tfidf(db, tfidf_location)
populate_simimlarities(db, similarities_location)
conn.commit()


def populate_tfidf(db, input_dir):
"""
creates and populates a table in the database referenced by the
cursor db named tfidf using the content of input_dir,
which should contain the output from the tfidf mapper
from mapred_tfidf.py
"""
db.execute('CREATE TABLE tfidf (word text, document text, tfidf real);')
for subdir, dirs, files in os.walk(input_dir):
for filename in files:
# ignore "Success" and CRC files from mapred
if IS_MAP_REDUCE_OUTPUT.match(filename):
for line in open(input_dir + '/' + filename, 'r'):
vals = line.strip().split()
sql = 'INSERT INTO tfidf VALUES (?, ?, ?);'
db.execute(sql, [vals[0], vals[1], vals[2]])


def populate_simimlarities(db, input_dir):
"""
creates and populates a table in the database referenced by the
cursor db named similarities using the content of input_dir,
which should contain the output from the similarity reducer
from mapred_tfidf.py
"""
# Since as of right now, we're only going to use this to query
# top k similar documents, it doesn't matter which order
# the documents are in.
db.execute('''CREATE TABLE similarities
(doc1 text, doc2 text, similarity real);''')
for subdir, dirs, files in os.walk(input_dir):
for filename in files:
if IS_MAP_REDUCE_OUTPUT.match(filename):
for line in open(input_dir + '/' + filename, 'r'):
vals = line.strip().split()
sql = 'INSERT INTO similarities VALUES (?, ?, ?);'
db.execute(sql, (vals[0], vals[1], vals[2]))


if __name__ == '__main__':
description = ''' uses the output of mapred_tfidf to produce a sqlite3
database which can be used to query the most similar
documents in a corpus and compare a new text to the
documnets already processed with mapred_tfidf.
'''

parser = argparse.ArgumentParser(
description=description,
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)

db_name_help = 'the name of the sqlite3 db file to create'
parser.add_argument('--output', '-o', default=DEFAULT_DB_LOCATION,
dest='db_name', help=db_name_help)

tfidf_location_help = 'the location of the tf-idf output from mapred_tfidf'
parser.add_argument('--tfidf', '-t', dest='tfidf_loc',
default='tfidf', help=tfidf_location_help)

sim_loc_help = 'the location of the similarities output from mapred_tfidf'
parser.add_argument('--similarities', '-s', dest='similarities_location',
default='similarities', help=sim_loc_help)

force_help = 'if provided, automatically overwrite db_name if it exists'
parser.add_argument('--force', '-f', dest='force', default=False,
help=force_help, action='store_true')

args = parser.parse_args()
db_name = args.db_name
tfidf_location = args.tfidf_loc
similarities_location = args.similarities_location
force = args.force

# don't clobber the db location unless user asks to
if os.path.exists(db_name):
if not force:
prompt = 'Overwrite db file {} [y/n]?'.format(db_name)
response = raw_input(prompt)
if response not in ['y', 'yes', 'Y', 'Yes']:
exit()

try:
os.remove(db_name)
except OSError as ose:
err_msg = 'unable to rm db file. is it a dir? chmod?'
print(err_msg, file=os.stderr)

main(db_name, tfidf_location, similarities_location)
155 changes: 155 additions & 0 deletions query_results.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
#!/usr/bin/env python

import argparse
import sqlite3
import collections
import map_reduce_utils as mru
from math import log
from create_db import * # just for config/locations

def main():
description = '''
given a sqlite3 database file produced using create_db.py,
either display the top k most similar documents in the corpus
or, if given a new file to analyze, calculate the k most
similar documents to it in the original corpus. This designed
to be run on smaller datasets just to gain insight/intuiton.
'''
parser = argparse.ArgumentParser(
description=description,
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)

top_help = "the number of similar documents to display"
parser.add_argument('--top', '-t', help=top_help, type=int, default=5)

file_help = '''find the k most similar documents to file instead of the
entire corpus'''
parser.add_argument('--file', '-f', dest='file', help=file_help)

db_name_help = "the location of the sqlite database file to query"
parser.add_argument('--database', '-d', dest='db_location',
help=db_name_help, default=DEFAULT_DB_LOCATION)

args = parser.parse_args()
with sqlite3.connect(args.db_location) as conn:
db = conn.cursor()
if args.file:
compare_file(db, args.file, args.top)
else:
find_top_k(db, args.top)
conn.commit()


def find_top_k(db, k):
"""
prints to top k similar documents in the 'similarities' table
of the database referenced by the sqlite3 cursor dbx
"""
results = db.execute('''SELECT *
FROM similarities
ORDER BY similarity DESC
LIMIT ?;''', (k,))
print '\n'.join(map(lambda x: '{} {} {:.10f}'.format(x[0], x[1], x[2]),
results.fetchall()))


def compare_file(db, filename, k):
"""
finds the k most similar documents to filename in the database referenced
by the sqlite3 cursor db and prints them along with the cosine similarity
metric between filename and each of the k documents
"""
with open(filename, 'r') as f:
contents = f.read()
contents = mru.clean_text(contents)
counts = {}
for word in contents:
if word in counts:
counts[word] += 1
else:
counts[word] = 1
docs_containing = {w: num_docs_containing(db, w) for w in set(contents)}

# we're going to use the number of documents in the original
# corpus to calculate tfidf, not including the file we are now
# analyzing, since the tfidf scores we have in the database were
# calculated with this number
corp_size = get_corpus_size(db)
doc_size = len(contents)
tfidfs = {word: tfidf(count, doc_size, corp_size, docs_containing[word])
for word, count in counts.items()}

# now, calculate the similarity metric with each document in the database
similarities = {}
documents = db.execute('SELECT DISTINCT document FROM tfidf;').fetchall()
for doc in map(lambda x: x[0], documents):
similarity = 0
for word in set(contents):
other_doc_tfidf = get_tfidf(db, doc, word)
this_doc_tfidf = tfidfs[word]
similarity += this_doc_tfidf * other_doc_tfidf
similarities[doc] = similarity
top_k = collections.Counter(similarities).most_common(k)
print '\n'.join(map(lambda x: ':\t'.join([repr(i) for i in x]), top_k))


def tfidf(n, N, D, m):
"""
given the document frequency n, the document length N, corpus size D
and corpus frequency m, returns the tfidf score for this word and document
"""
# Since we're using counts from the original corpus, it could be that
# this word is unique to the document we are analyzing now, in which
# case we need to do some (sort of) smoothing
if m == 0:
m = 1

if (n == 0 or D == 0):
return 0.0
else:
return (float(n) / float(N)) * log(float(D) / float(m), 10)


def get_tfidf(db, doc, word):
"""
returns the tfidf score of word in the document named doc in the
database referenced by the sqlite3 cursosr db
"""
results = db.execute('''SELECT tfidf
FROM tfidf
WHERE word = ?
AND document = ?
LIMIT 1''', (word, doc)).fetchall()
# if there are no results, then we simply return 0 so that nothing
# is added to the similarity for this word
if results == []:
return 0.0
else:
return results[0][0]


def get_corpus_size(db):
"""
returns the number of unique documents in the 'tfidf' table in
the database referenced by the sqlite3 cursor db
"""
results = db.execute('SELECT COUNT(distinct document) FROM tfidf;')
return int(results.fetchall()[0][0])


def num_docs_containing(db, word):
"""
returns the number of documents which contain word by querying
the 'tfidf' table in the database referenced by the sqlite3
cursor db
"""
result = db.execute('''SELECT COUNT(distinct document)
FROM tfidf
WHERE document = ?;
''', (word,))
return int(result.fetchall()[0][0])


if __name__ == '__main__':
main()

0 comments on commit 9c1441c

Please sign in to comment.