Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/uwsampa/nltk-hadoop
Browse files Browse the repository at this point in the history
  • Loading branch information
zbsimon committed May 24, 2015
2 parents cf99cc2 + a06131c commit 7964e30
Show file tree
Hide file tree
Showing 44 changed files with 1,783 additions and 269 deletions.
11 changes: 11 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
language: python
python:
- "2.7"
virtualenv:
system_site_packages: true
before_install:
- sudo apt-get install -qq python-numpy python-scipy
install:
- "pip install nltk"
- "pip install scikit-learn"
script: nosetests
28 changes: 27 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
[![Build Status](https://travis-ci.org/uwsampa/nltk-hadoop.svg?branch=master)](https://travis-ci.org/uwsampa/nltk-hadoop)

# Setup

- [install scikit-learn and its dependencies](http://scikit-learn.org/stable/install.html)

# Materialize an nltk corpus

- find a corpus at http://nltk.googlecode.com/svn/trunk/nltk_data/index.xml, e.g. `inaugural`
Expand Down Expand Up @@ -35,6 +41,26 @@ ls $OUTPUT_DIR/part-*
```

See the tfidf metrics for each document/word pair:
```
```sh
ls $tfidf/part-*
```

# Run the test suite

with `nose` installed,
```sh
nosetests
```


# The TFIDF Metric
After cleaning and stemming a document, we obtain a list of words, `d`, for that document. The tfidf score of a word `w` in `d` is defined as follows:
* let `n` be the number of times `w` appears in `d`
* let `N` be the length of `d`
* let `D` be the number of documents in the corpus
* let `m` be the number of documents in which the word `d` appears at least once
* `tf = n / N` (tf is the 'term frequency' of the word)
* `idf = log(D / m)` (idf is the 'inverse document frequency' of the word)
* `tfidf = tf*idf`

These naming conventions are used in certain places in the codebase, for example in the docstrings for many mapper and reducer functions.
30 changes: 19 additions & 11 deletions contents_mapper.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,27 @@
#!/usr/bin/env python

from __future__ import print_function
import os
import sys
from map_reduce_utils import clean_text

"""
(file_contents) --> (file_name) (file_contents)

for each line from stdin consisting of a document in the corpus, emits
a key-value pair to stdout with a key of the corresponding filename
and a value of the file contents cleaned with
map_reduce_utils.clean_text
"""
def map_contents(input=sys.stdin, output=sys.stdout):
"""
(file_contents) --> (file_name) (file_contents)
for line in sys.stdin:
docname = os.environ['mapreduce_map_input_file']
contents = clean_text(line)
print docname, '\t', ' '.join(map(str, contents))
for each line from stdin consisting of a document in the corpus, emits
a key-value pair to stdout with a key of the corresponding filename
and a value of the file contents cleaned with
map_reduce_utils.clean_text
"""
template = '{}\t{}'
for line in input:
docname = os.environ['mapreduce_map_input_file']
contents = clean_text(line)
result = template.format(docname, ' '.join(map(str, contents)))
print(result, file=output)


if __name__ == '__main__':
map_contents()
27 changes: 17 additions & 10 deletions corp_freq_map.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,23 @@
#!/usr/bin/env python

from __future__ import print_function
import sys

"""
(word file_name) (n N) --> (word) (file_name n N 1)

emits a line for each unique word in each file to be consumed
by corp_freq_red to find the number of occurences of each
unique word throughout the entire corpus.
"""
def map_corpus_frequency(input=sys.stdin, output=sys.stdout):
"""
(word file_name) (n N) --> (word) (file_name n N 1)
for line in sys.stdin:
key, value = line.strip().split('\t')
word, docname = key.strip().split()
print '%s\t%s %s %s' % (word, docname, value, 1)
emits a line for each unique word in each file to be consumed
by corp_freq_red to find the number of occurences of each
unique word throughout the entire corpus.
"""
for line in input:
key, value = line.strip().split('\t')
word, docname = key.strip().split()
result = '{0}\t{1} {2} {3}'.format(word, docname, value, 1)
print(result, file=output)


if __name__ == '__main__':
map_corpus_frequency()
75 changes: 35 additions & 40 deletions corp_freq_red.py
Original file line number Diff line number Diff line change
@@ -1,44 +1,39 @@
#!/usr/bin/env python

from __future__ import print_function
import sys
from map_reduce_utils import reducer_stream

"""
(word) (file_name n N 1) --> (word file_name) (n N m)
sums up the number of occurences of each unique word throughout
the corpus and emits this sum for each document that the word
occurs in.
"""


def print_results(count, files):
for string in files:
print '%s %s' % (string, count)

processed_files = []
cur_word = None
cur_count = 0

word = None


for line in sys.stdin:
key, value = line.strip().split('\t')
word = key.strip()
docname, word_count, doc_count, count = value.strip().split()
count = int(count)
# add document/word combo to processed files
processed_combo = '%s %s\t%s %s' % (word, docname, word_count, doc_count)
if cur_word == word:
cur_count += count
processed_files.append(processed_combo)
else:
if cur_word is not None:
print_results(cur_count, processed_files)
cur_word = word
cur_count = count
processed_files = []
processed_files.append(processed_combo)

if cur_word is not None:
print_results(cur_count, processed_files)

KEYS = ['word']
VALUES = ['filename', 'freq', 'size', 'count']


def reduce_corpus_frequency(input=reducer_stream(KEYS, VALUES),
output=sys.stdout):
"""
(word) (file_name n N 1) --> (word file_name) (n N m)
sums up the number of occurences of each unique word throughout
the corpus and emits this sum for each document that the word
occurs in.
"""
for key, key_stream in input:
count = 0
values = []
for value in key_stream:
count += int(value['count'])
values.append(value)
print_results(values, key['word'], count, output)


def print_results(values, word, count, output):
template = '{0} {1}\t{2} {3} {4}'
for value in values:
result = template.format(word, value['filename'],
value['freq'], value['size'], count)
print(result, file=output)


if __name__ == '__main__':
reduce_corpus_frequency()
37 changes: 23 additions & 14 deletions cos_sim_map.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,30 @@
#!/usr/bin/env python

from __future__ import print_function
import sys

"""
(word) (file1 file2 tfidf1*tfidf2) --> (file1 file2) (tfidf1*tfidf2)

for each word common to two documents, removes the word from the
key/value pair and replaces it with the two filenames so that we can
sum up the values for each pair of documents in the reducer.
"""
def map_cosine_similarity(input=sys.stdin, output=sys.stdout):
"""
(word) (file1 file2 tfidf1*tfidf2) --> (file1 file2) (tfidf1*tfidf2)
for each word common to two documents, removes the word from the
key/value pair and replaces it with the two filenames so that we can
sum up the values for each pair of documents in the reducer.
"""
template = '{} {}\t{}'
for line in input:
key, value = line.strip().split('\t')
doc1, doc2, product = value.strip().split()

# we want to ensure that (doc1 doc2) and (doc2 doc1) get
# sent to the same reducer, so we order them alphabetically
if doc1 > doc2:
doc1, doc2 = doc2, doc1

for line in sys.stdin:
key, value = line.strip().split('\t')
doc1, doc2, product = value.strip().split()
result = template.format(doc1, doc2, product)
print(result, file=output)

# we want to ensure that (doc1 doc2) and (doc2 doc1) get
# sent to the same reducer, so we order them alphabetically
if doc1 > doc2:
doc1, doc2 = doc2, doc1

print '%s %s\t%s' % (doc1, doc2, product)
if __name__ == '__main__':
map_cosine_similarity()
53 changes: 31 additions & 22 deletions cos_sim_red.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,40 @@
#!/usr/bin/env python

from __future__ import print_function
import sys
import argparse
from map_reduce_utils import reducer_stream


KEYS = ['file1', 'file2']
VALUES = ['term']

"""
(file1 file2) (tfidf1*tfidf2) --> (file1 file2) (cosine_similarity(f1, f2))

sums up the products of the tfidf values of words common between every
pair of documents to produce the cosine similarity of the two documents
"""
def reduce_cosine_similarity(precision,
input=reducer_stream(KEYS, VALUES),
output=sys.stdout):
"""
(file1 file2) (tfidf1*tfidf2) --> (file1 file2) (cosine_similarity(f1, f2))
cur_sum = 0
cur_docs = (None, None) # will become (doc1, doc2)
sums up the products of the tfidf values of words common between every
pair of documents to produce the cosine similarity of the two documents
"""
for key, key_stream in input:
sum_for_docs = 0
for value in key_stream:
term = value['term']
sum_for_docs += float(term)
print_result(key['file1'], key['file2'],
sum_for_docs, precision, output)


def print_result(doc1, doc2, sum_for_docs):
print '%s %s\t%.16f' % (doc1, doc2, sum_for_docs)
def print_result(doc1, doc2, sum_for_docs, precision, output):
template = '{0} {1}\t{2:.{3}f}'
print(template.format(doc1, doc2, sum_for_docs, precision), file=output)

for line in sys.stdin:
key, value = line.strip().split('\t')
doc1, doc2 = key.strip().split()
product = float(value)
if (doc1, doc2) == cur_docs:
cur_sum += product
else:
if cur_docs[0] is not None and cur_docs[1] is not None:
print_result(cur_docs[0], cur_docs[1], cur_sum)
cur_docs = (doc1, doc2)
cur_sum = 0

if cur_docs[0] is not None and cur_docs[1] is not None:
print_result(cur_docs[0], cur_docs[1], cur_sum)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--precision', '-p', dest='precision')
precision = int(parser.parse_args().precision)
reduce_cosine_similarity(precision)
Loading

0 comments on commit 7964e30

Please sign in to comment.