Merge branch 'master' of https://github.com/uwsampa/nltk-hadoop

uwsampa · May 24, 2015 · 7964e30 · 7964e30
2 parents cf99cc2 + a06131c
commit 7964e30
Show file tree

Hide file tree

Showing 44 changed files with 1,783 additions and 269 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,11 @@
+language: python
+python:
+  - "2.7"
+virtualenv:
+  system_site_packages: true
+before_install:
+  - sudo apt-get install -qq python-numpy python-scipy
+install:
+  - "pip install nltk"
+  - "pip install scikit-learn"
+script: nosetests
diff --git a/README.md b/README.md
@@ -1,3 +1,9 @@
+[![Build Status](https://travis-ci.org/uwsampa/nltk-hadoop.svg?branch=master)](https://travis-ci.org/uwsampa/nltk-hadoop)
+
+# Setup
+
+- [install scikit-learn and its dependencies](http://scikit-learn.org/stable/install.html)
+
 # Materialize an nltk corpus
 
 - find a corpus at http://nltk.googlecode.com/svn/trunk/nltk_data/index.xml, e.g. `inaugural`
@@ -35,6 +41,26 @@ ls $OUTPUT_DIR/part-*
 ```
 
 See the tfidf metrics for each document/word pair:
-```
+```sh
 ls $tfidf/part-*
 ```
+
+# Run the test suite
+
+with `nose` installed,
+```sh
+nosetests
+```
+
+
+# The TFIDF Metric
+After cleaning and stemming a document, we obtain a list of words, `d`, for that document. The tfidf score of a word `w` in `d` is defined as follows:
+* let `n` be the number of times `w` appears in `d`
+* let `N` be the length of `d`
+* let `D` be the number of documents in the corpus
+* let `m` be the number of documents in which the word `d` appears at least once
+* `tf = n / N` (tf is the 'term frequency' of the word)
+* `idf = log(D / m)` (idf is the 'inverse document frequency' of the word)
+* `tfidf = tf*idf`
+
+These naming conventions are used in certain places in the codebase, for example in the docstrings for many mapper and reducer functions.
diff --git a/contents_mapper.py b/contents_mapper.py
@@ -1,19 +1,27 @@
 #!/usr/bin/env python
 
+from __future__ import print_function
 import os
 import sys
 from map_reduce_utils import clean_text
 
-"""
-(file_contents) --> (file_name) (file_contents)
 
-for each line from stdin consisting of a document in the corpus, emits
-a key-value pair to stdout with a key of the corresponding filename
-and a value of the file contents cleaned with
-map_reduce_utils.clean_text
-"""
+def map_contents(input=sys.stdin, output=sys.stdout):
+    """
+    (file_contents) --> (file_name) (file_contents)
 
-for line in sys.stdin:
-    docname = os.environ['mapreduce_map_input_file']
-    contents = clean_text(line)
-    print docname, '\t', ' '.join(map(str, contents))
+    for each line from stdin consisting of a document in the corpus, emits
+    a key-value pair to stdout with a key of the corresponding filename
+    and a value of the file contents cleaned with
+    map_reduce_utils.clean_text
+    """
+    template = '{}\t{}'
+    for line in input:
+        docname = os.environ['mapreduce_map_input_file']
+        contents = clean_text(line)
+        result = template.format(docname, ' '.join(map(str, contents)))
+        print(result, file=output)
+
+
+if __name__ == '__main__':
+    map_contents()
diff --git a/corp_freq_map.py b/corp_freq_map.py
@@ -1,16 +1,23 @@
 #!/usr/bin/env python
 
+from __future__ import print_function
 import sys
 
-"""
-(word file_name) (n N) --> (word) (file_name n N 1)
 
-emits a line for each unique word in each file to be consumed
-by corp_freq_red to find the number of occurences of each
-unique word throughout the entire corpus.
-"""
+def map_corpus_frequency(input=sys.stdin, output=sys.stdout):
+    """
+    (word file_name) (n N) --> (word) (file_name n N 1)
 
-for line in sys.stdin:
-    key, value = line.strip().split('\t')
-    word, docname = key.strip().split()
-    print '%s\t%s %s %s' % (word, docname, value, 1)
+    emits a line for each unique word in each file to be consumed
+    by corp_freq_red to find the number of occurences of each
+    unique word throughout the entire corpus.
+    """
+    for line in input:
+        key, value = line.strip().split('\t')
+        word, docname = key.strip().split()
+        result = '{0}\t{1} {2} {3}'.format(word, docname, value, 1)
+        print(result, file=output)
+
+
+if __name__ == '__main__':
+    map_corpus_frequency()
diff --git a/corp_freq_red.py b/corp_freq_red.py
@@ -1,44 +1,39 @@
 #!/usr/bin/env python
 
+from __future__ import print_function
 import sys
+from map_reduce_utils import reducer_stream
 
-"""
-(word) (file_name n N 1) --> (word file_name) (n N m)
-
-sums up the number of occurences of each unique word throughout
-the corpus and emits this sum for each document that the word
-occurs in.
-"""
-
-
-def print_results(count, files):
-    for string in files:
-        print '%s %s' % (string, count)
-
-processed_files = []
-cur_word = None
-cur_count = 0
-
-word = None
-
-
-for line in sys.stdin:
-    key, value = line.strip().split('\t')
-    word = key.strip()
-    docname, word_count, doc_count, count = value.strip().split()
-    count = int(count)
-    # add document/word combo to processed files
-    processed_combo = '%s %s\t%s %s' % (word, docname, word_count, doc_count)
-    if cur_word == word:
-        cur_count += count
-        processed_files.append(processed_combo)
-    else:
-        if cur_word is not None:
-            print_results(cur_count, processed_files)
-        cur_word = word
-        cur_count = count
-        processed_files = []
-        processed_files.append(processed_combo)
-
-if cur_word is not None:
-    print_results(cur_count, processed_files)
+
+KEYS = ['word']
+VALUES = ['filename', 'freq', 'size', 'count']
+
+
+def reduce_corpus_frequency(input=reducer_stream(KEYS, VALUES),
+                            output=sys.stdout):
+    """
+    (word) (file_name n N 1) --> (word file_name) (n N m)
+
+    sums up the number of occurences of each unique word throughout
+    the corpus and emits this sum for each document that the word
+    occurs in.
+    """
+    for key, key_stream in input:
+        count = 0
+        values = []
+        for value in key_stream:
+            count += int(value['count'])
+            values.append(value)
+        print_results(values, key['word'], count, output)
+
+
+def print_results(values, word, count, output):
+    template = '{0} {1}\t{2} {3} {4}'
+    for value in values:
+        result = template.format(word, value['filename'],
+                                 value['freq'], value['size'], count)
+        print(result, file=output)
+
+
+if __name__ == '__main__':
+    reduce_corpus_frequency()
diff --git a/cos_sim_map.py b/cos_sim_map.py
@@ -1,21 +1,30 @@
 #!/usr/bin/env python
+
+from __future__ import print_function
 import sys
 
-"""
-(word) (file1 file2 tfidf1*tfidf2) --> (file1 file2) (tfidf1*tfidf2)
 
-for each word common to two documents, removes the word from the
-key/value pair and replaces it with the two filenames so that we can
-sum up the values for each pair of documents in the reducer.
-"""
+def map_cosine_similarity(input=sys.stdin, output=sys.stdout):
+    """
+    (word) (file1 file2 tfidf1*tfidf2) --> (file1 file2) (tfidf1*tfidf2)
+
+    for each word common to two documents, removes the word from the
+    key/value pair and replaces it with the two filenames so that we can
+    sum up the values for each pair of documents in the reducer.
+    """
+    template = '{} {}\t{}'
+    for line in input:
+        key, value = line.strip().split('\t')
+        doc1, doc2, product = value.strip().split()
+
+        # we want to ensure that (doc1 doc2) and (doc2 doc1) get
+        # sent to the same reducer, so we order them alphabetically
+        if doc1 > doc2:
+            doc1, doc2 = doc2, doc1
 
-for line in sys.stdin:
-    key, value = line.strip().split('\t')
-    doc1, doc2, product = value.strip().split()
+        result = template.format(doc1, doc2, product)
+        print(result, file=output)
 
-    # we want to ensure that (doc1 doc2) and (doc2 doc1) get
-    # sent to the same reducer, so we order them alphabetically
-    if doc1 > doc2:
-        doc1, doc2 = doc2, doc1
 
-    print '%s %s\t%s' % (doc1, doc2, product)
+if __name__ == '__main__':
+    map_cosine_similarity()
diff --git a/cos_sim_red.py b/cos_sim_red.py
@@ -1,31 +1,40 @@
 #!/usr/bin/env python
+
+from __future__ import print_function
 import sys
+import argparse
+from map_reduce_utils import reducer_stream
+
+
+KEYS = ['file1', 'file2']
+VALUES = ['term']
 
-"""
-(file1 file2) (tfidf1*tfidf2) --> (file1 file2) (cosine_similarity(f1, f2))
 
-sums up the products of the tfidf values of words common between every
-pair of documents to produce the cosine similarity of the two documents
-"""
+def reduce_cosine_similarity(precision,
+                             input=reducer_stream(KEYS, VALUES),
+                             output=sys.stdout):
+    """
+    (file1 file2) (tfidf1*tfidf2) --> (file1 file2) (cosine_similarity(f1, f2))
 
-cur_sum = 0
-cur_docs = (None, None)  # will become (doc1, doc2)
+    sums up the products of the tfidf values of words common between every
+    pair of documents to produce the cosine similarity of the two documents
+    """
+    for key, key_stream in input:
+        sum_for_docs = 0
+        for value in key_stream:
+            term = value['term']
+            sum_for_docs += float(term)
+        print_result(key['file1'], key['file2'],
+                     sum_for_docs, precision, output)
 
 
-def print_result(doc1, doc2, sum_for_docs):
-    print '%s %s\t%.16f' % (doc1, doc2, sum_for_docs)
+def print_result(doc1, doc2, sum_for_docs, precision, output):
+    template = '{0} {1}\t{2:.{3}f}'
+    print(template.format(doc1, doc2, sum_for_docs, precision), file=output)
 
-for line in sys.stdin:
-    key, value = line.strip().split('\t')
-    doc1, doc2 = key.strip().split()
-    product = float(value)
-    if (doc1, doc2) == cur_docs:
-        cur_sum += product
-    else:
-        if cur_docs[0] is not None and cur_docs[1] is not None:
-            print_result(cur_docs[0], cur_docs[1], cur_sum)
-        cur_docs = (doc1, doc2)
-        cur_sum = 0
 
-if cur_docs[0] is not None and cur_docs[1] is not None:
-    print_result(cur_docs[0], cur_docs[1], cur_sum)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--precision', '-p', dest='precision')
+    precision = int(parser.parse_args().precision)
+    reduce_cosine_similarity(precision)