-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'master' of https://github.com/uwsampa/nltk-hadoop
- Loading branch information
Showing
44 changed files
with
1,783 additions
and
269 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
language: python | ||
python: | ||
- "2.7" | ||
virtualenv: | ||
system_site_packages: true | ||
before_install: | ||
- sudo apt-get install -qq python-numpy python-scipy | ||
install: | ||
- "pip install nltk" | ||
- "pip install scikit-learn" | ||
script: nosetests |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,19 +1,27 @@ | ||
#!/usr/bin/env python | ||
|
||
from __future__ import print_function | ||
import os | ||
import sys | ||
from map_reduce_utils import clean_text | ||
|
||
""" | ||
(file_contents) --> (file_name) (file_contents) | ||
|
||
for each line from stdin consisting of a document in the corpus, emits | ||
a key-value pair to stdout with a key of the corresponding filename | ||
and a value of the file contents cleaned with | ||
map_reduce_utils.clean_text | ||
""" | ||
def map_contents(input=sys.stdin, output=sys.stdout): | ||
""" | ||
(file_contents) --> (file_name) (file_contents) | ||
for line in sys.stdin: | ||
docname = os.environ['mapreduce_map_input_file'] | ||
contents = clean_text(line) | ||
print docname, '\t', ' '.join(map(str, contents)) | ||
for each line from stdin consisting of a document in the corpus, emits | ||
a key-value pair to stdout with a key of the corresponding filename | ||
and a value of the file contents cleaned with | ||
map_reduce_utils.clean_text | ||
""" | ||
template = '{}\t{}' | ||
for line in input: | ||
docname = os.environ['mapreduce_map_input_file'] | ||
contents = clean_text(line) | ||
result = template.format(docname, ' '.join(map(str, contents))) | ||
print(result, file=output) | ||
|
||
|
||
if __name__ == '__main__': | ||
map_contents() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,16 +1,23 @@ | ||
#!/usr/bin/env python | ||
|
||
from __future__ import print_function | ||
import sys | ||
|
||
""" | ||
(word file_name) (n N) --> (word) (file_name n N 1) | ||
|
||
emits a line for each unique word in each file to be consumed | ||
by corp_freq_red to find the number of occurences of each | ||
unique word throughout the entire corpus. | ||
""" | ||
def map_corpus_frequency(input=sys.stdin, output=sys.stdout): | ||
""" | ||
(word file_name) (n N) --> (word) (file_name n N 1) | ||
for line in sys.stdin: | ||
key, value = line.strip().split('\t') | ||
word, docname = key.strip().split() | ||
print '%s\t%s %s %s' % (word, docname, value, 1) | ||
emits a line for each unique word in each file to be consumed | ||
by corp_freq_red to find the number of occurences of each | ||
unique word throughout the entire corpus. | ||
""" | ||
for line in input: | ||
key, value = line.strip().split('\t') | ||
word, docname = key.strip().split() | ||
result = '{0}\t{1} {2} {3}'.format(word, docname, value, 1) | ||
print(result, file=output) | ||
|
||
|
||
if __name__ == '__main__': | ||
map_corpus_frequency() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,44 +1,39 @@ | ||
#!/usr/bin/env python | ||
|
||
from __future__ import print_function | ||
import sys | ||
from map_reduce_utils import reducer_stream | ||
|
||
""" | ||
(word) (file_name n N 1) --> (word file_name) (n N m) | ||
sums up the number of occurences of each unique word throughout | ||
the corpus and emits this sum for each document that the word | ||
occurs in. | ||
""" | ||
|
||
|
||
def print_results(count, files): | ||
for string in files: | ||
print '%s %s' % (string, count) | ||
|
||
processed_files = [] | ||
cur_word = None | ||
cur_count = 0 | ||
|
||
word = None | ||
|
||
|
||
for line in sys.stdin: | ||
key, value = line.strip().split('\t') | ||
word = key.strip() | ||
docname, word_count, doc_count, count = value.strip().split() | ||
count = int(count) | ||
# add document/word combo to processed files | ||
processed_combo = '%s %s\t%s %s' % (word, docname, word_count, doc_count) | ||
if cur_word == word: | ||
cur_count += count | ||
processed_files.append(processed_combo) | ||
else: | ||
if cur_word is not None: | ||
print_results(cur_count, processed_files) | ||
cur_word = word | ||
cur_count = count | ||
processed_files = [] | ||
processed_files.append(processed_combo) | ||
|
||
if cur_word is not None: | ||
print_results(cur_count, processed_files) | ||
|
||
KEYS = ['word'] | ||
VALUES = ['filename', 'freq', 'size', 'count'] | ||
|
||
|
||
def reduce_corpus_frequency(input=reducer_stream(KEYS, VALUES), | ||
output=sys.stdout): | ||
""" | ||
(word) (file_name n N 1) --> (word file_name) (n N m) | ||
sums up the number of occurences of each unique word throughout | ||
the corpus and emits this sum for each document that the word | ||
occurs in. | ||
""" | ||
for key, key_stream in input: | ||
count = 0 | ||
values = [] | ||
for value in key_stream: | ||
count += int(value['count']) | ||
values.append(value) | ||
print_results(values, key['word'], count, output) | ||
|
||
|
||
def print_results(values, word, count, output): | ||
template = '{0} {1}\t{2} {3} {4}' | ||
for value in values: | ||
result = template.format(word, value['filename'], | ||
value['freq'], value['size'], count) | ||
print(result, file=output) | ||
|
||
|
||
if __name__ == '__main__': | ||
reduce_corpus_frequency() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,21 +1,30 @@ | ||
#!/usr/bin/env python | ||
|
||
from __future__ import print_function | ||
import sys | ||
|
||
""" | ||
(word) (file1 file2 tfidf1*tfidf2) --> (file1 file2) (tfidf1*tfidf2) | ||
|
||
for each word common to two documents, removes the word from the | ||
key/value pair and replaces it with the two filenames so that we can | ||
sum up the values for each pair of documents in the reducer. | ||
""" | ||
def map_cosine_similarity(input=sys.stdin, output=sys.stdout): | ||
""" | ||
(word) (file1 file2 tfidf1*tfidf2) --> (file1 file2) (tfidf1*tfidf2) | ||
for each word common to two documents, removes the word from the | ||
key/value pair and replaces it with the two filenames so that we can | ||
sum up the values for each pair of documents in the reducer. | ||
""" | ||
template = '{} {}\t{}' | ||
for line in input: | ||
key, value = line.strip().split('\t') | ||
doc1, doc2, product = value.strip().split() | ||
|
||
# we want to ensure that (doc1 doc2) and (doc2 doc1) get | ||
# sent to the same reducer, so we order them alphabetically | ||
if doc1 > doc2: | ||
doc1, doc2 = doc2, doc1 | ||
|
||
for line in sys.stdin: | ||
key, value = line.strip().split('\t') | ||
doc1, doc2, product = value.strip().split() | ||
result = template.format(doc1, doc2, product) | ||
print(result, file=output) | ||
|
||
# we want to ensure that (doc1 doc2) and (doc2 doc1) get | ||
# sent to the same reducer, so we order them alphabetically | ||
if doc1 > doc2: | ||
doc1, doc2 = doc2, doc1 | ||
|
||
print '%s %s\t%s' % (doc1, doc2, product) | ||
if __name__ == '__main__': | ||
map_cosine_similarity() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,31 +1,40 @@ | ||
#!/usr/bin/env python | ||
|
||
from __future__ import print_function | ||
import sys | ||
import argparse | ||
from map_reduce_utils import reducer_stream | ||
|
||
|
||
KEYS = ['file1', 'file2'] | ||
VALUES = ['term'] | ||
|
||
""" | ||
(file1 file2) (tfidf1*tfidf2) --> (file1 file2) (cosine_similarity(f1, f2)) | ||
|
||
sums up the products of the tfidf values of words common between every | ||
pair of documents to produce the cosine similarity of the two documents | ||
""" | ||
def reduce_cosine_similarity(precision, | ||
input=reducer_stream(KEYS, VALUES), | ||
output=sys.stdout): | ||
""" | ||
(file1 file2) (tfidf1*tfidf2) --> (file1 file2) (cosine_similarity(f1, f2)) | ||
cur_sum = 0 | ||
cur_docs = (None, None) # will become (doc1, doc2) | ||
sums up the products of the tfidf values of words common between every | ||
pair of documents to produce the cosine similarity of the two documents | ||
""" | ||
for key, key_stream in input: | ||
sum_for_docs = 0 | ||
for value in key_stream: | ||
term = value['term'] | ||
sum_for_docs += float(term) | ||
print_result(key['file1'], key['file2'], | ||
sum_for_docs, precision, output) | ||
|
||
|
||
def print_result(doc1, doc2, sum_for_docs): | ||
print '%s %s\t%.16f' % (doc1, doc2, sum_for_docs) | ||
def print_result(doc1, doc2, sum_for_docs, precision, output): | ||
template = '{0} {1}\t{2:.{3}f}' | ||
print(template.format(doc1, doc2, sum_for_docs, precision), file=output) | ||
|
||
for line in sys.stdin: | ||
key, value = line.strip().split('\t') | ||
doc1, doc2 = key.strip().split() | ||
product = float(value) | ||
if (doc1, doc2) == cur_docs: | ||
cur_sum += product | ||
else: | ||
if cur_docs[0] is not None and cur_docs[1] is not None: | ||
print_result(cur_docs[0], cur_docs[1], cur_sum) | ||
cur_docs = (doc1, doc2) | ||
cur_sum = 0 | ||
|
||
if cur_docs[0] is not None and cur_docs[1] is not None: | ||
print_result(cur_docs[0], cur_docs[1], cur_sum) | ||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('--precision', '-p', dest='precision') | ||
precision = int(parser.parse_args().precision) | ||
reduce_cosine_similarity(precision) |
Oops, something went wrong.