-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
466 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
language: python | ||
python: | ||
- "2.7" | ||
- "3.3" | ||
script: nosetests |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
#!/usr/bin/env python | ||
|
||
import tests.test_utils as tu | ||
import contents_mapper | ||
import os | ||
import re | ||
|
||
|
||
class TestContentsMapper(tu.MapTestCase): | ||
""" tests contents mapper """ | ||
|
||
def __init__(self, *args, **kwargs): | ||
super(TestContentsMapper, self).__init__( | ||
contents_mapper.map_contents, | ||
'tests/fixtures/contents_mapper.txt', | ||
*args, **kwargs | ||
) | ||
os.environ['mapreduce_map_input_file'] = 'filename' | ||
|
||
def test_has_correct_number_of_keys_and_values(self): | ||
"ensures that the correct number of keys and values are emitted" | ||
self.has_single_delimiter() | ||
self.has_n_keys(1) | ||
|
||
def test_only_lowercase_alphabetic(self): | ||
"ensures all content is lowercase alphabetic charachters" | ||
numerals = re.compile('^[a-z, ,\t]+$') | ||
self.are_all_matches(numerals) | ||
|
||
def test_prepends_filename(self): | ||
"ensures that the filename is prepended to each line emitted" | ||
contains_filename = re.compile('^filename') | ||
self.are_all_matches(contains_filename) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
#!/usr/bin/env python | ||
|
||
import tests.test_utils as tu | ||
import corp_freq_map as map | ||
import corp_freq_red as red | ||
import re | ||
|
||
|
||
class TestCorpusFrequencyMapper(tu.MapTestCase): | ||
""" | ||
tests corpus frequency mapper | ||
""" | ||
|
||
def __init__(self, *args, **kwargs): | ||
super(TestCorpusFrequencyMapper, self).__init__( | ||
map.map_corpus_frequency, | ||
'tests/fixtures/corpus_frequency_mapper.txt', | ||
*args, **kwargs | ||
) | ||
|
||
def test_has_correct_number_of_keys_and_values(self): | ||
""" | ||
tests that values emitted have correct number of keys and values | ||
""" | ||
self.has_correct_number_of_keys_and_values(1, 4) | ||
|
||
def test_appends_one(self): | ||
""" | ||
tests that 1 is appended to each line of input | ||
""" | ||
ends_with_one = '.*1$' | ||
self.are_all_matches(re.compile(ends_with_one)) | ||
|
||
def test_lines_out_equals_lines_in(self): | ||
""" | ||
tests that the number of lines of input and output are equal | ||
""" | ||
self.lines_out_equals_lines_in() | ||
|
||
|
||
class TestCorpusFrequencyReducer(tu.ReduceTestCase): | ||
""" | ||
tests basic functionality of the corpus frequency reducer | ||
""" | ||
|
||
def __init__(self, *args, **kwargs): | ||
super(TestCorpusFrequencyReducer, self).__init__( | ||
red.KEYS, red.VALUES, | ||
red.reduce_corpus_frequency, | ||
'tests/fixtures/corpus_frequency_reducer.txt', | ||
*args, **kwargs | ||
) | ||
|
||
def test_has_correct_number_of_keys_and_values(self): | ||
""" | ||
ensures that the correct number of keys and values are emitted | ||
""" | ||
self.has_correct_number_of_keys_and_values(2, 3) | ||
|
||
def test_sums_correct_values(self): | ||
""" | ||
ensure that the sum is the number of document a word occurs in, | ||
not the number of occurences of the word. | ||
""" | ||
for line in self.run_reducer_tokenize(): | ||
if line[0][0] == 'word1': | ||
self.assertEqual(int(line[1][2]), 2) | ||
elif line[0][0] == 'word2': | ||
self.assertEqual(int(line[1][2]), 2) | ||
elif line[0][0] == 'word3': | ||
self.assertEqual(int(line[1][2]), 1) | ||
else: | ||
self.fail('unknown word in output') | ||
|
||
def test_lines_out_equals_lines_in(self): | ||
""" | ||
tests that the number of lines of input and output are equal | ||
""" | ||
self.lines_out_equals_lines_in() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
#!/usr/bin/env python | ||
|
||
import tests.test_utils as tu | ||
import cos_sim_map as map | ||
import cos_sim_red as red | ||
|
||
|
||
class TestCosineSimilarityMapper(tu.MapTestCase): | ||
""" tests the cosine similarity mapper """ | ||
|
||
def __init__(self, *args, **kwargs): | ||
super(TestCosineSimilarityMapper, self).__init__( | ||
map.map_cosine_similarity, | ||
'tests/fixtures/cosine_similarity_mapper.txt', | ||
*args, **kwargs | ||
) | ||
|
||
def test_has_correct_number_of_keys_and_values(self): | ||
""" | ||
ensures that the same number of keys and values are emitted | ||
""" | ||
self.has_correct_number_of_keys_and_values(2, 1) | ||
|
||
def test_emits_line_for_each_word_in_each_file(self): | ||
""" | ||
ensures that the same number of lines of input and output are emitted | ||
""" | ||
self.lines_out_equals_lines_in() | ||
|
||
|
||
class TestCosineSimilarityReducer(tu.ReduceTestCase): | ||
""" tests the cosine simiarity reducer """ | ||
|
||
def __init__(self, *args, **kwargs): | ||
super(TestCosineSimilarityReducer, self).__init__( | ||
red.KEYS, red.VALUES, | ||
red.reduce_cosine_similarity, | ||
'tests/fixtures/cosine_similarity_reducer.txt', | ||
*args, **kwargs | ||
) | ||
self.default_args = {'precision': 6} | ||
|
||
def test_has_correct_number_of_keys_and_values(self): | ||
""" | ||
ensures that the same number of keys and values are emitted | ||
""" | ||
self.has_correct_number_of_keys_and_values(2, 1, args=self.default_args) | ||
|
||
def test_sums_up_correct_values(self): | ||
""" | ||
ensures that the correct sum is computed | ||
""" | ||
for key, value in self.run_reducer_tokenize(args=self.default_args): | ||
if key[0] == 'file1' and key[1] == 'file2': | ||
self.assertEqual(value[0], '1.000000') | ||
else: | ||
self.assertEqual(value[0], '1.500000') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
#!/usr/bin/env python | ||
|
||
from math import log | ||
import tests.test_utils as tu | ||
import map_reduce_utils as mru | ||
import tf_idf_map as map | ||
|
||
|
||
class TestTfIdfMapper(tu.MapTestCase): | ||
""" tests the tf-idf mapper """ | ||
|
||
def __init__(self, *args, **kwargs): | ||
super(TestTfIdfMapper, self).__init__( | ||
map.map_tf_idf, | ||
'tests/fixtures/tf_idf_mapper.txt', | ||
*args, **kwargs | ||
) | ||
self.default_args = {'corpus_size': 3, 'precision': 5} | ||
|
||
def test_has_correct_number_of_keys_and_values(self): | ||
""" | ||
ensures that the correct number of keys and values are emitted | ||
""" | ||
self.has_correct_number_of_keys_and_values(2, 1, args=self.default_args) | ||
|
||
def test_computes_correct_tfidf_score(self): | ||
""" | ||
tests that the correct tfidf value is emitted | ||
""" | ||
results = self.run_mapper(args=self.default_args) | ||
results = mru.tokenize_key_value_pair(results[0]) | ||
computed_tfidf = float(results[1][0]) | ||
expected_tf = (7.0 / 12.0) | ||
expected_idf = log((float(self.default_args['corpus_size']) / 2.0), 10) | ||
expected_tfidf = expected_tf * expected_idf | ||
expected_tfidf = round(expected_tfidf, self.default_args['precision']) | ||
self.assertEqual(expected_tfidf, computed_tfidf) | ||
|
||
def test_has_correcct_precision(self): | ||
""" | ||
tests to ensure that the precision argument is ensured. | ||
""" | ||
# we split on '.' and then check the length of the string after | ||
# the period | ||
precision_to_test = 8 | ||
results = self.run_mapper(args={'precision': precision_to_test, | ||
'corpus_size': 9}) | ||
result = mru.tokenize_key_value_pair(results[0]) | ||
computed_tfidf = result[1][0] | ||
computed_precision = len(computed_tfidf.strip().split('.')[1]) | ||
self.assertEqual(precision_to_test, computed_precision) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
#!/usr/bin/env python | ||
|
||
import tests.test_utils as tu | ||
import word_count_map as map | ||
import word_count_red as red | ||
|
||
|
||
class TestWordCountMapper(tu.MapTestCase): | ||
""" tests the word count mapper """ | ||
|
||
def __init__(self, *args, **kwargs): | ||
super(TestWordCountMapper, self).__init__( | ||
map.map_word_count, | ||
'tests/fixtures/word_count_mapper.txt', | ||
*args, **kwargs | ||
) | ||
|
||
def test_has_correct_number_of_keys_and_values(self): | ||
""" | ||
ensures that the correct number of keys and values are returned | ||
""" | ||
self.has_correct_number_of_keys_and_values(1, 2) | ||
|
||
def test_lines_out_equals_lines_in(self): | ||
""" | ||
ensures that a line is emmited for each line of input | ||
""" | ||
self.lines_out_equals_lines_in() | ||
|
||
|
||
class TestWordCountReducer(tu.ReduceTestCase): | ||
""" tests the word count reducer """ | ||
|
||
def __init__(self, *args, **kwargs): | ||
super(TestWordCountReducer, self).__init__( | ||
red.KEYS, red.VALUES, | ||
red.reduce_word_count, | ||
'tests/fixtures/word_count_reducer.txt', | ||
*args, **kwargs | ||
) | ||
|
||
def test_has_correct_number_of_keys_and_values(self): | ||
""" | ||
ensures that the correct number of keys and values are emitted | ||
""" | ||
self.has_correct_number_of_keys_and_values(2, 2) | ||
|
||
def test_sums_up_correct_values(self): | ||
""" | ||
ensures that the correct sum is calculated | ||
""" | ||
for result in self.run_reducer_tokenize(): | ||
if result[0][1] == 'file1': | ||
self.assertEqual(int(result[1][1]), 11) | ||
elif result[0][1] == 'file2': | ||
self.assertEqual(int(result[1][1]), 16) | ||
elif result[0][1] == 'file3': | ||
self.assertEqual(int(result[1][1]), 26) | ||
else: | ||
self.fail('unknown filename') | ||
|
||
def test_lines_out_equals_lines_in(self): | ||
""" | ||
test to ensure that the number of lines of input is equal to | ||
the number of lines of output. | ||
""" | ||
self.lines_out_equals_lines_in() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
#!/usr/bin/env python | ||
|
||
import re | ||
import tests.test_utils as tu | ||
import word_freq_map as map | ||
import word_freq_red as red | ||
|
||
|
||
class TestWordFrequencyMapper(tu.MapTestCase): | ||
""" | ||
tests basic functionality of the word frequency mapper | ||
""" | ||
|
||
def __init__(self, *args, **kwargs): | ||
super(TestWordFrequencyMapper, self).__init__( | ||
map.map_word_frequency, | ||
'tests/fixtures/word_frequency_mapper.txt', | ||
*args, **kwargs | ||
) | ||
|
||
def test_has_correct_number_of_keys_and_values(self): | ||
""" | ||
ensures that the correct number of keys and values are emitted | ||
""" | ||
self.has_correct_number_of_keys_and_values(2, 1) | ||
|
||
def test_emits_line_for_each_word_in_each_file(self): | ||
""" | ||
ensures that a line is emitted for each word in each file | ||
""" | ||
with open(self.default_fixture) as f: | ||
output = self.run_mapper() | ||
input = f.readlines() | ||
num_files = len(input) | ||
total_words = 0 | ||
for line in input: | ||
total_words += len(line.strip().split()) | ||
self.assertEqual(len(output), total_words - num_files) | ||
|
||
def test_appends_one(self): | ||
""" | ||
tests that 1 is appended to each line of output | ||
""" | ||
ends_with_one = '.*1$' | ||
self.are_all_matches(re.compile(ends_with_one)) | ||
|
||
|
||
class TestWordFrequencyReducer(tu.ReduceTestCase): | ||
""" tests the word frequency reducer """ | ||
|
||
def __init__(self, *args, **kwargs): | ||
super(TestWordFrequencyReducer, self).__init__( | ||
red.KEYS, red.VALUES, | ||
red.reduce_word_frequency, | ||
'tests/fixtures/word_frequency_reducer.txt', | ||
*args, **kwargs | ||
) | ||
|
||
def test_has_correct_number_of_keys_and_values(self): | ||
""" | ||
ensures that the correct number of keys and values are emitted | ||
""" | ||
self.has_correct_number_of_keys_and_values(2, 1) | ||
|
||
def test_sum_of_output_equals_length_of_input(self): | ||
""" | ||
tests that each line is accounted for in the sum produced as output | ||
""" | ||
output_total_sum = 0 | ||
for line in self.run_reducer_tokenize(): | ||
output_total_sum += int(line[1][0]) | ||
with open(self.default_fixture) as f: | ||
input_len = len(f.readlines()) | ||
self.assertEqual(output_total_sum, input_len) |
Oops, something went wrong.