diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..7e21d46 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,5 @@ +language: python +python: + - "2.7" + - "3.3" +script: nosetests diff --git a/tests/unit_tests/__init__.py b/tests/unit_tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit_tests/contents_mapper_test.py b/tests/unit_tests/contents_mapper_test.py new file mode 100644 index 0000000..858e901 --- /dev/null +++ b/tests/unit_tests/contents_mapper_test.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python + +import tests.test_utils as tu +import contents_mapper +import os +import re + + +class TestContentsMapper(tu.MapTestCase): + """ tests contents mapper """ + + def __init__(self, *args, **kwargs): + super(TestContentsMapper, self).__init__( + contents_mapper.map_contents, + 'tests/fixtures/contents_mapper.txt', + *args, **kwargs + ) + os.environ['mapreduce_map_input_file'] = 'filename' + + def test_has_correct_number_of_keys_and_values(self): + "ensures that the correct number of keys and values are emitted" + self.has_single_delimiter() + self.has_n_keys(1) + + def test_only_lowercase_alphabetic(self): + "ensures all content is lowercase alphabetic charachters" + numerals = re.compile('^[a-z, ,\t]+$') + self.are_all_matches(numerals) + + def test_prepends_filename(self): + "ensures that the filename is prepended to each line emitted" + contains_filename = re.compile('^filename') + self.are_all_matches(contains_filename) diff --git a/tests/unit_tests/corpus_frequency_test.py b/tests/unit_tests/corpus_frequency_test.py new file mode 100644 index 0000000..341e387 --- /dev/null +++ b/tests/unit_tests/corpus_frequency_test.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python + +import tests.test_utils as tu +import corp_freq_map as map +import corp_freq_red as red +import re + + +class TestCorpusFrequencyMapper(tu.MapTestCase): + """ + tests corpus frequency mapper + """ + + def __init__(self, *args, **kwargs): + super(TestCorpusFrequencyMapper, self).__init__( + map.map_corpus_frequency, + 'tests/fixtures/corpus_frequency_mapper.txt', + *args, **kwargs + ) + + def test_has_correct_number_of_keys_and_values(self): + """ + tests that values emitted have correct number of keys and values + """ + self.has_correct_number_of_keys_and_values(1, 4) + + def test_appends_one(self): + """ + tests that 1 is appended to each line of input + """ + ends_with_one = '.*1$' + self.are_all_matches(re.compile(ends_with_one)) + + def test_lines_out_equals_lines_in(self): + """ + tests that the number of lines of input and output are equal + """ + self.lines_out_equals_lines_in() + + +class TestCorpusFrequencyReducer(tu.ReduceTestCase): + """ + tests basic functionality of the corpus frequency reducer + """ + + def __init__(self, *args, **kwargs): + super(TestCorpusFrequencyReducer, self).__init__( + red.KEYS, red.VALUES, + red.reduce_corpus_frequency, + 'tests/fixtures/corpus_frequency_reducer.txt', + *args, **kwargs + ) + + def test_has_correct_number_of_keys_and_values(self): + """ + ensures that the correct number of keys and values are emitted + """ + self.has_correct_number_of_keys_and_values(2, 3) + + def test_sums_correct_values(self): + """ + ensure that the sum is the number of document a word occurs in, + not the number of occurences of the word. + """ + for line in self.run_reducer_tokenize(): + if line[0][0] == 'word1': + self.assertEqual(int(line[1][2]), 2) + elif line[0][0] == 'word2': + self.assertEqual(int(line[1][2]), 2) + elif line[0][0] == 'word3': + self.assertEqual(int(line[1][2]), 1) + else: + self.fail('unknown word in output') + + def test_lines_out_equals_lines_in(self): + """ + tests that the number of lines of input and output are equal + """ + self.lines_out_equals_lines_in() diff --git a/tests/unit_tests/cosine_similarity_test.py b/tests/unit_tests/cosine_similarity_test.py new file mode 100644 index 0000000..b097104 --- /dev/null +++ b/tests/unit_tests/cosine_similarity_test.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python + +import tests.test_utils as tu +import cos_sim_map as map +import cos_sim_red as red + + +class TestCosineSimilarityMapper(tu.MapTestCase): + """ tests the cosine similarity mapper """ + + def __init__(self, *args, **kwargs): + super(TestCosineSimilarityMapper, self).__init__( + map.map_cosine_similarity, + 'tests/fixtures/cosine_similarity_mapper.txt', + *args, **kwargs + ) + + def test_has_correct_number_of_keys_and_values(self): + """ + ensures that the same number of keys and values are emitted + """ + self.has_correct_number_of_keys_and_values(2, 1) + + def test_emits_line_for_each_word_in_each_file(self): + """ + ensures that the same number of lines of input and output are emitted + """ + self.lines_out_equals_lines_in() + + +class TestCosineSimilarityReducer(tu.ReduceTestCase): + """ tests the cosine simiarity reducer """ + + def __init__(self, *args, **kwargs): + super(TestCosineSimilarityReducer, self).__init__( + red.KEYS, red.VALUES, + red.reduce_cosine_similarity, + 'tests/fixtures/cosine_similarity_reducer.txt', + *args, **kwargs + ) + self.default_args = {'precision': 6} + + def test_has_correct_number_of_keys_and_values(self): + """ + ensures that the same number of keys and values are emitted + """ + self.has_correct_number_of_keys_and_values(2, 1, args=self.default_args) + + def test_sums_up_correct_values(self): + """ + ensures that the correct sum is computed + """ + for key, value in self.run_reducer_tokenize(args=self.default_args): + if key[0] == 'file1' and key[1] == 'file2': + self.assertEqual(value[0], '1.000000') + else: + self.assertEqual(value[0], '1.500000') diff --git a/tests/unit_tests/tf_idf_test.py b/tests/unit_tests/tf_idf_test.py new file mode 100644 index 0000000..73f9784 --- /dev/null +++ b/tests/unit_tests/tf_idf_test.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python + +from math import log +import tests.test_utils as tu +import map_reduce_utils as mru +import tf_idf_map as map + + +class TestTfIdfMapper(tu.MapTestCase): + """ tests the tf-idf mapper """ + + def __init__(self, *args, **kwargs): + super(TestTfIdfMapper, self).__init__( + map.map_tf_idf, + 'tests/fixtures/tf_idf_mapper.txt', + *args, **kwargs + ) + self.default_args = {'corpus_size': 3, 'precision': 5} + + def test_has_correct_number_of_keys_and_values(self): + """ + ensures that the correct number of keys and values are emitted + """ + self.has_correct_number_of_keys_and_values(2, 1, args=self.default_args) + + def test_computes_correct_tfidf_score(self): + """ + tests that the correct tfidf value is emitted + """ + results = self.run_mapper(args=self.default_args) + results = mru.tokenize_key_value_pair(results[0]) + computed_tfidf = float(results[1][0]) + expected_tf = (7.0 / 12.0) + expected_idf = log((float(self.default_args['corpus_size']) / 2.0), 10) + expected_tfidf = expected_tf * expected_idf + expected_tfidf = round(expected_tfidf, self.default_args['precision']) + self.assertEqual(expected_tfidf, computed_tfidf) + + def test_has_correcct_precision(self): + """ + tests to ensure that the precision argument is ensured. + """ + # we split on '.' and then check the length of the string after + # the period + precision_to_test = 8 + results = self.run_mapper(args={'precision': precision_to_test, + 'corpus_size': 9}) + result = mru.tokenize_key_value_pair(results[0]) + computed_tfidf = result[1][0] + computed_precision = len(computed_tfidf.strip().split('.')[1]) + self.assertEqual(precision_to_test, computed_precision) diff --git a/tests/unit_tests/word_count_test.py b/tests/unit_tests/word_count_test.py new file mode 100644 index 0000000..55e9e1f --- /dev/null +++ b/tests/unit_tests/word_count_test.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python + +import tests.test_utils as tu +import word_count_map as map +import word_count_red as red + + +class TestWordCountMapper(tu.MapTestCase): + """ tests the word count mapper """ + + def __init__(self, *args, **kwargs): + super(TestWordCountMapper, self).__init__( + map.map_word_count, + 'tests/fixtures/word_count_mapper.txt', + *args, **kwargs + ) + + def test_has_correct_number_of_keys_and_values(self): + """ + ensures that the correct number of keys and values are returned + """ + self.has_correct_number_of_keys_and_values(1, 2) + + def test_lines_out_equals_lines_in(self): + """ + ensures that a line is emmited for each line of input + """ + self.lines_out_equals_lines_in() + + +class TestWordCountReducer(tu.ReduceTestCase): + """ tests the word count reducer """ + + def __init__(self, *args, **kwargs): + super(TestWordCountReducer, self).__init__( + red.KEYS, red.VALUES, + red.reduce_word_count, + 'tests/fixtures/word_count_reducer.txt', + *args, **kwargs + ) + + def test_has_correct_number_of_keys_and_values(self): + """ + ensures that the correct number of keys and values are emitted + """ + self.has_correct_number_of_keys_and_values(2, 2) + + def test_sums_up_correct_values(self): + """ + ensures that the correct sum is calculated + """ + for result in self.run_reducer_tokenize(): + if result[0][1] == 'file1': + self.assertEqual(int(result[1][1]), 11) + elif result[0][1] == 'file2': + self.assertEqual(int(result[1][1]), 16) + elif result[0][1] == 'file3': + self.assertEqual(int(result[1][1]), 26) + else: + self.fail('unknown filename') + + def test_lines_out_equals_lines_in(self): + """ + test to ensure that the number of lines of input is equal to + the number of lines of output. + """ + self.lines_out_equals_lines_in() diff --git a/tests/unit_tests/word_frequency_test.py b/tests/unit_tests/word_frequency_test.py new file mode 100644 index 0000000..610eab6 --- /dev/null +++ b/tests/unit_tests/word_frequency_test.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python + +import re +import tests.test_utils as tu +import word_freq_map as map +import word_freq_red as red + + +class TestWordFrequencyMapper(tu.MapTestCase): + """ + tests basic functionality of the word frequency mapper + """ + + def __init__(self, *args, **kwargs): + super(TestWordFrequencyMapper, self).__init__( + map.map_word_frequency, + 'tests/fixtures/word_frequency_mapper.txt', + *args, **kwargs + ) + + def test_has_correct_number_of_keys_and_values(self): + """ + ensures that the correct number of keys and values are emitted + """ + self.has_correct_number_of_keys_and_values(2, 1) + + def test_emits_line_for_each_word_in_each_file(self): + """ + ensures that a line is emitted for each word in each file + """ + with open(self.default_fixture) as f: + output = self.run_mapper() + input = f.readlines() + num_files = len(input) + total_words = 0 + for line in input: + total_words += len(line.strip().split()) + self.assertEqual(len(output), total_words - num_files) + + def test_appends_one(self): + """ + tests that 1 is appended to each line of output + """ + ends_with_one = '.*1$' + self.are_all_matches(re.compile(ends_with_one)) + + +class TestWordFrequencyReducer(tu.ReduceTestCase): + """ tests the word frequency reducer """ + + def __init__(self, *args, **kwargs): + super(TestWordFrequencyReducer, self).__init__( + red.KEYS, red.VALUES, + red.reduce_word_frequency, + 'tests/fixtures/word_frequency_reducer.txt', + *args, **kwargs + ) + + def test_has_correct_number_of_keys_and_values(self): + """ + ensures that the correct number of keys and values are emitted + """ + self.has_correct_number_of_keys_and_values(2, 1) + + def test_sum_of_output_equals_length_of_input(self): + """ + tests that each line is accounted for in the sum produced as output + """ + output_total_sum = 0 + for line in self.run_reducer_tokenize(): + output_total_sum += int(line[1][0]) + with open(self.default_fixture) as f: + input_len = len(f.readlines()) + self.assertEqual(output_total_sum, input_len) diff --git a/tests/unit_tests/word_join_test.py b/tests/unit_tests/word_join_test.py new file mode 100644 index 0000000..a5ca8e8 --- /dev/null +++ b/tests/unit_tests/word_join_test.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python + +import tests.test_utils as tu +import word_join_map as map +import word_join_red as red + + +class TestWordJoinMapper(tu.MapTestCase): + """ tests the word join mapper """ + + def __init__(self, *args, **kwargs): + super(TestWordJoinMapper, self).__init__( + map.map_word_join, + 'tests/fixtures/word_join_mapper.txt', + *args, **kwargs + ) + + def test_has_correct_number_of_keys_and_values(self): + """ + ensures that the correct number of keys and values are emitted + """ + self.has_correct_number_of_keys_and_values(1, 2) + + def test_lines_out_equals_lines_in(self): + """ + ensures that the number of lines of input and output are equal + """ + self.lines_out_equals_lines_in() + + +class TestWordJoinReducer(tu.ReduceTestCase): + """ tests the word join reducer """ + + def __init__(self, *args, **kwargs): + super(TestWordJoinReducer, self).__init__( + red.KEYS, red.VALUES, + red.reduce_word_join, + 'tests/fixtures/word_join_reducer.txt', + *args, **kwargs + ) + self.default_args = {'precision': 6} + + def test_has_correct_number_of_keys_and_values(self): + """ + ensures that the correct number of keys and values are emitted + """ + self.has_correct_number_of_keys_and_values(1, 3, args=self.default_args) + + def test_emits_correct_number_of_lines(self): + """ + ensures that the correct number of lines are emitted + """ + output_size = len(self.run_reducer(args=self.default_args)) + self.assertEqual(output_size, 8) + + def test_correct_values_are_computed(self): + """ + ensures that the correct values are calculated for a small + sample data set. + """ + for result in self.run_reducer_tokenize(args=self.default_args): + if result[0][0] == 'word1': + if result[1][0] == 'file1': + if result[1][1] == 'file2': + self.assertEqual(float(result[1][2]), .5) + else: + self.fail('wrong filename') + elif result[1][0] == 'file2': + if result[1][1] == 'file1': + self.assertEqual(float(result[1][2]), .5) + else: + self.fail('wrong filename') + else: + self.fail('wrong filename') + elif result[0][0] == 'word2': + if result[1][0] == 'file1': + if result[1][1] == 'file2': + self.assertEqual(float(result[1][2]), .25) + elif result[1][1] == 'file3': + self.assertEqual(float(result[1][2]), .05) + else: + self.fail('wrong filename') + elif result[1][0] == 'file2': + if result[1][1] == 'file1': + self.assertEqual(float(result[1][2]), .25) + elif result[1][1] == 'file3': + self.assertEqual(float(result[1][2]), .05) + else: + self.fail('wrong filename') + elif result[1][0] == 'file3': + if result[1][1] == 'file1': + self.assertEqual(float(result[1][2]), .05) + elif result[1][1] == 'file2': + self.assertEqual(float(result[1][2]), .05) + else: + self.fail('wrong filename') + else: + self.fail('wrong filename') + else: + self.fail('wrong word in output')