added unit tests

uwsampa · Jan 15, 2015 · fd85fc5 · fd85fc5
1 parent 78a0c73
commit fd85fc5
Show file tree

Hide file tree

Showing 9 changed files with 466 additions and 0 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,5 @@
+language: python
+python:
+  - "2.7"
+  - "3.3"
+script: nosetests
diff --git a/tests/unit_tests/__init__.py b/tests/unit_tests/__init__.py
diff --git a/tests/unit_tests/contents_mapper_test.py b/tests/unit_tests/contents_mapper_test.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python
+
+import tests.test_utils as tu
+import contents_mapper
+import os
+import re
+
+
+class TestContentsMapper(tu.MapTestCase):
+    """ tests contents mapper """
+
+    def __init__(self, *args, **kwargs):
+        super(TestContentsMapper, self).__init__(
+            contents_mapper.map_contents,
+            'tests/fixtures/contents_mapper.txt',
+            *args, **kwargs
+        )
+        os.environ['mapreduce_map_input_file'] = 'filename'
+
+    def test_has_correct_number_of_keys_and_values(self):
+        "ensures that the correct number of keys and values are emitted"
+        self.has_single_delimiter()
+        self.has_n_keys(1)
+
+    def test_only_lowercase_alphabetic(self):
+        "ensures all content is lowercase alphabetic charachters"
+        numerals = re.compile('^[a-z, ,\t]+$')
+        self.are_all_matches(numerals)
+
+    def test_prepends_filename(self):
+        "ensures that the filename is prepended to each line emitted"
+        contains_filename = re.compile('^filename')
+        self.are_all_matches(contains_filename)
diff --git a/tests/unit_tests/corpus_frequency_test.py b/tests/unit_tests/corpus_frequency_test.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python
+
+import tests.test_utils as tu
+import corp_freq_map as map
+import corp_freq_red as red
+import re
+
+
+class TestCorpusFrequencyMapper(tu.MapTestCase):
+    """
+    tests corpus frequency mapper
+    """
+
+    def __init__(self, *args, **kwargs):
+        super(TestCorpusFrequencyMapper, self).__init__(
+            map.map_corpus_frequency,
+            'tests/fixtures/corpus_frequency_mapper.txt',
+            *args, **kwargs
+        )
+
+    def test_has_correct_number_of_keys_and_values(self):
+        """
+        tests that values emitted have correct number of keys and values
+        """
+        self.has_correct_number_of_keys_and_values(1, 4)
+
+    def test_appends_one(self):
+        """
+        tests that 1 is appended to each line of input
+        """
+        ends_with_one = '.*1$'
+        self.are_all_matches(re.compile(ends_with_one))
+
+    def test_lines_out_equals_lines_in(self):
+        """
+        tests that the number of lines of input and output are equal
+        """
+        self.lines_out_equals_lines_in()
+
+
+class TestCorpusFrequencyReducer(tu.ReduceTestCase):
+    """
+    tests basic functionality of the corpus frequency reducer
+    """
+
+    def __init__(self, *args, **kwargs):
+        super(TestCorpusFrequencyReducer, self).__init__(
+            red.KEYS, red.VALUES,
+            red.reduce_corpus_frequency,
+            'tests/fixtures/corpus_frequency_reducer.txt',
+            *args, **kwargs
+        )
+
+    def test_has_correct_number_of_keys_and_values(self):
+        """
+        ensures that the correct number of keys and values are emitted
+        """
+        self.has_correct_number_of_keys_and_values(2, 3)
+
+    def test_sums_correct_values(self):
+        """
+        ensure that the sum is the number of document a word occurs in,
+        not the number of occurences of the word.
+        """
+        for line in self.run_reducer_tokenize():
+            if line[0][0] == 'word1':
+                self.assertEqual(int(line[1][2]), 2)
+            elif line[0][0] == 'word2':
+                self.assertEqual(int(line[1][2]), 2)
+            elif line[0][0] == 'word3':
+                self.assertEqual(int(line[1][2]), 1)
+            else:
+                self.fail('unknown word in output')
+
+    def test_lines_out_equals_lines_in(self):
+        """
+        tests that the number of lines of input and output are equal
+        """
+        self.lines_out_equals_lines_in()
diff --git a/tests/unit_tests/cosine_similarity_test.py b/tests/unit_tests/cosine_similarity_test.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+
+import tests.test_utils as tu
+import cos_sim_map as map
+import cos_sim_red as red
+
+
+class TestCosineSimilarityMapper(tu.MapTestCase):
+    """ tests the cosine similarity mapper """
+
+    def __init__(self, *args, **kwargs):
+        super(TestCosineSimilarityMapper, self).__init__(
+            map.map_cosine_similarity,
+            'tests/fixtures/cosine_similarity_mapper.txt',
+            *args, **kwargs
+        )
+
+    def test_has_correct_number_of_keys_and_values(self):
+        """
+        ensures that the same number of keys and values are emitted
+        """
+        self.has_correct_number_of_keys_and_values(2, 1)
+
+    def test_emits_line_for_each_word_in_each_file(self):
+        """
+        ensures that the same number of lines of input and output are emitted
+        """
+        self.lines_out_equals_lines_in()
+
+
+class TestCosineSimilarityReducer(tu.ReduceTestCase):
+    """ tests the cosine simiarity reducer """
+
+    def __init__(self, *args, **kwargs):
+        super(TestCosineSimilarityReducer, self).__init__(
+            red.KEYS, red.VALUES,
+            red.reduce_cosine_similarity,
+            'tests/fixtures/cosine_similarity_reducer.txt',
+            *args, **kwargs
+        )
+        self.default_args = {'precision': 6}
+
+    def test_has_correct_number_of_keys_and_values(self):
+        """
+        ensures that the same number of keys and values are emitted
+        """
+        self.has_correct_number_of_keys_and_values(2, 1, args=self.default_args)
+
+    def test_sums_up_correct_values(self):
+        """
+        ensures that the correct sum is computed
+        """
+        for key, value in self.run_reducer_tokenize(args=self.default_args):
+            if key[0] == 'file1' and key[1] == 'file2':
+                self.assertEqual(value[0], '1.000000')
+            else:
+                self.assertEqual(value[0], '1.500000')
diff --git a/tests/unit_tests/tf_idf_test.py b/tests/unit_tests/tf_idf_test.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+
+from math import log
+import tests.test_utils as tu
+import map_reduce_utils as mru
+import tf_idf_map as map
+
+
+class TestTfIdfMapper(tu.MapTestCase):
+    """ tests the tf-idf mapper """
+
+    def __init__(self, *args, **kwargs):
+        super(TestTfIdfMapper, self).__init__(
+            map.map_tf_idf,
+            'tests/fixtures/tf_idf_mapper.txt',
+            *args, **kwargs
+        )
+        self.default_args = {'corpus_size': 3, 'precision': 5}
+
+    def test_has_correct_number_of_keys_and_values(self):
+        """
+        ensures that the correct number of keys and values are emitted
+        """
+        self.has_correct_number_of_keys_and_values(2, 1, args=self.default_args)
+
+    def test_computes_correct_tfidf_score(self):
+        """
+        tests that the correct tfidf value is emitted
+        """
+        results = self.run_mapper(args=self.default_args)
+        results = mru.tokenize_key_value_pair(results[0])
+        computed_tfidf = float(results[1][0])
+        expected_tf = (7.0 / 12.0)
+        expected_idf = log((float(self.default_args['corpus_size']) / 2.0), 10)
+        expected_tfidf = expected_tf * expected_idf
+        expected_tfidf = round(expected_tfidf, self.default_args['precision'])
+        self.assertEqual(expected_tfidf, computed_tfidf)
+
+    def test_has_correcct_precision(self):
+        """
+        tests to ensure that the precision argument is ensured.
+        """
+        # we split on '.' and then check the length of the string after
+        # the period
+        precision_to_test = 8
+        results = self.run_mapper(args={'precision': precision_to_test,
+                                        'corpus_size': 9})
+        result = mru.tokenize_key_value_pair(results[0])
+        computed_tfidf = result[1][0]
+        computed_precision = len(computed_tfidf.strip().split('.')[1])
+        self.assertEqual(precision_to_test, computed_precision)
diff --git a/tests/unit_tests/word_count_test.py b/tests/unit_tests/word_count_test.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python
+
+import tests.test_utils as tu
+import word_count_map as map
+import word_count_red as red
+
+
+class TestWordCountMapper(tu.MapTestCase):
+    """ tests the word count mapper """
+
+    def __init__(self, *args, **kwargs):
+        super(TestWordCountMapper, self).__init__(
+            map.map_word_count,
+            'tests/fixtures/word_count_mapper.txt',
+            *args, **kwargs
+        )
+
+    def test_has_correct_number_of_keys_and_values(self):
+        """
+        ensures that the correct number of keys and values are returned
+        """
+        self.has_correct_number_of_keys_and_values(1, 2)
+
+    def test_lines_out_equals_lines_in(self):
+        """
+        ensures that a line is emmited for each line of input
+        """
+        self.lines_out_equals_lines_in()
+
+
+class TestWordCountReducer(tu.ReduceTestCase):
+    """ tests the word count reducer """
+
+    def __init__(self, *args, **kwargs):
+        super(TestWordCountReducer, self).__init__(
+            red.KEYS, red.VALUES,
+            red.reduce_word_count,
+            'tests/fixtures/word_count_reducer.txt',
+            *args, **kwargs
+        )
+
+    def test_has_correct_number_of_keys_and_values(self):
+        """
+        ensures that the correct number of keys and values are emitted
+        """
+        self.has_correct_number_of_keys_and_values(2, 2)
+
+    def test_sums_up_correct_values(self):
+        """
+        ensures that the correct sum is calculated
+        """
+        for result in self.run_reducer_tokenize():
+            if result[0][1] == 'file1':
+                self.assertEqual(int(result[1][1]), 11)
+            elif result[0][1] == 'file2':
+                self.assertEqual(int(result[1][1]), 16)
+            elif result[0][1] == 'file3':
+                self.assertEqual(int(result[1][1]), 26)
+            else:
+                self.fail('unknown filename')
+
+    def test_lines_out_equals_lines_in(self):
+        """
+        test to ensure that the number of lines of input is equal to
+        the number of lines of output.
+        """
+        self.lines_out_equals_lines_in()
diff --git a/tests/unit_tests/word_frequency_test.py b/tests/unit_tests/word_frequency_test.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python
+
+import re
+import tests.test_utils as tu
+import word_freq_map as map
+import word_freq_red as red
+
+
+class TestWordFrequencyMapper(tu.MapTestCase):
+    """
+    tests basic functionality of the word frequency mapper
+    """
+
+    def __init__(self, *args, **kwargs):
+        super(TestWordFrequencyMapper, self).__init__(
+            map.map_word_frequency,
+            'tests/fixtures/word_frequency_mapper.txt',
+            *args, **kwargs
+        )
+
+    def test_has_correct_number_of_keys_and_values(self):
+        """
+        ensures that the correct number of keys and values are emitted
+        """
+        self.has_correct_number_of_keys_and_values(2, 1)
+
+    def test_emits_line_for_each_word_in_each_file(self):
+        """
+        ensures that a line is emitted for each word in each file
+        """
+        with open(self.default_fixture) as f:
+            output = self.run_mapper()
+            input = f.readlines()
+            num_files = len(input)
+            total_words = 0
+            for line in input:
+                total_words += len(line.strip().split())
+            self.assertEqual(len(output), total_words - num_files)
+
+    def test_appends_one(self):
+        """
+        tests that 1 is appended to each line of output
+        """
+        ends_with_one = '.*1$'
+        self.are_all_matches(re.compile(ends_with_one))
+
+
+class TestWordFrequencyReducer(tu.ReduceTestCase):
+    """ tests the word frequency reducer """
+
+    def __init__(self, *args, **kwargs):
+        super(TestWordFrequencyReducer, self).__init__(
+            red.KEYS, red.VALUES,
+            red.reduce_word_frequency,
+            'tests/fixtures/word_frequency_reducer.txt',
+            *args, **kwargs
+        )
+
+    def test_has_correct_number_of_keys_and_values(self):
+        """
+        ensures that the correct number of keys and values are emitted
+        """
+        self.has_correct_number_of_keys_and_values(2, 1)
+
+    def test_sum_of_output_equals_length_of_input(self):
+        """
+        tests that each line is accounted for in the sum produced as output
+        """
+        output_total_sum = 0
+        for line in self.run_reducer_tokenize():
+            output_total_sum += int(line[1][0])
+        with open(self.default_fixture) as f:
+            input_len = len(f.readlines())
+            self.assertEqual(output_total_sum, input_len)