removed dropping extra rows/cols, added more test coverage

ldhtnp · Oct 9, 2024 · 2a4fabf · 2a4fabf
1 parent cc99682
commit 2a4fabf
Show file tree

Hide file tree

Showing 17 changed files with 224 additions and 119 deletions.
diff --git a/pvaccompare/compare_tools/comparison_router.py b/pvaccompare/compare_tools/comparison_router.py
@@ -16,7 +16,9 @@ def find_file(results_folder, subfolder, pattern):
     return files[0] if files else None
 
 
-def write_header(output_file, aggregated_columns, unaggregated_columns, reference_match_columns):
+def write_header(
+    output_file, aggregated_columns, unaggregated_columns, reference_match_columns
+):
     """
     Purpose:    Writes the report generation date and time to the top of the output file
     Modifies:   Nothing
@@ -44,7 +46,9 @@ def run_comparison(
     Returns:    None
     """
     output_file = output_file + "_" + prefix.replace("/", "_") + ".tsv"
-    write_header(output_file, aggregated_columns, unaggregated_columns, reference_match_columns)
+    write_header(
+        output_file, aggregated_columns, unaggregated_columns, reference_match_columns
+    )
 
     if "pVACseq" not in prefix:
         yml1_path = find_file(results_folder1, prefix + "/log", "inputs.yml")

diff --git a/pvaccompare/comparisons/compare_aggregated_tsv.py b/pvaccompare/comparisons/compare_aggregated_tsv.py
@@ -1,4 +1,5 @@
 from run_utils import *
+import logging
 
 
 class CompareAggregatedTSV:
@@ -12,23 +13,21 @@ def __init__(self, input_file1, input_file2, output_file, columns_to_compare):
         self.ID_replacement_cols = ["Gene", "AA Change"]
         self.columns_to_compare = columns_to_compare
 
-    def check_id(self, cols1_to_drop, cols2_to_drop):
+    def check_id(self):
         """
         Purpose:    Replace ID with Gene-AA_change if needed
         Modifies:   self.contains_id, self.replaced_id
         Returns:    None
         """
-        if "ID" in cols1_to_drop or "ID" in cols2_to_drop:
+        if "ID" not in self.df1.columns or "ID" not in self.df2.columns:
             self.contains_id = False
-
-        if not self.contains_id:
             can_replace = True
             for col in self.ID_replacement_cols:
                 if col not in self.df1.columns or col not in self.df2.columns:
                     can_replace = False
             if can_replace:
                 self.combine_gene_and_AA_change()
-                print("\u2022", "Replaced ID with Gene and AA Change")
+                logging.info("\u2022 Replaced ID with Gene and AA Change")
                 self.replaced_id = True
 
     def combine_gene_and_AA_change(self):

diff --git a/pvaccompare/comparisons/compare_reference_matches_tsv.py b/pvaccompare/comparisons/compare_reference_matches_tsv.py
@@ -1,4 +1,5 @@
 from run_utils import *
+import logging
 
 
 class CompareReferenceMatchesTSV:
@@ -53,15 +54,15 @@ def check_duplicate_ids(self):
 
         if max_hits_file1 > 1 or max_hits_file2 > 1:
             if max_hits_file1 > 1 and max_hits_file2 > 1:
-                print(
+                logging.error(
                     "ERROR: Duplicate unique records were found in both files. Writing number of hits only."
                 )
             elif max_hits_file1 > 1:
-                print(
+                logging.error(
                     "ERROR: Duplicate unique records were found in file 1. Writing number of hits only."
                 )
             else:
-                print(
+                logging.error(
                     "ERROR: Duplicate unique records were found in file 2. Writing number of hits only."
                 )
             return True

diff --git a/pvaccompare/run.py b/pvaccompare/run.py
@@ -5,7 +5,7 @@
 logging.basicConfig(level=logging.DEBUG, format="%(message)s")
 
 # TODO: Speed up identical file comparison
-# TODO: Implement tests
+# TODO: Add line numbers to output differences
 
 
 def define_parser():

diff --git a/pvaccompare/run_utils.py b/pvaccompare/run_utils.py
@@ -37,35 +37,30 @@ def check_column_formatting(df1, df2):
                 break
 
 
-def output_dropped_cols(cols1_to_drop, cols2_to_drop):
+def output_dropped_cols(df1, df2, original_columns):
     """
     Purpose:    Outputs the dropped comparison columns to the terminal and creates a columns dropped message for the generated report
     Modifies:   Nothing
     Returns:    String columns_dropped_message
     """
     columns_dropped_message = ""
-    for col in cols1_to_drop:
-        if col in cols2_to_drop:
+    for col in original_columns:
+        if col not in df1.columns and col not in df2.columns:
             logging.info(
-                "\u2022 Comparison dropped: '%s' is not present in either file", col
+                "\u2022 Column dropped: '%s' is not present in either file", col
             )
             columns_dropped_message += (
-                f"Comparison dropped: '{col}' is not present in either file\n"
-            )
-        else:
-            logging.info(
-                "\u2022 Comparison dropped: '%s' is only present in file 1", col
+                f"Column dropped: '{col}' is not present in either file\n"
             )
+        elif col not in df1.columns:
+            logging.info("\u2022 Column dropped: '%s' is only present in file 2", col)
             columns_dropped_message += (
-                f"Comparison dropped: '{col}' is only present in file 1\n"
-            )
-    for col in cols2_to_drop:
-        if col not in cols1_to_drop:
-            logging.info(
-                "\u2022 Comparison dropped: '%s' is only present in file 2", col
+                f"Column dropped: '{col}' is only present in file 2\n"
             )
+        elif col not in df2.columns:
+            logging.info("\u2022 Column dropped: '%s' is only present in file 1", col)
             columns_dropped_message += (
-                f"Comparison dropped: '{col}' is only present in file 2\n"
+                f"Column dropped: '{col}' is only present in file 1\n"
             )
     return columns_dropped_message
 
@@ -93,63 +88,6 @@ def load_tsv_files(input_file1, input_file2):
     return df1, df2
 
 
-def make_rows_equal(df1, df2):
-    """
-    Purpose:    Add 'dummy data' to make the two dataframes have an equal number of rows
-    Modifies:   One of the two dataframes depending on which is smaller
-    Returns:    Two dataframes
-    """
-    num_rows_to_add = abs(df1.shape[0] - df2.shape[0])
-    if df1.shape[0] > df2.shape[0]:
-        dummy_data = pd.DataFrame(
-            np.nan, index=range(num_rows_to_add), columns=df2.columns
-        )
-        df2 = pd.concat([df2, dummy_data], ignore_index=True)
-    else:
-        dummy_data = pd.DataFrame(
-            np.nan, index=range(num_rows_to_add), columns=df1.columns
-        )
-        df1 = pd.concat([df1, dummy_data], ignore_index=True)
-    return df1, df2
-
-
-def drop_useless_columns(df1, df2, columns_to_compare):
-    """
-    Purpose:    First removes columns that are not included in the comparison, excluding 'ID', then removes columns not present
-                in both files
-    Modifies:   df1 and df2
-    Returns:    Two lists containing the columns dropped in the corresponding dataframes
-    """
-    columns_to_keep = set(["ID"])
-    if "ID" not in df1.columns or "ID" not in df2.columns:
-        columns_to_keep.update(["Gene", "AA Change"])
-
-    # Drop columns that are not in columns_to_compare and not 'ID'
-    cols1_to_drop = [
-        col
-        for col in df1.columns
-        if (col not in columns_to_compare) and (col not in columns_to_keep)
-    ]
-    cols2_to_drop = [
-        col
-        for col in df2.columns
-        if (col not in columns_to_compare) and (col not in columns_to_keep)
-    ]
-
-    df1.drop(columns=cols1_to_drop, inplace=True)
-    df2.drop(columns=cols2_to_drop, inplace=True)
-
-    # Drop columns that are not present in both dataframes
-    common_cols = set(df1.columns).intersection(set(df2.columns))
-    cols1_to_drop = [col for col in df1.columns if col not in common_cols]
-    cols2_to_drop = [col for col in df2.columns if col not in common_cols]
-
-    df1.drop(columns=cols1_to_drop, inplace=True)
-    df2.drop(columns=cols2_to_drop, inplace=True)
-
-    return cols1_to_drop, cols2_to_drop
-
-
 def check_columns_to_compare(df1, df2, columns_to_compare):
     """
     Purpose:    Add columns present in both dataframes to columns_to_keep
@@ -220,7 +158,12 @@ def get_file_differences(
     Modifies:   Nothing
     Returns:    Dictionary of differences and a dictionary of unique variants
     """
-    merged_df = pd.merge(df1, df2, on="ID", suffixes=("_file1", "_file2"))
+    df1_selected = df1[["ID"] + columns_to_compare]
+    df2_selected = df2[["ID"] + columns_to_compare]
+
+    merged_df = pd.merge(
+        df1_selected, df2_selected, on="ID", suffixes=("_file1", "_file2")
+    )
 
     differences = {}
     for col in columns_to_compare:
@@ -242,9 +185,7 @@ def get_file_differences(
             )
 
             # Mask for rows where one value is NaN and the other is not
-            nan_mask = (
-                merged_df[col_file1].isna() & ~merged_df[col_file2].isna()
-            ) | (
+            nan_mask = (merged_df[col_file1].isna() & ~merged_df[col_file2].isna()) | (
                 ~merged_df[col_file1].isna() & merged_df[col_file2].isna()
             )
 

diff --git a/pvaccompare/runners/run_compare_aggregated_tsv.py b/pvaccompare/runners/run_compare_aggregated_tsv.py
@@ -14,23 +14,20 @@ def main(input_file1, input_file2, output_file, columns_to_compare):
         input_file1, input_file2, output_file, columns_to_compare
     )
     check_column_formatting(comparer.df1, comparer.df2)
+    comparer.check_id()
 
-    cols1_to_drop, cols2_to_drop = drop_useless_columns(
+    columns_dropped_message = output_dropped_cols(
         comparer.df1, comparer.df2, comparer.columns_to_compare
     )
-    columns_dropped_message = output_dropped_cols(cols1_to_drop, cols2_to_drop)
     comparer.columns_to_compare = check_columns_to_compare(
         comparer.df1, comparer.df2, comparer.columns_to_compare
     )
-    comparer.check_id(cols1_to_drop, cols2_to_drop)
 
     common_variants = get_common_variants(comparer.df1, comparer.df2)
     unique_variants_file1, unique_variants_file2 = get_unique_variants(
         comparer.df1, comparer.df2, common_variants
     )
 
-    if comparer.df1.shape != comparer.df2.shape:
-        comparer.df1, comparer.df2 = make_rows_equal(comparer.df1, comparer.df2)
     differences, unique_variants = get_file_differences(
         comparer.df1,
         comparer.df2,

diff --git a/pvaccompare/runners/run_compare_reference_matches_tsv.py b/pvaccompare/runners/run_compare_reference_matches_tsv.py
@@ -14,23 +14,19 @@ def main(input_file1, input_file2, output_file, columns_to_compare):
         input_file1, input_file2, output_file, columns_to_compare
     )
     check_column_formatting(comparer.df1, comparer.df2)
-
     comparer.create_id_column()
-    common_variants = get_common_variants(comparer.df1, comparer.df2)
-    unique_variants_file1, unique_variants_file2 = get_unique_variants(
-        comparer.df1, comparer.df2, common_variants
-    )
 
-    cols1_to_drop, cols2_to_drop = drop_useless_columns(
+    columns_dropped_message = output_dropped_cols(
         comparer.df1, comparer.df2, comparer.columns_to_compare
     )
-    columns_dropped_message = output_dropped_cols(cols1_to_drop, cols2_to_drop)
     comparer.columns_to_compare = check_columns_to_compare(
         comparer.df1, comparer.df2, comparer.columns_to_compare
     )
 
-    if comparer.df1.shape != comparer.df2.shape:
-        comparer.df1, comparer.df2 = make_rows_equal(comparer.df1, comparer.df2)
+    common_variants = get_common_variants(comparer.df1, comparer.df2)
+    unique_variants_file1, unique_variants_file2 = get_unique_variants(
+        comparer.df1, comparer.df2, common_variants
+    )
 
     if comparer.check_duplicate_ids():
         differences_summary = generate_differences_summary(

diff --git a/pvaccompare/runners/run_compare_unaggregated_tsv.py b/pvaccompare/runners/run_compare_unaggregated_tsv.py
@@ -8,31 +8,26 @@ def main(input_file1, input_file2, output_file, columns_to_compare):
     Modifies:   Nothing
     Returns:    None
     """
-    id_format = (
-        "Chromosome-Start-Stop-Reference-Variant-HLA_Allele-Sub_peptide_Position-Mt_Epitope_Seq-Index"
-    )
+    id_format = "Chromosome-Start-Stop-Reference-Variant-HLA_Allele-Sub_peptide_Position-Mt_Epitope_Seq-Index"
 
     comparer = CompareUnaggregatedTSV(
         input_file1, input_file2, output_file, columns_to_compare
     )
     check_column_formatting(comparer.df1, comparer.df2)
-
     comparer.create_id_column()
-    common_variants = get_common_variants(comparer.df1, comparer.df2)
-    unique_variants_file1, unique_variants_file2 = get_unique_variants(
-        comparer.df1, comparer.df2, common_variants
-    )
 
-    cols1_to_drop, cols2_to_drop = drop_useless_columns(
+    columns_dropped_message = output_dropped_cols(
         comparer.df1, comparer.df2, comparer.columns_to_compare
     )
-    columns_dropped_message = output_dropped_cols(cols1_to_drop, cols2_to_drop)
     comparer.columns_to_compare = check_columns_to_compare(
         comparer.df1, comparer.df2, comparer.columns_to_compare
     )
 
-    if comparer.df1.shape != comparer.df2.shape:
-        comparer.df1, comparer.df2 = make_rows_equal(comparer.df1, comparer.df2)
+    common_variants = get_common_variants(comparer.df1, comparer.df2)
+    unique_variants_file1, unique_variants_file2 = get_unique_variants(
+        comparer.df1, comparer.df2, common_variants
+    )
+
     differences, unique_variants = get_file_differences(
         comparer.df1,
         comparer.df2,

diff --git a/pvaccompare/tests/test_compare_aggregated_tsv.py b/pvaccompare/tests/test_compare_aggregated_tsv.py
@@ -6,6 +6,7 @@
 
 # To run the tests navigate to pvaccompare/ and run the following:
 # python -m unittest tests/test_compare_aggregated_tsv.py
+# python -m unittest discover -s tests
 class TestRunCompareAggregatedTSV(unittest.TestCase):
     def setUp(self):
         self.input_file1 = tempfile.NamedTemporaryFile(delete=False, suffix=".tsv")
@@ -74,3 +75,38 @@ def test_different_files(self):
         ) as expected_file:
             expected_output = expected_file.read().strip()
         self.assertEqual(sanitized_output.strip(), expected_output)
+
+    def test_missing_id(self):
+        with open("tests/test_data/aggregated_input1.tsv", "r") as f:
+            content1 = f.read()
+        with open("tests/test_data/aggregated_input3.tsv", "r") as f:
+            content2 = f.read()
+
+        self.input_file1.write(content1.encode())
+        self.input_file2.write(content2.encode())
+        self.input_file1.close()
+        self.input_file2.close()
+
+        with self.assertLogs(level="INFO") as log:
+            main(
+                self.input_file1.name,
+                self.input_file2.name,
+                self.output_file.name,
+                self.columns_to_compare,
+            )
+        self.assertIn("INFO:root:• Replaced ID with Gene and AA Change", log.output)
+
+        self.output_file.seek(0)
+        output_content = self.output_file.read().decode()
+        sanitized_output = "\n".join(
+            [
+                line
+                for line in output_content.splitlines()
+                if not line.startswith("File 1:") and not line.startswith("File 2:")
+            ]
+        )
+        with open(
+            "tests/test_data/aggregated_id_change_output.tsv", "r"
+        ) as expected_file:
+            expected_output = expected_file.read().strip()
+        self.assertEqual(sanitized_output.strip(), expected_output)
diff --git a/pvaccompare/tests/test_compare_json.py b/pvaccompare/tests/test_compare_json.py
@@ -6,6 +6,7 @@
 
 # To run the tests navigate to pvaccompare/ and run the following:
 # python -m unittest tests/test_compare_json.py
+# python -m unittest discover -s tests
 class TestRunCompareJSON(unittest.TestCase):
     def setUp(self):
         self.input_file1 = tempfile.NamedTemporaryFile(delete=False, suffix=".json")