Skip to content

Commit

Permalink
removed dropping extra rows/cols, added more test coverage
Browse files Browse the repository at this point in the history
  • Loading branch information
ldhtnp committed Oct 9, 2024
1 parent cc99682 commit 2a4fabf
Show file tree
Hide file tree
Showing 17 changed files with 224 additions and 119 deletions.
8 changes: 6 additions & 2 deletions pvaccompare/compare_tools/comparison_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ def find_file(results_folder, subfolder, pattern):
return files[0] if files else None


def write_header(output_file, aggregated_columns, unaggregated_columns, reference_match_columns):
def write_header(
output_file, aggregated_columns, unaggregated_columns, reference_match_columns
):
"""
Purpose: Writes the report generation date and time to the top of the output file
Modifies: Nothing
Expand Down Expand Up @@ -44,7 +46,9 @@ def run_comparison(
Returns: None
"""
output_file = output_file + "_" + prefix.replace("/", "_") + ".tsv"
write_header(output_file, aggregated_columns, unaggregated_columns, reference_match_columns)
write_header(
output_file, aggregated_columns, unaggregated_columns, reference_match_columns
)

if "pVACseq" not in prefix:
yml1_path = find_file(results_folder1, prefix + "/log", "inputs.yml")
Expand Down
9 changes: 4 additions & 5 deletions pvaccompare/comparisons/compare_aggregated_tsv.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from run_utils import *
import logging


class CompareAggregatedTSV:
Expand All @@ -12,23 +13,21 @@ def __init__(self, input_file1, input_file2, output_file, columns_to_compare):
self.ID_replacement_cols = ["Gene", "AA Change"]
self.columns_to_compare = columns_to_compare

def check_id(self, cols1_to_drop, cols2_to_drop):
def check_id(self):
"""
Purpose: Replace ID with Gene-AA_change if needed
Modifies: self.contains_id, self.replaced_id
Returns: None
"""
if "ID" in cols1_to_drop or "ID" in cols2_to_drop:
if "ID" not in self.df1.columns or "ID" not in self.df2.columns:
self.contains_id = False

if not self.contains_id:
can_replace = True
for col in self.ID_replacement_cols:
if col not in self.df1.columns or col not in self.df2.columns:
can_replace = False
if can_replace:
self.combine_gene_and_AA_change()
print("\u2022", "Replaced ID with Gene and AA Change")
logging.info("\u2022 Replaced ID with Gene and AA Change")
self.replaced_id = True

def combine_gene_and_AA_change(self):
Expand Down
7 changes: 4 additions & 3 deletions pvaccompare/comparisons/compare_reference_matches_tsv.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from run_utils import *
import logging


class CompareReferenceMatchesTSV:
Expand Down Expand Up @@ -53,15 +54,15 @@ def check_duplicate_ids(self):

if max_hits_file1 > 1 or max_hits_file2 > 1:
if max_hits_file1 > 1 and max_hits_file2 > 1:
print(
logging.error(
"ERROR: Duplicate unique records were found in both files. Writing number of hits only."
)
elif max_hits_file1 > 1:
print(
logging.error(
"ERROR: Duplicate unique records were found in file 1. Writing number of hits only."
)
else:
print(
logging.error(
"ERROR: Duplicate unique records were found in file 2. Writing number of hits only."
)
return True
Expand Down
2 changes: 1 addition & 1 deletion pvaccompare/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
logging.basicConfig(level=logging.DEBUG, format="%(message)s")

# TODO: Speed up identical file comparison
# TODO: Implement tests
# TODO: Add line numbers to output differences


def define_parser():
Expand Down
95 changes: 18 additions & 77 deletions pvaccompare/run_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,35 +37,30 @@ def check_column_formatting(df1, df2):
break


def output_dropped_cols(cols1_to_drop, cols2_to_drop):
def output_dropped_cols(df1, df2, original_columns):
"""
Purpose: Outputs the dropped comparison columns to the terminal and creates a columns dropped message for the generated report
Modifies: Nothing
Returns: String columns_dropped_message
"""
columns_dropped_message = ""
for col in cols1_to_drop:
if col in cols2_to_drop:
for col in original_columns:
if col not in df1.columns and col not in df2.columns:
logging.info(
"\u2022 Comparison dropped: '%s' is not present in either file", col
"\u2022 Column dropped: '%s' is not present in either file", col
)
columns_dropped_message += (
f"Comparison dropped: '{col}' is not present in either file\n"
)
else:
logging.info(
"\u2022 Comparison dropped: '%s' is only present in file 1", col
f"Column dropped: '{col}' is not present in either file\n"
)
elif col not in df1.columns:
logging.info("\u2022 Column dropped: '%s' is only present in file 2", col)
columns_dropped_message += (
f"Comparison dropped: '{col}' is only present in file 1\n"
)
for col in cols2_to_drop:
if col not in cols1_to_drop:
logging.info(
"\u2022 Comparison dropped: '%s' is only present in file 2", col
f"Column dropped: '{col}' is only present in file 2\n"
)
elif col not in df2.columns:
logging.info("\u2022 Column dropped: '%s' is only present in file 1", col)
columns_dropped_message += (
f"Comparison dropped: '{col}' is only present in file 2\n"
f"Column dropped: '{col}' is only present in file 1\n"
)
return columns_dropped_message

Expand Down Expand Up @@ -93,63 +88,6 @@ def load_tsv_files(input_file1, input_file2):
return df1, df2


def make_rows_equal(df1, df2):
"""
Purpose: Add 'dummy data' to make the two dataframes have an equal number of rows
Modifies: One of the two dataframes depending on which is smaller
Returns: Two dataframes
"""
num_rows_to_add = abs(df1.shape[0] - df2.shape[0])
if df1.shape[0] > df2.shape[0]:
dummy_data = pd.DataFrame(
np.nan, index=range(num_rows_to_add), columns=df2.columns
)
df2 = pd.concat([df2, dummy_data], ignore_index=True)
else:
dummy_data = pd.DataFrame(
np.nan, index=range(num_rows_to_add), columns=df1.columns
)
df1 = pd.concat([df1, dummy_data], ignore_index=True)
return df1, df2


def drop_useless_columns(df1, df2, columns_to_compare):
"""
Purpose: First removes columns that are not included in the comparison, excluding 'ID', then removes columns not present
in both files
Modifies: df1 and df2
Returns: Two lists containing the columns dropped in the corresponding dataframes
"""
columns_to_keep = set(["ID"])
if "ID" not in df1.columns or "ID" not in df2.columns:
columns_to_keep.update(["Gene", "AA Change"])

# Drop columns that are not in columns_to_compare and not 'ID'
cols1_to_drop = [
col
for col in df1.columns
if (col not in columns_to_compare) and (col not in columns_to_keep)
]
cols2_to_drop = [
col
for col in df2.columns
if (col not in columns_to_compare) and (col not in columns_to_keep)
]

df1.drop(columns=cols1_to_drop, inplace=True)
df2.drop(columns=cols2_to_drop, inplace=True)

# Drop columns that are not present in both dataframes
common_cols = set(df1.columns).intersection(set(df2.columns))
cols1_to_drop = [col for col in df1.columns if col not in common_cols]
cols2_to_drop = [col for col in df2.columns if col not in common_cols]

df1.drop(columns=cols1_to_drop, inplace=True)
df2.drop(columns=cols2_to_drop, inplace=True)

return cols1_to_drop, cols2_to_drop


def check_columns_to_compare(df1, df2, columns_to_compare):
"""
Purpose: Add columns present in both dataframes to columns_to_keep
Expand Down Expand Up @@ -220,7 +158,12 @@ def get_file_differences(
Modifies: Nothing
Returns: Dictionary of differences and a dictionary of unique variants
"""
merged_df = pd.merge(df1, df2, on="ID", suffixes=("_file1", "_file2"))
df1_selected = df1[["ID"] + columns_to_compare]
df2_selected = df2[["ID"] + columns_to_compare]

merged_df = pd.merge(
df1_selected, df2_selected, on="ID", suffixes=("_file1", "_file2")
)

differences = {}
for col in columns_to_compare:
Expand All @@ -242,9 +185,7 @@ def get_file_differences(
)

# Mask for rows where one value is NaN and the other is not
nan_mask = (
merged_df[col_file1].isna() & ~merged_df[col_file2].isna()
) | (
nan_mask = (merged_df[col_file1].isna() & ~merged_df[col_file2].isna()) | (
~merged_df[col_file1].isna() & merged_df[col_file2].isna()
)

Expand Down
7 changes: 2 additions & 5 deletions pvaccompare/runners/run_compare_aggregated_tsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,23 +14,20 @@ def main(input_file1, input_file2, output_file, columns_to_compare):
input_file1, input_file2, output_file, columns_to_compare
)
check_column_formatting(comparer.df1, comparer.df2)
comparer.check_id()

cols1_to_drop, cols2_to_drop = drop_useless_columns(
columns_dropped_message = output_dropped_cols(
comparer.df1, comparer.df2, comparer.columns_to_compare
)
columns_dropped_message = output_dropped_cols(cols1_to_drop, cols2_to_drop)
comparer.columns_to_compare = check_columns_to_compare(
comparer.df1, comparer.df2, comparer.columns_to_compare
)
comparer.check_id(cols1_to_drop, cols2_to_drop)

common_variants = get_common_variants(comparer.df1, comparer.df2)
unique_variants_file1, unique_variants_file2 = get_unique_variants(
comparer.df1, comparer.df2, common_variants
)

if comparer.df1.shape != comparer.df2.shape:
comparer.df1, comparer.df2 = make_rows_equal(comparer.df1, comparer.df2)
differences, unique_variants = get_file_differences(
comparer.df1,
comparer.df2,
Expand Down
14 changes: 5 additions & 9 deletions pvaccompare/runners/run_compare_reference_matches_tsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,23 +14,19 @@ def main(input_file1, input_file2, output_file, columns_to_compare):
input_file1, input_file2, output_file, columns_to_compare
)
check_column_formatting(comparer.df1, comparer.df2)

comparer.create_id_column()
common_variants = get_common_variants(comparer.df1, comparer.df2)
unique_variants_file1, unique_variants_file2 = get_unique_variants(
comparer.df1, comparer.df2, common_variants
)

cols1_to_drop, cols2_to_drop = drop_useless_columns(
columns_dropped_message = output_dropped_cols(
comparer.df1, comparer.df2, comparer.columns_to_compare
)
columns_dropped_message = output_dropped_cols(cols1_to_drop, cols2_to_drop)
comparer.columns_to_compare = check_columns_to_compare(
comparer.df1, comparer.df2, comparer.columns_to_compare
)

if comparer.df1.shape != comparer.df2.shape:
comparer.df1, comparer.df2 = make_rows_equal(comparer.df1, comparer.df2)
common_variants = get_common_variants(comparer.df1, comparer.df2)
unique_variants_file1, unique_variants_file2 = get_unique_variants(
comparer.df1, comparer.df2, common_variants
)

if comparer.check_duplicate_ids():
differences_summary = generate_differences_summary(
Expand Down
19 changes: 7 additions & 12 deletions pvaccompare/runners/run_compare_unaggregated_tsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,31 +8,26 @@ def main(input_file1, input_file2, output_file, columns_to_compare):
Modifies: Nothing
Returns: None
"""
id_format = (
"Chromosome-Start-Stop-Reference-Variant-HLA_Allele-Sub_peptide_Position-Mt_Epitope_Seq-Index"
)
id_format = "Chromosome-Start-Stop-Reference-Variant-HLA_Allele-Sub_peptide_Position-Mt_Epitope_Seq-Index"

comparer = CompareUnaggregatedTSV(
input_file1, input_file2, output_file, columns_to_compare
)
check_column_formatting(comparer.df1, comparer.df2)

comparer.create_id_column()
common_variants = get_common_variants(comparer.df1, comparer.df2)
unique_variants_file1, unique_variants_file2 = get_unique_variants(
comparer.df1, comparer.df2, common_variants
)

cols1_to_drop, cols2_to_drop = drop_useless_columns(
columns_dropped_message = output_dropped_cols(
comparer.df1, comparer.df2, comparer.columns_to_compare
)
columns_dropped_message = output_dropped_cols(cols1_to_drop, cols2_to_drop)
comparer.columns_to_compare = check_columns_to_compare(
comparer.df1, comparer.df2, comparer.columns_to_compare
)

if comparer.df1.shape != comparer.df2.shape:
comparer.df1, comparer.df2 = make_rows_equal(comparer.df1, comparer.df2)
common_variants = get_common_variants(comparer.df1, comparer.df2)
unique_variants_file1, unique_variants_file2 = get_unique_variants(
comparer.df1, comparer.df2, common_variants
)

differences, unique_variants = get_file_differences(
comparer.df1,
comparer.df2,
Expand Down
36 changes: 36 additions & 0 deletions pvaccompare/tests/test_compare_aggregated_tsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

# To run the tests navigate to pvaccompare/ and run the following:
# python -m unittest tests/test_compare_aggregated_tsv.py
# python -m unittest discover -s tests
class TestRunCompareAggregatedTSV(unittest.TestCase):
def setUp(self):
self.input_file1 = tempfile.NamedTemporaryFile(delete=False, suffix=".tsv")
Expand Down Expand Up @@ -74,3 +75,38 @@ def test_different_files(self):
) as expected_file:
expected_output = expected_file.read().strip()
self.assertEqual(sanitized_output.strip(), expected_output)

def test_missing_id(self):
with open("tests/test_data/aggregated_input1.tsv", "r") as f:
content1 = f.read()
with open("tests/test_data/aggregated_input3.tsv", "r") as f:
content2 = f.read()

self.input_file1.write(content1.encode())
self.input_file2.write(content2.encode())
self.input_file1.close()
self.input_file2.close()

with self.assertLogs(level="INFO") as log:
main(
self.input_file1.name,
self.input_file2.name,
self.output_file.name,
self.columns_to_compare,
)
self.assertIn("INFO:root:• Replaced ID with Gene and AA Change", log.output)

self.output_file.seek(0)
output_content = self.output_file.read().decode()
sanitized_output = "\n".join(
[
line
for line in output_content.splitlines()
if not line.startswith("File 1:") and not line.startswith("File 2:")
]
)
with open(
"tests/test_data/aggregated_id_change_output.tsv", "r"
) as expected_file:
expected_output = expected_file.read().strip()
self.assertEqual(sanitized_output.strip(), expected_output)
1 change: 1 addition & 0 deletions pvaccompare/tests/test_compare_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

# To run the tests navigate to pvaccompare/ and run the following:
# python -m unittest tests/test_compare_json.py
# python -m unittest discover -s tests
class TestRunCompareJSON(unittest.TestCase):
def setUp(self):
self.input_file1 = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
Expand Down
Loading

0 comments on commit 2a4fabf

Please sign in to comment.