Skip to content

Commit

Permalink
Add a test that scans the whole data frame for MONDO:MONDO bananas
Browse files Browse the repository at this point in the history
  • Loading branch information
matentzn committed Nov 22, 2024
1 parent 1103f92 commit cd99892
Showing 1 changed file with 18 additions and 0 deletions.
18 changes: 18 additions & 0 deletions src/scripts/post_process_externally_managed_content.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import click
import os
import re
import pandas as pd
import logging

Expand Down Expand Up @@ -95,6 +96,23 @@ def _remove_erroneous_values_from_externally_managed_content(external_content_fi
report.append(error_report)
df_external_content.at[index, erroneous_column] = ""

# Additional checks on the pandas dataframe that are not covered by SPARQL

# BANANA ERROR: Search the entire external content for occurrences of the pattern 'MONDO:MONDO'
pattern = r"^MONDO:MONDO:.*$"
result = df_external_content.applymap(lambda x: bool(re.match(pattern, str(x))))
rows_to_drop = result.any(axis=1).index[result.any(axis=1)].tolist()
for row in rows_to_drop:
error_report = df_external_content.loc[row].to_dict()
error_report['Source'] = source
rule = "MONDO:MONDO_pattern"
property = "IRI"
error_report['Check'] = f"{rule} ({property})"
report.append(error_report)
df_external_content.drop(index=rows_to_drop, inplace=True)

# X ERROR: TBD

df_external_content.to_csv(external_content_file_out, sep="\t", index=False)
_write_nice_report(report, source)

Expand Down

0 comments on commit cd99892

Please sign in to comment.