From b734c786d7cb030aa22912cac0347876174b4d0c Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Thu, 18 Jan 2024 07:50:22 -0800 Subject: [PATCH] Refactor post-processing script to be specific to zika strain name fixes This commit refactors the generic post-processing script to better align with its specific purpose in Zika ingest. The purpose of this script is to fix zika strain names based on historical modifications from the fauna repo. In summary the following changes: * Rename script to fix-zika-strain-names.py to match the purpose * Add a docstring to the script * Replace the accession argument with a strain field argument, which is the field that needs to be fixed --- ...post_process_metadata.py => fix-zika-strain-names.py} | 9 ++++----- ingest/rules/transform.smk | 3 +-- 2 files changed, 5 insertions(+), 7 deletions(-) rename ingest/bin/{post_process_metadata.py => fix-zika-strain-names.py} (85%) diff --git a/ingest/bin/post_process_metadata.py b/ingest/bin/fix-zika-strain-names.py similarity index 85% rename from ingest/bin/post_process_metadata.py rename to ingest/bin/fix-zika-strain-names.py index 3c587e5..6c46bf5 100755 --- a/ingest/bin/post_process_metadata.py +++ b/ingest/bin/fix-zika-strain-names.py @@ -8,10 +8,10 @@ def parse_args(): parser = argparse.ArgumentParser( - description="Reformat a NCBI Virus metadata.tsv file for a pathogen build." + description="Modify zika strain names by referencing historical modifications from the fauna repo." ) - parser.add_argument("--accession-field", default='accession', - help="Field from the records to use as the sequence ID in the FASTA file.") + parser.add_argument("--strain-field", default='strain', + help="Field from the records to use as the strain name to be fixed.") return parser.parse_args() @@ -48,8 +48,7 @@ def main(): for index, record in enumerate(stdin): record = json.loads(record) - record["strain"] = _set_strain_name(record) - record["authors"] = record["abbr_authors"] + record[args.strain_field] = _set_strain_name(record) stdout.write(json.dumps(record) + "\n") diff --git a/ingest/rules/transform.smk b/ingest/rules/transform.smk index a0891e5..cc4e917 100644 --- a/ingest/rules/transform.smk +++ b/ingest/rules/transform.smk @@ -85,8 +85,7 @@ rule transform: --abbr-authors-field {params.abbr_authors_field} \ | ./vendored/apply-geolocation-rules \ --geolocation-rules {input.all_geolocation_rules} \ - | ./bin/post_process_metadata.py \ - --accession-field {params.id_field} \ + | ./bin/fix-zika-strain-names.py \ | ./vendored/merge-user-metadata \ --annotations {input.annotations} \ --id-field {params.annotations_id} \