diff --git a/ingest/bin/post_process_metadata.py b/ingest/bin/fix-zika-strain-names.py similarity index 85% rename from ingest/bin/post_process_metadata.py rename to ingest/bin/fix-zika-strain-names.py index 3c587e5..6c46bf5 100755 --- a/ingest/bin/post_process_metadata.py +++ b/ingest/bin/fix-zika-strain-names.py @@ -8,10 +8,10 @@ def parse_args(): parser = argparse.ArgumentParser( - description="Reformat a NCBI Virus metadata.tsv file for a pathogen build." + description="Modify zika strain names by referencing historical modifications from the fauna repo." ) - parser.add_argument("--accession-field", default='accession', - help="Field from the records to use as the sequence ID in the FASTA file.") + parser.add_argument("--strain-field", default='strain', + help="Field from the records to use as the strain name to be fixed.") return parser.parse_args() @@ -48,8 +48,7 @@ def main(): for index, record in enumerate(stdin): record = json.loads(record) - record["strain"] = _set_strain_name(record) - record["authors"] = record["abbr_authors"] + record[args.strain_field] = _set_strain_name(record) stdout.write(json.dumps(record) + "\n") diff --git a/ingest/rules/transform.smk b/ingest/rules/transform.smk index a0891e5..cc4e917 100644 --- a/ingest/rules/transform.smk +++ b/ingest/rules/transform.smk @@ -85,8 +85,7 @@ rule transform: --abbr-authors-field {params.abbr_authors_field} \ | ./vendored/apply-geolocation-rules \ --geolocation-rules {input.all_geolocation_rules} \ - | ./bin/post_process_metadata.py \ - --accession-field {params.id_field} \ + | ./bin/fix-zika-strain-names.py \ | ./vendored/merge-user-metadata \ --annotations {input.annotations} \ --id-field {params.annotations_id} \