From 7d86b03f4c2767282c3fd914b87964f94490d930 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 7 Feb 2025 11:30:58 -0800 Subject: [PATCH] ingest/ncbi: Replace "invalid" characters from `strain` Replace what iqtree considers "invalid" characters with "_" in `strain` so that augur tree/iqtree does not change the strain name in the phylogenetic workflow and cause an error in augur refine. Similar to the changes made for the curate-andersen-lab-data script in . --- ingest/build-configs/ncbi/bin/transform-to-match-fauna | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/ingest/build-configs/ncbi/bin/transform-to-match-fauna b/ingest/build-configs/ncbi/bin/transform-to-match-fauna index 1fcec0f..12b163b 100755 --- a/ingest/build-configs/ncbi/bin/transform-to-match-fauna +++ b/ingest/build-configs/ncbi/bin/transform-to-match-fauna @@ -4,6 +4,7 @@ Transforms to specific fields in the NDJSON record to match the output metdata from fauna for easier downstream use in the phylogenetic workflow """ import json +import re from sys import stdin, stdout @@ -31,7 +32,12 @@ if __name__ == "__main__": # Keep a copy of the original strain name since we are editing it below record["original_strain"] = record["strain"] # Remove spaces from strain names since they are not allowed in our phylo workflow. - record["strain"] = record["original_strain"].replace(" ", "") + # Replace invalid characters with `_` to match iqtree so augur tree will not modify strain + # + # Similar to the changes made for the curate-andersen-lab-data script in + # . + strain = record["original_strain"].replace(" ", "") + record["strain"] = re.sub(r'[^\w\_\-\.\|\/]', '_', strain) json.dump(record, stdout, allow_nan=False, indent=None, separators=',:') print()