From 2420273905f53d7b1c08464cb354f145c0fc65ac Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Tue, 18 Feb 2025 11:23:32 -0800
Subject: [PATCH] curate-andersen-lab-data: Use existing strain name if
 available

Noticed in debugging that certain records already have the strain
name in the `isolate` field in the raw data. Use the existing strain
name if it's available, otherwise fallback to constructing a strain
name from the metadata.
---
 .../ncbi/bin/curate-andersen-lab-data         | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)
diff --git a/ingest/build-configs/ncbi/bin/curate-andersen-lab-data b/ingest/build-configs/ncbi/bin/curate-andersen-lab-data
index cd499e9..135e175 100755
--- a/ingest/build-configs/ncbi/bin/curate-andersen-lab-data
+++ b/ingest/build-configs/ncbi/bin/curate-andersen-lab-data
@@ -108,8 +108,9 @@ def parse_center_name(center_name: str) -> str:
 
 def construct_strain_name(record: dict, sample_id: str) -> str:
     """
-    Construct a strain name for the *sample_id* using metadata from the *record*
-    to include host, country, and year.
+    Check if the *sample_id* is follows a strain name pattern, otherwise
+    construct a strain name for the *sample_id* using metadata from the
+    *record* to include host, country, and year.
 
     Removes all spaces in the constructed strain name because they are not
     allowed in the downstream phylogenetic workflow. Also replaces invalid
@@ -119,10 +120,16 @@ def construct_strain_name(record: dict, sample_id: str) -> str:
     ¹ <https://github.com/iqtree/iqtree2/blob/74da454bbd98d6ecb8cb955975a50de59785fbde/utils/tools.cpp#L607>
     ² <https://github.com/nextstrain/avian-flu/issues/113>
     """
-    host = record['host']
-    country = record['country']
-    year = str(parse_date(record['date']).year)
-    strain = f"A/{host}/{country}/{sample_id}/{year}".replace(" ", "")
+    # A/<host>/<location>/<sample_id>/<year>
+    strain_pattern = r"(A\/.+\/.+\/.+\/[\d]{4})"
+    matches = re.search(strain_pattern, sample_id)
+    if matches:
+        strain = matches.group(1)
+    else:
+        host = record['host']
+        country = record['country']
+        year = str(parse_date(record['date']).year)
+        strain = f"A/{host}/{country}/{sample_id}/{year}".replace(" ", "")
     return re.sub(r'[^\w\_\-\.\|\/]', '_', strain)