From 2420273905f53d7b1c08464cb354f145c0fc65ac Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Tue, 18 Feb 2025 11:23:32 -0800 Subject: [PATCH] curate-andersen-lab-data: Use existing strain name if available Noticed in debugging that certain records already have the strain name in the `isolate` field in the raw data. Use the existing strain name if it's available, otherwise fallback to constructing a strain name from the metadata. --- .../ncbi/bin/curate-andersen-lab-data | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/ingest/build-configs/ncbi/bin/curate-andersen-lab-data b/ingest/build-configs/ncbi/bin/curate-andersen-lab-data index cd499e9..135e175 100755 --- a/ingest/build-configs/ncbi/bin/curate-andersen-lab-data +++ b/ingest/build-configs/ncbi/bin/curate-andersen-lab-data @@ -108,8 +108,9 @@ def parse_center_name(center_name: str) -> str: def construct_strain_name(record: dict, sample_id: str) -> str: """ - Construct a strain name for the *sample_id* using metadata from the *record* - to include host, country, and year. + Check if the *sample_id* is follows a strain name pattern, otherwise + construct a strain name for the *sample_id* using metadata from the + *record* to include host, country, and year. Removes all spaces in the constructed strain name because they are not allowed in the downstream phylogenetic workflow. Also replaces invalid @@ -119,10 +120,16 @@ def construct_strain_name(record: dict, sample_id: str) -> str: ¹ ² """ - host = record['host'] - country = record['country'] - year = str(parse_date(record['date']).year) - strain = f"A/{host}/{country}/{sample_id}/{year}".replace(" ", "") + # A//// + strain_pattern = r"(A\/.+\/.+\/.+\/[\d]{4})" + matches = re.search(strain_pattern, sample_id) + if matches: + strain = matches.group(1) + else: + host = record['host'] + country = record['country'] + year = str(parse_date(record['date']).year) + strain = f"A/{host}/{country}/{sample_id}/{year}".replace(" ", "") return re.sub(r'[^\w\_\-\.\|\/]', '_', strain)