[ingest] improve GenoFLU parsing

The previous csvtk approach would drop "Not assigned" records because they didn't have results for all 8 segments. We now report the genome result as "Not assigned (too divergent)" and report the segment results where available.
nextstrain · Feb 23, 2025 · 682df60 · 682df60
1 parent 956dd12
commit 682df60
Show file tree

Hide file tree

Showing 2 changed files with 32 additions and 5 deletions.
diff --git a/ingest/rules/genoflu.smk b/ingest/rules/genoflu.smk
@@ -58,11 +58,8 @@ rule parse_genoflu:
         r"""
         cat {input.genoflu} | \
             csvtk cut -t -F -f Strain,Genotype,'Genotype List Used*' | \
-            csvtk grep -t -F -f 'Genotype List Used*' -r -p "^PA:.+HA:.+PB1:.+MP:.+NA:.+PB2:.+NP:.+NS:.+$" -N | \
-            csvtk sep -t -n genoflu_PA,genoflu_HA,genoflu_PB1,genoflu_MP,genoflu_NA,genoflu_PB2,genoflu_NP,genoflu_NS -f 3 -s ", "  | \
-            csvtk replace -t -f 4-11 -p "^(.+):" | \
-            csvtk cut -t -f 1,2,4-11 | \
-            csvtk rename -t -f Strain,Genotype -n strain,genoflu \
+            csvtk rename -t -F -f Strain,Genotype,'Genotype List Used*' -n strain,genoflu,details | \
+            python scripts/parse_genoflu.py \
             > {output.genotypes}
         """
 

diff --git a/ingest/scripts/parse_genoflu.py b/ingest/scripts/parse_genoflu.py
@@ -0,0 +1,30 @@
+"""
+Takes a (modified) GenoFLU results TSV on STDIN and writes a TSV to STDOUT
+The input TSV is expected to have three fields:
+* strain
+* genoflu
+* details
+The output TSV exports 10 fields:
+* strain
+* genoflu
+* genoflu_<SEGMENT> (8 fields)
+"""
+
+
+from augur.io.metadata import read_table_to_dict
+from sys import stdin, stdout
+from csv import DictWriter
+
+if __name__ == "__main__":
+    SEGMENTS = ["PB2", "PB1", "PA", "HA", "NP", "NA", "MP", "NS"]
+    HEADER = ['strain', 'genoflu', *[f"genoflu_{s}" for s in SEGMENTS]]
+    tsv_writer = DictWriter(stdout,HEADER,extrasaction='ignore',delimiter='\t',lineterminator='\n')
+    tsv_writer.writeheader()
+    for record in read_table_to_dict(stdin.buffer, ["\t"]):
+        if record['details']:
+            for segment_name, lineage in (parts.split(':') for parts in record['details'].split(", ")):
+                record[f"genoflu_{segment_name}"] = lineage
+        # Collapse all "not assigned" calls into a single metadata value
+        if record['genoflu'].startswith("Not assigned:"):
+            record['genoflu'] = "Not assigned (too divergent)"
+        tsv_writer.writerow(record)