Metadata and filter for N subtype in flu (#650)

* remove unused example_data_folder * Adjust local data directory names * add dev commands * fix typo * Add n subtype metadata col to config * Extract N subtype from flu gisaid workflow * Reduce local dev environment data * Use local config file for dev environment * Adjust flu dev environment, make coverage seeding a bit more robust to sparse dev environment data * Pass selected group filter to metadata query * Add N subtype in flu genbank workflow * Pass fasta input files to manifest job to automatically trigger job - also add sequence manifest re-triggering to example commands to make sure this step isn't skipped
vector-engineering · Jul 28, 2024 · 73fa89e · 73fa89e
1 parent 45a6c7c
commit 73fa89e
Show file tree

Hide file tree

Showing 15 changed files with 75 additions and 32 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -2,10 +2,8 @@ build
 node_modules
 data
 data_genbank
-data_flu
-data_flu_small
-data_flu_genbank
-data_gisaid_flu
+data_flu_gisaid_small
+data_flu_genbank_small
 data_gisaid_rsv
 data_genbank_rsv
 dist

diff --git a/.gcloudignore b/.gcloudignore
@@ -21,10 +21,8 @@ data_genbank
 data_genbank_rsv
 data_az
 data_ma
-data_flu
-data_flu_small
-data_flu_genbank
-data_gisaid_flu
+data_flu_gisaid_small
+data_flu_genbank_small
 data_gisaid_rsv
 dist
 example_data_genbank

diff --git a/.gitignore b/.gitignore
@@ -136,12 +136,11 @@ example_data_genbank/*/lineage_treetime/*.pdf
 
 data
 data_genbank
-data_flu_genbank
 example_data_genbank/rsv/**
 example_data_genbank/flu/**
 example_data_genbank/sars2/**
-data_flu_small/**
-data_flu_genbank
+data_flu_gisaid_small/**
+data_flu_genbank_small/**
 
 # scratch notebooks
 workflow_main/notebooks/**

diff --git a/config/config_flu_genbank.yaml b/config/config_flu_genbank.yaml
@@ -69,6 +69,8 @@ metadata_cols:
     title: "Host"
   isolation_source:
     title: "Isolation source"
+  n_subtype:
+    title: "Neuraminidase subtype"
   authors:
     title: "Authors"
   publications:

diff --git a/config/config_flu_genbank_dev.yaml b/config/config_flu_genbank_dev.yaml
@@ -7,7 +7,7 @@ virus: "flu"
 
 # Path to folder with downloaded and processed data
 # This path is relative to the project root
-data_folder: "data_flu_small"
+data_folder: "data_flu_genbank_small"
 
 # Path to folder with genome information (reference.fasta, genes.json, proteins.json)
 # This path is relative to the project root
@@ -33,10 +33,10 @@ chunk_size: 10000
 
 # Don't process sequences prior to this date
 # Leave empty to ignore
-start_date_cutoff:
+start_date_cutoff: 2023-01-01
 # Don't process sequences after this date
 # Leave empty to ignore
-end_date_cutoff:
+end_date_cutoff: 2023-05-01
 
 # Don't process sequences after X days ago
 # Leave empty to ignore
@@ -69,6 +69,8 @@ metadata_cols:
     title: "Host"
   isolation_source:
     title: "Isolation source"
+  n_subtype:
+    title: "Neuraminidase subtype"
   authors:
     title: "Authors"
   publications:

diff --git a/config/config_flu_gisaid.yaml b/config/config_flu_gisaid.yaml
@@ -63,6 +63,8 @@ metadata_cols:
     title: "Clade"
   lineage:
     title: "Lineage"
+  n_subtype:
+    title: "Neuraminidase subtype"
   passage:
     title: "Passage"
   host:

diff --git a/config/config_flu_gisaid_dev.yaml b/config/config_flu_gisaid_dev.yaml
@@ -7,17 +7,12 @@ virus: "flu"
 
 # Path to folder with downloaded and processed data
 # This path is relative to the project root
-data_folder: "data_flu_small"
+data_folder: "data_flu_gisaid_small"
 
 # Path to folder with genome information (reference.fasta, genes.json, proteins.json)
 # This path is relative to the project root
 static_data_folder: "static_data/flu"
 
-# Path to folder with data to use in development
-# This path is relative to the project root
-# Only used for database seeding in development
-example_data_folder: "data_flu_small"
-
 # Database for this virus
 postgres_db: "flu_gisaid_dev"
 
@@ -37,7 +32,7 @@ chunk_size: 10000
 start_date_cutoff:
 # Don't process sequences after this date
 # Leave empty to ignore
-end_date_cutoff: 2021-03-01
+end_date_cutoff: 2024-06-01
 
 # Don't process sequences after X days ago
 # Leave empty to ignore
@@ -68,6 +63,8 @@ metadata_cols:
     title: "Clade"
   lineage:
     title: "Lineage"
+  n_subtype:
+    title: "Neuraminidase subtype"
   passage:
     title: "Passage"
   host:

diff --git a/docker-compose.flu.genbank.yml b/docker-compose.flu.genbank.yml
@@ -30,7 +30,7 @@ services:
     working_dir: /app
     volumes:
       - ./services/server:/app:cached # Mount the server python code at run-time, so that the flask development server can refresh on changes
-      - ./data_flu_genbank:/data:cached # Mount the data at run-time (for database seeding only). Should prevent sending all the data over unnecessarily at build-time
+      - ./data_flu_genbank_small:/data:cached # Mount the data at run-time (for database seeding only). Should prevent sending all the data over unnecessarily at build-time
       - ./src/constants:/opt/constants:cached
       - ./config:/opt/config:cached
       - ./static_data/flu:/opt/static_data:cached

diff --git a/docker-compose.flu.gisaid.yml b/docker-compose.flu.gisaid.yml
@@ -12,7 +12,7 @@ services:
       - LOGINS=user1:pass1,user2:pass2
       - FLASK_APP=cg_server/main.py
       - FLASK_ENV=development
-      - CONFIGFILE=/opt/config/config_flu_gisaid.yaml
+      - CONFIGFILE=/opt/config/config_flu_gisaid_dev.yaml
       - CONSTANTSFILE=/opt/constants/defs.json
       - DATA_PATH=/data
       - STATIC_DATA_PATH=/opt/static_data
@@ -30,7 +30,7 @@ services:
     working_dir: /app
     volumes:
       - ./services/server:/app:cached # Mount the server python code at run-time, so that the flask development server can refresh on changes
-      - ./data_flu_small:/data:cached # Mount the data at run-time (for database seeding only). Should prevent sending all the data over unnecessarily at build-time
+      - ./data_flu_gisaid_small:/data:cached # Mount the data at run-time (for database seeding only). Should prevent sending all the data over unnecessarily at build-time
       - ./src/constants:/opt/constants:cached
       - ./config:/opt/config:cached
       - ./static_data/flu:/opt/static_data:cached
@@ -57,7 +57,7 @@ services:
       context: ./
       dockerfile: ./services/frontend/Dockerfile
     environment:
-      CONFIGFILE: /app/config/config_flu_gisaid.yaml
+      CONFIGFILE: /app/config/config_flu_gisaid_dev.yaml
     working_dir: /app
     volumes:
       - ./src:/app/src:cached # Mount the JS code at run-time, so the babel server can recompile the app on file changes

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -145,7 +145,7 @@ services:
     working_dir: /app
     volumes:
       - ./services/server:/app:cached # Mount the server python code at run-time, so that the flask development server can refresh on changes
-      - ./data_flu_small:/data:cached # Mount the data at run-time (for database seeding only). Should prevent sending all the data over unnecessarily at build-time
+      - ./data_flu_gisaid_small:/data:cached # Mount the data at run-time (for database seeding only). Should prevent sending all the data over unnecessarily at build-time
       - ./src/constants:/opt/constants:cached
       - ./config:/opt/config:cached
       - ./static_data/flu:/opt/static_data:cached

diff --git a/services/server/cg_server/db_seed/seed.py b/services/server/cg_server/db_seed/seed.py
@@ -738,6 +738,16 @@ def seed_database(conn, schema="public"):
                     result_type="expand",
                 )
 
+            # Subset start_df and end_df columns to only include
+            # feature_cols_start and feature_cols_end
+            # Just in case any weird columns sneak through
+            # Allow for missing columns - only useful for dev environments where
+            # truncated data may be missing some segments
+            start_df = start_df[
+                [col for col in feature_cols_start if col in start_df.columns]
+            ]
+            end_df = end_df[[col for col in feature_cols_end if col in end_df.columns]]
+
             df_to_sql(
                 cur,
                 pd.concat(
@@ -754,8 +764,8 @@ def seed_database(conn, schema="public"):
                             + grouping_cols
                             + ["reference"]
                         ],
-                        start_df[feature_cols_start].astype(pd.Int64Dtype()),
-                        end_df[feature_cols_end].astype(pd.Int64Dtype()),
+                        start_df.astype(pd.Int64Dtype()),
+                        end_df.astype(pd.Int64Dtype()),
                     ],
                     axis=1,
                 ),

diff --git a/src/stores/metadataStore.js b/src/stores/metadataStore.js
@@ -39,6 +39,9 @@ export class MetadataStore {
         ...rootStoreInstance.configStore.getSelectedLocations(),
         selected_metadata_fields:
           rootStoreInstance.configStore.getSelectedMetadataFields(),
+        selected_group_fields: toJS(
+          rootStoreInstance.configStore.selectedGroupFields
+        ),
       }),
     })
       .then((res) => {

diff --git a/workflow_flu_genbank_ingest/scripts/clean_metadata.py b/workflow_flu_genbank_ingest/scripts/clean_metadata.py
@@ -452,6 +452,8 @@ def main():
     'Yamagata', 'Yamagata-like',
     'unidentified', 'unknown'
     ]
+    # TODO: if serotypes not in the prior list, then throw out an error so we can detect it and
+    #       add it to the list in the future
 
     """
     serotype_rename_map = {
@@ -499,8 +501,18 @@ def main():
         "Yamagata": "B-yam",
         "Victoria": "B-vic",
     }
+    df["original_serotype"] = df["serotype"]
     df["serotype"] = df["serotype"].replace(serotype_rename_map)
 
+    # Extract N subtype
+    df["n_subtype"] = (
+        df["original_serotype"]
+        .str.extract(r".*N(\d+)$", expand=False)
+        .fillna("Unknown")
+    )
+    df.loc[df["serotype"].isin(["B-yam", "B-vic"]), "n_subtype"] = "NA"
+    df.drop(columns=["original_serotype"], inplace=True)
+
     # Filter on serotypes
     # Only do this for alpha, leave beta alone
     valid_serotypes = ["H1N1", "H3N2", "H5NX", "H7NX", "H9NX", "B-yam", "B-vic"]
@@ -509,7 +521,7 @@ def main():
 
     # Segment extraction
     # Clean segments
-    df["genome_coverage"] = df["genome_coverage"].fillna('[]').apply(json.loads)
+    df["genome_coverage"] = df["genome_coverage"].fillna("[]").apply(json.loads)
     df["proteins"] = df["genome_coverage"].apply(
         lambda x: (
             [p["name"] for p in x[0]["proteins"]]
@@ -626,6 +638,7 @@ def datetime_to_date(x):
             segments=("segment", list),
             genus=("genus", "first"),
             serotype=("serotype", "first"),
+            n_subtype=("n_subtype", "first"),
             region=("region", "first"),
             country=("country", "first"),
             division=("division", "first"),

diff --git a/workflow_flu_gisaid_ingest/scripts/clean_metadata.py b/workflow_flu_gisaid_ingest/scripts/clean_metadata.py
@@ -233,6 +233,15 @@ def clean_df(df):
     df.loc[df["serotype"].str.startswith("H9"), "serotype"] = "H9NX"
     df.loc[df["serotype"].str.startswith("H10"), "serotype"] = "H10NX"
 
+    # Extract N subtype
+    df["n_subtype"] = "NA"
+    df.loc[~b_serotype, "n_subtype"] = (
+        df.loc[~b_serotype, "original_serotype"]
+        .str.extract(r".*N(\d+)$", expand=False)
+        .fillna("Unknown")
+    )
+    df.loc[b_serotype, "n_subtype"] = "NA"
+
     # Remove rows without segments
     df = df.loc[df["segments"].apply(len) > 0, :]
 
@@ -316,6 +325,7 @@ def clean_df(df):
             "isolate_id",
             "virus_name",
             "serotype",
+            "n_subtype",
             "lineage",
             "clade",
             "passage",
@@ -402,7 +412,10 @@ def main():
 
     # Expand by Accession ID
     df = dfs.explode(["accession_ids", "segments"]).rename(
-        columns={"accession_ids": "Accession ID", "segments": "segment",}
+        columns={
+            "accession_ids": "Accession ID",
+            "segments": "segment",
+        }
     )
     df.to_csv(args.metadata_out)
 

diff --git a/workflow_main/Snakefile b/workflow_main/Snakefile
@@ -7,7 +7,9 @@ $ snakemake --configfile ../config/config_sars2_gisaid_6month.yaml -j6 -R sequen
 $ snakemake --configfile ../config/config_sars2_genbank_dev.yaml -j6
 $ snakemake --configfile ../config/config_rsv_genbank.yaml -j6
 $ snakemake --configfile ../config/config_flu_genbank.yaml -j6
+$ snakemake --configfile ../config/config_flu_genbank_dev.yaml -j6  -R sequence_manifest
 $ snakemake --configfile ../config/config_flu_gisaid.yaml -j6
+$ snakemake --configfile ../config/config_flu_gisaid_dev.yaml -j6  -R sequence_manifest
 
 This DAG building gets kind of complicated...
 First, sequences are split up by submission date and subtype (this is
@@ -237,7 +239,11 @@ rule sequence_manifest:
     input:
         # Include metadata so this triggers every run
         metadata = os.path.join(data_folder, "metadata.csv"),
-        reference = rules.write_reference_files.output.reference
+        reference = rules.write_reference_files.output.reference,
+        fasta = expand(
+            os.path.join(data_folder, "fasta_raw", "{chunk}.fa.gz"),
+            chunk=CHUNKS
+        ),
     output:
         manifest = os.path.join(data_folder, "sequence_manifest.csv")
     params:
@@ -250,7 +256,7 @@ rule sequence_manifest:
             --reference {input.reference} \
             --fasta {params.processed_fasta_files} \
             --out {output.manifest} \
-            {params.start_date_cutoff}
+            {params.start_date_cutoff} \
             {params.end_date_cutoff}
         """