PANGO annotations for SARS2 GISAID workflow (#631)

* Add PANGO assignment to GISAID workflow * Add legacy lineage col to metadata * add missing input func * Add missing repo * Fix fasta_raw folder missing on first run * Add SARS2 GISAID workflow example to main Snakefile * Multithread PANGO for SARS2 GenBank workflow * Fix typos -- missing xz adjustment from previous PRs * Fix lineage DAG generation for SARS2 GISAID workflow * Add GISAID lineage as another group col * Move fake input to combine_lineages to avoid retriggering job for each individual lineage assignment job * Bump version to v2.7.5-pango (temp) * Auto-update nextstrain exclusion list
vector-engineering · Oct 31, 2023 · f039ad2 · f039ad2
1 parent 6fe2047
commit f039ad2
Show file tree

Hide file tree

Showing 12 changed files with 144 additions and 40 deletions.
diff --git a/config/config_sars2_gisaid.yaml b/config/config_sars2_gisaid.yaml
@@ -76,12 +76,19 @@ metadata_cols:
 group_cols:
   lineage:
     name: "lineage"
-    title: "Lineage"
+    title: "PANGO Lineage"
     description: ""
     link:
       title: "(Lineage Descriptions)"
       href: "https://cov-lineages.org/descriptions.html"
     show_collapse_options: true
+  gisaid_lineage:
+    name: "gisaid_lineage"
+    title: "PANGO Lineage (GISAID)"
+    description: "PANGO assignments from GISAID"
+    link:
+      title: "(Lineage Descriptions)"
+      href: "https://cov-lineages.org/descriptions.html"
   clade:
     name: "clade"
     title: "Clade"

diff --git a/config/config_sars2_gisaid_private.yaml b/config/config_sars2_gisaid_private.yaml
@@ -76,12 +76,19 @@ metadata_cols:
 group_cols:
   lineage:
     name: "lineage"
-    title: "Lineage"
+    title: "PANGO Lineage"
     description: ""
     link:
       title: "(Lineage Descriptions)"
       href: "https://cov-lineages.org/descriptions.html"
     show_collapse_options: true
+  gisaid_lineage:
+    name: "gisaid_lineage"
+    title: "PANGO Lineage (GISAID)"
+    description: "PANGO assignments from GISAID"
+    link:
+      title: "(Lineage Descriptions)"
+      href: "https://cov-lineages.org/descriptions.html"
   clade:
     name: "clade"
     title: "Clade"

diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "covidcg",
-  "version": "2.7.4",
+  "version": "2.7.5-pango",
   "description": "",
   "engines": {
     "node": ">=8",

diff --git a/src/utils/version.js b/src/utils/version.js
@@ -1 +1 @@
-export const version = '2.7.4';
+export const version = '2.7.5-pango';
diff --git a/workflow_main/Snakefile b/workflow_main/Snakefile
@@ -2,6 +2,7 @@
 
 """Main data processing workflow from ingested data
 
+$ snakemake --configfile ../config/config_sars2_gisaid.yaml -j6
 $ snakemake --configfile ../config/config_sars2_genbank_dev.yaml -j6
 $ snakemake --configfile ../config/config_rsv_genbank.yaml -j6
 $ snakemake --configfile ../config/config_flu_genbank.yaml -j6
@@ -238,7 +239,8 @@ rule preprocess_sequences:
         virus = config['virus'],
         nextstrain_exclude = os.path.join(
             static_data_folder, "nextstrain_exclude_20200520.txt"
-        )
+        ),
+        exclude_list = os.path.join(data_folder, "exclude_list.txt")
     output:
         fasta = os.path.join(data_folder, "fasta_processed", "{chunk}.fa.gz")
     shell:
@@ -247,6 +249,7 @@ rule preprocess_sequences:
             python3 scripts/preprocess_sars2_sequences.py \
                 --input {input.fasta} \
                 --nextstrain-exclusion-file {params.nextstrain_exclude} \
+                --exclude-list {params.exclude_list} \
                 --output {output.fasta}
         elif [[ "{params.virus}" == "rsv" ]]; then
             python3 scripts/preprocess_rsv_sequences.py \

diff --git a/workflow_main/scripts/preprocess_sars2_sequences.py b/workflow_main/scripts/preprocess_sars2_sequences.py
@@ -11,14 +11,14 @@
 import math
 import re
 
-from pathlib import Path
+import pandas as pd
 
 
 def main():
     """Filter out sequences (adapted from van Dorp et al, 2020)
     1. Filter against nextstrain exclusion list
     2. Can't be less than 29700NT
-	3. Can't have more than 5% ambiguous NT
+    3. Can't have more than 5% ambiguous NT
     """
 
     parser = argparse.ArgumentParser()
@@ -30,32 +30,30 @@ def main():
         required=True,
         help="Path to nextstrain exclusion file",
     )
+    parser.add_argument("--exclude-list", type=str, required=True, help="Debug")
     parser.add_argument(
-        "--output", type=str, required=True, help="Output FASTA file",
+        "--output",
+        type=str,
+        required=True,
+        help="Output FASTA file",
     )
 
     args = parser.parse_args()
 
     # Load lines, ignoring comments and empty lines
-    exclude_taxons = []
-    with Path(args.nextstrain_exclusion_file).open("r") as fp:
-        for line in fp.readlines():
-            # Exclude comments
-            if line[0] == "#":
-                continue
-
-            # Strip whitespace
-            line = re.sub(r"\s+", "", line).strip()
-
-            # Exclude empty lines
-            if not line:
-                continue
+    df = pd.read_csv(
+        "https://raw.githubusercontent.com/nextstrain/ncov/master/defaults/exclude.txt",
+        comment="#",
+        header=None,
+        skip_blank_lines=True,
+    )
 
-            exclude_taxons.append(line)
+    exclude_taxons = list(set(df[0].tolist()))
 
     num_excluded = 0
     fp_in = gzip.open(args.input, "rt")
     fp_out = gzip.open(args.output, "wt")
+    fp_exclude = open(args.exclude_list, "a")
 
     cur_entry = ""
     cur_seq = ""
@@ -64,24 +62,30 @@ def main():
 
         # Beginning of a new entry, or EOF = end of current entry
         if not line or line[0] == ">":
-
             if cur_entry:
                 num_ambiguous = 0
                 for char in cur_seq:
                     if char == "N":
                         num_ambiguous += 1
 
-                if (
-                    # 1: Check against nextstrain exclusion list
-                    (cur_entry in exclude_taxons)
-                    or
-                    # 2: Can't be less than 29700 NT
-                    len(cur_seq) < 29700
-                    or
-                    # 3: Can't have more than 5% ambiguous (N) NT
-                    num_ambiguous > math.floor(len(cur_seq) * 0.05)
-                ):
+                exclude_reasons = []
+                # 1: Check against nextstrain exclusion list
+                if cur_entry in exclude_taxons:
+                    num_excluded += 1
+                    exclude_reasons.append("in_exclusion_list")
+
+                # 2: Can't be less than 29700 NT
+                if len(cur_seq) < 29700:
                     num_excluded += 1
+                    exclude_reasons.append(f"too_short:{str(len(cur_seq))}")
+
+                # 3: Can't have more than 5% ambiguous (N) NT
+                if num_ambiguous > math.floor(len(cur_seq) * 0.05):
+                    num_excluded += 1
+                    exclude_reasons.append(f"too_many_ambiguous:{str(num_ambiguous)}")
+
+                if len(exclude_reasons) > 1:
+                    fp_exclude.write(f"{cur_entry},{';'.join(exclude_reasons)}\n")
                 else:
                     # It passed, write to output
                     fp_out.write(">" + cur_entry + "\n")
@@ -108,6 +112,7 @@ def main():
 
     fp_in.close()
     fp_out.close()
+    fp_exclude.close()
 
     print("Removed {:,} sequences".format(num_excluded), flush=True)
 

diff --git a/workflow_sars2_genbank_ingest/Snakefile b/workflow_sars2_genbank_ingest/Snakefile
@@ -199,11 +199,12 @@ rule pangolin_lineages:
         fasta = temp(os.path.join(data_folder, "lineages", "{chunk}.fa")),
         lineages = os.path.join(data_folder, "lineages", "{chunk}.csv")
     conda: "envs/pangolin.yaml"
+    threads: workflow.cores
     shell:
         """
         # Pangolin can only use un-gzipped fasta files
         gunzip -c {input.fasta} > {output.fasta}
-        pangolin --outfile {output.lineages} {output.fasta}
+        pangolin --outfile {output.lineages} {output.fasta} -t {workflow.cores} # --analysis-mode fast
         """
 
 rule combine_lineages:

diff --git a/workflow_sars2_gisaid_ingest/Snakefile b/workflow_sars2_gisaid_ingest/Snakefile
@@ -42,7 +42,7 @@ rule download:
         "scripts/download.sh > {output.feed}"
 
 
-rule process_feed:
+checkpoint process_feed:
     """Split up the data feed's individual JSON objects into metadata and fasta files. Chunk the fasta files so that every day we only reprocess the subset of fasta files that have changed. The smaller the chunk size, the more efficient the updates, but the more files on the filesystem.
     On a 48-core workstation with 128 GB RAM, aligning 200 sequences takes about 10 minutes, and this is more acceptable than having to align 1000 sequences, which takes ~1 hour. We end up with hundreds of files, but the filesystem seems to be handling it well.
     """
@@ -56,10 +56,49 @@ rule process_feed:
     threads: workflow.cores
     shell:
         """
+        mkdir -p {params.fasta}
         python3 scripts/process_feed.py -d {input.feed} -f {params.fasta} -m {output.metadata_dirty} -p {threads}
         """
 
 
+rule pangolin_lineages:
+    """Assign a lineage to each sequence using pangolin
+    """
+    input:
+        fasta = os.path.join(data_folder, "fasta_raw", "{chunk}.fa.gz")
+    output:
+        fasta = temp(os.path.join(data_folder, "lineages", "{chunk}.fa")),
+        lineages = os.path.join(data_folder, "lineages", "{chunk}.csv")
+    conda: "envs/pangolin.yaml"
+    threads: workflow.cores
+    shell:
+        """
+        # Pangolin can only use un-gzipped fasta files
+        gunzip -c {input.fasta} > {output.fasta}
+        pangolin --outfile {output.lineages} {output.fasta} -t {workflow.cores} # --analysis-mode fast
+        """
+
+
+checkpoint combine_lineages:
+    """Combine all lineage result chunks
+    """
+    input:
+        lineages = expand(
+            os.path.join(data_folder, "lineages", "{chunk}.csv"),
+            chunk=glob_wildcards(os.path.join(data_folder, "fasta_raw", "{i}.fa.gz")).i
+        ),
+        status = rules.all.input.copy_status
+    params:
+        chunk_glob = os.path.join(data_folder, "lineages", "*.csv")
+    output:
+        lineages = os.path.join(data_folder, "lineages.csv")
+    shell:
+        """
+        echo {input.lineages}
+        awk '(NR == 1) || (FNR > 1)' {params.chunk_glob} > {output.lineages}
+        """
+
+
 rule clean_metadata:
     """Clean up metadata from GISAID
     """
@@ -68,9 +107,14 @@ rule clean_metadata:
         location_corrections = os.path.join(
             static_data_folder, "location_corrections.csv"
         ),
+        lineages = rules.combine_lineages.output.lineages
     output:
         metadata_clean = os.path.join(data_folder, "metadata.csv")
     shell:
         """
-        python3 scripts/clean_metadata.py -i {input.metadata_dirty} -l {input.location_corrections} -o {output.metadata_clean}
+        python3 scripts/clean_metadata.py \
+            --metadata-in {input.metadata_dirty} \
+            --location-corrections {input.location_corrections} \
+            --lineages {input.lineages} \
+            --metadata-out {output.metadata_clean}
         """
diff --git a/workflow_sars2_gisaid_ingest/envs/pangolin.yaml b/workflow_sars2_gisaid_ingest/envs/pangolin.yaml
@@ -0,0 +1,22 @@
+# MODIFIED FROM: https://github.com/cov-lineages/pangolin, v4.3.1
+
+name: pangolin
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - biopython=1.74
+  - minimap2>=2.16
+  - pip=19.3.1
+  - python=3.7
+  - snakemake-minimal<=7.24.0
+  - gofasta
+  - ucsc-fatovcf>=426
+  - usher>=0.5.4
+  - git-lfs
+  - pip:
+      - git+https://github.com/cov-lineages/[email protected]
+      - git+https://github.com/cov-lineages/scorpio.git
+      - git+https://github.com/cov-lineages/constellations.git
+      - git+https://github.com/cov-lineages/pangolin-data.git
diff --git a/workflow_sars2_gisaid_ingest/scripts/clean_metadata.py b/workflow_sars2_gisaid_ingest/scripts/clean_metadata.py
@@ -1073,6 +1073,10 @@ def main():
         help="Path to location corrections CSV file",
     )
 
+    parser.add_argument(
+        "--lineages", type=str, required=True, help="Path to lineages CSV file",
+    )
+
     parser.add_argument(
         "-o",
         "--metadata-out",
@@ -1142,6 +1146,17 @@ def main():
     # Segment = 1
     df["segment"] = 1
 
+    # Load lineages and join to dataframe
+    lineages_df = pd.read_csv(args.lineages)
+    lineages_df = lineages_df.rename(columns={"taxon": "Accession ID"}).set_index(
+        "Accession ID"
+    )
+    df = df.rename(columns={"lineage": "gisaid_lineage"}).join(
+        lineages_df[["lineage"]], how="left"
+    )
+    # Fill in missing values with GISAID lineages
+    df.loc[:, "lineage"] = df["lineage"].combine_first(df['gisaid_lineage'])
+
     df.to_csv(args.metadata_out)
 
 

diff --git a/workflow_sars2_gisaid_ingest/scripts/process_feed.py b/workflow_sars2_gisaid_ingest/scripts/process_feed.py
@@ -160,7 +160,7 @@ def error_callback(e):
         pool.join()
 
     manifest = pd.DataFrame(entries, columns=["Accession ID", "sequence_hash", "date"])
-    #manifest["date"] = pd.to_datetime(manifest["date"])
+    # manifest["date"] = pd.to_datetime(manifest["date"])
     # Sort by date, and drop duplicate Accession IDs, by keeping the last copy
     # (i.e., the latest copy)
     manifest = manifest.sort_values("date", ascending=True).drop_duplicates(
@@ -173,7 +173,7 @@ def error_callback(e):
 
     # Get fields for each isolate
     fields = []
-    with lzma.open(args.data_feed, "xt") as fp_in:
+    with lzma.open(args.data_feed, "rt") as fp_in:
         isolate = json.loads(fp_in.readline().strip())
         for i, key in enumerate(isolate.keys()):
             # Skip the special sequence column
@@ -190,7 +190,7 @@ def error_callback(e):
     # as list of tuples, (Accession ID, sequence)
     new_entries = []
 
-    with open(args.data_feed, "r") as fp_in:
+    with lzma.open(args.data_feed, "rt") as fp_in:
         line_counter = 0
 
         for line in fp_in:
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		export const version = '2.7.4';
		export const version = '2.7.5-pango';