From f039ad2cb8f85abae9fc99f1fdd77e508fbf5b5a Mon Sep 17 00:00:00 2001
From: Albert Tian Chen <chena@broadinstitute.org>
Date: Mon, 30 Oct 2023 21:21:33 -0400
Subject: [PATCH] PANGO annotations for SARS2 GISAID workflow (#631)

* Add PANGO assignment to GISAID workflow

* Add legacy lineage col to metadata

* add missing input func

* Add missing repo

* Fix fasta_raw folder missing on first run

* Add SARS2 GISAID workflow example to main Snakefile

* Multithread PANGO for SARS2 GenBank workflow

* Fix typos -- missing xz adjustment from previous PRs

* Fix lineage DAG generation for SARS2 GISAID workflow

* Add GISAID lineage as another group col

* Move fake input to combine_lineages to avoid retriggering job for each individual lineage assignment job

* Bump version to v2.7.5-pango (temp)

* Auto-update nextstrain exclusion list
---
 config/config_sars2_gisaid.yaml               |  9 ++-
 config/config_sars2_gisaid_private.yaml       |  9 ++-
 package-lock.json                             |  2 +-
 package.json                                  |  2 +-
 src/utils/version.js                          |  2 +-
 workflow_main/Snakefile                       |  5 +-
 .../scripts/preprocess_sars2_sequences.py     | 61 ++++++++++---------
 workflow_sars2_genbank_ingest/Snakefile       |  3 +-
 workflow_sars2_gisaid_ingest/Snakefile        | 48 ++++++++++++++-
 .../envs/pangolin.yaml                        | 22 +++++++
 .../scripts/clean_metadata.py                 | 15 +++++
 .../scripts/process_feed.py                   |  6 +-
 12 files changed, 144 insertions(+), 40 deletions(-)
 create mode 100644 workflow_sars2_gisaid_ingest/envs/pangolin.yaml

diff --git a/config/config_sars2_gisaid.yaml b/config/config_sars2_gisaid.yaml
index bc236536..425f4cda 100644
--- a/config/config_sars2_gisaid.yaml
+++ b/config/config_sars2_gisaid.yaml
@@ -76,12 +76,19 @@ metadata_cols:
 group_cols:
   lineage:
     name: "lineage"
-    title: "Lineage"
+    title: "PANGO Lineage"
     description: ""
     link:
       title: "(Lineage Descriptions)"
       href: "https://cov-lineages.org/descriptions.html"
     show_collapse_options: true
+  gisaid_lineage:
+    name: "gisaid_lineage"
+    title: "PANGO Lineage (GISAID)"
+    description: "PANGO assignments from GISAID"
+    link:
+      title: "(Lineage Descriptions)"
+      href: "https://cov-lineages.org/descriptions.html"
   clade:
     name: "clade"
     title: "Clade"
diff --git a/config/config_sars2_gisaid_private.yaml b/config/config_sars2_gisaid_private.yaml
index de64492a..a7dd6695 100644
--- a/config/config_sars2_gisaid_private.yaml
+++ b/config/config_sars2_gisaid_private.yaml
@@ -76,12 +76,19 @@ metadata_cols:
 group_cols:
   lineage:
     name: "lineage"
-    title: "Lineage"
+    title: "PANGO Lineage"
     description: ""
     link:
       title: "(Lineage Descriptions)"
       href: "https://cov-lineages.org/descriptions.html"
     show_collapse_options: true
+  gisaid_lineage:
+    name: "gisaid_lineage"
+    title: "PANGO Lineage (GISAID)"
+    description: "PANGO assignments from GISAID"
+    link:
+      title: "(Lineage Descriptions)"
+      href: "https://cov-lineages.org/descriptions.html"
   clade:
     name: "clade"
     title: "Clade"
diff --git a/package-lock.json b/package-lock.json
index 82b64c07..e1862237 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,6 +1,6 @@
 {
   "name": "covidcg",
-  "version": "2.7.4",
+  "version": "2.7.5-pango",
   "lockfileVersion": 1,
   "requires": true,
   "dependencies": {
diff --git a/package.json b/package.json
index ea50f603..d31b17c5 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "covidcg",
-  "version": "2.7.4",
+  "version": "2.7.5-pango",
   "description": "",
   "engines": {
     "node": ">=8",
diff --git a/src/utils/version.js b/src/utils/version.js
index fd3b8705..70e4870c 100644
--- a/src/utils/version.js
+++ b/src/utils/version.js
@@ -1 +1 @@
-export const version = '2.7.4';
+export const version = '2.7.5-pango';
diff --git a/workflow_main/Snakefile b/workflow_main/Snakefile
index 16fb0252..9dca62c8 100644
--- a/workflow_main/Snakefile
+++ b/workflow_main/Snakefile
@@ -2,6 +2,7 @@
 
 """Main data processing workflow from ingested data
 
+$ snakemake --configfile ../config/config_sars2_gisaid.yaml -j6
 $ snakemake --configfile ../config/config_sars2_genbank_dev.yaml -j6
 $ snakemake --configfile ../config/config_rsv_genbank.yaml -j6
 $ snakemake --configfile ../config/config_flu_genbank.yaml -j6
@@ -238,7 +239,8 @@ rule preprocess_sequences:
         virus = config['virus'],
         nextstrain_exclude = os.path.join(
             static_data_folder, "nextstrain_exclude_20200520.txt"
-        )
+        ),
+        exclude_list = os.path.join(data_folder, "exclude_list.txt")
     output:
         fasta = os.path.join(data_folder, "fasta_processed", "{chunk}.fa.gz")
     shell:
@@ -247,6 +249,7 @@ rule preprocess_sequences:
             python3 scripts/preprocess_sars2_sequences.py \
                 --input {input.fasta} \
                 --nextstrain-exclusion-file {params.nextstrain_exclude} \
+                --exclude-list {params.exclude_list} \
                 --output {output.fasta}
         elif [[ "{params.virus}" == "rsv" ]]; then
             python3 scripts/preprocess_rsv_sequences.py \
diff --git a/workflow_main/scripts/preprocess_sars2_sequences.py b/workflow_main/scripts/preprocess_sars2_sequences.py
index 4e7038d6..4f0d5828 100755
--- a/workflow_main/scripts/preprocess_sars2_sequences.py
+++ b/workflow_main/scripts/preprocess_sars2_sequences.py
@@ -11,14 +11,14 @@
 import math
 import re
 
-from pathlib import Path
+import pandas as pd
 
 
 def main():
     """Filter out sequences (adapted from van Dorp et al, 2020)
     1. Filter against nextstrain exclusion list
     2. Can't be less than 29700NT
-	3. Can't have more than 5% ambiguous NT
+    3. Can't have more than 5% ambiguous NT
     """
 
     parser = argparse.ArgumentParser()
@@ -30,32 +30,30 @@ def main():
         required=True,
         help="Path to nextstrain exclusion file",
     )
+    parser.add_argument("--exclude-list", type=str, required=True, help="Debug")
     parser.add_argument(
-        "--output", type=str, required=True, help="Output FASTA file",
+        "--output",
+        type=str,
+        required=True,
+        help="Output FASTA file",
     )
 
     args = parser.parse_args()
 
     # Load lines, ignoring comments and empty lines
-    exclude_taxons = []
-    with Path(args.nextstrain_exclusion_file).open("r") as fp:
-        for line in fp.readlines():
-            # Exclude comments
-            if line[0] == "#":
-                continue
-
-            # Strip whitespace
-            line = re.sub(r"\s+", "", line).strip()
-
-            # Exclude empty lines
-            if not line:
-                continue
+    df = pd.read_csv(
+        "https://raw.githubusercontent.com/nextstrain/ncov/master/defaults/exclude.txt",
+        comment="#",
+        header=None,
+        skip_blank_lines=True,
+    )
 
-            exclude_taxons.append(line)
+    exclude_taxons = list(set(df[0].tolist()))
 
     num_excluded = 0
     fp_in = gzip.open(args.input, "rt")
     fp_out = gzip.open(args.output, "wt")
+    fp_exclude = open(args.exclude_list, "a")
 
     cur_entry = ""
     cur_seq = ""
@@ -64,24 +62,30 @@ def main():
 
         # Beginning of a new entry, or EOF = end of current entry
         if not line or line[0] == ">":
-
             if cur_entry:
                 num_ambiguous = 0
                 for char in cur_seq:
                     if char == "N":
                         num_ambiguous += 1
 
-                if (
-                    # 1: Check against nextstrain exclusion list
-                    (cur_entry in exclude_taxons)
-                    or
-                    # 2: Can't be less than 29700 NT
-                    len(cur_seq) < 29700
-                    or
-                    # 3: Can't have more than 5% ambiguous (N) NT
-                    num_ambiguous > math.floor(len(cur_seq) * 0.05)
-                ):
+                exclude_reasons = []
+                # 1: Check against nextstrain exclusion list
+                if cur_entry in exclude_taxons:
+                    num_excluded += 1
+                    exclude_reasons.append("in_exclusion_list")
+
+                # 2: Can't be less than 29700 NT
+                if len(cur_seq) < 29700:
                     num_excluded += 1
+                    exclude_reasons.append(f"too_short:{str(len(cur_seq))}")
+
+                # 3: Can't have more than 5% ambiguous (N) NT
+                if num_ambiguous > math.floor(len(cur_seq) * 0.05):
+                    num_excluded += 1
+                    exclude_reasons.append(f"too_many_ambiguous:{str(num_ambiguous)}")
+
+                if len(exclude_reasons) > 1:
+                    fp_exclude.write(f"{cur_entry},{';'.join(exclude_reasons)}\n")
                 else:
                     # It passed, write to output
                     fp_out.write(">" + cur_entry + "\n")
@@ -108,6 +112,7 @@ def main():
 
     fp_in.close()
     fp_out.close()
+    fp_exclude.close()
 
     print("Removed {:,} sequences".format(num_excluded), flush=True)
 
diff --git a/workflow_sars2_genbank_ingest/Snakefile b/workflow_sars2_genbank_ingest/Snakefile
index 2510924a..1696d8f7 100644
--- a/workflow_sars2_genbank_ingest/Snakefile
+++ b/workflow_sars2_genbank_ingest/Snakefile
@@ -199,11 +199,12 @@ rule pangolin_lineages:
         fasta = temp(os.path.join(data_folder, "lineages", "{chunk}.fa")),
         lineages = os.path.join(data_folder, "lineages", "{chunk}.csv")
     conda: "envs/pangolin.yaml"
+    threads: workflow.cores
     shell:
         """
         # Pangolin can only use un-gzipped fasta files
         gunzip -c {input.fasta} > {output.fasta}
-        pangolin --outfile {output.lineages} {output.fasta}
+        pangolin --outfile {output.lineages} {output.fasta} -t {workflow.cores} # --analysis-mode fast
         """
 
 rule combine_lineages:
diff --git a/workflow_sars2_gisaid_ingest/Snakefile b/workflow_sars2_gisaid_ingest/Snakefile
index 66390364..c0a6df5b 100644
--- a/workflow_sars2_gisaid_ingest/Snakefile
+++ b/workflow_sars2_gisaid_ingest/Snakefile
@@ -42,7 +42,7 @@ rule download:
         "scripts/download.sh > {output.feed}"
 
 
-rule process_feed:
+checkpoint process_feed:
     """Split up the data feed's individual JSON objects into metadata and fasta files. Chunk the fasta files so that every day we only reprocess the subset of fasta files that have changed. The smaller the chunk size, the more efficient the updates, but the more files on the filesystem.
     On a 48-core workstation with 128 GB RAM, aligning 200 sequences takes about 10 minutes, and this is more acceptable than having to align 1000 sequences, which takes ~1 hour. We end up with hundreds of files, but the filesystem seems to be handling it well.
     """
@@ -56,10 +56,49 @@ rule process_feed:
     threads: workflow.cores
     shell:
         """
+        mkdir -p {params.fasta}
         python3 scripts/process_feed.py -d {input.feed} -f {params.fasta} -m {output.metadata_dirty} -p {threads}
         """
 
 
+rule pangolin_lineages:
+    """Assign a lineage to each sequence using pangolin
+    """
+    input:
+        fasta = os.path.join(data_folder, "fasta_raw", "{chunk}.fa.gz")
+    output:
+        fasta = temp(os.path.join(data_folder, "lineages", "{chunk}.fa")),
+        lineages = os.path.join(data_folder, "lineages", "{chunk}.csv")
+    conda: "envs/pangolin.yaml"
+    threads: workflow.cores
+    shell:
+        """
+        # Pangolin can only use un-gzipped fasta files
+        gunzip -c {input.fasta} > {output.fasta}
+        pangolin --outfile {output.lineages} {output.fasta} -t {workflow.cores} # --analysis-mode fast
+        """
+
+
+checkpoint combine_lineages:
+    """Combine all lineage result chunks
+    """
+    input:
+        lineages = expand(
+            os.path.join(data_folder, "lineages", "{chunk}.csv"),
+            chunk=glob_wildcards(os.path.join(data_folder, "fasta_raw", "{i}.fa.gz")).i
+        ),
+        status = rules.all.input.copy_status
+    params:
+        chunk_glob = os.path.join(data_folder, "lineages", "*.csv")
+    output:
+        lineages = os.path.join(data_folder, "lineages.csv")
+    shell:
+        """
+        echo {input.lineages}
+        awk '(NR == 1) || (FNR > 1)' {params.chunk_glob} > {output.lineages}
+        """
+
+
 rule clean_metadata:
     """Clean up metadata from GISAID
     """
@@ -68,9 +107,14 @@ rule clean_metadata:
         location_corrections = os.path.join(
             static_data_folder, "location_corrections.csv"
         ),
+        lineages = rules.combine_lineages.output.lineages
     output:
         metadata_clean = os.path.join(data_folder, "metadata.csv")
     shell:
         """
-        python3 scripts/clean_metadata.py -i {input.metadata_dirty} -l {input.location_corrections} -o {output.metadata_clean}
+        python3 scripts/clean_metadata.py \
+            --metadata-in {input.metadata_dirty} \
+            --location-corrections {input.location_corrections} \
+            --lineages {input.lineages} \
+            --metadata-out {output.metadata_clean}
         """
diff --git a/workflow_sars2_gisaid_ingest/envs/pangolin.yaml b/workflow_sars2_gisaid_ingest/envs/pangolin.yaml
new file mode 100644
index 00000000..a4e76c65
--- /dev/null
+++ b/workflow_sars2_gisaid_ingest/envs/pangolin.yaml
@@ -0,0 +1,22 @@
+# MODIFIED FROM: https://github.com/cov-lineages/pangolin, v4.3.1
+
+name: pangolin
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - biopython=1.74
+  - minimap2>=2.16
+  - pip=19.3.1
+  - python=3.7
+  - snakemake-minimal<=7.24.0
+  - gofasta
+  - ucsc-fatovcf>=426
+  - usher>=0.5.4
+  - git-lfs
+  - pip:
+      - git+https://github.com/cov-lineages/pangolin.git@v4.3.1
+      - git+https://github.com/cov-lineages/scorpio.git
+      - git+https://github.com/cov-lineages/constellations.git
+      - git+https://github.com/cov-lineages/pangolin-data.git
diff --git a/workflow_sars2_gisaid_ingest/scripts/clean_metadata.py b/workflow_sars2_gisaid_ingest/scripts/clean_metadata.py
index 7e7a4bab..08b0290a 100755
--- a/workflow_sars2_gisaid_ingest/scripts/clean_metadata.py
+++ b/workflow_sars2_gisaid_ingest/scripts/clean_metadata.py
@@ -1073,6 +1073,10 @@ def main():
         help="Path to location corrections CSV file",
     )
 
+    parser.add_argument(
+        "--lineages", type=str, required=True, help="Path to lineages CSV file",
+    )
+
     parser.add_argument(
         "-o",
         "--metadata-out",
@@ -1142,6 +1146,17 @@ def main():
     # Segment = 1
     df["segment"] = 1
 
+    # Load lineages and join to dataframe
+    lineages_df = pd.read_csv(args.lineages)
+    lineages_df = lineages_df.rename(columns={"taxon": "Accession ID"}).set_index(
+        "Accession ID"
+    )
+    df = df.rename(columns={"lineage": "gisaid_lineage"}).join(
+        lineages_df[["lineage"]], how="left"
+    )
+    # Fill in missing values with GISAID lineages
+    df.loc[:, "lineage"] = df["lineage"].combine_first(df['gisaid_lineage'])
+
     df.to_csv(args.metadata_out)
 
 
diff --git a/workflow_sars2_gisaid_ingest/scripts/process_feed.py b/workflow_sars2_gisaid_ingest/scripts/process_feed.py
index d74ca8f0..53f2dec8 100644
--- a/workflow_sars2_gisaid_ingest/scripts/process_feed.py
+++ b/workflow_sars2_gisaid_ingest/scripts/process_feed.py
@@ -160,7 +160,7 @@ def error_callback(e):
         pool.join()
 
     manifest = pd.DataFrame(entries, columns=["Accession ID", "sequence_hash", "date"])
-    #manifest["date"] = pd.to_datetime(manifest["date"])
+    # manifest["date"] = pd.to_datetime(manifest["date"])
     # Sort by date, and drop duplicate Accession IDs, by keeping the last copy
     # (i.e., the latest copy)
     manifest = manifest.sort_values("date", ascending=True).drop_duplicates(
@@ -173,7 +173,7 @@ def error_callback(e):
 
     # Get fields for each isolate
     fields = []
-    with lzma.open(args.data_feed, "xt") as fp_in:
+    with lzma.open(args.data_feed, "rt") as fp_in:
         isolate = json.loads(fp_in.readline().strip())
         for i, key in enumerate(isolate.keys()):
             # Skip the special sequence column
@@ -190,7 +190,7 @@ def error_callback(e):
     # as list of tuples, (Accession ID, sequence)
     new_entries = []
 
-    with open(args.data_feed, "r") as fp_in:
+    with lzma.open(args.data_feed, "rt") as fp_in:
         line_counter = 0
 
         for line in fp_in: