Skip to content

Commit

Permalink
PANGO annotations for SARS2 GISAID workflow (#631)
Browse files Browse the repository at this point in the history
* Add PANGO assignment to GISAID workflow

* Add legacy lineage col to metadata

* add missing input func

* Add missing repo

* Fix fasta_raw folder missing on first run

* Add SARS2 GISAID workflow example to main Snakefile

* Multithread PANGO for SARS2 GenBank workflow

* Fix typos -- missing xz adjustment from previous PRs

* Fix lineage DAG generation for SARS2 GISAID workflow

* Add GISAID lineage as another group col

* Move fake input to combine_lineages to avoid retriggering job for each individual lineage assignment job

* Bump version to v2.7.5-pango (temp)

* Auto-update nextstrain exclusion list
  • Loading branch information
atc3 authored Oct 31, 2023
1 parent 6fe2047 commit f039ad2
Show file tree
Hide file tree
Showing 12 changed files with 144 additions and 40 deletions.
9 changes: 8 additions & 1 deletion config/config_sars2_gisaid.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,19 @@ metadata_cols:
group_cols:
lineage:
name: "lineage"
title: "Lineage"
title: "PANGO Lineage"
description: ""
link:
title: "(Lineage Descriptions)"
href: "https://cov-lineages.org/descriptions.html"
show_collapse_options: true
gisaid_lineage:
name: "gisaid_lineage"
title: "PANGO Lineage (GISAID)"
description: "PANGO assignments from GISAID"
link:
title: "(Lineage Descriptions)"
href: "https://cov-lineages.org/descriptions.html"
clade:
name: "clade"
title: "Clade"
Expand Down
9 changes: 8 additions & 1 deletion config/config_sars2_gisaid_private.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,19 @@ metadata_cols:
group_cols:
lineage:
name: "lineage"
title: "Lineage"
title: "PANGO Lineage"
description: ""
link:
title: "(Lineage Descriptions)"
href: "https://cov-lineages.org/descriptions.html"
show_collapse_options: true
gisaid_lineage:
name: "gisaid_lineage"
title: "PANGO Lineage (GISAID)"
description: "PANGO assignments from GISAID"
link:
title: "(Lineage Descriptions)"
href: "https://cov-lineages.org/descriptions.html"
clade:
name: "clade"
title: "Clade"
Expand Down
2 changes: 1 addition & 1 deletion package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "covidcg",
"version": "2.7.4",
"version": "2.7.5-pango",
"description": "",
"engines": {
"node": ">=8",
Expand Down
2 changes: 1 addition & 1 deletion src/utils/version.js
Original file line number Diff line number Diff line change
@@ -1 +1 @@
export const version = '2.7.4';
export const version = '2.7.5-pango';
5 changes: 4 additions & 1 deletion workflow_main/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

"""Main data processing workflow from ingested data
$ snakemake --configfile ../config/config_sars2_gisaid.yaml -j6
$ snakemake --configfile ../config/config_sars2_genbank_dev.yaml -j6
$ snakemake --configfile ../config/config_rsv_genbank.yaml -j6
$ snakemake --configfile ../config/config_flu_genbank.yaml -j6
Expand Down Expand Up @@ -238,7 +239,8 @@ rule preprocess_sequences:
virus = config['virus'],
nextstrain_exclude = os.path.join(
static_data_folder, "nextstrain_exclude_20200520.txt"
)
),
exclude_list = os.path.join(data_folder, "exclude_list.txt")
output:
fasta = os.path.join(data_folder, "fasta_processed", "{chunk}.fa.gz")
shell:
Expand All @@ -247,6 +249,7 @@ rule preprocess_sequences:
python3 scripts/preprocess_sars2_sequences.py \
--input {input.fasta} \
--nextstrain-exclusion-file {params.nextstrain_exclude} \
--exclude-list {params.exclude_list} \
--output {output.fasta}
elif [[ "{params.virus}" == "rsv" ]]; then
python3 scripts/preprocess_rsv_sequences.py \
Expand Down
61 changes: 33 additions & 28 deletions workflow_main/scripts/preprocess_sars2_sequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@
import math
import re

from pathlib import Path
import pandas as pd


def main():
"""Filter out sequences (adapted from van Dorp et al, 2020)
1. Filter against nextstrain exclusion list
2. Can't be less than 29700NT
3. Can't have more than 5% ambiguous NT
3. Can't have more than 5% ambiguous NT
"""

parser = argparse.ArgumentParser()
Expand All @@ -30,32 +30,30 @@ def main():
required=True,
help="Path to nextstrain exclusion file",
)
parser.add_argument("--exclude-list", type=str, required=True, help="Debug")
parser.add_argument(
"--output", type=str, required=True, help="Output FASTA file",
"--output",
type=str,
required=True,
help="Output FASTA file",
)

args = parser.parse_args()

# Load lines, ignoring comments and empty lines
exclude_taxons = []
with Path(args.nextstrain_exclusion_file).open("r") as fp:
for line in fp.readlines():
# Exclude comments
if line[0] == "#":
continue

# Strip whitespace
line = re.sub(r"\s+", "", line).strip()

# Exclude empty lines
if not line:
continue
df = pd.read_csv(
"https://raw.githubusercontent.com/nextstrain/ncov/master/defaults/exclude.txt",
comment="#",
header=None,
skip_blank_lines=True,
)

exclude_taxons.append(line)
exclude_taxons = list(set(df[0].tolist()))

num_excluded = 0
fp_in = gzip.open(args.input, "rt")
fp_out = gzip.open(args.output, "wt")
fp_exclude = open(args.exclude_list, "a")

cur_entry = ""
cur_seq = ""
Expand All @@ -64,24 +62,30 @@ def main():

# Beginning of a new entry, or EOF = end of current entry
if not line or line[0] == ">":

if cur_entry:
num_ambiguous = 0
for char in cur_seq:
if char == "N":
num_ambiguous += 1

if (
# 1: Check against nextstrain exclusion list
(cur_entry in exclude_taxons)
or
# 2: Can't be less than 29700 NT
len(cur_seq) < 29700
or
# 3: Can't have more than 5% ambiguous (N) NT
num_ambiguous > math.floor(len(cur_seq) * 0.05)
):
exclude_reasons = []
# 1: Check against nextstrain exclusion list
if cur_entry in exclude_taxons:
num_excluded += 1
exclude_reasons.append("in_exclusion_list")

# 2: Can't be less than 29700 NT
if len(cur_seq) < 29700:
num_excluded += 1
exclude_reasons.append(f"too_short:{str(len(cur_seq))}")

# 3: Can't have more than 5% ambiguous (N) NT
if num_ambiguous > math.floor(len(cur_seq) * 0.05):
num_excluded += 1
exclude_reasons.append(f"too_many_ambiguous:{str(num_ambiguous)}")

if len(exclude_reasons) > 1:
fp_exclude.write(f"{cur_entry},{';'.join(exclude_reasons)}\n")
else:
# It passed, write to output
fp_out.write(">" + cur_entry + "\n")
Expand All @@ -108,6 +112,7 @@ def main():

fp_in.close()
fp_out.close()
fp_exclude.close()

print("Removed {:,} sequences".format(num_excluded), flush=True)

Expand Down
3 changes: 2 additions & 1 deletion workflow_sars2_genbank_ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -199,11 +199,12 @@ rule pangolin_lineages:
fasta = temp(os.path.join(data_folder, "lineages", "{chunk}.fa")),
lineages = os.path.join(data_folder, "lineages", "{chunk}.csv")
conda: "envs/pangolin.yaml"
threads: workflow.cores
shell:
"""
# Pangolin can only use un-gzipped fasta files
gunzip -c {input.fasta} > {output.fasta}
pangolin --outfile {output.lineages} {output.fasta}
pangolin --outfile {output.lineages} {output.fasta} -t {workflow.cores} # --analysis-mode fast
"""

rule combine_lineages:
Expand Down
48 changes: 46 additions & 2 deletions workflow_sars2_gisaid_ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ rule download:
"scripts/download.sh > {output.feed}"


rule process_feed:
checkpoint process_feed:
"""Split up the data feed's individual JSON objects into metadata and fasta files. Chunk the fasta files so that every day we only reprocess the subset of fasta files that have changed. The smaller the chunk size, the more efficient the updates, but the more files on the filesystem.
On a 48-core workstation with 128 GB RAM, aligning 200 sequences takes about 10 minutes, and this is more acceptable than having to align 1000 sequences, which takes ~1 hour. We end up with hundreds of files, but the filesystem seems to be handling it well.
"""
Expand All @@ -56,10 +56,49 @@ rule process_feed:
threads: workflow.cores
shell:
"""
mkdir -p {params.fasta}
python3 scripts/process_feed.py -d {input.feed} -f {params.fasta} -m {output.metadata_dirty} -p {threads}
"""


rule pangolin_lineages:
"""Assign a lineage to each sequence using pangolin
"""
input:
fasta = os.path.join(data_folder, "fasta_raw", "{chunk}.fa.gz")
output:
fasta = temp(os.path.join(data_folder, "lineages", "{chunk}.fa")),
lineages = os.path.join(data_folder, "lineages", "{chunk}.csv")
conda: "envs/pangolin.yaml"
threads: workflow.cores
shell:
"""
# Pangolin can only use un-gzipped fasta files
gunzip -c {input.fasta} > {output.fasta}
pangolin --outfile {output.lineages} {output.fasta} -t {workflow.cores} # --analysis-mode fast
"""


checkpoint combine_lineages:
"""Combine all lineage result chunks
"""
input:
lineages = expand(
os.path.join(data_folder, "lineages", "{chunk}.csv"),
chunk=glob_wildcards(os.path.join(data_folder, "fasta_raw", "{i}.fa.gz")).i
),
status = rules.all.input.copy_status
params:
chunk_glob = os.path.join(data_folder, "lineages", "*.csv")
output:
lineages = os.path.join(data_folder, "lineages.csv")
shell:
"""
echo {input.lineages}
awk '(NR == 1) || (FNR > 1)' {params.chunk_glob} > {output.lineages}
"""


rule clean_metadata:
"""Clean up metadata from GISAID
"""
Expand All @@ -68,9 +107,14 @@ rule clean_metadata:
location_corrections = os.path.join(
static_data_folder, "location_corrections.csv"
),
lineages = rules.combine_lineages.output.lineages
output:
metadata_clean = os.path.join(data_folder, "metadata.csv")
shell:
"""
python3 scripts/clean_metadata.py -i {input.metadata_dirty} -l {input.location_corrections} -o {output.metadata_clean}
python3 scripts/clean_metadata.py \
--metadata-in {input.metadata_dirty} \
--location-corrections {input.location_corrections} \
--lineages {input.lineages} \
--metadata-out {output.metadata_clean}
"""
22 changes: 22 additions & 0 deletions workflow_sars2_gisaid_ingest/envs/pangolin.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# MODIFIED FROM: https://github.com/cov-lineages/pangolin, v4.3.1

name: pangolin
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- biopython=1.74
- minimap2>=2.16
- pip=19.3.1
- python=3.7
- snakemake-minimal<=7.24.0
- gofasta
- ucsc-fatovcf>=426
- usher>=0.5.4
- git-lfs
- pip:
- git+https://github.com/cov-lineages/[email protected]
- git+https://github.com/cov-lineages/scorpio.git
- git+https://github.com/cov-lineages/constellations.git
- git+https://github.com/cov-lineages/pangolin-data.git
15 changes: 15 additions & 0 deletions workflow_sars2_gisaid_ingest/scripts/clean_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -1073,6 +1073,10 @@ def main():
help="Path to location corrections CSV file",
)

parser.add_argument(
"--lineages", type=str, required=True, help="Path to lineages CSV file",
)

parser.add_argument(
"-o",
"--metadata-out",
Expand Down Expand Up @@ -1142,6 +1146,17 @@ def main():
# Segment = 1
df["segment"] = 1

# Load lineages and join to dataframe
lineages_df = pd.read_csv(args.lineages)
lineages_df = lineages_df.rename(columns={"taxon": "Accession ID"}).set_index(
"Accession ID"
)
df = df.rename(columns={"lineage": "gisaid_lineage"}).join(
lineages_df[["lineage"]], how="left"
)
# Fill in missing values with GISAID lineages
df.loc[:, "lineage"] = df["lineage"].combine_first(df['gisaid_lineage'])

df.to_csv(args.metadata_out)


Expand Down
6 changes: 3 additions & 3 deletions workflow_sars2_gisaid_ingest/scripts/process_feed.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def error_callback(e):
pool.join()

manifest = pd.DataFrame(entries, columns=["Accession ID", "sequence_hash", "date"])
#manifest["date"] = pd.to_datetime(manifest["date"])
# manifest["date"] = pd.to_datetime(manifest["date"])
# Sort by date, and drop duplicate Accession IDs, by keeping the last copy
# (i.e., the latest copy)
manifest = manifest.sort_values("date", ascending=True).drop_duplicates(
Expand All @@ -173,7 +173,7 @@ def error_callback(e):

# Get fields for each isolate
fields = []
with lzma.open(args.data_feed, "xt") as fp_in:
with lzma.open(args.data_feed, "rt") as fp_in:
isolate = json.loads(fp_in.readline().strip())
for i, key in enumerate(isolate.keys()):
# Skip the special sequence column
Expand All @@ -190,7 +190,7 @@ def error_callback(e):
# as list of tuples, (Accession ID, sequence)
new_entries = []

with open(args.data_feed, "r") as fp_in:
with lzma.open(args.data_feed, "rt") as fp_in:
line_counter = 0

for line in fp_in:
Expand Down

0 comments on commit f039ad2

Please sign in to comment.