Skip to content

Commit

Permalink
Merge pull request #431 from broadinstitute/dp-cdc
Browse files Browse the repository at this point in the history
CDC SC2 viral pipeline updates
  • Loading branch information
dpark01 authored Jul 11, 2022
2 parents 920df81 + cde9223 commit 7e90cb0
Show file tree
Hide file tree
Showing 6 changed files with 123 additions and 23 deletions.
6 changes: 3 additions & 3 deletions pipes/WDL/tasks/tasks_assembly.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -407,9 +407,9 @@ task refine_assembly_with_aligned_reads {
File reads_aligned_bam
String sample_name
Boolean? mark_duplicates = false
Float? major_cutoff = 0.5
Int? min_coverage = 3
Boolean mark_duplicates = false
Float major_cutoff = 0.5
Int min_coverage = 3
Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-assemble:2.1.16.1"
Expand Down
30 changes: 25 additions & 5 deletions pipes/WDL/tasks/tasks_reports.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ task alignment_metrics {
File aligned_bam
File ref_fasta
File? primers_bed
String? amplicon_set
Int? min_coverage

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-core:2.1.33"
Expand Down Expand Up @@ -54,19 +56,37 @@ task alignment_metrics {
echo -e "$SAMPLE\t~{out_basename}" >> prepend.txt
paste prepend.txt picard_clean.insert_size_metrics.txt > "~{out_basename}".insert_size_metrics.txt

# actually don't know how to do CollectTargetedPcrMetrics yet
touch "~{out_basename}".ampliconstats.txt "~{out_basename}".ampliconstats_parsed.txt
echo -e "sample_sanitized\tbam\tamplicon_set\tamplicon_idx\tamplicon_left\tamplicon_right\tFREADS\tFDEPTH\tFPCOV\tFAMP" > "~{out_basename}.ampliconstats_parsed.txt"
if [ -n "~{primers_bed}" ]; then
picard $XMX BedToIntervalList \
-I "~{primers_bed}" \
-O primers.interval.list \
-SD reference.dict
# samtools ampliconstats
samtools ampliconstats -s -@ $(nproc) \
~{'-d ' + min_coverage} \
-o "~{out_basename}".ampliconstats.txt "~{primers_bed}" "~{aligned_bam}"

# parse into our own tsv to facilitate tsv joining later
if [ -n "~{default='' amplicon_set}" ]; then
AMPLICON_SET="~{default='' amplicon_set}"
else
AMPLICON_SET=$(basename "~{primers_bed}" .bed)
fi
grep ^AMPLICON "~{out_basename}".ampliconstats.txt | cut -f 2- > AMPLICON
grep ^FREADS "~{out_basename}".ampliconstats.txt | cut -f 3- | tr '\t' '\n' > FREADS; echo "" >> FREADS
grep ^FDEPTH "~{out_basename}".ampliconstats.txt | cut -f 3- | tr '\t' '\n' > FDEPTH; echo "" >> FDEPTH
grep ^FPCOV "~{out_basename}".ampliconstats.txt | cut -f 3- | tr '\t' '\n' > FPCOV; echo "" >> FPCOV
grep ^FAMP "~{out_basename}".ampliconstats.txt | cut -f 4 | tail +2 > FAMP
for i in $(cut -f 1 AMPLICON); do echo -e "$SAMPLE\t~{out_basename}\t$AMPLICON_SET"; done > prepend.txt
wc -l prepend.txt AMPLICON FREADS FDEPTH FPCOV FAMP
paste prepend.txt AMPLICON FREADS FDEPTH FPCOV FAMP | grep '\S' >> "~{out_basename}.ampliconstats_parsed.txt"
fi
>>>

output {
File wgs_metrics = "~{out_basename}.raw_wgs_metrics.txt"
File alignment_metrics = "~{out_basename}.alignment_metrics.txt"
File insert_size_metrics = "~{out_basename}.insert_size_metrics.txt"
File amplicon_stats = "~{out_basename}.ampliconstats.txt"
File amplicon_stats_parsed = "~{out_basename}.ampliconstats_parsed.txt"
}

runtime {
Expand Down
31 changes: 22 additions & 9 deletions pipes/WDL/tasks/tasks_sarscov2.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -184,8 +184,8 @@ task sequencing_report {
File assembly_stats_tsv
File? collab_ids_tsv
String? sequencing_lab = "Broad Institute"
String? intro_blurb = "The Broad Institute Viral Genomics group, in partnership with the Genomics Platform and Data Sciences Platform, has been engaged in viral sequencing of COVID-19 patients since March 2020."
String sequencing_lab = "Broad Institute"
String intro_blurb = "The Broad Institute Viral Genomics group, in partnership with the Genomics Platform and Data Sciences Platform, has been engaged in viral sequencing of COVID-19 patients since March 2020."
String? max_date
String? min_date
Int? min_unambig
Expand Down Expand Up @@ -240,9 +240,12 @@ task sc2_meta_final {
String? max_date
String? min_date
Int? min_unambig=24000
Int min_unambig=24000
Boolean drop_file_cols=false
String address_map = '{}'
String authors_map = '{}'
File? filter_to_ids
String docker = "quay.io/broadinstitute/py3-bio:0.1.2"
Expand Down Expand Up @@ -274,6 +277,8 @@ task sc2_meta_final {
genome_status = json.load(inf)
else:
genome_status = {}
address_map = json.loads('~{address_map}')
authors_map = json.loads('~{authors_map}')
# read input files
df_assemblies = pd.read_csv(assemblies_tsv, sep='\t').dropna(how='all')
Expand Down Expand Up @@ -352,6 +357,10 @@ task sc2_meta_final {
# join column: collaborator_id
df_assemblies = df_assemblies.merge(collab_ids, on='sample', how='left', validate='one_to_one')
# derived columns: authors, orig_lab_addr
df_assemblies.loc[:,'authors'] = list(authors_map.get(cby) if not pd.isna(cby) else cby for cby in df_assemblies.loc[:,'collected_by'])
df_assemblies.loc[:,'orig_lab_addr'] = list(address_map.get(cby) if not pd.isna(cby) else cby for cby in df_assemblies.loc[:,'collected_by'])
# write final output
df_assemblies.to_csv("~{out_basename}.final.tsv", sep='\t', index=False)
CODE
Expand Down Expand Up @@ -547,23 +556,27 @@ task gisaid_uploader {
File gisaid_sequences_fasta
File gisaid_meta_csv
File cli_auth_token
String database="EpiCoV"
String frameshift="catch_novel"
}
command {
set -e
cp "~{cli_auth_token}" gisaid_uploader.authtoken
gisaid_uploader CoV upload \
cli3 upload \
--database "~{database}" \
--token "~{cli_auth_token}" \
--fasta "~{gisaid_sequences_fasta}" \
--csv "~{gisaid_meta_csv}" \
--failedout failed_metadata.csv \
| tee logs.txt
--metadata "~{gisaid_meta_csv}" \
--failed failed_metadata.csv \
--frameshift "~{frameshift}" \
--log logs.txt
# the following grep statement will exit 1 if anything failed
grep "submissions failed: 0" logs.txt > /dev/null
}
output {
File failed_metadata = "failed_metadata.csv"
}
runtime {
docker: "quay.io/broadinstitute/gisaid-cli:1.0"
docker: "quay.io/broadinstitute/gisaid-cli:3.0"
memory: "2 GB"
cpu: 2
disks: "local-disk 100 HDD"
Expand Down
51 changes: 51 additions & 0 deletions pipes/WDL/tasks/tasks_utils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,32 @@ task zcat {
}
}
task sed {
meta {
description: "Replace all occurrences of 'search' with 'replace' using sed."
}
input {
File infile
String search
String replace
String outfilename = "~{basename(infile)}-rename.txt"
}
command {
sed 's/~{search}/~{replace}/g' "~{infile}" > "~{outfilename}"
}
runtime {
docker: "ubuntu"
memory: "1 GB"
cpu: 1
disks: "local-disk 375 LOCAL"
dx_instance_type: "mem1_ssd1_v2_x2"
maxRetries: 2
}
output {
File outfile = "~{outfilename}"
}
}
task fasta_to_ids {
meta {
description: "Return the headers only from a fasta file"
Expand Down Expand Up @@ -461,6 +487,31 @@ task tsv_stack {
}
}
task cat_except_headers {
input {
Array[File]+ infiles
String out_filename
}
command <<<
awk 'FNR>1 || NR==1' \
~{sep=' ' infiles} \
> ~{out_filename}
>>>
output {
File out_tsv = out_filename
}
runtime {
memory: "1 GB"
cpu: 1
docker: "ubuntu"
disks: "local-disk 50 HDD"
dx_instance_type: "mem1_ssd1_v2_x2"
maxRetries: 2
}
}
task make_empty_file {
input {
String out_filename
Expand Down
6 changes: 5 additions & 1 deletion pipes/WDL/workflows/assemble_refbased.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,9 @@ workflow assemble_refbased {
call reports.alignment_metrics {
input:
aligned_bam = aligned_trimmed_bam,
ref_fasta = reference_fasta
ref_fasta = reference_fasta,
primers_bed = trim_coords_bed,
min_coverage = min_coverage
}

call assembly.run_discordance {
Expand Down Expand Up @@ -221,6 +223,8 @@ workflow assemble_refbased {
File picard_metrics_wgs = alignment_metrics.wgs_metrics
File picard_metrics_alignment = alignment_metrics.alignment_metrics
File picard_metrics_insert_size = alignment_metrics.insert_size_metrics
File samtools_ampliconstats = alignment_metrics.amplicon_stats
File samtools_ampliconstats_parsed = alignment_metrics.amplicon_stats_parsed

Array[File] align_to_self_merged_aligned_and_unaligned_bam = align_to_self.aligned_bam

Expand Down
22 changes: 17 additions & 5 deletions pipes/WDL/workflows/sarscov2_illumina_full.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ workflow sarscov2_illumina_full {

Int min_genome_bases = 24000
Int max_vadr_alerts = 0
Int ntc_max_unambig = 3000

File? sample_rename_map

Expand Down Expand Up @@ -107,7 +108,13 @@ workflow sarscov2_illumina_full {
# assemble genome
if (ampseq) {
String trim_coords_bed = amplicon_bed_prefix + demux_deplete.meta_by_sample[name_reads.left]["amplicon_set"] + ".bed"
call utils.sed as bed_rename {
input:
infile = amplicon_bed_prefix + demux_deplete.meta_by_sample[name_reads.left]["amplicon_set"] + ".bed",
outfilename = demux_deplete.meta_by_sample[name_reads.left]["amplicon_set"] + ".bed",
search = "MN908947.3",
replace = "NC_045512.2"
}
}
call assemble_refbased.assemble_refbased {
input:
Expand All @@ -116,7 +123,7 @@ workflow sarscov2_illumina_full {
sample_name = name_reads.left,
aligner = "minimap2",
skip_mark_dupes = ampseq,
trim_coords_bed = trim_coords_bed,
trim_coords_bed = bed_rename.outfile,
major_cutoff = 0.75,
min_coverage = if ampseq then 50 else 3
}
Expand Down Expand Up @@ -242,7 +249,7 @@ workflow sarscov2_illumina_full {
seqid_list = write_lines(select_all(passing_assembly_ids)),
demux_meta_by_sample_json = demux_deplete.meta_by_sample_json,
assembly_meta_tsv = sarscov2_batch_relineage.assembly_stats_relineage_tsv,
ntc_min_unambig = 15000
ntc_min_unambig = ntc_max_unambig
}
### QC metrics
Expand Down Expand Up @@ -274,6 +281,11 @@ workflow sarscov2_illumina_full {
id_col = 'sample_sanitized',
out_basename = "picard_metrics_insertsize-~{flowcell_id}"
}
call utils.cat_except_headers as samtools_ampliconstats_merge {
input:
infiles = assemble_refbased.samtools_ampliconstats_parsed,
out_filename = "samtools_ampliconstats-~{flowcell_id}.txt"
}

### filter and concatenate final sets for delivery ("passing" and "submittable")
call sarscov2.sc2_meta_final {
Expand Down Expand Up @@ -395,14 +407,14 @@ workflow sarscov2_illumina_full {
if(defined(gcs_out_metrics)) {
call terra.gcs_copy as gcs_metrics_dump {
input:
infiles = flatten([[assembly_meta_tsv.combined, sc2_meta_final.meta_tsv, ivar_trim_stats.trim_stats_tsv, demux_deplete.multiqc_report_raw, demux_deplete.multiqc_report_cleaned, demux_deplete.spikein_counts, picard_wgs_merge.out_tsv, picard_alignment_merge.out_tsv, picard_insertsize_merge.out_tsv, sarscov2_batch_relineage.nextclade_all_json, sarscov2_batch_relineage.nextclade_all_tsv], demux_deplete.demux_metrics]),
infiles = flatten([[assembly_meta_tsv.combined, sc2_meta_final.meta_tsv, ivar_trim_stats.trim_stats_tsv, demux_deplete.multiqc_report_raw, demux_deplete.multiqc_report_cleaned, demux_deplete.spikein_counts, picard_wgs_merge.out_tsv, picard_alignment_merge.out_tsv, picard_insertsize_merge.out_tsv, samtools_ampliconstats_merge.out_tsv, sarscov2_batch_relineage.nextclade_all_json, sarscov2_batch_relineage.nextclade_all_tsv], demux_deplete.demux_metrics]),
gcs_uri_prefix = "~{gcs_out_metrics}/~{flowcell_id}/"
}
}
if(defined(gcs_out_cdc)) {
call terra.gcs_copy as gcs_cdc_dump {
input:
infiles = [sc2_meta_final.meta_tsv, passing_cat.filtered_fasta],
infiles = [sc2_meta_final.meta_tsv, passing_cat.filtered_fasta, gisaid_meta_prep.meta_csv, prefix_gisaid.renamed_fasta, package_genbank_ftp_submission.submission_zip, select_first([demux_deplete.sra_metadata])],
gcs_uri_prefix = "~{gcs_out_cdc}/~{demux_deplete.run_date}/~{flowcell_id}/"
}
call terra.gcs_copy as gcs_cdc_dump_reads {
Expand Down

0 comments on commit 7e90cb0

Please sign in to comment.