From e43ba2d04ba737d88bea7a193ee52320e5acff50 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Fri, 10 Jun 2022 23:48:50 -0400 Subject: [PATCH 01/15] add samtools ampliconstats reporting --- pipes/WDL/tasks/tasks_reports.wdl | 16 ++++++++---- pipes/WDL/tasks/tasks_utils.wdl | 26 +++++++++++++++++++ pipes/WDL/workflows/assemble_refbased.wdl | 1 + .../WDL/workflows/sarscov2_illumina_full.wdl | 11 +++++--- 4 files changed, 46 insertions(+), 8 deletions(-) diff --git a/pipes/WDL/tasks/tasks_reports.wdl b/pipes/WDL/tasks/tasks_reports.wdl index f8871b9aa..d12bfd62c 100644 --- a/pipes/WDL/tasks/tasks_reports.wdl +++ b/pipes/WDL/tasks/tasks_reports.wdl @@ -6,6 +6,7 @@ task alignment_metrics { File ref_fasta File? primers_bed + Int? machine_mem_gb String docker = "quay.io/broadinstitute/viral-core:2.1.33" } @@ -54,12 +55,16 @@ task alignment_metrics { echo -e "$SAMPLE\t~{out_basename}" >> prepend.txt paste prepend.txt picard_clean.insert_size_metrics.txt > "~{out_basename}".insert_size_metrics.txt - # actually don't know how to do CollectTargetedPcrMetrics yet + touch "~{out_basename}".ampliconstats.txt if [ -n "~{primers_bed}" ]; then - picard $XMX BedToIntervalList \ - -I "~{primers_bed}" \ - -O primers.interval.list \ - -SD reference.dict + ## actually don't know how to do CollectTargetedPcrMetrics yet + #picard $XMX BedToIntervalList \ + # -I "~{primers_bed}" \ + # -O primers.interval.list \ + # -SD reference.dict + + # samtools ampliconstats + samtools ampliconstats -s -@ $(nproc) -o "~{out_basename}".ampliconstats.txt "~{primers_bed}" "~{aligned_bam}" fi >>> @@ -67,6 +72,7 @@ task alignment_metrics { File wgs_metrics = "~{out_basename}.raw_wgs_metrics.txt" File alignment_metrics = "~{out_basename}.alignment_metrics.txt" File insert_size_metrics = "~{out_basename}.insert_size_metrics.txt" + File amplicon_stats = "~{out_basename}.ampliconstats.txt" } runtime { diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl index 881157855..77af8500c 100644 --- a/pipes/WDL/tasks/tasks_utils.wdl +++ b/pipes/WDL/tasks/tasks_utils.wdl @@ -116,6 +116,32 @@ task zcat { } } +task sed { + meta { + description: "Replace all occurrences of 'search' with 'replace' using sed." + } + input { + File infile + String search + String replace + String outfilename = "~{infile}-rename.txt" + } + command { + sed 's/~{search}/~{replace}/g' "~{infile}" > "~{outfilename}" + } + runtime { + docker: "ubuntu" + memory: "1 GB" + cpu: 1 + disks: "local-disk 375 LOCAL" + dx_instance_type: "mem1_ssd1_v2_x2" + maxRetries: 2 + } + output { + File outfile = "~{outfilename}" + } +} + task fasta_to_ids { meta { description: "Return the headers only from a fasta file" diff --git a/pipes/WDL/workflows/assemble_refbased.wdl b/pipes/WDL/workflows/assemble_refbased.wdl index 40ef09fc5..567086ad5 100644 --- a/pipes/WDL/workflows/assemble_refbased.wdl +++ b/pipes/WDL/workflows/assemble_refbased.wdl @@ -221,6 +221,7 @@ workflow assemble_refbased { File picard_metrics_wgs = alignment_metrics.wgs_metrics File picard_metrics_alignment = alignment_metrics.alignment_metrics File picard_metrics_insert_size = alignment_metrics.insert_size_metrics + File samtools_ampliconstats = alignment_metrics.amplicon_stats Array[File] align_to_self_merged_aligned_and_unaligned_bam = align_to_self.aligned_bam diff --git a/pipes/WDL/workflows/sarscov2_illumina_full.wdl b/pipes/WDL/workflows/sarscov2_illumina_full.wdl index 39fd19cd7..97f07fce2 100644 --- a/pipes/WDL/workflows/sarscov2_illumina_full.wdl +++ b/pipes/WDL/workflows/sarscov2_illumina_full.wdl @@ -107,7 +107,12 @@ workflow sarscov2_illumina_full { # assemble genome if (ampseq) { - String trim_coords_bed = amplicon_bed_prefix + demux_deplete.meta_by_sample[name_reads.left]["amplicon_set"] + ".bed" + call utils.sed as bed_rename { + input: + infile = amplicon_bed_prefix + demux_deplete.meta_by_sample[name_reads.left]["amplicon_set"] + ".bed", + search = "MN908947.3", + replace = "NC_045512.​2" + } } call assemble_refbased.assemble_refbased { input: @@ -116,7 +121,7 @@ workflow sarscov2_illumina_full { sample_name = name_reads.left, aligner = "minimap2", skip_mark_dupes = ampseq, - trim_coords_bed = trim_coords_bed, + trim_coords_bed = bed_rename.outfile, major_cutoff = 0.75, min_coverage = if ampseq then 50 else 3 } @@ -395,7 +400,7 @@ workflow sarscov2_illumina_full { if(defined(gcs_out_metrics)) { call terra.gcs_copy as gcs_metrics_dump { input: - infiles = flatten([[assembly_meta_tsv.combined, sc2_meta_final.meta_tsv, ivar_trim_stats.trim_stats_tsv, demux_deplete.multiqc_report_raw, demux_deplete.multiqc_report_cleaned, demux_deplete.spikein_counts, picard_wgs_merge.out_tsv, picard_alignment_merge.out_tsv, picard_insertsize_merge.out_tsv, sarscov2_batch_relineage.nextclade_all_json, sarscov2_batch_relineage.nextclade_all_tsv], demux_deplete.demux_metrics]), + infiles = flatten([[assembly_meta_tsv.combined, sc2_meta_final.meta_tsv, ivar_trim_stats.trim_stats_tsv, demux_deplete.multiqc_report_raw, demux_deplete.multiqc_report_cleaned, demux_deplete.spikein_counts, picard_wgs_merge.out_tsv, picard_alignment_merge.out_tsv, picard_insertsize_merge.out_tsv, sarscov2_batch_relineage.nextclade_all_json, sarscov2_batch_relineage.nextclade_all_tsv], demux_deplete.demux_metrics, assemble_refbased.samtools_ampliconstats]), gcs_uri_prefix = "~{gcs_out_metrics}/~{flowcell_id}/" } } From ee8d4d173d65ea907d03332f0463b42e677d0444 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Tue, 21 Jun 2022 08:42:13 -0400 Subject: [PATCH 02/15] parameterize ntc rejection cutoff, reduce default from 15000 to 3000 --- pipes/WDL/workflows/sarscov2_illumina_full.wdl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pipes/WDL/workflows/sarscov2_illumina_full.wdl b/pipes/WDL/workflows/sarscov2_illumina_full.wdl index 636ee9d2f..3ffed5221 100644 --- a/pipes/WDL/workflows/sarscov2_illumina_full.wdl +++ b/pipes/WDL/workflows/sarscov2_illumina_full.wdl @@ -50,6 +50,7 @@ workflow sarscov2_illumina_full { Int min_genome_bases = 24000 Int max_vadr_alerts = 0 + Int ntc_max_unambig = 3000 File? sample_rename_map @@ -242,7 +243,7 @@ workflow sarscov2_illumina_full { seqid_list = write_lines(select_all(passing_assembly_ids)), demux_meta_by_sample_json = demux_deplete.meta_by_sample_json, assembly_meta_tsv = sarscov2_batch_relineage.assembly_stats_relineage_tsv, - ntc_min_unambig = 15000 + ntc_min_unambig = ntc_max_unambig } ### QC metrics From 133dd7a0dcd132b81a5fa8c8b63b463a25ed238b Mon Sep 17 00:00:00 2001 From: Danny Park Date: Tue, 28 Jun 2022 10:12:16 -0400 Subject: [PATCH 03/15] WDL fixes for non-optional inputs --- pipes/WDL/tasks/tasks_assembly.wdl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pipes/WDL/tasks/tasks_assembly.wdl b/pipes/WDL/tasks/tasks_assembly.wdl index 696df4bf1..6b2f75e74 100644 --- a/pipes/WDL/tasks/tasks_assembly.wdl +++ b/pipes/WDL/tasks/tasks_assembly.wdl @@ -407,9 +407,9 @@ task refine_assembly_with_aligned_reads { File reads_aligned_bam String sample_name - Boolean? mark_duplicates = false - Float? major_cutoff = 0.5 - Int? min_coverage = 3 + Boolean mark_duplicates = false + Float major_cutoff = 0.5 + Int min_coverage = 3 Int? machine_mem_gb String docker = "quay.io/broadinstitute/viral-assemble:2.1.16.1" From a65ac59b661dc7932b25e4d96b59e3e3d80af2dd Mon Sep 17 00:00:00 2001 From: Danny Park Date: Tue, 28 Jun 2022 11:06:37 -0400 Subject: [PATCH 04/15] start to prep ampliconstats output for GP --- pipes/WDL/tasks/tasks_reports.wdl | 31 ++++++++++++++++------- pipes/WDL/workflows/assemble_refbased.wdl | 4 ++- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/pipes/WDL/tasks/tasks_reports.wdl b/pipes/WDL/tasks/tasks_reports.wdl index d12bfd62c..90753177f 100644 --- a/pipes/WDL/tasks/tasks_reports.wdl +++ b/pipes/WDL/tasks/tasks_reports.wdl @@ -5,7 +5,8 @@ task alignment_metrics { File aligned_bam File ref_fasta File? primers_bed - + String? amplicon_set + Int? min_coverage Int? machine_mem_gb String docker = "quay.io/broadinstitute/viral-core:2.1.33" @@ -55,16 +56,27 @@ task alignment_metrics { echo -e "$SAMPLE\t~{out_basename}" >> prepend.txt paste prepend.txt picard_clean.insert_size_metrics.txt > "~{out_basename}".insert_size_metrics.txt - touch "~{out_basename}".ampliconstats.txt + touch "~{out_basename}".ampliconstats.txt "~{out_basename}".ampliconstats_parsed.txt + echo -e "sample_sanitized\tbam\tamplicon_set\tamplicon_idx\tamplicon_left\tamplicon_right\tFREADS\tFDEPTH\tFPCOV\tFAMP" > "~{out_basename}.ampliconstats_parsed.txt" if [ -n "~{primers_bed}" ]; then - ## actually don't know how to do CollectTargetedPcrMetrics yet - #picard $XMX BedToIntervalList \ - # -I "~{primers_bed}" \ - # -O primers.interval.list \ - # -SD reference.dict - # samtools ampliconstats - samtools ampliconstats -s -@ $(nproc) -o "~{out_basename}".ampliconstats.txt "~{primers_bed}" "~{aligned_bam}" + samtools ampliconstats -s -@ $(nproc) \ + ~{'-d ' + min_coverage} \ + -o "~{out_basename}".ampliconstats.txt "~{primers_bed}" "~{aligned_bam}" + + # parse into our own tsv to facilitate tsv joining later + if [ -n "~{default='' amplicon_set}" ]; then + AMPLICON_SET="~{default='' amplicon_set}" + else + AMPLICON_SET=$(basename "~{primers_bed}" .bed) + fi + echo -e "$SAMPLE\t~{out_basename}\t$AMPLICON_SET" > prepend.txt + grep ^AMPLICON "~{out_basename}".ampliconstats.txt | cut -f 2- > AMPLICON + grep ^FREADS "~{out_basename}".ampliconstats.txt | cut -f 3- | tr '\t' '\n' > FREADS; echo "" >> FREADS + grep ^FDEPTH "~{out_basename}".ampliconstats.txt | cut -f 3- | tr '\t' '\n' > FDEPTH; echo "" >> FDEPTH + grep ^FPCOV "~{out_basename}".ampliconstats.txt | cut -f 3- | tr '\t' '\n' > FPCOV; echo "" >> FPCOV + grep ^FAMP "~{out_basename}".ampliconstats.txt | cut -f 4 | tail +2 > FAMP + paste prepend.txt AMPLICON FREADS FDEPTH FPCOV FAMP >> "~{out_basename}.ampliconstats_parsed.txt" fi >>> @@ -73,6 +85,7 @@ task alignment_metrics { File alignment_metrics = "~{out_basename}.alignment_metrics.txt" File insert_size_metrics = "~{out_basename}.insert_size_metrics.txt" File amplicon_stats = "~{out_basename}.ampliconstats.txt" + File amplicon_stats_parsed = "~{out_basename}.ampliconstats_parsed.txt" } runtime { diff --git a/pipes/WDL/workflows/assemble_refbased.wdl b/pipes/WDL/workflows/assemble_refbased.wdl index 567086ad5..b1f1587a7 100644 --- a/pipes/WDL/workflows/assemble_refbased.wdl +++ b/pipes/WDL/workflows/assemble_refbased.wdl @@ -122,7 +122,9 @@ workflow assemble_refbased { call reports.alignment_metrics { input: aligned_bam = aligned_trimmed_bam, - ref_fasta = reference_fasta + ref_fasta = reference_fasta, + primers_bed = trim_coords_bed, + min_coverage = min_coverage } call assembly.run_discordance { From 5e8550d12bcde98ef95acdec41aeb82730237419 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Tue, 28 Jun 2022 11:11:15 -0400 Subject: [PATCH 05/15] add merged parsed samtools ampliconstats to gp bucket --- pipes/WDL/workflows/assemble_refbased.wdl | 1 + pipes/WDL/workflows/sarscov2_illumina_full.wdl | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/pipes/WDL/workflows/assemble_refbased.wdl b/pipes/WDL/workflows/assemble_refbased.wdl index b1f1587a7..00ee8a445 100644 --- a/pipes/WDL/workflows/assemble_refbased.wdl +++ b/pipes/WDL/workflows/assemble_refbased.wdl @@ -224,6 +224,7 @@ workflow assemble_refbased { File picard_metrics_alignment = alignment_metrics.alignment_metrics File picard_metrics_insert_size = alignment_metrics.insert_size_metrics File samtools_ampliconstats = alignment_metrics.amplicon_stats + File samtools_ampliconstats_parsed = alignment_metrics.amplicon_stats_parsed Array[File] align_to_self_merged_aligned_and_unaligned_bam = align_to_self.aligned_bam diff --git a/pipes/WDL/workflows/sarscov2_illumina_full.wdl b/pipes/WDL/workflows/sarscov2_illumina_full.wdl index 391789979..98b8b158f 100644 --- a/pipes/WDL/workflows/sarscov2_illumina_full.wdl +++ b/pipes/WDL/workflows/sarscov2_illumina_full.wdl @@ -280,6 +280,12 @@ workflow sarscov2_illumina_full { id_col = 'sample_sanitized', out_basename = "picard_metrics_insertsize-~{flowcell_id}" } + call utils.tsv_join as samtools_ampliconstats_merge { + input: + input_tsvs = assemble_refbased.samtools_ampliconstats_parsed, + id_col = 'sample_sanitized', + out_basename = "samtools_ampliconstats-~{flowcell_id}" + } ### filter and concatenate final sets for delivery ("passing" and "submittable") call sarscov2.sc2_meta_final { @@ -401,7 +407,7 @@ workflow sarscov2_illumina_full { if(defined(gcs_out_metrics)) { call terra.gcs_copy as gcs_metrics_dump { input: - infiles = flatten([[assembly_meta_tsv.combined, sc2_meta_final.meta_tsv, ivar_trim_stats.trim_stats_tsv, demux_deplete.multiqc_report_raw, demux_deplete.multiqc_report_cleaned, demux_deplete.spikein_counts, picard_wgs_merge.out_tsv, picard_alignment_merge.out_tsv, picard_insertsize_merge.out_tsv, sarscov2_batch_relineage.nextclade_all_json, sarscov2_batch_relineage.nextclade_all_tsv], demux_deplete.demux_metrics, assemble_refbased.samtools_ampliconstats]), + infiles = flatten([[assembly_meta_tsv.combined, sc2_meta_final.meta_tsv, ivar_trim_stats.trim_stats_tsv, demux_deplete.multiqc_report_raw, demux_deplete.multiqc_report_cleaned, demux_deplete.spikein_counts, picard_wgs_merge.out_tsv, picard_alignment_merge.out_tsv, picard_insertsize_merge.out_tsv, samtools_ampliconstats_merge.out_tsv, sarscov2_batch_relineage.nextclade_all_json, sarscov2_batch_relineage.nextclade_all_tsv], demux_deplete.demux_metrics]), gcs_uri_prefix = "~{gcs_out_metrics}/~{flowcell_id}/" } } From 50fe9d8d0db567a71b3e5093dc0dcd07afe57cd2 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Tue, 28 Jun 2022 13:01:50 -0400 Subject: [PATCH 06/15] add more files to cdc delivery --- pipes/WDL/workflows/sarscov2_illumina_full.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipes/WDL/workflows/sarscov2_illumina_full.wdl b/pipes/WDL/workflows/sarscov2_illumina_full.wdl index 98b8b158f..dbec7ecab 100644 --- a/pipes/WDL/workflows/sarscov2_illumina_full.wdl +++ b/pipes/WDL/workflows/sarscov2_illumina_full.wdl @@ -414,7 +414,7 @@ workflow sarscov2_illumina_full { if(defined(gcs_out_cdc)) { call terra.gcs_copy as gcs_cdc_dump { input: - infiles = [sc2_meta_final.meta_tsv, passing_cat.filtered_fasta], + infiles = [sc2_meta_final.meta_tsv, passing_cat.filtered_fasta, gisaid_meta_prep.meta_csv, prefix_gisaid.renamed_fasta, package_genbank_ftp_submission.submission_zip, select_first([demux_deplete.sra_metadata])], gcs_uri_prefix = "~{gcs_out_cdc}/~{demux_deplete.run_date}/~{flowcell_id}/" } call terra.gcs_copy as gcs_cdc_dump_reads { From c96349a4f871c6f0d5edb0a9826987819841147e Mon Sep 17 00:00:00 2001 From: Danny Park Date: Wed, 29 Jun 2022 08:45:12 -0400 Subject: [PATCH 07/15] update gisaid cli uploader from v1 to v3. fix some wdl draft2 -> 1.0 inputs --- pipes/WDL/tasks/tasks_sarscov2.wdl | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/pipes/WDL/tasks/tasks_sarscov2.wdl b/pipes/WDL/tasks/tasks_sarscov2.wdl index a0c916c60..e76e61aef 100644 --- a/pipes/WDL/tasks/tasks_sarscov2.wdl +++ b/pipes/WDL/tasks/tasks_sarscov2.wdl @@ -184,8 +184,8 @@ task sequencing_report { File assembly_stats_tsv File? collab_ids_tsv - String? sequencing_lab = "Broad Institute" - String? intro_blurb = "The Broad Institute Viral Genomics group, in partnership with the Genomics Platform and Data Sciences Platform, has been engaged in viral sequencing of COVID-19 patients since March 2020." + String sequencing_lab = "Broad Institute" + String intro_blurb = "The Broad Institute Viral Genomics group, in partnership with the Genomics Platform and Data Sciences Platform, has been engaged in viral sequencing of COVID-19 patients since March 2020." String? max_date String? min_date Int? min_unambig @@ -240,7 +240,7 @@ task sc2_meta_final { String? max_date String? min_date - Int? min_unambig=24000 + Int min_unambig=24000 Boolean drop_file_cols=false File? filter_to_ids @@ -547,15 +547,19 @@ task gisaid_uploader { File gisaid_sequences_fasta File gisaid_meta_csv File cli_auth_token + String database="EpiCoV" + String frameshift="catch_novel" } command { set -e - cp "~{cli_auth_token}" gisaid_uploader.authtoken - gisaid_uploader CoV upload \ + cli3 upload \ + --database "~{database}" \ + --token "~{cli_auth_token}" \ --fasta "~{gisaid_sequences_fasta}" \ - --csv "~{gisaid_meta_csv}" \ - --failedout failed_metadata.csv \ - | tee logs.txt + --metadata "~{gisaid_meta_csv}" \ + --failed failed_metadata.csv \ + --frameshift "~{frameshift}" \ + --log logs.txt # the following grep statement will exit 1 if anything failed grep "submissions failed: 0" logs.txt > /dev/null } @@ -563,7 +567,7 @@ task gisaid_uploader { File failed_metadata = "failed_metadata.csv" } runtime { - docker: "quay.io/broadinstitute/gisaid-cli:1.0" + docker: "quay.io/broadinstitute/gisaid-cli:3.0" memory: "2 GB" cpu: 2 disks: "local-disk 100 HDD" From 11fbf76f6749cb47222fe503a6176c1e4ec90e06 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Wed, 29 Jun 2022 10:34:31 -0400 Subject: [PATCH 08/15] typo fix --- pipes/WDL/workflows/sarscov2_illumina_full.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipes/WDL/workflows/sarscov2_illumina_full.wdl b/pipes/WDL/workflows/sarscov2_illumina_full.wdl index dbec7ecab..414057276 100644 --- a/pipes/WDL/workflows/sarscov2_illumina_full.wdl +++ b/pipes/WDL/workflows/sarscov2_illumina_full.wdl @@ -112,7 +112,7 @@ workflow sarscov2_illumina_full { input: infile = amplicon_bed_prefix + demux_deplete.meta_by_sample[name_reads.left]["amplicon_set"] + ".bed", search = "MN908947.3", - replace = "NC_045512.​2" + replace = "NC_045512.2" } } call assemble_refbased.assemble_refbased { From 5a30e0ed9d38d3f29aea8c2b9ecf072f0d1116e1 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Wed, 29 Jun 2022 12:09:34 -0400 Subject: [PATCH 09/15] oops! --- pipes/WDL/tasks/tasks_utils.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl index 77af8500c..1299ccfec 100644 --- a/pipes/WDL/tasks/tasks_utils.wdl +++ b/pipes/WDL/tasks/tasks_utils.wdl @@ -124,7 +124,7 @@ task sed { File infile String search String replace - String outfilename = "~{infile}-rename.txt" + String outfilename = "~{basename(infile)}-rename.txt" } command { sed 's/~{search}/~{replace}/g' "~{infile}" > "~{outfilename}" From 499a13a989dbdf845812326fd16b4afa47ba27ef Mon Sep 17 00:00:00 2001 From: Danny Park Date: Thu, 30 Jun 2022 00:31:17 -0400 Subject: [PATCH 10/15] tsv_join -> tsv_stack, adjust renamed bed filename --- pipes/WDL/workflows/sarscov2_illumina_full.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipes/WDL/workflows/sarscov2_illumina_full.wdl b/pipes/WDL/workflows/sarscov2_illumina_full.wdl index 414057276..af29f36fe 100644 --- a/pipes/WDL/workflows/sarscov2_illumina_full.wdl +++ b/pipes/WDL/workflows/sarscov2_illumina_full.wdl @@ -111,6 +111,7 @@ workflow sarscov2_illumina_full { call utils.sed as bed_rename { input: infile = amplicon_bed_prefix + demux_deplete.meta_by_sample[name_reads.left]["amplicon_set"] + ".bed", + outfilename = demux_deplete.meta_by_sample[name_reads.left]["amplicon_set"] + ".bed", search = "MN908947.3", replace = "NC_045512.2" } @@ -280,10 +281,9 @@ workflow sarscov2_illumina_full { id_col = 'sample_sanitized', out_basename = "picard_metrics_insertsize-~{flowcell_id}" } - call utils.tsv_join as samtools_ampliconstats_merge { + call utils.tsv_stack as samtools_ampliconstats_merge { input: input_tsvs = assemble_refbased.samtools_ampliconstats_parsed, - id_col = 'sample_sanitized', out_basename = "samtools_ampliconstats-~{flowcell_id}" } From f4116d3e688e6bf5bab565411199db8c8f6d211e Mon Sep 17 00:00:00 2001 From: Danny Park Date: Thu, 30 Jun 2022 08:11:23 -0400 Subject: [PATCH 11/15] implement a new merger --- pipes/WDL/tasks/tasks_utils.wdl | 25 +++++++++++++++++++ .../WDL/workflows/sarscov2_illumina_full.wdl | 6 ++--- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl index 1299ccfec..9c1414ff5 100644 --- a/pipes/WDL/tasks/tasks_utils.wdl +++ b/pipes/WDL/tasks/tasks_utils.wdl @@ -487,6 +487,31 @@ task tsv_stack { } } +task cat_except_headers { + input { + Array[File]+ infiles + String out_filename + } + + command { + awk 'FNR>1 || NR==1' \ + ${sep=' ' infiles} \ + > ${out_filename} + } + + output { + File out_tsv = "${out_basename}" + } + + runtime { + memory: "1 GB" + cpu: 1 + docker: "ubuntu" + disks: "local-disk 50 HDD" + dx_instance_type: "mem1_ssd1_v2_x2" + maxRetries: 2 + } +} task make_empty_file { input { String out_filename diff --git a/pipes/WDL/workflows/sarscov2_illumina_full.wdl b/pipes/WDL/workflows/sarscov2_illumina_full.wdl index af29f36fe..b687d38c4 100644 --- a/pipes/WDL/workflows/sarscov2_illumina_full.wdl +++ b/pipes/WDL/workflows/sarscov2_illumina_full.wdl @@ -281,10 +281,10 @@ workflow sarscov2_illumina_full { id_col = 'sample_sanitized', out_basename = "picard_metrics_insertsize-~{flowcell_id}" } - call utils.tsv_stack as samtools_ampliconstats_merge { + call utils.cat_except_headers as samtools_ampliconstats_merge { input: - input_tsvs = assemble_refbased.samtools_ampliconstats_parsed, - out_basename = "samtools_ampliconstats-~{flowcell_id}" + infiles = assemble_refbased.samtools_ampliconstats_parsed, + out_filename = "samtools_ampliconstats-~{flowcell_id}.txt" } ### filter and concatenate final sets for delivery ("passing" and "submittable") From a5c201ab72dfd73fcd779075411d6028855b9335 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Thu, 30 Jun 2022 08:24:04 -0400 Subject: [PATCH 12/15] fixes --- pipes/WDL/tasks/tasks_utils.wdl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl index 9c1414ff5..38889debc 100644 --- a/pipes/WDL/tasks/tasks_utils.wdl +++ b/pipes/WDL/tasks/tasks_utils.wdl @@ -493,14 +493,14 @@ task cat_except_headers { String out_filename } - command { + command <<< awk 'FNR>1 || NR==1' \ - ${sep=' ' infiles} \ - > ${out_filename} - } + ~{sep=' ' infiles} \ + > ~{out_filename} + >>> output { - File out_tsv = "${out_basename}" + File out_tsv = out_filename } runtime { From 27fed58274c0db005622e04d057f374cfd4f98dd Mon Sep 17 00:00:00 2001 From: Danny Park Date: Thu, 30 Jun 2022 11:49:14 -0400 Subject: [PATCH 13/15] cleanups to output report --- pipes/WDL/tasks/tasks_reports.wdl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pipes/WDL/tasks/tasks_reports.wdl b/pipes/WDL/tasks/tasks_reports.wdl index 90753177f..cfe29a92d 100644 --- a/pipes/WDL/tasks/tasks_reports.wdl +++ b/pipes/WDL/tasks/tasks_reports.wdl @@ -70,13 +70,14 @@ task alignment_metrics { else AMPLICON_SET=$(basename "~{primers_bed}" .bed) fi - echo -e "$SAMPLE\t~{out_basename}\t$AMPLICON_SET" > prepend.txt grep ^AMPLICON "~{out_basename}".ampliconstats.txt | cut -f 2- > AMPLICON grep ^FREADS "~{out_basename}".ampliconstats.txt | cut -f 3- | tr '\t' '\n' > FREADS; echo "" >> FREADS grep ^FDEPTH "~{out_basename}".ampliconstats.txt | cut -f 3- | tr '\t' '\n' > FDEPTH; echo "" >> FDEPTH grep ^FPCOV "~{out_basename}".ampliconstats.txt | cut -f 3- | tr '\t' '\n' > FPCOV; echo "" >> FPCOV grep ^FAMP "~{out_basename}".ampliconstats.txt | cut -f 4 | tail +2 > FAMP - paste prepend.txt AMPLICON FREADS FDEPTH FPCOV FAMP >> "~{out_basename}.ampliconstats_parsed.txt" + for i in $(cut -f 1 AMPLICON); do echo -e "$SAMPLE\t~{out_basename}\t$AMPLICON_SET"; done > prepend.txt + wc -l prepend.txt AMPLICON FREADS FDEPTH FPCOV FAMP + paste prepend.txt AMPLICON FREADS FDEPTH FPCOV FAMP | grep . >> "~{out_basename}.ampliconstats_parsed.txt" fi >>> From 75ff8ce2acf6d1c060198f9d2fa7d8a31b89fa24 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Tue, 5 Jul 2022 16:22:19 -0400 Subject: [PATCH 14/15] drop empty lines --- pipes/WDL/tasks/tasks_reports.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipes/WDL/tasks/tasks_reports.wdl b/pipes/WDL/tasks/tasks_reports.wdl index cfe29a92d..43ee24a90 100644 --- a/pipes/WDL/tasks/tasks_reports.wdl +++ b/pipes/WDL/tasks/tasks_reports.wdl @@ -77,7 +77,7 @@ task alignment_metrics { grep ^FAMP "~{out_basename}".ampliconstats.txt | cut -f 4 | tail +2 > FAMP for i in $(cut -f 1 AMPLICON); do echo -e "$SAMPLE\t~{out_basename}\t$AMPLICON_SET"; done > prepend.txt wc -l prepend.txt AMPLICON FREADS FDEPTH FPCOV FAMP - paste prepend.txt AMPLICON FREADS FDEPTH FPCOV FAMP | grep . >> "~{out_basename}.ampliconstats_parsed.txt" + paste prepend.txt AMPLICON FREADS FDEPTH FPCOV FAMP | grep '\S' >> "~{out_basename}.ampliconstats_parsed.txt" fi >>> From cde92231bbcadf19541b78c99d09a6d80cb355c9 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Mon, 11 Jul 2022 10:15:16 -0400 Subject: [PATCH 15/15] add authors and addresses to sc2_meta_final --- pipes/WDL/tasks/tasks_sarscov2.wdl | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pipes/WDL/tasks/tasks_sarscov2.wdl b/pipes/WDL/tasks/tasks_sarscov2.wdl index 8ff2ad908..55d93a531 100644 --- a/pipes/WDL/tasks/tasks_sarscov2.wdl +++ b/pipes/WDL/tasks/tasks_sarscov2.wdl @@ -243,6 +243,9 @@ task sc2_meta_final { Int min_unambig=24000 Boolean drop_file_cols=false + String address_map = '{}' + String authors_map = '{}' + File? filter_to_ids String docker = "quay.io/broadinstitute/py3-bio:0.1.2" @@ -274,6 +277,8 @@ task sc2_meta_final { genome_status = json.load(inf) else: genome_status = {} + address_map = json.loads('~{address_map}') + authors_map = json.loads('~{authors_map}') # read input files df_assemblies = pd.read_csv(assemblies_tsv, sep='\t').dropna(how='all') @@ -352,6 +357,10 @@ task sc2_meta_final { # join column: collaborator_id df_assemblies = df_assemblies.merge(collab_ids, on='sample', how='left', validate='one_to_one') + # derived columns: authors, orig_lab_addr + df_assemblies.loc[:,'authors'] = list(authors_map.get(cby) if not pd.isna(cby) else cby for cby in df_assemblies.loc[:,'collected_by']) + df_assemblies.loc[:,'orig_lab_addr'] = list(address_map.get(cby) if not pd.isna(cby) else cby for cby in df_assemblies.loc[:,'collected_by']) + # write final output df_assemblies.to_csv("~{out_basename}.final.tsv", sep='\t', index=False) CODE