stjudecloud · a-frantz · Jul 16, 2024 · Jul 16, 2024 · Jul 16, 2024 · Jul 16, 2024
diff --git a/data_structures/flag_filter.wdl b/data_structures/flag_filter.wdl
@@ -115,7 +115,7 @@ task validate_string_is_12bit_oct_dec_or_hex {
 
     runtime {
         memory: "4 GB"
-        disk: "10 GB"
+        disks: "10 GB"
         container: "ghcr.io/stjudecloud/util:1.3.0"
         maxRetries: 1
     }

diff --git a/data_structures/read_group.wdl b/data_structures/read_group.wdl
@@ -1,29 +1,45 @@
 ## Read groups are defined in the SAM spec
-##    ID: "Read group identifier. Each Read Group must have a unique ID. The value of ID is used in the RG tags of alignment records.",
-##    BC: "Barcode sequence identifying the sample or library. This value is the expected barcode bases as read by the sequencing machine in the absence of errors. If there are several barcodes for the sample/library (e.g., one on each end of the template), the recommended implementation concatenates all the barcodes separating them with hyphens (`-`).",
-##    CN: "Name of sequencing center producing the read.",
-##    DS: "Description.",
-##    DT: "Date the run was produced (ISO8601 date or date/time).",
-##    FO: "Flow order. The array of nucleotide bases that correspond to the nucleotides used for each flow of each read. Multi-base flows are encoded in IUPAC format, and non-nucleotide flows by various other characters. Format: /\\*|[ACMGRSVTWYHKDBN]+/",
-##    KS: "The array of nucleotide bases that correspond to the key sequence of each read.",
-##    LB: "Library.",
-##    PG: "Programs used for processing the read group.",
-##    PI: "Predicted median insert size, rounded to the nearest integer.",
-##    PL: "Platform/technology used to produce the reads. Valid values: CAPILLARY, DNBSEQ (MGI/BGI), ELEMENT, HELICOS, ILLUMINA, IONTORRENT, LS454, ONT (Oxford Nanopore), PACBIO (Pacific Biosciences), SINGULAR, SOLID, and ULTIMA. This field should be omitted when the technology is not in this list (though the PM field may still be present in this case) or is unknown.",
-##    PM: "Platform model. Free-form text providing further details of the platform/technology used.",
-##    PU: "Platform unit (e.g., flowcell-barcode.lane for Illumina or slide for SOLiD). Unique identifier.",
-##    SM: "Sample. Use pool name where a pool is being sequenced."
+## - ID: Read group identifier. Each Read Group must have a unique ID.
+##     The value of ID is used in the RG tags of alignment records.
+## - BC: "Barcode sequence identifying the sample or library. This value is the
+##     expected barcode bases as read by the sequencing machine in the absence
+##     of errors. If there are several barcodes for the sample/library
+##     (e.g., one on each end of the template), the recommended implementation
+##     concatenates all the barcodes separating them with hyphens (`-`).
+## - CN: Name of sequencing center producing the read.
+## - DS: Description.
+## - DT: Date the run was produced (ISO8601 date or date/time).
+## - FO: Flow order. The array of nucleotide bases that correspond to the nucleotides
+##     used for each flow of each read. Multi-base flows are encoded in IUPAC format,
+##     and non-nucleotide flows by various other characters.
+##     Format: /\\*|[ACMGRSVTWYHKDBN]+/
+## - KS: The array of nucleotide bases that correspond to the key sequence of each read.
+## - LB: Library.
+## - PG: Programs used for processing the read group.
+## - PI: Predicted median insert size, rounded to the nearest integer.
+## - PL: Platform/technology used to produce the reads.
+##     Valid values: CAPILLARY, DNBSEQ (MGI/BGI), ELEMENT, HELICOS, ILLUMINA, IONTORRENT,
+##     LS454, ONT (Oxford Nanopore), PACBIO (Pacific Biosciences), SINGULAR, SOLID,
+##     and ULTIMA. This field should be omitted when the technology is not in this list
+##     (though the PM field may still be present in this case) or is unknown.
+## - PM: Platform model. Free-form text providing further details of the
+##     platform/technology used.
+## - PU: Platform unit (e.g., flowcell-barcode.lane for Illumina or slide
+##     for SOLiD). Unique identifier.
+## - SM: Sample. Use pool name where a pool is being sequenced.
 ##
 ## An example input JSON entry for `read_group` might look like this:
+## ```
 ## {
 ##     "read_group": {
-##             "ID": "rg1",
-##             "PI": 150,
-##             "PL": "ILLUMINA",
-##             "SM": "Sample",
-##             "LB": "Sample"
+##         "ID": "rg1",
+##         "PI": 150,
+##         "PL": "ILLUMINA",
+##         "SM": "Sample",
+##         "LB": "Sample"
 ##     }
 ## }
+## ```
 
 version 1.1
 
@@ -64,7 +80,6 @@ task read_group_to_string {
 
     command <<<
         {
-            # TODO: I think this can be simplified by dropping the `if defined` checks?
             echo -n "~{"ID:~{read_group.ID}"}"  # required field. All others optional
             echo -n "~{if defined(read_group.BC) then " BC:~{read_group.BC}" else ""}"
             echo -n "~{if defined(read_group.CN) then " CN:~{read_group.CN}" else ""}"
@@ -88,7 +103,7 @@ task read_group_to_string {
 
     runtime {
         memory: "4 GB"
-        disk: "10 GB"
+        disks: "10 GB"
         container: "ghcr.io/stjudecloud/util:1.4.0"
         maxRetries: 1
     }
@@ -115,6 +130,7 @@ task get_read_groups {
     Float bam_size = size(bam, "GiB")
     Int disk_size_gb = ceil(bam_size) + 10 + modify_disk_size_gb
 
+    #@ except: LineWidth
     command <<<
         set -euo pipefail
         BAM="~{bam}" OUTFILE="read_groups.json" python - <<END
@@ -139,7 +155,7 @@ task get_read_groups {
 
     runtime {
         memory: "4 GB"
-        disk: "~{disk_size_gb} GB"
+        disks: "~{disk_size_gb} GB"
         container: "quay.io/biocontainers/pysam:0.22.0--py38h15b938a_1"
         maxRetries: 1
     }
@@ -171,7 +187,10 @@ task validate_read_group {
     String id_pattern = "id"
     String sample_pattern = "sample.?"
     String restrictive_pattern = "\\ "  # Disallow spaces
-    Array[String] platforms = ["CAPILLARY", "DNBSEQ", "ELEMENT", "HELICOS", "ILLUMINA", "IONTORRENT", "LS454", "ONT", "PACBIO", "SINGULAR", "SOLID", "ULTIMA"]
+    Array[String] platforms = [
+        "CAPILLARY", "DNBSEQ", "ELEMENT", "HELICOS", "ILLUMINA", "IONTORRENT", "LS454",
+        "ONT", "PACBIO", "SINGULAR", "SOLID", "ULTIMA"
+    ]
 
     command <<<
         error=0
@@ -180,7 +199,8 @@ task validate_read_group {
             if [[ ~{read_group.ID} =~ ^~{id_pattern}$ ]] \
             || [[ ~{read_group.ID} =~ ~{restrictive_pattern} ]]
             then
-                >&2 echo "ID (~{read_group.ID}) must not match pattern ~{id_pattern} nor ~{restrictive_pattern}"
+                >&2 echo "ID (~{read_group.ID}) must not match patterns:"
+                >&2 echo "'~{id_pattern}' or '~{restrictive_pattern}'"
                 error=1
             fi
         fi
@@ -204,7 +224,8 @@ task validate_read_group {
                 if [[ "~{read_group.SM}" =~ ^~{sample_pattern}$ ]] \
                 || [[ "~{read_group.SM}" =~ ~{restrictive_pattern} ]]
                 then
-                    >&2 echo "SM must not match pattern ~{sample_pattern} nor ~{restrictive_pattern}"
+                    >&2 echo "SM must not match patterns:"
+                    >&2 echo "'~{sample_pattern}' or '~{restrictive_pattern}'"
                     error=1
                 fi
             fi
@@ -419,7 +440,7 @@ task validate_read_group {
 
     runtime {
         memory: "4 GB"
-        disk: "10 GB"
+        disks: "10 GB"
         container: "ghcr.io/stjudecloud/util:1.4.0"
         maxRetries: 0
     }

diff --git a/template/task-templates.wdl b/template/task-templates.wdl
@@ -25,7 +25,7 @@ task static_disk_and_ram_task {
 
     runtime {
         memory: "4 GB"
-        disk: "10 GB"
+        disks: "10 GB"
         container: ""
         maxRetries: 1
     }
@@ -61,7 +61,7 @@ task dynamic_disk_and_ram_task {
 
     runtime {
         memory: "~{memory_gb} GB"
-        disk: "~{disk_size_gb} GB"
+        disks: "~{disk_size_gb} GB"
         container: ""
         maxRetries: 1
     }
@@ -98,7 +98,7 @@ task use_all_cores_task {
     runtime {
         cpu: ncpu
         memory: "4 GB"
-        disk: "10 GB"
+        disks: "10 GB"
         container: ""
         maxRetries: 1
     }
@@ -140,7 +140,7 @@ task localize_files_task {
 
     runtime {
         memory: "4 GB"
-        disk: "10 GB"
+        disks: "10 GB"
         container: ""
         maxRetries: 1
     }
@@ -169,7 +169,7 @@ task outfile_name_task {
 
     runtime {
         memory: "4 GB"
-        disk: "10 GB"
+        disks: "10 GB"
         container: ""
         maxRetries: 1
     }
@@ -198,7 +198,7 @@ task prefix_task {
 
     runtime {
         memory: "4 GB"
-        disk: "10 GB"
+        disks: "10 GB"
         container: ""
         maxRetries: 1
     }
@@ -243,7 +243,7 @@ task string_choices_task {
 
     runtime {
         memory: "4 GB"
-        disk: "10 GB"
+        disks: "10 GB"
         container: ""
         maxRetries: 1
     }

diff --git a/tests/tools/test_util.yaml b/tests/tools/test_util.yaml
@@ -50,7 +50,7 @@
       contains:
         - "AL954722.1"
 
-# TODO: This fails, but the underlying task will be updated.
+# This fails, but the underlying task will be updated.
 - name: qc_summary
   tags:
    - miniwdl

diff --git a/tools/arriba.wdl b/tools/arriba.wdl
@@ -137,10 +137,15 @@ task arriba {
         File? annotate_fusions
         File? protein_domains
         File? wgs_svs
-        Array[String] interesting_contigs = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y", "AC_*", "NC_*"]
+        Array[String] interesting_contigs = [
+            "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14",
+            "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y", "AC_*", "NC_*"
+        ]
         Array[String] viral_contigs = ["AC_*", "NC_*"]
         Array[String] disable_filters = []
-        String feature_name = "gene_name=gene_name|gene_id,gene_id=gene_id,transcript_id=transcript_id,feature_exon=exon,feature_CDS=CDS"
+        #@ except: LineWidth
+        String feature_name
+            = "gene_name=gene_name|gene_id,gene_id=gene_id,transcript_id=transcript_id,feature_exon=exon,feature_CDS=CDS"
         String prefix = basename(bam, ".bam") + ".fusions"
         String strandedness = "auto"
         Boolean mark_duplicates = true
@@ -171,7 +176,10 @@ task arriba {
     }
 
     Int bam_size_gb = ceil(size(bam, "GiB"))
-    Int disk_size_gb = bam_size_gb + ceil(size(gtf, "GiB")) + ceil(size(reference_fasta_gz, "GiB")) + modify_disk_size_gb
+    Int disk_size_gb = bam_size_gb
+        + ceil(size(gtf, "GiB"))
+        + ceil(size(reference_fasta_gz, "GiB"))
+        + modify_disk_size_gb
     Int memory_gb = bam_size_gb + modify_memory_gb
 
     command <<<
@@ -190,9 +198,17 @@ task arriba {
             ~{if defined(wgs_svs) then "-d " + wgs_svs else ""} \
             -D ~{max_genomic_breakpoint_distance} \
             -s ~{strandedness} \
-            ~{if length(interesting_contigs) > 0 then "-i " + sep(",", interesting_contigs) else ""} \
+            ~{(
+                if length(interesting_contigs) > 0
+                then "-i " + sep(",", interesting_contigs)
+                else ""
+            )} \
             ~{if length(viral_contigs) > 0 then "-v " + sep(",", viral_contigs) else ""} \
-            ~{if length(disable_filters) > 0 then "-f " + sep(",", disable_filters) else ""} \
+            ~{(
+                if length(disable_filters) > 0
+                then "-f " + sep(",", disable_filters)
+                else ""
+            )} \
             -E ~{max_e_value} \
             -S ~{min_supporting_reads} \
             -m ~{max_mismappers} \
@@ -254,13 +270,16 @@ task arriba_tsv_to_vcf {
     }
 
     Int input_size_gb = ceil(size(fusions, "GiB"))
-    Int disk_size_gb = ceil(input_size_gb) + (ceil(size(reference_fasta, "GiB")) * 3) + modify_disk_size_gb
+    Int disk_size_gb = ceil(input_size_gb)
+        + (ceil(size(reference_fasta, "GiB")) * 3)
+        + modify_disk_size_gb
 
     command <<<
         set -euo pipefail
 
         fasta_name=~{basename(reference_fasta, ".gz")}
-        gunzip -c ~{reference_fasta} > "$fasta_name" || ln -sf ~{reference_fasta} "$fasta_name"
+        gunzip -c ~{reference_fasta} > "$fasta_name" \
+            || ln -sf ~{reference_fasta} "$fasta_name"
 
         convert_fusions_to_vcf.sh \
             $fasta_name \

diff --git a/tools/bwa.wdl b/tools/bwa.wdl
@@ -2,8 +2,6 @@
 
 version 1.1
 
-# TODO there are probably BWA params we can expose. Have not checked
-
 task bwa_aln {
     meta {
         description: "Maps Single-End FASTQ files to BAM format using bwa aln"
@@ -13,7 +11,7 @@ task bwa_aln {
     }
 
     parameter_meta {
-        fastq: "Input FASTQ file to align with bwa"  # TODO verify can be gzipped or compressed
+        fastq: "Input FASTQ file to align with bwa"
         bwa_db_tar_gz: "Gzipped tar archive of the bwa reference files. Files should be at the root of the archive."
         prefix: "Prefix for the BAM file. The extension `.bam` will be added."
         read_group: {
@@ -105,7 +103,7 @@ task bwa_aln_pe {
         read_one_fastq_gz: {
             description: "Input gzipped FASTQ read one file to align with bwa",
             stream: false
-        }  # TODO verify can be gzipped or compressed
+        }
         read_two_fastq_gz: {
             description: "Input gzipped FASTQ read two file to align with bwa",
             stream: false
@@ -203,7 +201,7 @@ task bwa_mem {
     }
 
     parameter_meta {
-        read_one_fastq_gz: "Input gzipped FASTQ read one file to align with bwa"  # TODO verify can be gzipped or compressed
+        read_one_fastq_gz: "Input gzipped FASTQ read one file to align with bwa"
         bwa_db_tar_gz: "Gzipped tar archive of the bwa reference files. Files should be at the root of the archive."
         read_two_fastq_gz: "Input gzipped FASTQ read two file to align with bwa"
         prefix: "Prefix for the BAM file. The extension `.bam` will be added."
@@ -239,7 +237,8 @@ task bwa_mem {
 
     String output_bam = prefix + ".bam"
 
-    Float input_fastq_size = size(read_one_fastq_gz, "GiB") + size(read_two_fastq_gz, "GiB")
+    Float input_fastq_size = size(read_one_fastq_gz, "GiB")
+        + size(read_two_fastq_gz, "GiB")
     Float reference_size = size(bwa_db_tar_gz, "GiB")
     Int disk_size_gb = (
         ceil((input_fastq_size + reference_size) * 2) + 10 + modify_disk_size_gb

diff --git a/tools/cellranger.wdl b/tools/cellranger.wdl
@@ -2,6 +2,7 @@
 ##
 ## This WDL file wrap the [10x Genomics Cell Ranger](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/what-is-cell-ranger) tool.
 ## Cell Ranger is a tool for handling scRNA-Seq data.
+#@ except: LineWidth
 
 version 1.1
 

diff --git a/tools/deeptools.wdl b/tools/deeptools.wdl
@@ -3,7 +3,6 @@
 version 1.1
 
 task bam_coverage {
-    # TODO expose other params/formats
     meta {
         description: "Generates a BigWig coverage track using bamCoverage from DeepTools"
         outputs: {

diff --git a/tools/estimate.wdl b/tools/estimate.wdl
@@ -5,8 +5,8 @@ version 1.1
 task run_estimate {
     meta {
         description: "**[DEPRECATED]** Given a gene expression file, run the ESTIMATE software package"
-        outputs:  {
-            estimate_file: "The results file of the ESTIMATE software package"  # TODO actually run and see what format it is.
+        outputs: {
+            estimate_file: "The results file of the ESTIMATE software package"
         }
         deprecated: true
     }

diff --git a/tools/fastqc.wdl b/tools/fastqc.wdl
@@ -56,7 +56,7 @@ task fastqc {
     >>>
 
     output {
-        File raw_data = "~{prefix}/~{basename(bam, ".bam")}_fastqc.zip"  # TODO verify this works if prefix differs
+        File raw_data = "~{prefix}/~{basename(bam, ".bam")}_fastqc.zip"
         File results = out_tar_gz
     }