stjudecloud · a-frantz · Feb 19, 2025 · Feb 18, 2025 · Feb 18, 2025 · Feb 18, 2025
diff --git a/README.md b/README.md
@@ -31,6 +31,35 @@ The repository is laid out as follows:
 * `bin/` - **no longer in use** Scripts used by Cromwell configuration settings. Add this to `$PATH` prior to using configurations in `conf` with Cromwell.
 * `conf/` - **no longer in use** Cromwell configuration files created for various environments that we use across our team. Feel free to use/fork/suggest improvements.
 
+## Expected FASTQ file name conventions
+
+The tasks and workflows in this repository which have one or more FASTQ files as an input will also have a `prefix` input which will determine the filenames for any output files. The `prefix` input can be specified manually, or it can be left at the default value. The default value will attempt to strip common file suffixes from one of the input FASTQs and determine an appropriate basename to be used by all output files. That evaluation is most commonly performed using the POSIX ERE Regular Expression `(([_.][rR](?:ead)?[12])((?:[_.-][^_.-]*?)*?))?\.(fastq|fq)(\.gz)?$`. In plain English, this REGEX will at a minimum search for and remove the file extensions `.fastq` and `.fq` with or without a `.gz` GZIP extension. Additionally, if the FASTQ filename contains a "read number" signifier (`R1`/`R2`/`r1`/`r2`/`read1`/`read2`) somewhere before the FASTQ extension, that will be truncated off the basename. This means that _everything_ after the read indicator will be removed. If there is important information encoded in your filenames _between the read number and the final extension,_ we recommend you manually specify an appropriate `prefix` value.
+
+### Examples
+
+Every filename in the following list will have the evaluated `prefix` "`sample`" if no override value is provided.
+
+```
+sample_R1_100000.fastq.gz
+sample_R2.fq
+sample.fq.gz
+sample.r1_100000.trimmed.fastq.gz
+sample.R2_100000.trimmed-kebab.fastq.gz
+sample_r1.100000.trimmed-kebab.terriblename.fastq.gz
+sample.Read2_100000.trimmed-kebab.fastq
+sample_read1.100000.trimmed-kebab.fq
+```
+
+A FASTQ with the filename `sample.100000-kebab.foobar.fastq.gz` would have a default `prefix` value of "`sample.100000-kebab.foobar`".
+
+The following filenames will not result in _any_ trimming of the filename, and should likely be either renamed or have a manually specified `prefix`:
+
+```
+sample_R_one.FASTQ.gz
+sampleR1.Fq
+sample_read_two.fq.zip
+```
+
 ## Bootstrap guide
 
 This repository implements workflows using the Workflow Description Language (WDL). If unfamiliar with WDL, a short overview is available in the [WDL spec](https://github.com/openwdl/wdl/blob/main/versions/1.1/SPEC.md#introduction).

diff --git a/tools/CHANGELOG.md b/tools/CHANGELOG.md
@@ -6,9 +6,14 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/).
 
 ## 2025 February
 
+## Added
+
+- Added a default `prefix` calculation for `star.alignment` [#220](https://github.com/stjudecloud/workflows/pull/220).
+
 ### Changed
 
 - `ngsderive.encoding` removed the `String inferred_encoding` output [#216](https://github.com/stjudecloud/workflows/pull/216).
+- Improved the REGEX used to calculate a prefix for FASTQ input files in various tools [#220](https://github.com/stjudecloud/workflows/pull/220).
 
 ## 2025 January
 

diff --git a/tools/bwa.wdl b/tools/bwa.wdl
@@ -13,7 +13,11 @@ task bwa_aln {
     parameter_meta {
         fastq: "Input FASTQ file to align with bwa"
         bwa_db_tar_gz: "Gzipped tar archive of the bwa reference files. Files should be at the root of the archive."
-        prefix: "Prefix for the BAM file. The extension `.bam` will be added."
+        prefix: {
+            description: "Prefix for the BAM file. The extension `.bam` will be added.",
+            help: "See `../README.md` for more information on the default prefix evaluation.",
+            group: "common",
+        }
         read_group: {
             description: "Read group information for BWA to insert into the header. BWA format: '@RG\tID:foo\tSM:bar'",
             group: "common",
@@ -34,8 +38,8 @@ task bwa_aln {
         File bwa_db_tar_gz
         String prefix = sub(
             basename(fastq),
-            "([_\\.][rR][12])?(\\.subsampled)?\\.(fastq|fq)(\\.gz)?$",
-            ""
+            "(([_.][rR](?:ead)?[12])((?:[_.-][^_.-]*?)*?))?\\.(fastq|fq)(\\.gz)?$",
+            ""  # Once replacing with capturing groups is supported, replace with group 3
         )
         String read_group = ""
         Boolean use_all_cores = false

diff --git a/tools/fq.wdl b/tools/fq.wdl
@@ -110,7 +110,11 @@ task subsample {
     parameter_meta {
         read_one_fastq: "Input FASTQ with read one. Can be gzipped or uncompressed."
         read_two_fastq: "Input FASTQ with read two. Can be gzipped or uncompressed."
-        prefix: "Prefix for the output FASTQ file(s). The extension `.R1.subsampled.fastq.gz` and `.R2.subsampled.fastq.gz` will be added."
+        prefix: {
+            description: "Prefix for the output FASTQ file(s). The extension `.R1.subsampled.fastq.gz` and `.R2.subsampled.fastq.gz` will be added.",
+            help: "See `../README.md` for more information on the default prefix evaluation.",
+            group: "common",
+        }
         probability: {
             description: "The probability a record is kept, as a decimal (0.0, 1.0). Cannot be used with `record-count`. Any `probability<=0.0` or `probability>=1.0` to disable.",
             group: "common",
@@ -127,8 +131,8 @@ task subsample {
         File? read_two_fastq
         String prefix = sub(
             basename(read_one_fastq),
-            "([_\\.][rR][12])?(\\.subsampled)?\\.(fastq|fq)(\\.gz)?$",
-            ""
+            "(([_.][rR](?:ead)?[12])((?:[_.-][^_.-]*?)*?))?\\.(fastq|fq)(\\.gz)?$",
+            ""  # Once replacing with capturing groups is supported, replace with group 3
         )
         Float probability = 1.0
         Int record_count = -1

diff --git a/tools/kraken2.wdl b/tools/kraken2.wdl
@@ -308,7 +308,11 @@ task kraken {
         read_one_fastq_gz: "Gzipped FASTQ file with 1st reads in pair"
         read_two_fastq_gz: "Gzipped FASTQ file with 2nd reads in pair"
         db: "Kraken2 database. Can be generated with `qc-reference.wdl`. Must be a tarball without a root directory."
-        prefix: "Prefix for the Kraken2 output files. The extensions `.kraken2.txt` and `.kraken2.sequences.txt.gz` will be added."
+        prefix: {
+            description: "Prefix for the Kraken2 output files. The extensions `.kraken2.txt` and `.kraken2.sequences.txt.gz` will be added.",
+            help: "See `../README.md` for more information on the default prefix evaluation.",
+            group: "common",
+        }
         store_sequences: {
             description: "Store and output main Kraken2 output in addition to the summary report?",
             group: "common",
@@ -334,8 +338,8 @@ task kraken {
         File db
         String prefix = sub(
             basename(read_one_fastq_gz),
-            "([_\\.][rR][12])?(\\.subsampled)?\\.(fastq|fq)(\\.gz)?$",
-            ""
+            "(([_.][rR](?:ead)?[12])((?:[_.-][^_.-]*?)*?))?\\.(fastq|fq)(\\.gz)?$",
+            ""  # Once replacing with capturing groups is supported, replace with group 3
         )
         Boolean store_sequences = false
         Boolean use_names = true

diff --git a/tools/librarian.wdl b/tools/librarian.wdl
@@ -14,16 +14,20 @@ task librarian {
 
     parameter_meta {
         read_one_fastq: "Read one FASTQ of a Paired-End sample to analyze. May be uncompressed or gzipped."
-        prefix: "Name of the output tar archive. The extension `.tar.gz` will be added."
+        prefix: {
+            description: "Name of the output tar archive. The extension `.tar.gz` will be added.",
+            help: "See `../README.md` for more information on the default prefix evaluation.",
+            group: "common",
+        }
         modify_disk_size_gb: "Add to or subtract from dynamic disk space allocation. Default disk size is determined by the size of the inputs. Specified in GB."
     }
 
     input {
         File read_one_fastq
         String prefix = sub(
             basename(read_one_fastq),
-            "([_\\.][rR][12])?(\\.subsampled)?\\.(fastq|fq)(\\.gz)?$",
-            ""
+            "(([_.][rR](?:ead)?[12])((?:[_.-][^_.-]*?)*?))?\\.(fastq|fq)(\\.gz)?$",
+            ""  # Once replacing with capturing groups is supported, replace with group 3
         ) + ".librarian"
         Int modify_disk_size_gb = 0
     }

diff --git a/tools/star.wdl b/tools/star.wdl
@@ -147,7 +147,11 @@ task alignment {
     parameter_meta {
         read_one_fastqs_gz: "An array of gzipped FASTQ files containing read one information"
         star_db_tar_gz: "A gzipped TAR file containing the STAR reference files. The name of the root directory which was archived must match the archive's filename without the `.tar.gz` extension."
-        prefix: "Prefix for the BAM and other STAR files. The extensions `.Aligned.out.bam`, `.Log.final.out`, `.SJ.out.tab`, and `.Chimeric.out.junction` will be added."
+        prefix: {
+            description: "Prefix for the BAM and other STAR files. The extensions `.Aligned.out.bam`, `.Log.final.out`, `.SJ.out.tab`, and `.Chimeric.out.junction` will be added.",
+            help: "See `../README.md` for more information on the default prefix evaluation.",
+            group: "common",
+        }
         read_groups: {
             description: "An array of `String`s where each `String` corresponds to one read group.",
             help: "Each read group string should start with the `ID` field followed by any other read group fields, where fields are delimited by a space. See `../data_structures/read_group.wdl` for information about possible fields and utility tasks for constructing, validating, and \"stringifying\" read groups.",
@@ -480,7 +484,6 @@ task alignment {
     input {
         File star_db_tar_gz
         Array[File] read_one_fastqs_gz
-        String prefix
         Array[File] read_two_fastqs_gz = []
         Array[String] read_groups = []
         Array[Int] out_sj_filter_intron_max_vs_read_n = [50000, 100000, 200000]
@@ -520,6 +523,11 @@ task alignment {
         Pair[Int, Int] clip_3p_n_bases = (0, 0)
         Pair[Int, Int] clip_3p_after_adapter_n_bases = (0, 0)
         Pair[Int, Int] clip_5p_n_bases = (0, 0)
+        String prefix = sub(
+            basename(read_one_fastqs_gz[0]),
+            "(([_.][rR](?:ead)?[12])((?:[_.-][^_.-]*?)*?))?\\.(fastq|fq)(\\.gz)?$",
+            ""  # Once replacing with capturing groups is supported, replace with group 3
+        )
         String read_name_separator = "/"
         String clip_adapter_type = "Hamming"
         String out_sam_strand_field = "intronMotif"

diff --git a/tools/util.wdl b/tools/util.wdl
@@ -516,7 +516,10 @@ task split_fastq {
             stream: true,
         }
         reads_per_file: "Number of reads to include in each output FASTQ file"
-        prefix: "Prefix for the FASTQ files. The extension `.fastq.gz` (preceded by a split index) will be added."
+        prefix: {
+            description: "Prefix for the FASTQ files. The extension `.fastq.gz` (preceded by a split index) will be added.",
+            group: "common",
+        }
         modify_disk_size_gb: "Add to or subtract from dynamic disk space allocation. Default disk size is determined by the size of the inputs. Specified in GB."
         ncpu: "Number of cores to allocate for task"
     }

diff --git a/workflows/dnaseq/CHANGELOG.md b/workflows/dnaseq/CHANGELOG.md
@@ -5,3 +5,9 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/).
 
 ## Unreleased
+
+## 2025 February
+
+## Added
+
+- Added a default `prefix` calculation for `dnaseq-standard-fastq` [#220](https://github.com/stjudecloud/workflows/pull/220).
diff --git a/workflows/dnaseq/dnaseq-standard-fastq.wdl b/workflows/dnaseq/dnaseq-standard-fastq.wdl
@@ -27,7 +27,11 @@ workflow dnaseq_standard_fastq_experimental {
             description: "An Array of structs defining read groups to include in the harmonized BAM. Must correspond to input FASTQs. Each read group ID must be contained in the basename of a FASTQ file or pair of FASTQ files if Paired-End. This requirement means the length of `read_groups` must equal the length of `read_one_fastqs_gz` and the length of `read_two_fastqs_gz` if non-zero. Only the `ID` field is required, and it must be unique for each read group defined. See data_structures/read_group.wdl for help formatting your input JSON.",
             external_help: "https://samtools.github.io/hts-specs/SAMv1.pdf",
         }
-        prefix: "Prefix for the BAM file. The extension `.bam` will be added."
+        prefix: {
+            description: "Prefix for the BAM file. The extension `.bam` will be added.",
+            help: "See `../../README.md` for more information on the default prefix evaluation.",
+            group: "common",
+        }
         aligner: {
             description: "BWA aligner to use",
             choices: [
@@ -45,7 +49,11 @@ workflow dnaseq_standard_fastq_experimental {
         Array[File] read_one_fastqs_gz
         Array[File] read_two_fastqs_gz
         Array[ReadGroup] read_groups
-        String prefix
+        String prefix = sub(
+            basename(read_one_fastqs_gz[0]),
+            "(([_.][rR](?:ead)?[12])((?:[_.-][^_.-]*?)*?))?\\.(fastq|fq)(\\.gz)?$",
+            ""  # Once replacing with capturing groups is supported, replace with group 3
+        )
         String aligner = "mem"
         Boolean validate_input = true
         Boolean use_all_cores = false

diff --git a/workflows/rnaseq/CHANGELOG.md b/workflows/rnaseq/CHANGELOG.md
@@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](http://keepachangelog.com/).
 
+## 2025 February
+
+### Added
+
+- Added a default `prefix` calculation for `rnaseq-standard-fastq` and `rnaseq-core` [#220](https://github.com/stjudecloud/workflows/pull/220).
+
 ## 2025 January
 
 ### Changed

diff --git a/workflows/rnaseq/rnaseq-core.wdl b/workflows/rnaseq/rnaseq-core.wdl
@@ -33,7 +33,11 @@ workflow rnaseq_core {
             help: "Each read group string should start with the `ID` field followed by any other read group fields, where fields are delimited by a space. See `../data_structures/read_group.wdl` for information about possible fields and utility tasks for constructing, validating, and \"stringifying\" read groups.",
             warning: "The `ID` field for each read group _must_ be contained in the basename of a FASTQ file or pair of FASTQ files if Paired-End. Example: `[\"ID:rg1 PU:flowcell1.lane1 SM:sample1 PL:illumina LB:sample1_lib1\", \"ID:rg2 PU:flowcell1.lane2 SM:sample1 PL:illumina LB:sample1_lib1\"]`. These two read groups could be associated with the following four FASTQs: `[\"sample1.rg1.R1.fastq\", \"sample1.rg2.R1.fastq\"]` and `[\"sample1.rg1.R2.fastq\", \"sample1.rg2.R2.fastq\"]`",
         }
-        prefix: "Prefix for output files"
+        prefix: {
+            description: "Prefix for output files",
+            help: "See `../../README.md` for more information on the default prefix evaluation.",
+            group: "common",
+        }
         contaminant_db: "A compressed reference database corresponding to the aligner chosen with `xenocp_aligner` for the contaminant genome"
         align_sj_stitch_mismatch_n_max: {
             description: "This overrides the STAR alignment default. Maximum number of mismatches for stitching of the splice junctions (-1: no limit) for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif",
@@ -123,14 +127,18 @@ workflow rnaseq_core {
         Array[File] read_one_fastqs_gz
         Array[File] read_two_fastqs_gz
         Array[String] read_groups
-        String prefix
         File? contaminant_db
         SpliceJunctionMotifs align_sj_stitch_mismatch_n_max = SpliceJunctionMotifs {
             noncanonical_motifs: 5,
             GT_AG_and_CT_AC_motif: -1,
             GC_AG_and_CT_GC_motif: 5,
             AT_AC_and_GT_AT_motif: 5,
         }
+        String prefix = sub(
+            basename(read_one_fastqs_gz[0]),
+            "(([_.][rR](?:ead)?[12])((?:[_.-][^_.-]*?)*?))?\\.(fastq|fq)(\\.gz)?$",
+            ""  # Once replacing with capturing groups is supported, replace with group 3
+        )
         String xenocp_aligner = "star"
         String strandedness = ""
         Boolean mark_duplicates = false

diff --git a/workflows/rnaseq/rnaseq-standard-fastq.wdl b/workflows/rnaseq/rnaseq-standard-fastq.wdl
@@ -65,7 +65,11 @@ workflow rnaseq_standard_fastq {
                 SM: "Sample. Use pool name where a pool is being sequenced.",
             },
         }
-        prefix: "Prefix for output files"
+        prefix: {
+            description: "Prefix for output files",
+            help: "See `../../README.md` for more information on the default prefix evaluation.",
+            group: "common",
+        }
         contaminant_db: "A compressed reference database corresponding to the aligner chosen with `xenocp_aligner` for the contaminant genome"
         xenocp_aligner: {
             description: "Aligner to use to map reads to the host genome for detecting contamination",
@@ -97,8 +101,12 @@ workflow rnaseq_standard_fastq {
         Array[File] read_one_fastqs_gz
         Array[File] read_two_fastqs_gz
         Array[ReadGroup] read_groups
-        String prefix
         File? contaminant_db
+        String prefix = sub(
+            basename(read_one_fastqs_gz[0]),
+            "(([_.][rR](?:ead)?[12])((?:[_.-][^_.-]*?)*?))?\\.(fastq|fq)(\\.gz)?$",
+            ""  # Once replacing with capturing groups is supported, replace with group 3
+        )
         String xenocp_aligner = "star"
         String strandedness = ""
         Boolean mark_duplicates = false