Merge pull request #48 from Ferlab-Ste-Justine/feat/CLIN-3411-only-pu…

…blish-main-outputs feat: CLIN-3411 only publish main outputs
Ferlab-Ste-Justine · Dec 11, 2024 · a04b33a · a04b33a
2 parents 8fea002 + 9fc82cc
commit a04b33a
Show file tree

Hide file tree

Showing 6 changed files with 36 additions and 14 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#45](https://github.com/Ferlab-Ste-Justine/Post-processing-Pipeline/pull/45) Allow to add dbsnp ids to output vcf files
 - [#46](https://github.com/Ferlab-Ste-Justine/Post-processing-Pipeline/pull/46) Allow to skip the exclude mnp step
 - [#47](https://github.com/Ferlab-Ste-Justine/Post-processing-Pipeline/pull/47) Improve pipeline output documentation
+- [#48](https://github.com/Ferlab-Ste-Justine/Post-processing-Pipeline/pull/48) Publish only main outputs by default
 
 ### `Known issues`
 - The nf-core modules that we are using have a potential performance flaw. Typically, the regex used to describe the output files also match the input files (ex: "*.vcf"), which can cause unnecessary file transfers.  This has already proven to cause issues on fusion. One fix could be to transfer the whole modules to local to perform the small change necessary to fix this.

diff --git a/conf/modules.config b/conf/modules.config
@@ -10,14 +10,22 @@
 ----------------------------------------------------------------------------------------
 */
 
-process {
 
-    publishDir = [
+def new_publish_dir(new_attributes=[:]) { 
+    def default_publish_dir = [
+        enabled: params.publish_all,
         path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" },
         mode: params.publish_dir_mode,
         saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
     ]
 
+    return default_publish_dir + new_attributes
+}
+
+process {
+
+    publishDir = new_publish_dir()
+
     withName: BCFTOOLS_FILTER {
         container = 'staphb/bcftools:1.20'
         ext.args = {'-e \'strlen(REF)>1 & strlen(REF)==strlen(ALT) & TYPE="snp"\' -Oz --write-index=tbi'}
@@ -50,15 +58,15 @@ process {
 
     withName: EXOMISER { 
         container = 'ferlabcrsj/exomiser:2.3.0'
-        publishDir = [
-            path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" },
-            mode: params.publish_dir_mode,
-            pattern: "results/*{vcf.gz,vcf.gz.tbi,tsv,json,html}",
-        ]
+        publishDir = new_publish_dir([
+            enabled: true,
+            pattern: 'results/*{vcf.gz,vcf.gz.tbi,tsv,json,html}'
+        ])
     }
 
     withName: ENSEMBLVEP_VEP {
         container = 'ensemblorg/ensembl-vep:release_111.0' //sticking to v111 for now, but we should update this
+        publishDir = new_publish_dir([enabled: true])
         def args_list =  [
             "--offline",
             "--format vcf",
@@ -80,10 +88,13 @@ process {
 
     // To publish the vep index file in the same output folder as the vep output
     withName: 'vep_tabix' {
-         publishDir = [
-            path: { "${params.outdir}/ensemblvep" },
-            mode: params.publish_dir_mode,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
+        publishDir = new_publish_dir([
+            enabled: true,
+            path: { "${params.outdir}/ensemblvep" }
+        ])
+    }
+
+    withName: 'splitMultiAllelics' {
+        publishDir = new_publish_dir([enabled: true])
     }
 }
diff --git a/conf/test.config b/conf/test.config
@@ -29,6 +29,7 @@ params {
     // Input and output
     input  = "data-test/testSampleSheet.csv"
     outdir = "results"
+    publish_all = true
 
     // Reference data
     referenceGenome = "data-test/reference/Homo_sapiens_assembly38/chr22"

diff --git a/docs/output.md b/docs/output.md
@@ -132,4 +132,7 @@ For more details about the content of each of these files, you can have a look a
 
 ## Others Steps
 
-You might see other folders named after different pipeline processes. These are considered intermediate pipeline outputs.
+If needed, you can set the parameter `publish_all` to `true`, and the output from all pipeline steps will be published. 
+The names of the subdirectories will match the nextflow process names.
+
+We don't recommend using this in production. This is primarily useful for testing, debugging or troubleshooting.
diff --git a/nextflow.config b/nextflow.config
@@ -62,6 +62,7 @@ params {
 
     // Boilerplate options
     outdir                       = null
+    publish_all                  = false
     publish_dir_mode             = 'copy'
     monochrome_logs              = false
     help                         = false

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -196,6 +196,11 @@
           "fa_icon": "fas fa-question-circle",
           "hidden": true
         },
+        "publish_all": {
+          "type": "boolean",
+          "description": "Publish results from all processes to the output directory.",
+          "help_text": "If true (default false), publish results from all processes to the output directory. This is useful for debugging and testing, but can create a lot of output files."
+        },
         "publish_dir_mode": {
           "type": "string",
           "default": "copy",
@@ -222,7 +227,7 @@
           "type": "boolean",
           "description": "Boolean whether to validate parameters against the schema at runtime",
           "default": true,
-          "fa_icon": "fas fa-check-square",
+
           "hidden": true
         },
         "validationShowHiddenParams": {