Merge pull request #64 from uclahs-cds/nwiltsie-process-afterscript

Add method to always capture process log files
uclahs-cds · May 21, 2024 · bc3f67e · bc3f67e
2 parents e703579 + 1671182
commit bc3f67e
Show file tree

Hide file tree

Showing 3 changed files with 135 additions and 0 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 - Functional testing framework with concrete tests for `methods.set_env()`
 - Functional test for `bam_parser.parse_bam_header()`.
 - Dump parameters with `json_extractor.store_params_json()`
+- Function to save process logs, even after failure
 
 ### Fixed
 - Fixed retry for potentially undefined variable `proc_name_keys`. #57

diff --git a/config/methods/README.md b/config/methods/README.md
@@ -151,6 +151,75 @@ methods {
 }
 ```
 
+### Capture process outputs, even after failures
+`methods.setup_process_afterscript` is a drop-in replacement for this method of capturing process log files:
+
+```Nextflow
+// Old technique
+process xxx {
+    publishDir path: "${params.log_output_dir}/process-log",
+        pattern: ".command.*",
+        mode: "copy",
+        saveAs: { "${task.process.replace(':', '/')}-${sample_id}/log${file(it).getName()}" }
+
+    output:
+    path(".command.*")
+```
+
+The above method does not produce any output files if the process fails. In order to always capture the log files, use the following:
+
+```Nextflow
+includeConfig "/path/to/common_methods.config"
+...
+methods {
+    ...
+    methods.setup_process_afterscript()
+}
+```
+
+> [!NOTE]
+> `methods.setup_process_afterscript()` duplicates the effect of defining `publishDir` and `output: path(".command.*")` for every process. Although there is no harm in using both techniques simultaneously, for clarity it is recommended to only use one.
+
+The output path is controlled by the following two [custom process directives](https://www.nextflow.io/docs/latest/process.html#ext):
+
+| Name | Default Value |
+| --- | --- |
+| `process.ext.log_dir` | `task.process.replace(':', '/')` |
+| `process.ext.log_dir_suffix` | `''` |
+
+Each of the above can be overridden by individual processes.
+
+The final directory path is `${params.log_output_dir}/process-log/${task.ext.log_dir}${task.ext.log_dir_suffix}`.
+
+#### Disabling for individual processes
+
+```Nextflow
+process xxx {
+
+    ext capture_logs: false
+```
+
+#### Combining with `afterScript`
+
+> [!IMPORTANT]
+> This method sets `process.afterScript`. If you define another `afterScript` (either globally or for a specific process), you must use this technique to combine both scripts.
+
+The `afterScript` closure established by `methods.setup_process_afterscript()` is also stored as `process.ext.commonAfterScript`. If a process requires another `afterScript`, it should be combined with the one defined here like so:
+
+```Nextflow
+process xxx {
+    afterScript {
+        [
+            "echo 'Before the common script'",
+            task.ext.commonAfterScript ?: "",
+            "echo 'After the common script'"
+        ].join("\n")
+    }
+```
+
+Due to the [Elvis operator](https://groovy-lang.org/operators.html#_elvis_operator), the above snippet is safe to use even if `methods.setup_process_afterscript()` is not used.
+
+
 ## References
 1. `nf-core` - https://nf-co.re/
 2. `nf-code modules` - https://github.com/nf-core/sarek/blob/ad2b34f39fead34d7a09051e67506229e827e892/conf/modules.config
diff --git a/config/methods/common_methods.config b/config/methods/common_methods.config
@@ -391,6 +391,71 @@ methods {
         process.containerOptions  = {-> (task.containsKey('cpus')) ? "--cpu-shares ${default_cpu_shares} --cpus ${task.cpus}" : "--cpu-shares ${default_cpu_shares}"}
     }
 
+    /**
+     * Configure all processes to save their command files in the output
+     * directory.
+     *
+     * This add a custom process directive that, when used as the afterScript,
+     * will copy all of the process's .command.* files into the output
+     * directory.
+
+     * Processes can customize the output directory by setting
+     * `process.ext.log_dir` and `process.ext.log_dir_suffix`. Both may be
+     * closures.
+     *
+     * Inspired by https://github.com/nextflow-io/nextflow/issues/1166#issuecomment-502467562
+     */
+    setup_process_afterscript = {
+        process.ext.log_dir = {
+            "${task.process.replace(':', '/')}"
+        }
+
+        process.ext.capture_logs = true
+
+        process.ext.commonAfterScript = {
+            if (!task.ext.capture_logs) {
+                return ""
+            }
+
+            process_log_dir = [
+                "${params.log_output_dir}",
+                "process-log",
+                "${task.ext.log_dir}${task.ext.log_dir_suffix ?: ''}"
+            ].join("/")
+
+            // Handle relative paths
+            if (process_log_dir.substring(0, 1) != "/") {
+                process_log_dir = "${launchDir}/${process_log_dir}"
+            }
+
+            return """\
+                readonly LOG_DIR="${process_log_dir}"
+                mkdir -p "\${LOG_DIR}"
+                for filename in .command.*; do
+                    [ -e "\${filename}" ] || continue
+                    cp "\${filename}" "\${LOG_DIR}/log\${filename}"
+                done
+                """.stripIndent()
+        }
+
+        /*
+        Set the default afterScript. If individual processes override
+        afterScript, they can restore this functionality like so (this is safe
+        to include even if setup_process_afterscript is not called):
+
+        afterScript {
+            [
+                "echo 'Before the common'",
+                task.ext.commonAfterScript ?: "",
+                "echo 'After the common'"
+            ].join("\n")
+        }
+        */
+
+        process.afterScript = process.ext.commonAfterScript
+    }
+
+
     /**
     *   Resolve the absolute path of a file relative to the current config file.
     */