From c7eebfbe8ff1d35a93ca2f2bc7c9a9441bb1aa82 Mon Sep 17 00:00:00 2001 From: Nicole Zeltser Date: Thu, 7 Mar 2024 15:28:22 -0800 Subject: [PATCH 01/13] add schema.yaml --- config/schema.yaml | 133 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 config/schema.yaml diff --git a/config/schema.yaml b/config/schema.yaml new file mode 100644 index 0000000..0b22ed6 --- /dev/null +++ b/config/schema.yaml @@ -0,0 +1,133 @@ +--- +patient_id: + type: 'String' + required: true + help: 'Patient ID' +dataset_id: + type: 'String' + required: true + help: 'Dataset ID' +aligner: + type: 'AlignerTool' + required: true + help: 'Aligner used to align input BAMs. Provided as -' +output_dir: + type: 'Path' + mode: 'w' + required: true + help: 'Absolute path to output directory' +save_intermediate_files: + type: 'Bool' + required: true + default: false + help: 'Whether to save intermediate files' +is_emit_original_quals: + type: 'Bool' + required: true + default: true + help: 'Whether to emit original quality scores after recalibration' +is_DOC_run: + type: 'Bool' + required: true + default: false + help: 'Whether to run the DepthOfCoverage process, which is very time-consuming for large BAMs' +intervals: + type: 'String' + allow_empty: true + required: true + help: 'Target intervals to process for DNA panel/targeted sequencing samples; leave empty for WGS' +scatter_count: + type: 'Integer' + required: true + default: 50 + help: 'How many intervals to divide the genome into for parallelization' +split_intervals_extra_args: + type: 'String' + allow_empty: true + required: false + help: 'Extra arguments for interval splitting' +gatk_ir_compression: + type: 'Integer' + choices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + default: 1 + required: false +reference_fasta: + type: 'Path' + mode: 'r' + required: true + help: 'Absolute path to reference genome fasta' +bundle_mills_and_1000g_gold_standard_indels_vcf_gz: + type: 'Path' + mode: 'r' + required: true + help: 'Absolute path to Mills and 1000g gold standard INDELs VCF' +bundle_known_indels_vcf_gz: + type: 'Path' + mode: 'r' + required: true + help: 'Absolute path to known INDELs VCF' +bundle_v0_dbsnp138_vcf_gz: + type: 'Path' + mode: 'r' + required: true + help: 'Absolute path to v0 dbSNP 138 VCF' +bundle_contest_hapmap_3p3_vcf_gz: + type: 'Path' + mode: 'r' + required: true + help: 'Absolute path to ConEst HapMap 3p3 VCF' +metapipeline_delete_input_bams: + type: 'Bool' + required: true + default: false + help: 'Whether to delete the input BAMs' +metapipeline_final_output_dir: + type: 'String' + required: false + help: 'Directory containing final outputs to check before input deletion' +metapipeline_states_to_delete: + type: 'List' + required: true + help: 'List of states for which to delete input BAMs' + default: + - 'normal' + - 'tumor' + choice: + - 'normal' + - 'tumor' +base_resource_update: + type: 'ResourceUpdateNamespace' + required: false + help: 'User-defined modifications for adjusting base resource allocations for processes' + elements: + memory: + type: 'ResourceUpdateList' + required: false + help: 'List of memory updates' + cpus: + type: 'ResourceUpdateList' + required: false + help: 'List of CPU updates' +input: + type: 'InputNamespace' + required: true + help: 'Input samples' + elements: + BAM: + type: 'InputBAMNamespace' + required: true + help: 'Input BAMs for calling' + elements: + normal: + type: 'BAMEntryList' + required: false + help: 'Input normal BAMs' + tumor: + type: 'BAMEntryList' + required: false + help: 'Input tumor BAMs' + recalibration_table: + type: 'RecalibrationTableList' + required: false + allow_empty: false + help: 'List of any available recalibration tables' \ No newline at end of file From d3293e11609168ca5de2ac11b17e72b2235e86f7 Mon Sep 17 00:00:00 2001 From: Nicole Zeltser Date: Fri, 8 Mar 2024 16:51:54 -0800 Subject: [PATCH 02/13] update schema with correct params --- config/schema.yaml | 114 +++++++++++++++++++-------------------------- 1 file changed, 49 insertions(+), 65 deletions(-) diff --git a/config/schema.yaml b/config/schema.yaml index 0b22ed6..d0c9d90 100644 --- a/config/schema.yaml +++ b/config/schema.yaml @@ -1,16 +1,12 @@ --- -patient_id: +sample_id: type: 'String' required: true - help: 'Patient ID' + help: 'Sample ID' dataset_id: type: 'String' required: true help: 'Dataset ID' -aligner: - type: 'AlignerTool' - required: true - help: 'Aligner used to align input BAMs. Provided as -' output_dir: type: 'Path' mode: 'w' @@ -21,17 +17,47 @@ save_intermediate_files: required: true default: false help: 'Whether to save intermediate files' -is_emit_original_quals: +save_interval_list: + type: 'Bool' + required: true + default: false + help: 'Whether to save a copy of the interval list generated for picard CollectHsMetrics' +collect_metrics: + type: 'Bool' + required: true + default: true + help: 'Whether to run picard CollectHsMetrics' +off_target_depth: + type: 'Bool' + required: true + default: true + help: 'Whether to calculate depth at off-target dbSNP loci' +output_enriched_target_file: type: 'Bool' required: true default: true - help: 'Whether to emit original quality scores after recalibration' -is_DOC_run: + help: 'Whether or not to output a new target file containing high-coverage off-target dbSNP loci' +target_depth: type: 'Bool' required: true default: false - help: 'Whether to run the DepthOfCoverage process, which is very time-consuming for large BAMs' -intervals: + help: 'Whether or not to output a new target file containing high-coverage off-target dbSNP loci' +target_bed: + type: 'String' + allow_empty: true + required: true + help: 'Target intervals to process for DNA panel/targeted sequencing samples; leave empty for WGS' +bait_bed: + type: 'String' + allow_empty: true + required: true + help: 'Target intervals to process for DNA panel/targeted sequencing samples; leave empty for WGS' +target_interval_list: + type: 'String' + allow_empty: true + required: true + help: 'Target intervals to process for DNA panel/targeted sequencing samples; leave empty for WGS' +bait_interval_list: type: 'String' allow_empty: true required: true @@ -41,73 +67,31 @@ scatter_count: required: true default: 50 help: 'How many intervals to divide the genome into for parallelization' -split_intervals_extra_args: +picard_CollectHsMetrics_extra_args: type: 'String' allow_empty: true - required: false - help: 'Extra arguments for interval splitting' -gatk_ir_compression: - type: 'Integer' - choices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] - default: 1 - required: false -reference_fasta: - type: 'Path' - mode: 'r' required: true - help: 'Absolute path to reference genome fasta' -bundle_mills_and_1000g_gold_standard_indels_vcf_gz: - type: 'Path' - mode: 'r' + help: 'Extra arguments for CollectHsMetrics' +samtools_depth_extra_args: + type: 'String' + allow_empty: true required: true - help: 'Absolute path to Mills and 1000g gold standard INDELs VCF' -bundle_known_indels_vcf_gz: + help: 'Extra arguments for samtools depth' +reference_dict: type: 'Path' mode: 'r' required: true - help: 'Absolute path to known INDELs VCF' -bundle_v0_dbsnp138_vcf_gz: + help: 'Absolute path to reference genome dictionary' +reference_dbSNP:: type: 'Path' mode: 'r' required: true - help: 'Absolute path to v0 dbSNP 138 VCF' -bundle_contest_hapmap_3p3_vcf_gz: + help: 'Absolute path to thinned dbSNP VCF' +genome_sizes: type: 'Path' mode: 'r' required: true - help: 'Absolute path to ConEst HapMap 3p3 VCF' -metapipeline_delete_input_bams: - type: 'Bool' - required: true - default: false - help: 'Whether to delete the input BAMs' -metapipeline_final_output_dir: - type: 'String' - required: false - help: 'Directory containing final outputs to check before input deletion' -metapipeline_states_to_delete: - type: 'List' - required: true - help: 'List of states for which to delete input BAMs' - default: - - 'normal' - - 'tumor' - choice: - - 'normal' - - 'tumor' -base_resource_update: - type: 'ResourceUpdateNamespace' - required: false - help: 'User-defined modifications for adjusting base resource allocations for processes' - elements: - memory: - type: 'ResourceUpdateList' - required: false - help: 'List of memory updates' - cpus: - type: 'ResourceUpdateList' - required: false - help: 'List of CPU updates' + help: 'Absolute path to table of chromosome lengths, can be fasta index' input: type: 'InputNamespace' required: true From a268299026334147c3392bdb9907732090092a97 Mon Sep 17 00:00:00 2001 From: Nicole Zeltser Date: Fri, 8 Mar 2024 16:53:31 -0800 Subject: [PATCH 03/13] update schema inputs --- config/schema.yaml | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/config/schema.yaml b/config/schema.yaml index d0c9d90..329a425 100644 --- a/config/schema.yaml +++ b/config/schema.yaml @@ -97,21 +97,7 @@ input: required: true help: 'Input samples' elements: - BAM: - type: 'InputBAMNamespace' + bam: + type: 'Path' required: true - help: 'Input BAMs for calling' - elements: - normal: - type: 'BAMEntryList' - required: false - help: 'Input normal BAMs' - tumor: - type: 'BAMEntryList' - required: false - help: 'Input tumor BAMs' - recalibration_table: - type: 'RecalibrationTableList' - required: false - allow_empty: false - help: 'List of any available recalibration tables' \ No newline at end of file + help: 'Input BAM for coverage analysis' From 2d0df79a73b01a2b5d99ceb89cc238a385f4c4fa Mon Sep 17 00:00:00 2001 From: Nicole Zeltser Date: Fri, 8 Mar 2024 16:56:00 -0800 Subject: [PATCH 04/13] add custom schema config --- config/custom_schema_types.config | 172 ++++++++++++++++++++++++++++++ 1 file changed, 172 insertions(+) create mode 100644 config/custom_schema_types.config diff --git a/config/custom_schema_types.config b/config/custom_schema_types.config new file mode 100644 index 0000000..24c4f88 --- /dev/null +++ b/config/custom_schema_types.config @@ -0,0 +1,172 @@ +/** +* This custom schema namespace implements a custom type for checking input BAMs for call-gSNP +*/ +custom_schema_types { + allowed_input_types = [ + 'BAM', + 'recalibration_table' + ] + allowed_bam_types = [ + 'normal', + 'tumor' + ] + allowed_resource_types = [ + 'memory', + 'cpus' + ] + + /** + * Check that input types are in allowed list + */ + check_input_type_keys = { List given, String name, List choices=custom_schema_types.allowed_input_types -> + for (elem in given) { + if (!(elem in choices)) { + throw new Exception("Invalid paramter ${name}. Valid types: ${choices}.") + } + } + } + + /** + * Check if input is a String or GString + */ + is_string = { val -> + return (val in String || val in GString) + } + + /** + * Check if given input is a Namespace + */ + check_if_namespace = { val, String name -> + if (!(val in Map)) { + throw new Exception("${name} should be a Namespace, not ${val.getClass()}.") + } + } + + /** + * Check if given input is a list + */ + check_if_list = { val, String name -> + if (!(val in List || val in Set)) { + throw new Exception("${name} should be a List, not ${val.getClass()}.") + } + } + + /** + * Check if given input is a number + */ + check_if_number = { val, String name -> + if (!(val in Integer || val in Float)) { + throw new Exception("${name} should be an Integer or Float, not ${val.getClass()}") + } + } + + /** + * Check if given input is valid process list + */ + check_if_process_list = { val, String name -> + if (custom_schema_types.is_string(val)) { + if (val.isEmpty()) { + throw new Exception("Empty string specified for ${name}. Please provide valid input.") + } + } else { + try { + custom_schema_types.check_if_list(val, name) + } catch(Exception e) { + throw new Exception("${name} should be either a string or a list. Please provide valid input.") + } + } + } + + /** + * Check that input is namespace of expected types + */ + check_input_namespace = { Map options, String name, Map properties -> + // Check parameters keys + custom_schema_types.check_if_namespace(options[name], name) + def given_keys = options[name].keySet() as ArrayList + custom_schema_types.check_input_type_keys(given_keys, name) + + options[name].each { entry -> + def entry_as_map = [:] + entry_as_map[entry.key] = entry.value + schema.validate_parameter(entry_as_map, entry.key, properties.elements[entry.key]) + } + } + + /** + * Check namespace BAM + */ + check_bam_namespace = { Map options, String name, Map properties -> + custom_schema_types.check_if_namespace(options[name], name) + def given_keys = options[name].keySet() as ArrayList + if (given_keys.size() <= 0) { + throw new Exception("No inputs provided! Please provide inputs in the CSV or YAML.") + } + custom_schema_types.check_input_type_keys(given_keys, name, custom_schema_types.allowed_bam_types) + + options[name].each { entry -> + def entry_as_map = [:] + entry_as_map[entry.key] = entry.value + schema.validate_parameter(entry_as_map, entry.key, properties.elements[entry.key]) + } + } + + /** + * Check namespace for resource updates + */ + check_resource_update_namespace = { Map options, String name, Map properties -> + custom_schema_types.check_if_namespace(options[name], name) + def given_keys = options[name].keySet() as ArrayList + if (given_keys.size() <= 0) { + return + } + custom_schema_types.check_input_type_keys(given_keys, name, custom_schema_types.allowed_resource_types) + + options[name].each { entry -> + def entry_as_map = [:] + entry_as_map[entry.key] = entry.value + schema.validate_parameter(entry_as_map, entry.key, properties.elements[entry.key]) + } + } + + /** + * Check if proper BAM entry list + */ + check_readable_file_list = { Map options, String name, Map properties -> + custom_schema_types.check_if_list(options[name], name) + for (item in options[name]) { + schema.check_path(item, 'r') + } + } + + /** + * Check list of resource updates + */ + check_resource_update_list = { Map options, String name, Map properties -> + custom_schema_types.check_if_list(options[name], name) + for (item in options[name]) { + custom_schema_types.check_if_process_list(item[0], name) + custom_schema_types.check_if_number(item[1], name) + } + } + + /** + * Check aligner and version + */ + check_aligner = { Map options, String name, Map properties -> + schema.primitive_check_type(options, name, 'String') + if (!(options[name] ==~ /(BWA-MEM2|HISAT2)-[\d\.]+/)) { + throw new Exception("Invalid value for parameter ${name}: ${options[name]}. Please check and make sure the proper aligner is provided.") + } + } + + types = [ + 'InputNamespace': custom_schema_types.check_input_namespace, + 'InputBAMNamespace': custom_schema_types.check_bam_namespace, + 'BAMEntryList': custom_schema_types.check_readable_file_list, + 'RecalibrationTableList': custom_schema_types.check_readable_file_list, + 'AlignerTool': custom_schema_types.check_aligner, + 'ResourceUpdateNamespace': custom_schema_types.check_resource_update_namespace, + 'ResourceUpdateList': custom_schema_types.check_resource_update_list + ] +} From 2c18eeb527ed520cf98e81d80f0b5d2b88b56e6c Mon Sep 17 00:00:00 2001 From: Nicole Zeltser Date: Fri, 8 Mar 2024 16:59:37 -0800 Subject: [PATCH 05/13] update custom schema config to targeted-coverage --- config/custom_schema_types.config | 54 +------------------------------ 1 file changed, 1 insertion(+), 53 deletions(-) diff --git a/config/custom_schema_types.config b/config/custom_schema_types.config index 24c4f88..2e0df75 100644 --- a/config/custom_schema_types.config +++ b/config/custom_schema_types.config @@ -3,16 +3,7 @@ */ custom_schema_types { allowed_input_types = [ - 'BAM', - 'recalibration_table' - ] - allowed_bam_types = [ - 'normal', - 'tumor' - ] - allowed_resource_types = [ - 'memory', - 'cpus' + 'bam' ] /** @@ -111,24 +102,6 @@ custom_schema_types { } } - /** - * Check namespace for resource updates - */ - check_resource_update_namespace = { Map options, String name, Map properties -> - custom_schema_types.check_if_namespace(options[name], name) - def given_keys = options[name].keySet() as ArrayList - if (given_keys.size() <= 0) { - return - } - custom_schema_types.check_input_type_keys(given_keys, name, custom_schema_types.allowed_resource_types) - - options[name].each { entry -> - def entry_as_map = [:] - entry_as_map[entry.key] = entry.value - schema.validate_parameter(entry_as_map, entry.key, properties.elements[entry.key]) - } - } - /** * Check if proper BAM entry list */ @@ -139,34 +112,9 @@ custom_schema_types { } } - /** - * Check list of resource updates - */ - check_resource_update_list = { Map options, String name, Map properties -> - custom_schema_types.check_if_list(options[name], name) - for (item in options[name]) { - custom_schema_types.check_if_process_list(item[0], name) - custom_schema_types.check_if_number(item[1], name) - } - } - - /** - * Check aligner and version - */ - check_aligner = { Map options, String name, Map properties -> - schema.primitive_check_type(options, name, 'String') - if (!(options[name] ==~ /(BWA-MEM2|HISAT2)-[\d\.]+/)) { - throw new Exception("Invalid value for parameter ${name}: ${options[name]}. Please check and make sure the proper aligner is provided.") - } - } - types = [ 'InputNamespace': custom_schema_types.check_input_namespace, 'InputBAMNamespace': custom_schema_types.check_bam_namespace, 'BAMEntryList': custom_schema_types.check_readable_file_list, - 'RecalibrationTableList': custom_schema_types.check_readable_file_list, - 'AlignerTool': custom_schema_types.check_aligner, - 'ResourceUpdateNamespace': custom_schema_types.check_resource_update_namespace, - 'ResourceUpdateList': custom_schema_types.check_resource_update_list ] } From 7fa60fb2fcabaca4c5a38149a411777313658754 Mon Sep 17 00:00:00 2001 From: Nicole Zeltser Date: Fri, 8 Mar 2024 17:02:24 -0800 Subject: [PATCH 06/13] call schema validation --- config/methods.config | 3 +++ 1 file changed, 3 insertions(+) diff --git a/config/methods.config b/config/methods.config index f01cce4..5e96282 100644 --- a/config/methods.config +++ b/config/methods.config @@ -48,6 +48,9 @@ methods { methods.set_pipeline_logs() methods.set_env() + schema.load_custom_types("${projectDir}/config/custom_schema_types.config") + schema.validate() + methods.setup_docker_cpus() } } From 2b592f6b9bc18189af98787abeec7ca3c41b6813 Mon Sep 17 00:00:00 2001 From: Nicole Zeltser Date: Fri, 8 Mar 2024 17:04:27 -0800 Subject: [PATCH 07/13] update bam mode --- config/schema.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/config/schema.yaml b/config/schema.yaml index 329a425..62f3452 100644 --- a/config/schema.yaml +++ b/config/schema.yaml @@ -99,5 +99,6 @@ input: elements: bam: type: 'Path' + mode: 'r' required: true help: 'Input BAM for coverage analysis' From 68e6e6718287e98f8cc709eeff3893c188d441c4 Mon Sep 17 00:00:00 2001 From: Nicole Zeltser Date: Fri, 8 Mar 2024 17:08:02 -0800 Subject: [PATCH 08/13] update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5ad38af..52df869 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm - Add workflow to build and deploy documentation to GitHub Pages - Add workflow to run Nextflow configuration regression tests - Add NFTest infrastructure and test cases +- Add parameter validation schema ### Changed - Update CI/CD workflow to use current image From deda274ec8bfc30c96531e7d574aedca6a009c3a Mon Sep 17 00:00:00 2001 From: Nicole Zeltser Date: Fri, 8 Mar 2024 18:14:14 -0800 Subject: [PATCH 09/13] schema fixes --- config/custom_schema_types.config | 111 ------------------------------ config/methods.config | 1 + 2 files changed, 1 insertion(+), 111 deletions(-) diff --git a/config/custom_schema_types.config b/config/custom_schema_types.config index 2e0df75..da83753 100644 --- a/config/custom_schema_types.config +++ b/config/custom_schema_types.config @@ -6,115 +6,4 @@ custom_schema_types { 'bam' ] - /** - * Check that input types are in allowed list - */ - check_input_type_keys = { List given, String name, List choices=custom_schema_types.allowed_input_types -> - for (elem in given) { - if (!(elem in choices)) { - throw new Exception("Invalid paramter ${name}. Valid types: ${choices}.") - } - } - } - - /** - * Check if input is a String or GString - */ - is_string = { val -> - return (val in String || val in GString) - } - - /** - * Check if given input is a Namespace - */ - check_if_namespace = { val, String name -> - if (!(val in Map)) { - throw new Exception("${name} should be a Namespace, not ${val.getClass()}.") - } - } - - /** - * Check if given input is a list - */ - check_if_list = { val, String name -> - if (!(val in List || val in Set)) { - throw new Exception("${name} should be a List, not ${val.getClass()}.") - } - } - - /** - * Check if given input is a number - */ - check_if_number = { val, String name -> - if (!(val in Integer || val in Float)) { - throw new Exception("${name} should be an Integer or Float, not ${val.getClass()}") - } - } - - /** - * Check if given input is valid process list - */ - check_if_process_list = { val, String name -> - if (custom_schema_types.is_string(val)) { - if (val.isEmpty()) { - throw new Exception("Empty string specified for ${name}. Please provide valid input.") - } - } else { - try { - custom_schema_types.check_if_list(val, name) - } catch(Exception e) { - throw new Exception("${name} should be either a string or a list. Please provide valid input.") - } - } - } - - /** - * Check that input is namespace of expected types - */ - check_input_namespace = { Map options, String name, Map properties -> - // Check parameters keys - custom_schema_types.check_if_namespace(options[name], name) - def given_keys = options[name].keySet() as ArrayList - custom_schema_types.check_input_type_keys(given_keys, name) - - options[name].each { entry -> - def entry_as_map = [:] - entry_as_map[entry.key] = entry.value - schema.validate_parameter(entry_as_map, entry.key, properties.elements[entry.key]) - } - } - - /** - * Check namespace BAM - */ - check_bam_namespace = { Map options, String name, Map properties -> - custom_schema_types.check_if_namespace(options[name], name) - def given_keys = options[name].keySet() as ArrayList - if (given_keys.size() <= 0) { - throw new Exception("No inputs provided! Please provide inputs in the CSV or YAML.") - } - custom_schema_types.check_input_type_keys(given_keys, name, custom_schema_types.allowed_bam_types) - - options[name].each { entry -> - def entry_as_map = [:] - entry_as_map[entry.key] = entry.value - schema.validate_parameter(entry_as_map, entry.key, properties.elements[entry.key]) - } - } - - /** - * Check if proper BAM entry list - */ - check_readable_file_list = { Map options, String name, Map properties -> - custom_schema_types.check_if_list(options[name], name) - for (item in options[name]) { - schema.check_path(item, 'r') - } - } - - types = [ - 'InputNamespace': custom_schema_types.check_input_namespace, - 'InputBAMNamespace': custom_schema_types.check_bam_namespace, - 'BAMEntryList': custom_schema_types.check_readable_file_list, - ] } diff --git a/config/methods.config b/config/methods.config index 5e96282..ee51e38 100644 --- a/config/methods.config +++ b/config/methods.config @@ -48,6 +48,7 @@ methods { methods.set_pipeline_logs() methods.set_env() + schema.load_schema_types("${projectDir}/external/pipeline-Nextflow-config/config/schema/custom_schema_types.config") schema.load_custom_types("${projectDir}/config/custom_schema_types.config") schema.validate() From 2190066361694cac359a71d4a4f661de479fcdb6 Mon Sep 17 00:00:00 2001 From: Nicole Zeltser Date: Fri, 8 Mar 2024 18:27:59 -0800 Subject: [PATCH 10/13] fix schema function --- config/methods.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/methods.config b/config/methods.config index ee51e38..a494176 100644 --- a/config/methods.config +++ b/config/methods.config @@ -48,7 +48,7 @@ methods { methods.set_pipeline_logs() methods.set_env() - schema.load_schema_types("${projectDir}/external/pipeline-Nextflow-config/config/schema/custom_schema_types.config") + schema.load_custom_types("${projectDir}/external/pipeline-Nextflow-config/config/schema/custom_schema_types.config") schema.load_custom_types("${projectDir}/config/custom_schema_types.config") schema.validate() From 6b86b7093919abb8a46e4c520f36e93680017186 Mon Sep 17 00:00:00 2001 From: Nicole Zeltser Date: Fri, 8 Mar 2024 18:39:28 -0800 Subject: [PATCH 11/13] fix schema typo --- config/schema.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/schema.yaml b/config/schema.yaml index 62f3452..6676204 100644 --- a/config/schema.yaml +++ b/config/schema.yaml @@ -82,7 +82,7 @@ reference_dict: mode: 'r' required: true help: 'Absolute path to reference genome dictionary' -reference_dbSNP:: +reference_dbSNP: type: 'Path' mode: 'r' required: true From 4a79901d645cb7024083f5d096f250d777dfea6a Mon Sep 17 00:00:00 2001 From: Nicole Zeltser Date: Mon, 11 Mar 2024 14:00:27 -0700 Subject: [PATCH 12/13] remove extra scatter param --- config/schema.yaml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/config/schema.yaml b/config/schema.yaml index 6676204..4982244 100644 --- a/config/schema.yaml +++ b/config/schema.yaml @@ -62,11 +62,6 @@ bait_interval_list: allow_empty: true required: true help: 'Target intervals to process for DNA panel/targeted sequencing samples; leave empty for WGS' -scatter_count: - type: 'Integer' - required: true - default: 50 - help: 'How many intervals to divide the genome into for parallelization' picard_CollectHsMetrics_extra_args: type: 'String' allow_empty: true From 41599ed99f19272195e35367031d15ea04c7cbe7 Mon Sep 17 00:00:00 2001 From: Nicole Zeltser Date: Mon, 11 Mar 2024 14:00:59 -0700 Subject: [PATCH 13/13] update regression test with new schema params --- test/configtest-F16.json | 637 +++++++++++++++++++++++++++------------ 1 file changed, 440 insertions(+), 197 deletions(-) diff --git a/test/configtest-F16.json b/test/configtest-F16.json index 5bd74ee..6078e46 100644 --- a/test/configtest-F16.json +++ b/test/configtest-F16.json @@ -1,215 +1,458 @@ { - "nextflow_version": "23.10.0", - "config": [ - "test/nftest.config" - ], - "params_file": "test/single.yaml", - "cpus": 16, - "memory_gb": 31, - "nf_params": { - "output_dir": "/tmp/nf-config-test-outputs" - }, - "envvars": { - "SLURM_JOB_ID": "4674821" - }, - "mocks": { - "check_path": "" - }, - "dated_fields": [ - "params.log_output_dir", - "report.file", - "timeline.file", - "trace.file", - "params.date" - ], - "expected_result": { - "docker": { - "all_group_ids": "$(for i in `id --real --groups`; do echo -n \"--group-add=$i \"; done)", - "enabled": true, - "runOptions": "-u $(id -u):$(id -g) $(for i in `id --real --groups`; do echo -n \"--group-add=$i \"; done)", - "uid_and_gid": "-u $(id -u):$(id -g)" + "nextflow_version": "23.10.0", + "config": [ + "test/nftest.config" + ], + "params_file": "test/single.yaml", + "cpus": 16, + "memory_gb": 31, + "nf_params": { + "output_dir": "/tmp/nf-config-test-outputs" }, - "manifest": { - "author": "Nicole Zeltser", - "description": "Nextflow pipeline for calculating read-depth related statistics for targeted sequencing experiments", - "name": "calculate-targeted-coverage", - "version": "1.0.0-rc.2" + "envvars": { + "SLURM_JOB_ID": "4674821" }, - "params": { - "bait_bed": "", - "bait_interval_list": "", - "bedr_version": "1.1.0", - "bedtools_version": "2.29.2", - "blcds_registered_dataset": false, - "collect_metrics": true, - "coverage_cap": "3000", - "dataset_id": "TWGSAMIN000001-T002-S02-F", - "dataset_registry_prefix": "/hot/data", - "date": "19970704T165655Z", - "dbSNP_slop": "150", - "docker_container_registry": "ghcr.io/uclahs-cds", - "docker_image_bedops": "ghcr.io/uclahs-cds/bedr:1.1.0", - "docker_image_bedtools": "ghcr.io/uclahs-cds/bedtools:2.29.2", - "docker_image_picard": "ghcr.io/uclahs-cds/picard:3.0.0", - "docker_image_samtools": "ghcr.io/uclahs-cds/samtools:1.16.1", - "docker_image_validate": "ghcr.io/uclahs-cds/pipeval:4.0.0-rc.2", - "gatk_command_mem_diff": "2 GB", - "genome_sizes": "/hot/ref/reference/GRCh38-BI-20160721/Homo_sapiens_assembly38.fasta.fai", - "input": { - "bam": "/hot/resource/SMC-HET/tumours/A-mini/bams/n1/output/S2.T-n1.bam" - }, - "log_output_dir": "/tmp/nf-config-test-outputs/calculate-targeted-coverage-1.0.0-rc.2/TWGSAMIN000001-T002-S02-F/log-calculate-targeted-coverage-1.0.0-rc.2-19970704T165655Z", - "max_cpus": "16", - "max_memory": "31 GB", - "merge_operation": "collapse", - "min_base_quality": "20", - "min_cpus": "1", - "min_mapping_quality": "20", - "min_memory": "1 MB", - "min_read_depth": "30", - "near_distance": "250", - "off_target_depth": true, - "off_target_slop": "500", - "output_dir": "/tmp/nf-config-test-outputs", - "output_dir_base": "/tmp/nf-config-test-outputs/calculate-targeted-coverage-1.0.0-rc.2/TWGSAMIN000001-T002-S02-F", - "output_enriched_target_file": true, - "patient": "null", - "picard_CollectHsMetrics_extra_args": "", - "picard_version": "3.0.0", - "pipeval_version": "4.0.0-rc.2", - "reference_dbSNP": "/hot/ref/database/dbSNP-155/thinned/GRCh38/dbSNP-155_thinned_hg38.vcf.gz", - "reference_dict": "/hot/ref/reference/GRCh38-BI-20160721/Homo_sapiens_assembly38.dict", - "sample": "null", - "sample_id": "TWGSAMIN000001-T002-S02-F", - "samtools_depth_extra_args": "", - "samtools_version": "1.16.1", - "save_all_dbSNP": false, - "save_intermediate_files": false, - "save_interval_list": false, - "save_raw_target_bed": false, - "target_bed": "/hot/software/pipeline/pipeline-calculate-targeted-coverage/Nextflow/development/input/GRch38-small.bed", - "target_depth": true, - "target_interval_list": "", - "ucla_cds": true, - "work_dir": "/scratch/4674821", - "workflow_output_dir": "/tmp/nf-config-test-outputs/calculate-targeted-coverage-1.0.0-rc.2/TWGSAMIN000001-T002-S02-F/SAMtools-1.16.1" + "mocks": { + "check_path": "" }, - "process": { - "cache": true, - "containerOptions": { - "1": "--cpu-shares 1024 --cpus $task.cpus", - "2": "--cpu-shares 1024 --cpus $task.cpus", - "3": "--cpu-shares 1024 --cpus $task.cpus", - "closure": "--cpu-shares 1024 --cpus $task.cpus" - }, - "cpus": { - "1": "1", - "2": "2", - "3": "3", - "closure": "closure()" - }, - "echo": true, - "errorStrategy": { - "1": "finish", - "2": "finish", - "3": "finish", - "closure": "finish" - }, - "executor": "local", - "maxRetries": "1", - "memory": "31 GB", - "withLabel:process_high": { - "cpus": { - "1": "12", - "2": "16", - "3": "16", - "closure": "closure()" + "dated_fields": [ + "params.log_output_dir", + "report.file", + "timeline.file", + "trace.file", + "params.date" + ], + "expected_result": { + "docker": { + "all_group_ids": "$(for i in `id --real --groups`; do echo -n \"--group-add=$i \"; done)", + "enabled": true, + "runOptions": "-u $(id -u):$(id -g) $(for i in `id --real --groups`; do echo -n \"--group-add=$i \"; done)", + "uid_and_gid": "-u $(id -u):$(id -g)" }, - "memory": { - "1": "31 GB", - "2": "31 GB", - "3": "31 GB", - "closure": "closure()" - } - }, - "withLabel:process_low": { - "cpus": { - "1": "2", - "2": "4", - "3": "6", - "closure": "closure()" + "manifest": { + "author": "Nicole Zeltser", + "description": "Nextflow pipeline for calculating read-depth related statistics for targeted sequencing experiments", + "name": "calculate-targeted-coverage", + "version": "1.0.0-rc.2" }, - "memory": { - "1": "3 GB", - "2": "6 GB", - "3": "9 GB", - "closure": "closure()" + "params": { + "bait_bed": "", + "bait_interval_list": "", + "bedr_version": "1.1.0", + "bedtools_version": "2.29.2", + "blcds_registered_dataset": false, + "collect_metrics": true, + "coverage_cap": "3000", + "dataset_id": "TWGSAMIN000001-T002-S02-F", + "dataset_registry_prefix": "/hot/data", + "date": "19970704T165655Z", + "dbSNP_slop": "150", + "docker_container_registry": "ghcr.io/uclahs-cds", + "docker_image_bedops": "ghcr.io/uclahs-cds/bedr:1.1.0", + "docker_image_bedtools": "ghcr.io/uclahs-cds/bedtools:2.29.2", + "docker_image_picard": "ghcr.io/uclahs-cds/picard:3.0.0", + "docker_image_samtools": "ghcr.io/uclahs-cds/samtools:1.16.1", + "docker_image_validate": "ghcr.io/uclahs-cds/pipeval:4.0.0-rc.2", + "gatk_command_mem_diff": "2 GB", + "genome_sizes": "/hot/ref/reference/GRCh38-BI-20160721/Homo_sapiens_assembly38.fasta.fai", + "input": { + "bam": "/hot/resource/SMC-HET/tumours/A-mini/bams/n1/output/S2.T-n1.bam" + }, + "log_output_dir": "/tmp/nf-config-test-outputs/calculate-targeted-coverage-1.0.0-rc.2/TWGSAMIN000001-T002-S02-F/log-calculate-targeted-coverage-1.0.0-rc.2-19970704T165655Z", + "max_cpus": "16", + "max_memory": "31 GB", + "merge_operation": "collapse", + "min_base_quality": "20", + "min_cpus": "1", + "min_mapping_quality": "20", + "min_memory": "1 MB", + "min_read_depth": "30", + "near_distance": "250", + "off_target_depth": true, + "off_target_slop": "500", + "output_dir": "/tmp/nf-config-test-outputs", + "output_dir_base": "/tmp/nf-config-test-outputs/calculate-targeted-coverage-1.0.0-rc.2/TWGSAMIN000001-T002-S02-F", + "output_enriched_target_file": true, + "patient": "null", + "picard_CollectHsMetrics_extra_args": "", + "picard_version": "3.0.0", + "pipeval_version": "4.0.0-rc.2", + "reference_dbSNP": "/hot/ref/database/dbSNP-155/thinned/GRCh38/dbSNP-155_thinned_hg38.vcf.gz", + "reference_dict": "/hot/ref/reference/GRCh38-BI-20160721/Homo_sapiens_assembly38.dict", + "sample": "null", + "sample_id": "TWGSAMIN000001-T002-S02-F", + "samtools_depth_extra_args": "", + "samtools_version": "1.16.1", + "save_all_dbSNP": false, + "save_intermediate_files": false, + "save_interval_list": false, + "save_raw_target_bed": false, + "target_bed": "/hot/software/pipeline/pipeline-calculate-targeted-coverage/Nextflow/development/input/GRch38-small.bed", + "target_depth": true, + "target_interval_list": "", + "ucla_cds": true, + "work_dir": "/scratch/4674821", + "workflow_output_dir": "/tmp/nf-config-test-outputs/calculate-targeted-coverage-1.0.0-rc.2/TWGSAMIN000001-T002-S02-F/SAMtools-1.16.1" + }, + "params_schema": { + "bait_bed": { + "allow_empty": true, + "help": "Target intervals to process for DNA panel/targeted sequencing samples; leave empty for WGS", + "required": true, + "type": "String" + }, + "bait_interval_list": { + "allow_empty": true, + "help": "Target intervals to process for DNA panel/targeted sequencing samples; leave empty for WGS", + "required": true, + "type": "String" + }, + "collect_metrics": { + "default": true, + "help": "Whether to run picard CollectHsMetrics", + "required": true, + "type": "Bool" + }, + "dataset_id": { + "help": "Dataset ID", + "required": true, + "type": "String" + }, + "genome_sizes": { + "help": "Absolute path to table of chromosome lengths, can be fasta index", + "mode": "r", + "required": true, + "type": "Path" + }, + "input": { + "elements": { + "bam": { + "help": "Input BAM for coverage analysis", + "mode": "r", + "required": true, + "type": "Path" + } + }, + "help": "Input samples", + "required": true, + "type": "InputNamespace" + }, + "off_target_depth": { + "default": true, + "help": "Whether to calculate depth at off-target dbSNP loci", + "required": true, + "type": "Bool" + }, + "output_dir": { + "help": "Absolute path to output directory", + "mode": "w", + "required": true, + "type": "Path" + }, + "output_enriched_target_file": { + "default": true, + "help": "Whether or not to output a new target file containing high-coverage off-target dbSNP loci", + "required": true, + "type": "Bool" + }, + "picard_CollectHsMetrics_extra_args": { + "allow_empty": true, + "help": "Extra arguments for CollectHsMetrics", + "required": true, + "type": "String" + }, + "reference_dbSNP": { + "help": "Absolute path to thinned dbSNP VCF", + "mode": "r", + "required": true, + "type": "Path" + }, + "reference_dict": { + "help": "Absolute path to reference genome dictionary", + "mode": "r", + "required": true, + "type": "Path" + }, + "sample_id": { + "help": "Sample ID", + "required": true, + "type": "String" + }, + "samtools_depth_extra_args": { + "allow_empty": true, + "help": "Extra arguments for samtools depth", + "required": true, + "type": "String" + }, + "save_intermediate_files": { + "default": false, + "help": "Whether to save intermediate files", + "required": true, + "type": "Bool" + }, + "save_interval_list": { + "default": false, + "help": "Whether to save a copy of the interval list generated for picard CollectHsMetrics", + "required": true, + "type": "Bool" + }, + "target_bed": { + "allow_empty": true, + "help": "Target intervals to process for DNA panel/targeted sequencing samples; leave empty for WGS", + "required": true, + "type": "String" + }, + "target_depth": { + "default": false, + "help": "Whether or not to output a new target file containing high-coverage off-target dbSNP loci", + "required": true, + "type": "Bool" + }, + "target_interval_list": { + "allow_empty": true, + "help": "Target intervals to process for DNA panel/targeted sequencing samples; leave empty for WGS", + "required": true, + "type": "String" } }, - "withLabel:process_medium": { + "process": { + "cache": true, + "containerOptions": { + "1": "--cpu-shares 1024 --cpus $task.cpus", + "2": "--cpu-shares 1024 --cpus $task.cpus", + "3": "--cpu-shares 1024 --cpus $task.cpus", + "closure": "--cpu-shares 1024 --cpus $task.cpus" + }, "cpus": { - "1": "6", - "2": "12", - "3": "16", + "1": "1", + "2": "2", + "3": "3", "closure": "closure()" }, - "memory": { - "1": "31 GB", - "2": "31 GB", - "3": "31 GB", - "closure": "closure()" + "echo": true, + "errorStrategy": { + "1": "finish", + "2": "finish", + "3": "finish", + "closure": "finish" + }, + "executor": "local", + "maxRetries": "1", + "memory": "31 GB", + "withLabel:process_high": { + "cpus": { + "1": "12", + "2": "16", + "3": "16", + "closure": "closure()" + }, + "memory": { + "1": "31 GB", + "2": "31 GB", + "3": "31 GB", + "closure": "closure()" + } + }, + "withLabel:process_low": { + "cpus": { + "1": "2", + "2": "4", + "3": "6", + "closure": "closure()" + }, + "memory": { + "1": "3 GB", + "2": "6 GB", + "3": "9 GB", + "closure": "closure()" + } + }, + "withLabel:process_medium": { + "cpus": { + "1": "6", + "2": "12", + "3": "16", + "closure": "closure()" + }, + "memory": { + "1": "31 GB", + "2": "31 GB", + "3": "31 GB", + "closure": "closure()" + } + }, + "withName:convert_depth_to_bed": { + "cpus": "1", + "memory": "1 GB" + }, + "withName:merge_bedfiles_BEDtools": { + "cpus": "1", + "memory": "1 GB" + }, + "withName:run_BedToIntervalList_picard": { + "cpus": "1", + "memory": "3 GB" + }, + "withName:run_CollectHsMetrics_picard": { + "cpus": "1", + "memory": "25 GB" + }, + "withName:run_depth_SAMtools": { + "cpus": "1", + "memory": "1 GB" + }, + "withName:run_depth_filter": { + "cpus": "1", + "memory": "1 GB" + }, + "withName:run_intersect_BEDtools": { + "cpus": "1", + "memory": "1 GB" + }, + "withName:run_merge_BEDtools": { + "cpus": "1", + "memory": "1 GB" + }, + "withName:run_slop_BEDtools": { + "cpus": "1", + "memory": "1 GB" } }, - "withName:convert_depth_to_bed": { - "cpus": "1", - "memory": "1 GB" - }, - "withName:merge_bedfiles_BEDtools": { - "cpus": "1", - "memory": "1 GB" + "report": { + "enabled": true, + "file": "/tmp/nf-config-test-outputs/calculate-targeted-coverage-1.0.0-rc.2/TWGSAMIN000001-T002-S02-F/log-calculate-targeted-coverage-1.0.0-rc.2-19970704T165655Z/nextflow-log/report.html" }, - "withName:run_BedToIntervalList_picard": { - "cpus": "1", - "memory": "3 GB" + "timeline": { + "enabled": true, + "file": "/tmp/nf-config-test-outputs/calculate-targeted-coverage-1.0.0-rc.2/TWGSAMIN000001-T002-S02-F/log-calculate-targeted-coverage-1.0.0-rc.2-19970704T165655Z/nextflow-log/timeline.html" }, - "withName:run_CollectHsMetrics_picard": { - "cpus": "1", - "memory": "25 GB" + "trace": { + "enabled": true, + "file": "/tmp/nf-config-test-outputs/calculate-targeted-coverage-1.0.0-rc.2/TWGSAMIN000001-T002-S02-F/log-calculate-targeted-coverage-1.0.0-rc.2-19970704T165655Z/nextflow-log/trace.txt" }, - "withName:run_depth_SAMtools": { - "cpus": "1", - "memory": "1 GB" - }, - "withName:run_depth_filter": { - "cpus": "1", - "memory": "1 GB" - }, - "withName:run_intersect_BEDtools": { - "cpus": "1", - "memory": "1 GB" - }, - "withName:run_merge_BEDtools": { - "cpus": "1", - "memory": "1 GB" - }, - "withName:run_slop_BEDtools": { - "cpus": "1", - "memory": "1 GB" + "tz": "sun.util.calendar.ZoneInfo[id=\"UTC\",offset=0,dstSavings=0,useDaylight=false,transitions=0,lastRule=null]", + "workDir": "/scratch/4674821", + "yaml": { + "bait_bed": { + "allow_empty": true, + "help": "Target intervals to process for DNA panel/targeted sequencing samples; leave empty for WGS", + "required": true, + "type": "String" + }, + "bait_interval_list": { + "allow_empty": true, + "help": "Target intervals to process for DNA panel/targeted sequencing samples; leave empty for WGS", + "required": true, + "type": "String" + }, + "collect_metrics": { + "default": true, + "help": "Whether to run picard CollectHsMetrics", + "required": true, + "type": "Bool" + }, + "dataset_id": { + "help": "Dataset ID", + "required": true, + "type": "String" + }, + "genome_sizes": { + "help": "Absolute path to table of chromosome lengths, can be fasta index", + "mode": "r", + "required": true, + "type": "Path" + }, + "input": { + "elements": { + "bam": { + "help": "Input BAM for coverage analysis", + "mode": "r", + "required": true, + "type": "Path" + } + }, + "help": "Input samples", + "required": true, + "type": "InputNamespace" + }, + "off_target_depth": { + "default": true, + "help": "Whether to calculate depth at off-target dbSNP loci", + "required": true, + "type": "Bool" + }, + "output_dir": { + "help": "Absolute path to output directory", + "mode": "w", + "required": true, + "type": "Path" + }, + "output_enriched_target_file": { + "default": true, + "help": "Whether or not to output a new target file containing high-coverage off-target dbSNP loci", + "required": true, + "type": "Bool" + }, + "picard_CollectHsMetrics_extra_args": { + "allow_empty": true, + "help": "Extra arguments for CollectHsMetrics", + "required": true, + "type": "String" + }, + "reference_dbSNP": { + "help": "Absolute path to thinned dbSNP VCF", + "mode": "r", + "required": true, + "type": "Path" + }, + "reference_dict": { + "help": "Absolute path to reference genome dictionary", + "mode": "r", + "required": true, + "type": "Path" + }, + "sample_id": { + "help": "Sample ID", + "required": true, + "type": "String" + }, + "samtools_depth_extra_args": { + "allow_empty": true, + "help": "Extra arguments for samtools depth", + "required": true, + "type": "String" + }, + "save_intermediate_files": { + "default": false, + "help": "Whether to save intermediate files", + "required": true, + "type": "Bool" + }, + "save_interval_list": { + "default": false, + "help": "Whether to save a copy of the interval list generated for picard CollectHsMetrics", + "required": true, + "type": "Bool" + }, + "target_bed": { + "allow_empty": true, + "help": "Target intervals to process for DNA panel/targeted sequencing samples; leave empty for WGS", + "required": true, + "type": "String" + }, + "target_depth": { + "default": false, + "help": "Whether or not to output a new target file containing high-coverage off-target dbSNP loci", + "required": true, + "type": "Bool" + }, + "target_interval_list": { + "allow_empty": true, + "help": "Target intervals to process for DNA panel/targeted sequencing samples; leave empty for WGS", + "required": true, + "type": "String" + } } - }, - "report": { - "enabled": true, - "file": "/tmp/nf-config-test-outputs/calculate-targeted-coverage-1.0.0-rc.2/TWGSAMIN000001-T002-S02-F/log-calculate-targeted-coverage-1.0.0-rc.2-19970704T165655Z/nextflow-log/report.html" - }, - "timeline": { - "enabled": true, - "file": "/tmp/nf-config-test-outputs/calculate-targeted-coverage-1.0.0-rc.2/TWGSAMIN000001-T002-S02-F/log-calculate-targeted-coverage-1.0.0-rc.2-19970704T165655Z/nextflow-log/timeline.html" - }, - "trace": { - "enabled": true, - "file": "/tmp/nf-config-test-outputs/calculate-targeted-coverage-1.0.0-rc.2/TWGSAMIN000001-T002-S02-F/log-calculate-targeted-coverage-1.0.0-rc.2-19970704T165655Z/nextflow-log/trace.txt" - }, - "tz": "sun.util.calendar.ZoneInfo[id=\"UTC\",offset=0,dstSavings=0,useDaylight=false,transitions=0,lastRule=null]", - "workDir": "/scratch/4674821" + } } -} + \ No newline at end of file