From 2935ab115c8996c9b3f52b8bce7a7126f3a32617 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 5 Aug 2024 10:16:30 +0100 Subject: [PATCH 01/52] Last weeks addition --- main.nf | 31 +- modules.json | 30 + modules/local/generate_samplesheet.nf | 43 ++ modules/local/nextflow/run/main.nf | 38 ++ modules/nf-core/busco/busco/environment.yml | 7 + modules/nf-core/busco/busco/main.nf | 107 +++ modules/nf-core/busco/busco/meta.yml | 98 +++ .../nf-core/busco/busco/tests/main.nf.test | 419 ++++++++++++ .../busco/busco/tests/main.nf.test.snap | 230 +++++++ .../busco/tests/nextflow.augustus.config | 5 + .../nf-core/busco/busco/tests/nextflow.config | 5 + .../busco/busco/tests/nextflow.metaeuk.config | 5 + .../nf-core/busco/busco/tests/old_test.yml | 624 ++++++++++++++++++ modules/nf-core/busco/busco/tests/tags.yml | 2 + modules/nf-core/gfastats/environment.yml | 7 + modules/nf-core/gfastats/main.nf | 66 ++ modules/nf-core/gfastats/meta.yml | 72 ++ .../merquryfk/merquryfk/environment.yml | 5 + modules/nf-core/merquryfk/merquryfk/main.nf | 58 ++ modules/nf-core/merquryfk/merquryfk/meta.yml | 112 ++++ .../nf-core/minimap2/align/environment.yml | 11 + modules/nf-core/minimap2/align/main.nf | 78 +++ modules/nf-core/minimap2/align/meta.yml | 84 +++ .../nf-core/minimap2/align/tests/main.nf.test | 441 +++++++++++++ .../minimap2/align/tests/main.nf.test.snap | 476 +++++++++++++ modules/nf-core/minimap2/align/tests/tags.yml | 2 + .../nf-core/samtools/merge/environment.yml | 8 + modules/nf-core/samtools/merge/main.nf | 61 ++ modules/nf-core/samtools/merge/meta.yml | 83 +++ .../nf-core/samtools/merge/tests/index.config | 3 + .../nf-core/samtools/merge/tests/main.nf.test | 137 ++++ .../samtools/merge/tests/main.nf.test.snap | 228 +++++++ modules/nf-core/samtools/merge/tests/tags.yml | 2 + modules/nf-core/samtools/sort/environment.yml | 8 + modules/nf-core/samtools/sort/main.nf | 73 ++ modules/nf-core/samtools/sort/meta.yml | 71 ++ .../nf-core/samtools/sort/tests/main.nf.test | 128 ++++ .../samtools/sort/tests/main.nf.test.snap | 192 ++++++ .../samtools/sort/tests/nextflow.config | 8 + .../samtools/sort/tests/nextflow_cram.config | 8 + modules/nf-core/samtools/sort/tests/tags.yml | 3 + nextflow.config | 31 +- nextflow_schema.json | 8 +- workflows/ear.nf | 229 +++++-- 44 files changed, 4247 insertions(+), 90 deletions(-) create mode 100644 modules/local/generate_samplesheet.nf create mode 100644 modules/local/nextflow/run/main.nf create mode 100644 modules/nf-core/busco/busco/environment.yml create mode 100644 modules/nf-core/busco/busco/main.nf create mode 100644 modules/nf-core/busco/busco/meta.yml create mode 100644 modules/nf-core/busco/busco/tests/main.nf.test create mode 100644 modules/nf-core/busco/busco/tests/main.nf.test.snap create mode 100644 modules/nf-core/busco/busco/tests/nextflow.augustus.config create mode 100644 modules/nf-core/busco/busco/tests/nextflow.config create mode 100644 modules/nf-core/busco/busco/tests/nextflow.metaeuk.config create mode 100644 modules/nf-core/busco/busco/tests/old_test.yml create mode 100644 modules/nf-core/busco/busco/tests/tags.yml create mode 100644 modules/nf-core/gfastats/environment.yml create mode 100644 modules/nf-core/gfastats/main.nf create mode 100644 modules/nf-core/gfastats/meta.yml create mode 100644 modules/nf-core/merquryfk/merquryfk/environment.yml create mode 100644 modules/nf-core/merquryfk/merquryfk/main.nf create mode 100644 modules/nf-core/merquryfk/merquryfk/meta.yml create mode 100644 modules/nf-core/minimap2/align/environment.yml create mode 100644 modules/nf-core/minimap2/align/main.nf create mode 100644 modules/nf-core/minimap2/align/meta.yml create mode 100644 modules/nf-core/minimap2/align/tests/main.nf.test create mode 100644 modules/nf-core/minimap2/align/tests/main.nf.test.snap create mode 100644 modules/nf-core/minimap2/align/tests/tags.yml create mode 100644 modules/nf-core/samtools/merge/environment.yml create mode 100644 modules/nf-core/samtools/merge/main.nf create mode 100644 modules/nf-core/samtools/merge/meta.yml create mode 100644 modules/nf-core/samtools/merge/tests/index.config create mode 100644 modules/nf-core/samtools/merge/tests/main.nf.test create mode 100644 modules/nf-core/samtools/merge/tests/main.nf.test.snap create mode 100644 modules/nf-core/samtools/merge/tests/tags.yml create mode 100644 modules/nf-core/samtools/sort/environment.yml create mode 100644 modules/nf-core/samtools/sort/main.nf create mode 100644 modules/nf-core/samtools/sort/meta.yml create mode 100644 modules/nf-core/samtools/sort/tests/main.nf.test create mode 100644 modules/nf-core/samtools/sort/tests/main.nf.test.snap create mode 100644 modules/nf-core/samtools/sort/tests/nextflow.config create mode 100644 modules/nf-core/samtools/sort/tests/nextflow_cram.config create mode 100644 modules/nf-core/samtools/sort/tests/tags.yml diff --git a/main.nf b/main.nf index 96a8a81..3b7bca7 100644 --- a/main.nf +++ b/main.nf @@ -16,8 +16,6 @@ nextflow.enable.dsl = 2 */ include { EAR } from './workflows/ear' -include { PIPELINE_INITIALISATION } from './subworkflows/local/utils_nfcore_ear_pipeline' -include { PIPELINE_COMPLETION } from './subworkflows/local/utils_nfcore_ear_pipeline' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -42,8 +40,6 @@ workflow SANGERTOL_EAR { samplesheet ) - emit: - multiqc_report = EAR.out.multiqc_report // channel: /path/to/multiqc_report.html } /* @@ -56,38 +52,13 @@ workflow { main: - // - // SUBWORKFLOW: Run initialisation tasks - // - PIPELINE_INITIALISATION ( - params.version, - params.help, - params.validate_params, - params.monochrome_logs, - args, - params.outdir, - params.input - ) - // // WORKFLOW: Run main workflow // SANGERTOL_EAR ( - PIPELINE_INITIALISATION.out.samplesheet + params.input ) - // - // SUBWORKFLOW: Run completion tasks - // - PIPELINE_COMPLETION ( - params.email, - params.email_on_fail, - params.plaintext_email, - params.outdir, - params.monochrome_logs, - params.hook_url, - SANGERTOL_EAR.out.multiqc_report - ) } /* diff --git a/modules.json b/modules.json index 99a74d8..3b0db89 100644 --- a/modules.json +++ b/modules.json @@ -5,15 +5,45 @@ "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { + "busco/busco": { + "branch": "master", + "git_sha": "17486961b8b1ab1aae258c83a7e947b40d8ab670", + "installed_by": ["modules"] + }, "fastqc": { "branch": "master", "git_sha": "285a50500f9e02578d90b3ce6382ea3c30216acd", "installed_by": ["modules"] }, + "gfastats": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "merquryfk/merquryfk": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "minimap2/align": { + "branch": "master", + "git_sha": "a33ef9475558c6b8da08c5f522ddaca1ec810306", + "installed_by": ["modules"] + }, "multiqc": { "branch": "master", "git_sha": "b7ebe95761cd389603f9cc0e0dc384c0f663815a", "installed_by": ["modules"] + }, + "samtools/merge": { + "branch": "master", + "git_sha": "04fbbc7c43cebc0b95d5b126f6d9fe4effa33519", + "installed_by": ["modules"] + }, + "samtools/sort": { + "branch": "master", + "git_sha": "46eca555142d6e597729fcb682adcc791796f514", + "installed_by": ["modules"] } } }, diff --git a/modules/local/generate_samplesheet.nf b/modules/local/generate_samplesheet.nf new file mode 100644 index 0000000..018f7ec --- /dev/null +++ b/modules/local/generate_samplesheet.nf @@ -0,0 +1,43 @@ +process GENERATE_SAMPLESHEET { + tag "$meta.id" + label "process_low" + + conda "conda-forge::python=3.9" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.9' : + 'biocontainers/python:3.9' }" + + input: + tuple val(meta), path(pacbio_path) + + output: + tuple val(meta), path("*csv"), emit: csv + path "versions.yml", emit: versions + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: "" + """ + generate_samplesheet.py \\ + $prefix \\ + $pacbio_path + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + generate_samplesheet: \$(generate_samplesheet.py -v) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + touch ${prefix}.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + generate_samplesheet: \$(generate_samplesheet.py -v) + END_VERSIONS + """ +} diff --git a/modules/local/nextflow/run/main.nf b/modules/local/nextflow/run/main.nf new file mode 100644 index 0000000..cc522bc --- /dev/null +++ b/modules/local/nextflow/run/main.nf @@ -0,0 +1,38 @@ +import java.nio.file.Paths +import java.nio.file.Files + +process NEXTFLOW_RUN { + tag "$pipeline_name" + + input: + val pipeline_name // String + val nextflow_opts // String + val params_file // pipeline params-file + val samplesheet // pipeline samplesheet + val additional_config // custom configs + + when: + task.ext.when == null || task.ext.when + + exec: + // def args = task.ext.args ?: '' + def cache_dir = Paths.get(workflow.workDir.resolve(pipeline_name).toUri()) + Files.createDirectories(cache_dir) + def nxf_cmd = [ + 'nextflow run', + pipeline_name, + nextflow_opts, + params_file ? "-params-file $params_file" : '', + additional_config ? "-c $additional_config" : '', + samplesheet ? "--input $samplesheet" : '', + "--outdir $task.workDir/results", + ] + def builder = new ProcessBuilder(nxf_cmd.join(" ").tokenize(" ")) + builder.directory(cache_dir.toFile()) + process = builder.start() + assert process.waitFor() == 0: process.text + + output: + path "results" , emit: output + val process.text, emit: log +} \ No newline at end of file diff --git a/modules/nf-core/busco/busco/environment.yml b/modules/nf-core/busco/busco/environment.yml new file mode 100644 index 0000000..06a5d93 --- /dev/null +++ b/modules/nf-core/busco/busco/environment.yml @@ -0,0 +1,7 @@ +name: busco_busco +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::busco=5.7.1 diff --git a/modules/nf-core/busco/busco/main.nf b/modules/nf-core/busco/busco/main.nf new file mode 100644 index 0000000..f7c1a66 --- /dev/null +++ b/modules/nf-core/busco/busco/main.nf @@ -0,0 +1,107 @@ +process BUSCO_BUSCO { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/busco:5.7.1--pyhdfd78af_0': + 'biocontainers/busco:5.7.1--pyhdfd78af_0' }" + + input: + tuple val(meta), path(fasta, stageAs:'tmp_input/*') + val mode // Required: One of genome, proteins, or transcriptome + val lineage // Required: lineage to check against, "auto" enables --auto-lineage instead + path busco_lineages_path // Recommended: path to busco lineages - downloads if not set + path config_file // Optional: busco configuration file + + output: + tuple val(meta), path("*-busco.batch_summary.txt") , emit: batch_summary + tuple val(meta), path("short_summary.*.txt") , emit: short_summaries_txt , optional: true + tuple val(meta), path("short_summary.*.json") , emit: short_summaries_json , optional: true + tuple val(meta), path("*-busco/*/run_*/full_table.tsv") , emit: full_table , optional: true + tuple val(meta), path("*-busco/*/run_*/missing_busco_list.tsv") , emit: missing_busco_list , optional: true + tuple val(meta), path("*-busco/*/run_*/single_copy_proteins.faa") , emit: single_copy_proteins , optional: true + tuple val(meta), path("*-busco/*/run_*/busco_sequences") , emit: seq_dir + tuple val(meta), path("*-busco/*/translated_proteins") , emit: translated_dir , optional: true + tuple val(meta), path("*-busco") , emit: busco_dir + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + if ( mode !in [ 'genome', 'proteins', 'transcriptome' ] ) { + error "Mode must be one of 'genome', 'proteins', or 'transcriptome'." + } + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}-${lineage}" + def busco_config = config_file ? "--config $config_file" : '' + def busco_lineage = lineage.equals('auto') ? '--auto-lineage' : "--lineage_dataset ${lineage}" + def busco_lineage_dir = busco_lineages_path ? "--download_path ${busco_lineages_path}" : '' + """ + # Nextflow changes the container --entrypoint to /bin/bash (container default entrypoint: /usr/local/env-execute) + # Check for container variable initialisation script and source it. + if [ -f "/usr/local/env-activate.sh" ]; then + set +u # Otherwise, errors out because of various unbound variables + . "/usr/local/env-activate.sh" + set -u + fi + + # If the augustus config directory is not writable, then copy to writeable area + if [ ! -w "\${AUGUSTUS_CONFIG_PATH}" ]; then + # Create writable tmp directory for augustus + AUG_CONF_DIR=\$( mktemp -d -p \$PWD ) + cp -r \$AUGUSTUS_CONFIG_PATH/* \$AUG_CONF_DIR + export AUGUSTUS_CONFIG_PATH=\$AUG_CONF_DIR + echo "New AUGUSTUS_CONFIG_PATH=\${AUGUSTUS_CONFIG_PATH}" + fi + + # Ensure the input is uncompressed + INPUT_SEQS=input_seqs + mkdir "\$INPUT_SEQS" + cd "\$INPUT_SEQS" + for FASTA in ../tmp_input/*; do + if [ "\${FASTA##*.}" == 'gz' ]; then + gzip -cdf "\$FASTA" > \$( basename "\$FASTA" .gz ) + else + ln -s "\$FASTA" . + fi + done + cd .. + + busco \\ + --cpu $task.cpus \\ + --in "\$INPUT_SEQS" \\ + --out ${prefix}-busco \\ + --mode $mode \\ + $busco_lineage \\ + $busco_lineage_dir \\ + $busco_config \\ + $args + + # clean up + rm -rf "\$INPUT_SEQS" + + # Move files to avoid staging/publishing issues + mv ${prefix}-busco/batch_summary.txt ${prefix}-busco.batch_summary.txt + mv ${prefix}-busco/*/short_summary.*.{json,txt} . || echo "Short summaries were not available: No genes were found." + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + busco: \$( busco --version 2>&1 | sed 's/^BUSCO //' ) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}-${lineage}" + def fasta_name = files(fasta).first().name - '.gz' + """ + touch ${prefix}-busco.batch_summary.txt + mkdir -p ${prefix}-busco/$fasta_name/run_${lineage}/busco_sequences + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + busco: \$( busco --version 2>&1 | sed 's/^BUSCO //' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/busco/busco/meta.yml b/modules/nf-core/busco/busco/meta.yml new file mode 100644 index 0000000..29745d2 --- /dev/null +++ b/modules/nf-core/busco/busco/meta.yml @@ -0,0 +1,98 @@ +name: busco_busco +description: Benchmarking Universal Single Copy Orthologs +keywords: + - quality control + - genome + - transcriptome + - proteome +tools: + - busco: + description: BUSCO provides measures for quantitative assessment of genome assembly, gene set, and transcriptome completeness based on evolutionarily informed expectations of gene content from near-universal single-copy orthologs selected from OrthoDB. + homepage: https://busco.ezlab.org/ + documentation: https://busco.ezlab.org/busco_userguide.html + tool_dev_url: https://gitlab.com/ezlab/busco + doi: "10.1007/978-1-4939-9173-0_14" + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Nucleic or amino acid sequence file in FASTA format. + pattern: "*.{fasta,fna,fa,fasta.gz,fna.gz,fa.gz}" + - mode: + type: string + description: The mode to run Busco in. One of genome, proteins, or transcriptome + pattern: "{genome,proteins,transcriptome}" + - lineage: + type: string + description: The BUSCO lineage to use, or "auto" to automatically select lineage + - busco_lineages_path: + type: directory + description: Path to local BUSCO lineages directory. + - config_file: + type: file + description: Path to BUSCO config file. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - batch_summary: + type: file + description: Summary of all sequence files analyzed + pattern: "*-busco.batch_summary.txt" + - short_summaries_txt: + type: file + description: Short Busco summary in plain text format + pattern: "short_summary.*.txt" + - short_summaries_json: + type: file + description: Short Busco summary in JSON format + pattern: "short_summary.*.json" + - busco_dir: + type: directory + description: BUSCO lineage specific output + pattern: "*-busco" + - full_table: + type: file + description: Full BUSCO results table + pattern: "full_table.tsv" + - missing_busco_list: + type: file + description: List of missing BUSCOs + pattern: "missing_busco_list.tsv" + - single_copy_proteins: + type: file + description: Fasta file of single copy proteins (transcriptome mode) + pattern: "single_copy_proteins.faa" + - seq_dir: + type: directory + description: BUSCO sequence directory + pattern: "busco_sequences" + - translated_dir: + type: directory + description: Six frame translations of each transcript made by the transcriptome mode + pattern: "translated_dir" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@priyanka-surana" + - "@charles-plessy" + - "@mahesh-panchal" + - "@muffato" + - "@jvhagey" + - "@gallvp" +maintainers: + - "@priyanka-surana" + - "@charles-plessy" + - "@mahesh-panchal" + - "@muffato" + - "@jvhagey" + - "@gallvp" diff --git a/modules/nf-core/busco/busco/tests/main.nf.test b/modules/nf-core/busco/busco/tests/main.nf.test new file mode 100644 index 0000000..16b708b --- /dev/null +++ b/modules/nf-core/busco/busco/tests/main.nf.test @@ -0,0 +1,419 @@ +nextflow_process { + + name "Test Process BUSCO_BUSCO" + script "../main.nf" + process "BUSCO_BUSCO" + + tag "modules" + tag "modules_nfcore" + tag "busco" + tag "busco/busco" + + test("test_busco_genome_single_fasta") { + + config './nextflow.config' + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file( params.test_data['bacteroides_fragilis']['genome']['genome_fna_gz'], checkIfExists: true) + ] + input[1] = 'genome' + input[2] = 'bacteria_odb10' // Launch with 'auto' to use --auto-lineage, and specified lineages // 'auto' removed from test due to memory issues + input[3] = [] // Download busco lineage + input[4] = [] // No config + """ + } + } + + then { + assert process.success + + with(path(process.out.short_summaries_txt[0][1]).text) { + assert contains('BUSCO version') + assert contains('The lineage dataset is') + assert contains('BUSCO was run in mode') + assert contains('Complete BUSCOs') + assert contains('Missing BUSCOs') + assert contains('Dependencies and versions') + } + + with(path(process.out.short_summaries_json[0][1]).text) { + assert contains('one_line_summary') + assert contains('mode') + assert contains('dataset') + } + + assert snapshot( + process.out.batch_summary[0][1], + process.out.full_table[0][1], + process.out.missing_busco_list[0][1], + process.out.versions[0] + ).match() + + with(file(process.out.seq_dir[0][1]).listFiles().collect { it.name }) { + assert contains('single_copy_busco_sequences.tar.gz') + assert contains('multi_copy_busco_sequences.tar.gz') + assert contains('fragmented_busco_sequences.tar.gz') + } + + with(path("${process.out.busco_dir[0][1]}/logs/busco.log").text) { + assert contains('DEBUG:busco.run_BUSCO') + assert contains('Results from dataset') + assert contains('how to cite BUSCO') + } + + assert process.out.single_copy_proteins == [] + assert process.out.translated_dir == [] + } + } + + test("test_busco_genome_multi_fasta") { + + config './nextflow.config' + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + [ + file( params.test_data['bacteroides_fragilis']['genome']['genome_fna_gz'], checkIfExists: true), + file( params.test_data['candidatus_portiera_aleyrodidarum']['genome']['genome_fasta'], checkIfExists: true) + ] + ] + input[1] = 'genome' + input[2] = 'bacteria_odb10' + input[3] = [] + input[4] = [] + """ + } + } + + then { + assert process.success + + with(path(process.out.short_summaries_txt[0][1][0]).text) { + assert contains('BUSCO version') + assert contains('The lineage dataset is') + assert contains('BUSCO was run in mode') + assert contains('Complete BUSCOs') + assert contains('Missing BUSCOs') + assert contains('Dependencies and versions') + } + + with(path(process.out.short_summaries_txt[0][1][1]).text) { + assert contains('BUSCO version') + assert contains('The lineage dataset is') + assert contains('BUSCO was run in mode') + assert contains('Complete BUSCOs') + assert contains('Missing BUSCOs') + assert contains('Dependencies and versions') + } + + with(path(process.out.short_summaries_json[0][1][0]).text) { + assert contains('one_line_summary') + assert contains('mode') + assert contains('dataset') + } + + with(path(process.out.short_summaries_json[0][1][1]).text) { + assert contains('one_line_summary') + assert contains('mode') + assert contains('dataset') + } + + assert snapshot( + process.out.batch_summary[0][1], + process.out.full_table[0][1], + process.out.missing_busco_list[0][1], + process.out.versions[0] + ).match() + + with(file(process.out.seq_dir[0][1][0]).listFiles().collect { it.name }) { + assert contains('single_copy_busco_sequences.tar.gz') + assert contains('multi_copy_busco_sequences.tar.gz') + assert contains('fragmented_busco_sequences.tar.gz') + } + + with(file(process.out.seq_dir[0][1][1]).listFiles().collect { it.name }) { + assert contains('single_copy_busco_sequences.tar.gz') + assert contains('multi_copy_busco_sequences.tar.gz') + assert contains('fragmented_busco_sequences.tar.gz') + } + + with(path("${process.out.busco_dir[0][1]}/logs/busco.log").text) { + assert contains('DEBUG:busco.run_BUSCO') + assert contains('Results from dataset') + assert contains('how to cite BUSCO') + } + + assert process.out.single_copy_proteins == [] + assert process.out.translated_dir == [] + } + + } + + test("test_busco_eukaryote_metaeuk") { + + config './nextflow.metaeuk.config' + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file( params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) + ] + input[1] = 'genome' + input[2] = 'eukaryota_odb10' + input[3] = [] + input[4] = [] + """ + } + } + + then { + assert process.success + + with(path(process.out.short_summaries_txt[0][1]).text) { + assert contains('BUSCO version') + assert contains('The lineage dataset is') + assert contains('BUSCO was run in mode') + assert contains('Complete BUSCOs') + assert contains('Missing BUSCOs') + assert contains('Dependencies and versions') + } + + with(path(process.out.short_summaries_json[0][1]).text) { + assert contains('one_line_summary') + assert contains('mode') + assert contains('dataset') + } + + assert snapshot( + process.out.batch_summary[0][1], + process.out.full_table[0][1], + process.out.missing_busco_list[0][1], + process.out.versions[0] + ).match() + + with(file(process.out.seq_dir[0][1]).listFiles().collect { it.name }) { + assert contains('single_copy_busco_sequences.tar.gz') + assert contains('multi_copy_busco_sequences.tar.gz') + assert contains('fragmented_busco_sequences.tar.gz') + } + + with(path("${process.out.busco_dir[0][1]}/logs/busco.log").text) { + assert contains('DEBUG:busco.run_BUSCO') + assert contains("'use_augustus', 'False'") + assert contains("'use_metaeuk', 'True'") // METAEUK + assert contains('Results from dataset') + assert contains('how to cite BUSCO') + + } + + assert process.out.single_copy_proteins == [] + assert process.out.translated_dir == [] + } + + } + + test("test_busco_eukaryote_augustus") { + + config './nextflow.augustus.config' + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file( params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) + ] + input[1] = 'genome' + input[2] = 'eukaryota_odb10' + input[3] = [] + input[4] = [] + """ + } + } + + then { + assert process.success + + assert snapshot( + process.out.batch_summary[0][1], + process.out.versions[0] + ).match() + + with(path("${process.out.busco_dir[0][1]}/logs/busco.log").text) { + assert contains('DEBUG:busco.run_BUSCO') + assert contains("'use_augustus', 'True'") + assert contains("'use_metaeuk', 'False'") // AUGUSTUS + assert contains('Augustus did not recognize any genes') + + } + + assert process.out.short_summaries_json == [] + assert process.out.short_summaries_txt == [] + assert process.out.missing_busco_list == [] + assert process.out.full_table == [] + assert process.out.single_copy_proteins == [] + assert process.out.translated_dir == [] + } + + } + + test("test_busco_protein") { + + config './nextflow.config' + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file( params.test_data['candidatus_portiera_aleyrodidarum']['genome']['proteome_fasta'], checkIfExists: true) + ] + input[1] = 'proteins' + input[2] = 'bacteria_odb10' + input[3] = [] + input[4] = [] + """ + } + } + + then { + assert process.success + + with(path(process.out.short_summaries_txt[0][1]).text) { + assert contains('BUSCO version') + assert contains('The lineage dataset is') + assert contains('BUSCO was run in mode') + assert contains('Complete BUSCOs') + assert contains('Missing BUSCOs') + assert contains('Dependencies and versions') + } + + with(path(process.out.short_summaries_json[0][1]).text) { + assert contains('one_line_summary') + assert contains('mode') + assert contains('dataset') + } + + assert snapshot( + process.out.batch_summary[0][1], + process.out.full_table[0][1], + process.out.missing_busco_list[0][1], + process.out.versions[0] + ).match() + + with(file(process.out.seq_dir[0][1]).listFiles().collect { it.name }) { + assert contains('single_copy_busco_sequences.tar.gz') + assert contains('multi_copy_busco_sequences.tar.gz') + assert contains('fragmented_busco_sequences.tar.gz') + } + + with(path("${process.out.busco_dir[0][1]}/logs/busco.log").text) { + assert contains('DEBUG:busco.run_BUSCO') + assert contains('Results from dataset') + assert contains('how to cite BUSCO') + } + + assert process.out.single_copy_proteins == [] + assert process.out.translated_dir == [] + } + + } + + test("test_busco_transcriptome") { + + config './nextflow.config' + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file( params.test_data['bacteroides_fragilis']['illumina']['test1_contigs_fa_gz'], checkIfExists: true) + ] + input[1] = 'transcriptome' + input[2] = 'bacteria_odb10' + input[3] = [] + input[4] = [] + """ + } + } + + then { + assert process.success + + with(path(process.out.short_summaries_txt[0][1]).text) { + assert contains('BUSCO version') + assert contains('The lineage dataset is') + assert contains('BUSCO was run in mode') + assert contains('Complete BUSCOs') + assert contains('Missing BUSCOs') + assert contains('Dependencies and versions') + } + + with(path(process.out.short_summaries_json[0][1]).text) { + assert contains('one_line_summary') + assert contains('mode') + assert contains('dataset') + } + + assert snapshot( + process.out.batch_summary[0][1], + process.out.full_table[0][1], + process.out.missing_busco_list[0][1], + process.out.translated_dir[0][1], + process.out.single_copy_proteins[0][1], + process.out.versions[0] + ).match() + + with(file(process.out.seq_dir[0][1]).listFiles().collect { it.name }) { + assert contains('single_copy_busco_sequences.tar.gz') + assert contains('multi_copy_busco_sequences.tar.gz') + assert contains('fragmented_busco_sequences.tar.gz') + } + + with(path("${process.out.busco_dir[0][1]}/logs/busco.log").text) { + assert contains('DEBUG:busco.run_BUSCO') + assert contains('Results from dataset') + assert contains('how to cite BUSCO') + } + } + + } + + test("minimal-stub") { + + options '-stub' + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file( params.test_data['bacteroides_fragilis']['genome']['genome_fna_gz'], checkIfExists: true) + ] + input[1] = 'genome' + input[2] = 'bacteria_odb10' + input[3] = [] + input[4] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + +} diff --git a/modules/nf-core/busco/busco/tests/main.nf.test.snap b/modules/nf-core/busco/busco/tests/main.nf.test.snap new file mode 100644 index 0000000..1b6411b --- /dev/null +++ b/modules/nf-core/busco/busco/tests/main.nf.test.snap @@ -0,0 +1,230 @@ +{ + "minimal-stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test-bacteria_odb10-busco.batch_summary.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + + ], + "6": [ + [ + { + "id": "test" + }, + [ + + ] + ] + ], + "7": [ + + ], + "8": [ + [ + { + "id": "test" + }, + [ + [ + [ + [ + + ] + ] + ] + ] + ] + ], + "9": [ + "versions.yml:md5,3fc94714b95c2dc15399a4229d9dd1d9" + ], + "batch_summary": [ + [ + { + "id": "test" + }, + "test-bacteria_odb10-busco.batch_summary.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "busco_dir": [ + [ + { + "id": "test" + }, + [ + [ + [ + [ + + ] + ] + ] + ] + ] + ], + "full_table": [ + + ], + "missing_busco_list": [ + + ], + "seq_dir": [ + [ + { + "id": "test" + }, + [ + + ] + ] + ], + "short_summaries_json": [ + + ], + "short_summaries_txt": [ + + ], + "single_copy_proteins": [ + + ], + "translated_dir": [ + + ], + "versions": [ + "versions.yml:md5,3fc94714b95c2dc15399a4229d9dd1d9" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-03T13:28:04.451297" + }, + "test_busco_eukaryote_augustus": { + "content": [ + "test-eukaryota_odb10-busco.batch_summary.txt:md5,3ea3bdc423a461dae514d816bdc61c89", + "versions.yml:md5,3fc94714b95c2dc15399a4229d9dd1d9" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-03T13:26:36.974986" + }, + "test_busco_genome_single_fasta": { + "content": [ + "test-bacteria_odb10-busco.batch_summary.txt:md5,21b3fb771cf36be917cc451540d999be", + "full_table.tsv:md5,638fe7590f442c57361554dae330eca1", + "missing_busco_list.tsv:md5,1530af4fe7673a6d001349537bcd410a", + "versions.yml:md5,3fc94714b95c2dc15399a4229d9dd1d9" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-03T13:22:45.07816" + }, + "test_busco_genome_multi_fasta": { + "content": [ + "test-bacteria_odb10-busco.batch_summary.txt:md5,fcd3c208913e8abda3d6742c43fec5fa", + [ + "full_table.tsv:md5,c657edcc7d0de0175869717551df6e83", + "full_table.tsv:md5,638fe7590f442c57361554dae330eca1" + ], + [ + "missing_busco_list.tsv:md5,aceb66e347a353cb7fca8e2a725f9112", + "missing_busco_list.tsv:md5,1530af4fe7673a6d001349537bcd410a" + ], + "versions.yml:md5,3fc94714b95c2dc15399a4229d9dd1d9" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-03T13:23:50.255602" + }, + "test_busco_eukaryote_metaeuk": { + "content": [ + "test-eukaryota_odb10-busco.batch_summary.txt:md5,ff6d8277e452a83ce9456bbee666feb6", + "full_table.tsv:md5,92b1b1d5cb5ea0e2093d16f00187e8c7", + "missing_busco_list.tsv:md5,0352e563de290bf804c708323c35a9e3", + "versions.yml:md5,3fc94714b95c2dc15399a4229d9dd1d9" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-03T13:25:38.159041" + }, + "test_busco_transcriptome": { + "content": [ + "test-bacteria_odb10-busco.batch_summary.txt:md5,8734b3f379c4c0928e5dd4ea1873dc64", + "full_table.tsv:md5,1b2ce808fdafa744c56b5f781551272d", + "missing_busco_list.tsv:md5,a6931b6470262b997b8b99ea0f1d14a4", + [ + "1024388at2.faa:md5,797d603d262a6595a112e25b73e878b0", + "1054741at2.faa:md5,cd4b928cba6b19b4437746ba507e7195", + "1093223at2.faa:md5,df9549708e5ffcfaee6a74dd70a0e5dc", + "1151822at2.faa:md5,12726afc1cdc40c13392e1596e93df3a", + "143460at2.faa:md5,d887431fd988a5556a523440f02d9594", + "1491686at2.faa:md5,d03362d19979b27306c192f1c74a84e5", + "1504821at2.faa:md5,4f5f6e5c57bac0092c1d85ded73d7e67", + "1574817at2.faa:md5,1153e55998c2929eacad2aed7d08d248", + "1592033at2.faa:md5,bb7a59e5f3a57ba12d10dabf4c77ab57", + "1623045at2.faa:md5,8fe38155feb1802beb97ef7714837bf5", + "1661836at2.faa:md5,6c6d592c2fbb0d7a4e5e1f47a15644f0", + "1674344at2.faa:md5,bb41b44e53565a54cadf0b780532fe08", + "1698718at2.faa:md5,f233860000028eb00329aa85236c71e5", + "1990650at2.faa:md5,34a2d29c5f8b6253159ddb7a43fa1829", + "223233at2.faa:md5,dec6705c7846c989296e73942f953cbc", + "402899at2.faa:md5,acc0f271f9a586d2ce1ee41669b22999", + "505485at2.faa:md5,aa0391f8fa5d9bd19b30d844d5a99845", + "665824at2.faa:md5,47f8ad43b6a6078206feb48c2e552793", + "776861at2.faa:md5,f8b90c13f7c6be828dea3bb920195e3d", + "874197at2.faa:md5,8d22a35a768debe6f376fc695d233a69", + "932854at2.faa:md5,2eff2de1ab83b22f3234a529a44e22bb", + "95696at2.faa:md5,247bfd1aef432f7b5456307768e9149c" + ], + "single_copy_proteins.faa:md5,73e2c5d6a9b0f01f2deea3cc5f21b764", + "versions.yml:md5,3fc94714b95c2dc15399a4229d9dd1d9" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-03T13:27:53.992893" + }, + "test_busco_protein": { + "content": [ + "test-bacteria_odb10-busco.batch_summary.txt:md5,f5a782378f9f94a748aa907381fdef91", + "full_table.tsv:md5,812ab6a0496fccab774643cf40c4f2a8", + "missing_busco_list.tsv:md5,aceb66e347a353cb7fca8e2a725f9112", + "versions.yml:md5,3fc94714b95c2dc15399a4229d9dd1d9" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-03T13:27:12.724862" + } +} \ No newline at end of file diff --git a/modules/nf-core/busco/busco/tests/nextflow.augustus.config b/modules/nf-core/busco/busco/tests/nextflow.augustus.config new file mode 100644 index 0000000..84daa69 --- /dev/null +++ b/modules/nf-core/busco/busco/tests/nextflow.augustus.config @@ -0,0 +1,5 @@ +process { + withName: 'BUSCO_BUSCO' { + ext.args = '--tar --augustus' + } +} diff --git a/modules/nf-core/busco/busco/tests/nextflow.config b/modules/nf-core/busco/busco/tests/nextflow.config new file mode 100644 index 0000000..1ec3fec --- /dev/null +++ b/modules/nf-core/busco/busco/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: 'BUSCO_BUSCO' { + ext.args = '--tar' + } +} diff --git a/modules/nf-core/busco/busco/tests/nextflow.metaeuk.config b/modules/nf-core/busco/busco/tests/nextflow.metaeuk.config new file mode 100644 index 0000000..c141844 --- /dev/null +++ b/modules/nf-core/busco/busco/tests/nextflow.metaeuk.config @@ -0,0 +1,5 @@ +process { + withName: 'BUSCO_BUSCO' { + ext.args = '--tar --metaeuk' + } +} diff --git a/modules/nf-core/busco/busco/tests/old_test.yml b/modules/nf-core/busco/busco/tests/old_test.yml new file mode 100644 index 0000000..75177f5 --- /dev/null +++ b/modules/nf-core/busco/busco/tests/old_test.yml @@ -0,0 +1,624 @@ +- name: busco test_busco_genome_single_fasta + command: nextflow run ./tests/modules/nf-core/busco -entry test_busco_genome_single_fasta -c ./tests/config/nextflow.config + tags: + - busco + files: + - path: output/busco/short_summary.specific.bacteria_odb10.genome.fna.json + contains: + - "one_line_summary" + - "mode" + - "dataset" + - path: output/busco/short_summary.specific.bacteria_odb10.genome.fna.txt + contains: + - "BUSCO version" + - "The lineage dataset is" + - "BUSCO was run in mode" + - "Complete BUSCOs" + - "Missing BUSCOs" + - "Dependencies and versions" + - path: output/busco/test-bacteria_odb10-busco.batch_summary.txt + md5sum: bc2440f8a68d7fbf931ff911c1c3fdfa + - path: output/busco/test-bacteria_odb10-busco/genome.fna/logs/bbtools_err.log + - path: output/busco/test-bacteria_odb10-busco/genome.fna/logs/bbtools_out.log + md5sum: 9caf1a1434414c78562eb0bbb9c0e53f + - path: output/busco/test-bacteria_odb10-busco/genome.fna/logs/hmmsearch_err.log + - path: output/busco/test-bacteria_odb10-busco/genome.fna/logs/hmmsearch_out.log + contains: + - "# hmmsearch :: search profile(s) against a sequence database" + - "# target sequence database:" + - "Internal pipeline statistics summary:" + - "[ok]" + - path: output/busco/test-bacteria_odb10-busco/genome.fna/logs/prodigal_err.log + md5sum: 538510cfc7483498210f01e53fe035ad + - path: output/busco/test-bacteria_odb10-busco/genome.fna/logs/prodigal_out.log + md5sum: 61050b0706addc9498b2088a2d6efa9a + - path: output/busco/test-bacteria_odb10-busco/genome.fna/prodigal_output/.checkpoint + contains: + - "Tool: prodigal" + - "Completed" + - "jobs" + - path: output/busco/test-bacteria_odb10-busco/genome.fna/prodigal_output/predicted_genes/predicted.faa + md5sum: 836e9a80d33d8b89168f07ddc13ee991 + - path: output/busco/test-bacteria_odb10-busco/genome.fna/prodigal_output/predicted_genes/predicted.fna + md5sum: 20eeb75f86842e6e136f02bca8b73a9f + - path: output/busco/test-bacteria_odb10-busco/genome.fna/prodigal_output/predicted_genes/tmp/prodigal_mode_single_code_11.faa + md5sum: 836e9a80d33d8b89168f07ddc13ee991 + - path: output/busco/test-bacteria_odb10-busco/genome.fna/prodigal_output/predicted_genes/tmp/prodigal_mode_single_code_11.fna + md5sum: 20eeb75f86842e6e136f02bca8b73a9f + - path: output/busco/test-bacteria_odb10-busco/genome.fna/prodigal_output/predicted_genes/tmp/prodigal_mode_single_code_11_err.log + md5sum: 538510cfc7483498210f01e53fe035ad + - path: output/busco/test-bacteria_odb10-busco/genome.fna/prodigal_output/predicted_genes/tmp/prodigal_mode_single_code_11_out.log + md5sum: 61050b0706addc9498b2088a2d6efa9a + - path: output/busco/test-bacteria_odb10-busco/genome.fna/run_bacteria_odb10/.bbtools_output/.checkpoint + contains: + - "Tool: bbtools" + - "Completed" + - "jobs" + - path: output/busco/test-bacteria_odb10-busco/genome.fna/run_bacteria_odb10/busco_sequences/fragmented_busco_sequences.tar.gz + - path: output/busco/test-bacteria_odb10-busco/genome.fna/run_bacteria_odb10/busco_sequences/multi_copy_busco_sequences.tar.gz + - path: output/busco/test-bacteria_odb10-busco/genome.fna/run_bacteria_odb10/busco_sequences/single_copy_busco_sequences.tar.gz + - path: output/busco/test-bacteria_odb10-busco/genome.fna/run_bacteria_odb10/full_table.tsv + md5sum: c56edab1dc1522e993c25ae2b730799f + - path: output/busco/test-bacteria_odb10-busco/genome.fna/run_bacteria_odb10/hmmer_output.tar.gz + - path: output/busco/test-bacteria_odb10-busco/genome.fna/run_bacteria_odb10/missing_busco_list.tsv + md5sum: b533ef30270f27160acce85a22d01bf5 + - path: output/busco/test-bacteria_odb10-busco/genome.fna/run_bacteria_odb10/short_summary.json + contains: + - "one_line_summary" + - "mode" + - "lineage_dataset" + - path: output/busco/test-bacteria_odb10-busco/genome.fna/run_bacteria_odb10/short_summary.txt + contains: + - "# BUSCO version is:" + - "Results:" + - "busco:" + - path: output/busco/test-bacteria_odb10-busco/logs/busco.log + contains: + - "DEBUG:busco.run_BUSCO" + - "Results from dataset" + - "how to cite BUSCO" + - path: output/busco/versions.yml + +- name: busco test_busco_genome_multi_fasta + command: nextflow run ./tests/modules/nf-core/busco -entry test_busco_genome_multi_fasta -c ./tests/config/nextflow.config + tags: + - busco + files: + - path: output/busco/short_summary.specific.bacteria_odb10.genome.fasta.json + contains: + - "one_line_summary" + - "mode" + - "dataset" + - path: output/busco/short_summary.specific.bacteria_odb10.genome.fasta.txt + contains: + - "BUSCO version" + - "The lineage dataset is" + - "BUSCO was run in mode" + - "Complete BUSCOs" + - "Missing BUSCOs" + - "Dependencies and versions" + - path: output/busco/short_summary.specific.bacteria_odb10.genome.fna.json + contains: + - "one_line_summary" + - "mode" + - "dataset" + - path: output/busco/short_summary.specific.bacteria_odb10.genome.fna.txt + contains: + - "BUSCO version" + - "The lineage dataset is" + - "BUSCO was run in mode" + - "Complete BUSCOs" + - "Missing BUSCOs" + - "Dependencies and versions" + - path: output/busco/test-bacteria_odb10-busco.batch_summary.txt + md5sum: 8c64c1a28b086ef2ee444f99cbed5f7d + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/logs/bbtools_err.log + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/logs/bbtools_out.log + md5sum: 8f047bdb33264d22a83920bc2c63f29a + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/logs/hmmsearch_err.log + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/logs/hmmsearch_out.log + contains: + - "# hmmsearch :: search profile(s) against a sequence database" + - "# target sequence database:" + - "Internal pipeline statistics summary:" + - "[ok]" + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/logs/prodigal_err.log + md5sum: c1fdc6977332f53dfe7f632733bb4585 + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/logs/prodigal_out.log + md5sum: 50752acb1c5a20be886bfdfc06635bcb + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/prodigal_output/.checkpoint + contains: + - "Tool: prodigal" + - "Completed" + - "jobs" + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/prodigal_output/predicted_genes/predicted.faa + md5sum: 8166471fc5f08c82fd5643ab42327f9d + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/prodigal_output/predicted_genes/predicted.fna + md5sum: ddc508a18f60e7f3314534df50cdf8ca + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/prodigal_output/predicted_genes/tmp/prodigal_mode_single_code_11.faa + md5sum: 8166471fc5f08c82fd5643ab42327f9d + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/prodigal_output/predicted_genes/tmp/prodigal_mode_single_code_11.fna + md5sum: ddc508a18f60e7f3314534df50cdf8ca + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/prodigal_output/predicted_genes/tmp/prodigal_mode_single_code_11_err.log + md5sum: c1fdc6977332f53dfe7f632733bb4585 + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/prodigal_output/predicted_genes/tmp/prodigal_mode_single_code_11_out.log + md5sum: 50752acb1c5a20be886bfdfc06635bcb + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/prodigal_output/predicted_genes/tmp/prodigal_mode_single_code_4.faa + md5sum: e56fd59c38248dc21ac94355dca98121 + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/prodigal_output/predicted_genes/tmp/prodigal_mode_single_code_4.fna + md5sum: b365f84bf99c68357952e0b98ed7ce42 + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/prodigal_output/predicted_genes/tmp/prodigal_mode_single_code_4_err.log + md5sum: e5f14d7925ba14a0f9850542f3739894 + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/prodigal_output/predicted_genes/tmp/prodigal_mode_single_code_4_out.log + md5sum: d41971bfc1b621d4ffd2633bc47017ea + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/run_bacteria_odb10/.bbtools_output/.checkpoint + contains: + - "Tool: bbtools" + - "Completed" + - "jobs" + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/run_bacteria_odb10/busco_sequences/fragmented_busco_sequences.tar.gz + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/run_bacteria_odb10/busco_sequences/multi_copy_busco_sequences.tar.gz + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/run_bacteria_odb10/busco_sequences/single_copy_busco_sequences.tar.gz + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/run_bacteria_odb10/full_table.tsv + md5sum: c9651b88b10871abc260ee655898e828 + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/run_bacteria_odb10/hmmer_output.tar.gz + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/run_bacteria_odb10/missing_busco_list.tsv + md5sum: 9939309df2da5419de88c32d1435c779 + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/run_bacteria_odb10/short_summary.json + contains: + - "one_line_summary" + - "mode" + - "dataset" + - path: output/busco/test-bacteria_odb10-busco/genome.fasta/run_bacteria_odb10/short_summary.txt + contains: + - "# BUSCO version is:" + - "Results:" + - "busco:" + - path: output/busco/test-bacteria_odb10-busco/genome.fna/logs/bbtools_err.log + - path: output/busco/test-bacteria_odb10-busco/genome.fna/logs/bbtools_out.log + md5sum: 9caf1a1434414c78562eb0bbb9c0e53f + - path: output/busco/test-bacteria_odb10-busco/genome.fna/logs/hmmsearch_err.log + - path: output/busco/test-bacteria_odb10-busco/genome.fna/logs/hmmsearch_out.log + contains: + - "# hmmsearch :: search profile(s) against a sequence database" + - "# target sequence database:" + - "Internal pipeline statistics summary:" + - "[ok]" + - path: output/busco/test-bacteria_odb10-busco/genome.fna/logs/prodigal_err.log + md5sum: 538510cfc7483498210f01e53fe035ad + - path: output/busco/test-bacteria_odb10-busco/genome.fna/logs/prodigal_out.log + md5sum: 61050b0706addc9498b2088a2d6efa9a + - path: output/busco/test-bacteria_odb10-busco/genome.fna/prodigal_output/.checkpoint + contains: + - "Tool: prodigal" + - "Completed" + - "jobs" + - path: output/busco/test-bacteria_odb10-busco/genome.fna/prodigal_output/predicted_genes/predicted.faa + md5sum: 836e9a80d33d8b89168f07ddc13ee991 + - path: output/busco/test-bacteria_odb10-busco/genome.fna/prodigal_output/predicted_genes/predicted.fna + md5sum: 20eeb75f86842e6e136f02bca8b73a9f + - path: output/busco/test-bacteria_odb10-busco/genome.fna/prodigal_output/predicted_genes/tmp/prodigal_mode_single_code_11.faa + md5sum: 836e9a80d33d8b89168f07ddc13ee991 + - path: output/busco/test-bacteria_odb10-busco/genome.fna/prodigal_output/predicted_genes/tmp/prodigal_mode_single_code_11.fna + md5sum: 20eeb75f86842e6e136f02bca8b73a9f + - path: output/busco/test-bacteria_odb10-busco/genome.fna/prodigal_output/predicted_genes/tmp/prodigal_mode_single_code_11_err.log + md5sum: 538510cfc7483498210f01e53fe035ad + - path: output/busco/test-bacteria_odb10-busco/genome.fna/prodigal_output/predicted_genes/tmp/prodigal_mode_single_code_11_out.log + md5sum: 61050b0706addc9498b2088a2d6efa9a + - path: output/busco/test-bacteria_odb10-busco/genome.fna/run_bacteria_odb10/.bbtools_output/.checkpoint + contains: + - "Tool: bbtools" + - "Completed" + - "jobs" + - path: output/busco/test-bacteria_odb10-busco/genome.fna/run_bacteria_odb10/busco_sequences/fragmented_busco_sequences.tar.gz + - path: output/busco/test-bacteria_odb10-busco/genome.fna/run_bacteria_odb10/busco_sequences/multi_copy_busco_sequences.tar.gz + - path: output/busco/test-bacteria_odb10-busco/genome.fna/run_bacteria_odb10/busco_sequences/single_copy_busco_sequences.tar.gz + - path: output/busco/test-bacteria_odb10-busco/genome.fna/run_bacteria_odb10/full_table.tsv + md5sum: c56edab1dc1522e993c25ae2b730799f + - path: output/busco/test-bacteria_odb10-busco/genome.fna/run_bacteria_odb10/hmmer_output.tar.gz + - path: output/busco/test-bacteria_odb10-busco/genome.fna/run_bacteria_odb10/missing_busco_list.tsv + md5sum: b533ef30270f27160acce85a22d01bf5 + - path: output/busco/test-bacteria_odb10-busco/genome.fna/run_bacteria_odb10/short_summary.json + contains: + - "one_line_summary" + - "mode" + - "dataset" + - path: output/busco/test-bacteria_odb10-busco/genome.fna/run_bacteria_odb10/short_summary.txt + contains: + - "# BUSCO version is:" + - "Results:" + - "busco:" + - path: output/busco/test-bacteria_odb10-busco/logs/busco.log + contains: + - "DEBUG:busco.run_BUSCO" + - "Results from dataset" + - "how to cite BUSCO" + - path: output/busco/versions.yml + +- name: busco test_busco_eukaryote_metaeuk + command: nextflow run ./tests/modules/nf-core/busco -entry test_busco_eukaryote_metaeuk -c ./tests/config/nextflow.config + tags: + - busco + files: + - path: output/busco/short_summary.specific.eukaryota_odb10.genome.fasta.json + contains: + - "one_line_summary" + - "mode" + - "dataset" + - path: output/busco/short_summary.specific.eukaryota_odb10.genome.fasta.txt + contains: + - "BUSCO version" + - "The lineage dataset is" + - "BUSCO was run in mode" + - "Complete BUSCOs" + - "Missing BUSCOs" + - "Dependencies and versions" + - path: output/busco/test-eukaryota_odb10-busco.batch_summary.txt + md5sum: ff6d8277e452a83ce9456bbee666feb6 + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/logs/bbtools_err.log + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/logs/bbtools_out.log + md5sum: e63debaa653f18f7405d936050abc093 + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/logs/hmmsearch_err.log + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/logs/hmmsearch_out.log + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/logs/metaeuk_run1_err.log + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/logs/metaeuk_run1_out.log + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/logs/metaeuk_run2_err.log + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/logs/metaeuk_run2_out.log + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/.bbtools_output/.checkpoint + contains: + - "Tool: bbtools" + - "Completed" + - "jobs" + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/busco_sequences/fragmented_busco_sequences.tar.gz + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/busco_sequences/multi_copy_busco_sequences.tar.gz + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/busco_sequences/single_copy_busco_sequences.tar.gz + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/full_table.tsv + md5sum: bd880e90b9e5620a58943a3e0f9ff16b + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/hmmer_output.tar.gz + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/.checkpoint + contains: + - "Tool: metaeuk" + - "Completed" + - "jobs" + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/combined_pred_proteins.fas + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/initial_results/genome.fasta.codon.fas + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/initial_results/genome.fasta.fas + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/initial_results/genome.fasta.gff + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/initial_results/genome.fasta.headersMap.tsv + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/refseq_db_rerun.faa + md5sum: d80b8fa4cb5ed0d47d63d6aa93635bc2 + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/rerun_results/genome.fasta.codon.fas + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/rerun_results/genome.fasta.fas + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/rerun_results/genome.fasta.gff + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/rerun_results/genome.fasta.headersMap.tsv + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/missing_busco_list.tsv + md5sum: 1e8e79c540fd2e69ba0d2659d9eb2988 + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/short_summary.json + contains: + - "one_line_summary" + - "mode" + - "dataset" + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/short_summary.txt + contains: + - "# BUSCO version is:" + - "Results:" + - "busco:" + - path: output/busco/test-eukaryota_odb10-busco/logs/busco.log + contains: + - "DEBUG:busco.run_BUSCO" + - "Results from dataset" + - "how to cite BUSCO" + - path: output/busco/versions.yml + +- name: busco test_busco_eukaryote_augustus + command: nextflow run ./tests/modules/nf-core/busco -entry test_busco_eukaryote_augustus -c ./tests/config/nextflow.config + tags: + - busco + files: + - path: output/busco/short_summary.specific.eukaryota_odb10.genome.fasta.json + contains: + - "one_line_summary" + - "mode" + - "dataset" + - path: output/busco/short_summary.specific.eukaryota_odb10.genome.fasta.txt + contains: + - "BUSCO version" + - "The lineage dataset is" + - "BUSCO was run in mode" + - "Complete BUSCOs" + - "Missing BUSCOs" + - "Dependencies and versions" + - path: output/busco/test-eukaryota_odb10-busco.batch_summary.txt + md5sum: ff6d8277e452a83ce9456bbee666feb6 + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/logs/bbtools_err.log + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/logs/bbtools_out.log + md5sum: e63debaa653f18f7405d936050abc093 + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/logs/hmmsearch_err.log + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/logs/hmmsearch_out.log + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/logs/metaeuk_run1_err.log + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/logs/metaeuk_run1_out.log + contains: + - "metaeuk" + - "easy-predict" + - "Compute score and coverage" + - "Time for processing:" + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/logs/metaeuk_run2_err.log + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/logs/metaeuk_run2_out.log + contains: + - "metaeuk" + - "easy-predict" + - "Compute score and coverage" + - "Time for processing:" + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/.bbtools_output/.checkpoint + contains: + - "Tool: bbtools" + - "Completed" + - "jobs" + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/busco_sequences/fragmented_busco_sequences.tar.gz + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/busco_sequences/multi_copy_busco_sequences.tar.gz + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/busco_sequences/single_copy_busco_sequences.tar.gz + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/full_table.tsv + md5sum: bd880e90b9e5620a58943a3e0f9ff16b + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/hmmer_output.tar.gz + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/.checkpoint + contains: + - "Tool: metaeuk" + - "Completed" + - "jobs" + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/combined_pred_proteins.fas + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/initial_results/genome.fasta.codon.fas + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/initial_results/genome.fasta.fas + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/initial_results/genome.fasta.gff + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/initial_results/genome.fasta.headersMap.tsv + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/refseq_db_rerun.faa + md5sum: d80b8fa4cb5ed0d47d63d6aa93635bc2 + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/rerun_results/genome.fasta.codon.fas + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/rerun_results/genome.fasta.fas + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/rerun_results/genome.fasta.gff + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/metaeuk_output/rerun_results/genome.fasta.headersMap.tsv + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/missing_busco_list.tsv + md5sum: 1e8e79c540fd2e69ba0d2659d9eb2988 + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/short_summary.json + contains: + - "one_line_summary" + - "mode" + - "dataset" + - path: output/busco/test-eukaryota_odb10-busco/genome.fasta/run_eukaryota_odb10/short_summary.txt + contains: + - "# BUSCO version is:" + - "Results:" + - "busco:" + - path: output/busco/test-eukaryota_odb10-busco/logs/busco.log + contains: + - "DEBUG:busco.run_BUSCO" + - "Results from dataset" + - "how to cite BUSCO" + - path: output/busco/versions.yml + +- name: busco test_busco_protein + command: nextflow run ./tests/modules/nf-core/busco -entry test_busco_protein -c ./tests/config/nextflow.config + tags: + - busco + files: + - path: output/busco/short_summary.specific.bacteria_odb10.proteome.fasta.json + contains: + - "one_line_summary" + - "mode" + - "dataset" + - path: output/busco/short_summary.specific.bacteria_odb10.proteome.fasta.txt + contains: + - "BUSCO version" + - "The lineage dataset is" + - "BUSCO was run in mode" + - "Complete BUSCOs" + - "Missing BUSCOs" + - "Dependencies and versions" + - path: output/busco/test-bacteria_odb10-busco.batch_summary.txt + md5sum: 7a65e6cbb6c56a2ea4e739ae0aa3297d + - path: output/busco/test-bacteria_odb10-busco/logs/busco.log + contains: + - "DEBUG:busco.run_BUSCO" + - "Results from dataset" + - "how to cite BUSCO" + - path: output/busco/test-bacteria_odb10-busco/proteome.fasta/logs/hmmsearch_err.log + - path: output/busco/test-bacteria_odb10-busco/proteome.fasta/logs/hmmsearch_out.log + contains: + - "# hmmsearch :: search profile(s) against a sequence database" + - "# target sequence database:" + - "Internal pipeline statistics summary:" + - "[ok]" + - path: output/busco/test-bacteria_odb10-busco/proteome.fasta/run_bacteria_odb10/busco_sequences/fragmented_busco_sequences.tar.gz + - path: output/busco/test-bacteria_odb10-busco/proteome.fasta/run_bacteria_odb10/busco_sequences/multi_copy_busco_sequences.tar.gz + - path: output/busco/test-bacteria_odb10-busco/proteome.fasta/run_bacteria_odb10/busco_sequences/single_copy_busco_sequences.tar.gz + - path: output/busco/test-bacteria_odb10-busco/proteome.fasta/run_bacteria_odb10/full_table.tsv + md5sum: 0e34f1011cd83ea1d5d5103ec62b8922 + - path: output/busco/test-bacteria_odb10-busco/proteome.fasta/run_bacteria_odb10/hmmer_output.tar.gz + - path: output/busco/test-bacteria_odb10-busco/proteome.fasta/run_bacteria_odb10/missing_busco_list.tsv + md5sum: 9939309df2da5419de88c32d1435c779 + - path: output/busco/test-bacteria_odb10-busco/proteome.fasta/run_bacteria_odb10/short_summary.json + contains: + - "one_line_summary" + - "mode" + - "dataset" + - path: output/busco/test-bacteria_odb10-busco/proteome.fasta/run_bacteria_odb10/short_summary.txt + contains: + - "# BUSCO version is:" + - "Results:" + - "busco:" + - path: output/busco/versions.yml + +- name: busco test_busco_transcriptome + command: nextflow run ./tests/modules/nf-core/busco -entry test_busco_transcriptome -c ./tests/config/nextflow.config + tags: + - busco + files: + - path: output/busco/short_summary.specific.bacteria_odb10.test1.contigs.fa.json + contains: + - "one_line_summary" + - "mode" + - "dataset" + - path: output/busco/short_summary.specific.bacteria_odb10.test1.contigs.fa.txt + contains: + - "BUSCO version" + - "The lineage dataset is" + - "BUSCO was run in mode" + - "Complete BUSCOs" + - "Missing BUSCOs" + - "Dependencies and versions" + - path: output/busco/test-bacteria_odb10-busco.batch_summary.txt + md5sum: 46118ecf60d1b87d22b96d80f4f03632 + - path: output/busco/test-bacteria_odb10-busco/logs/busco.log + contains: + - "DEBUG:busco.run_BUSCO" + - "Results from dataset" + - "how to cite BUSCO" + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/blast_db/.checkpoint + contains: + - "Tool: makeblastdb" + - "Completed" + - "jobs" + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/blast_db/test1.contigs.fa.ndb + md5sum: 3788c017fe5e6f0f58224e9cdd21822b + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/blast_db/test1.contigs.fa.nhr + md5sum: 8ecd2ce392bb5e25ddbe1d85f879582e + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/blast_db/test1.contigs.fa.nin + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/blast_db/test1.contigs.fa.njs + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/blast_db/test1.contigs.fa.not + md5sum: 0c340e376c7e85d19f82ec1a833e6a6e + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/blast_db/test1.contigs.fa.nsq + md5sum: 532d5c0a7ea00fe95ca3c97cb3be6198 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/blast_db/test1.contigs.fa.ntf + md5sum: de1250813f0c7affc6d12dac9d0fb6bb + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/blast_db/test1.contigs.fa.nto + md5sum: ff74bd41f9cc9b011c63a32c4f7693bf + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/logs/hmmsearch_err.log + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/logs/hmmsearch_out.log + contains: + - "# hmmsearch :: search profile(s) against a sequence database" + - "# target sequence database:" + - "Internal pipeline statistics summary:" + - "[ok]" + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/logs/makeblastdb_err.log + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/logs/makeblastdb_out.log + contains: + - "Building a new DB" + - "Adding sequences from FASTA" + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/logs/tblastn_err.log + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/logs/tblastn_out.log + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/.checkpoint + contains: + - "Tool: tblastn" + - "Completed" + - "jobs" + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/coordinates.tsv + md5sum: cc30eed321944af293452bdbcfc24292 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_101.temp + md5sum: 73e9c65fc83fedc58f57f09b08f08238 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_119.temp + md5sum: 7fa4cc7955ec0cc36330a221c579b975 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_129.temp + md5sum: 6f1601c875d019e3f6f1f98ed8e988d4 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_138.temp + md5sum: 3f8e034686cd240c2330650d791bcae2 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_143.temp + md5sum: df3dfa8e9ba30ed70cf75b5e7abf2179 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_172.temp + md5sum: 7d463e0e6cf7169bc9077d8dc776dda1 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_178.temp + md5sum: 2288edf7fa4f88f51b4cf4d94086f77e + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_188.temp + md5sum: 029906abbad6d87fc57830dd548cac24 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_195.temp + md5sum: 4937f3b348774a31b1160a00297c29cc + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_210.temp + md5sum: afcb20ba4c466479d6b91c8c62251e1f + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_232.temp + md5sum: 2e1e823ce017345bd998191a39fa9924 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_268.temp + md5sum: 08c2d82c34ecffbe1c638b410349412e + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_29.temp + md5sum: cd9b63cf93524284781535c888313764 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_44.temp + md5sum: d1929b742b24ebe379bf4801ca882dca + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_58.temp + md5sum: 69215765b010c05336538cb322c900b3 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_72.temp + md5sum: 6feaa1cc3b0899a147ea9d466878f3e3 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_80.temp + md5sum: 13625eae14e860a96ce17cd4e37e9d01 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_81.temp + md5sum: e14b2484649b0dbc8926815c207b806d + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_93.temp + md5sum: 6902c93691df00e690faea914c71839e + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/sequences/k141_97.temp + md5sum: 0a0d9d38a83acbd5ad43c29cdf429988 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/blast_output/tblastn.tsv + contains: + - "TBLASTN" + - "BLAST processed" + - "queries" + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/busco_sequences/fragmented_busco_sequences.tar.gz + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/busco_sequences/multi_copy_busco_sequences.tar.gz + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/busco_sequences/single_copy_busco_sequences.tar.gz + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/full_table.tsv + md5sum: 24df25199e13c88bd892fc3e7b541ca0 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/hmmer_output.tar.gz + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/missing_busco_list.tsv + md5sum: e7232e2b8cca4fdfdd9e363b39ebbc81 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/short_summary.json + contains: + - "one_line_summary" + - "mode" + - "dataset" + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/short_summary.txt + contains: + - "# BUSCO version is:" + - "Results:" + - "busco:" + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/run_bacteria_odb10/single_copy_proteins.faa + md5sum: e04b9465733577ae6e4bccb7aa01e720 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/1024388at2.faa + md5sum: 7333c39a20258f20c7019ea0cd83157c + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/1054741at2.faa + md5sum: ebb481e77a824685fbe04d8a2f3a0d7d + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/1093223at2.faa + md5sum: 34621c7d499034e8f8e6b92fd4020a93 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/1151822at2.faa + md5sum: aa89ca381c1c70c9c4e1380351ca7c2a + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/143460at2.faa + md5sum: f2e91d78b8dd3722840378789f29e8c8 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/1491686at2.faa + md5sum: 73c25aef5c9cba7f4151804941b146ea + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/1504821at2.faa + md5sum: cda556018d1f84ebe517e89f6fc107d0 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/1574817at2.faa + md5sum: a9096c9fb8b25c78a72871ab0463acdc + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/1592033at2.faa + md5sum: e463d25ce186c0cebfd749474f3a4c64 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/1623045at2.faa + md5sum: f2cfd241590c6d8377286d6135480937 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/1661836at2.faa + md5sum: 586569546fb9861502468e3d9ba2775c + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/1674344at2.faa + md5sum: 24c658bee14ad84b062d81ad96642eb8 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/1698718at2.faa + md5sum: 0b8e26ddf5149bbd8805be7af125208d + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/1990650at2.faa + md5sum: 159320712ee01fb2ccb31a25df44eead + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/223233at2.faa + md5sum: 812629c0b06ac3d18661c2ca78de0c08 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/402899at2.faa + md5sum: f7ff4e1591342d30b77392a2e84b57d9 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/505485at2.faa + md5sum: 7b34a24fc49c540d46fcf96ff5129564 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/665824at2.faa + md5sum: 4cff2df64f6bcaff8bc19c234c8bcccd + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/776861at2.faa + md5sum: 613af7a3fea30ea2bece66f603b9284a + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/874197at2.faa + md5sum: a7cd1b13c9ef91c7ef4e31614166f197 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/932854at2.faa + md5sum: fe313ffd5efdb0fed887a04fba352552 + - path: output/busco/test-bacteria_odb10-busco/test1.contigs.fa/translated_proteins/95696at2.faa + md5sum: 4e1f30a2fea4dfbf9bb7fae2700622a0 + - path: output/busco/versions.yml diff --git a/modules/nf-core/busco/busco/tests/tags.yml b/modules/nf-core/busco/busco/tests/tags.yml new file mode 100644 index 0000000..7c4d283 --- /dev/null +++ b/modules/nf-core/busco/busco/tests/tags.yml @@ -0,0 +1,2 @@ +busco/busco: + - "modules/nf-core/busco/busco/**" diff --git a/modules/nf-core/gfastats/environment.yml b/modules/nf-core/gfastats/environment.yml new file mode 100644 index 0000000..1c875ce --- /dev/null +++ b/modules/nf-core/gfastats/environment.yml @@ -0,0 +1,7 @@ +name: gfastats +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gfastats=1.3.6 diff --git a/modules/nf-core/gfastats/main.nf b/modules/nf-core/gfastats/main.nf new file mode 100644 index 0000000..8db239a --- /dev/null +++ b/modules/nf-core/gfastats/main.nf @@ -0,0 +1,66 @@ +process GFASTATS { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gfastats:1.3.6--hdcf5f25_3': + 'biocontainers/gfastats:1.3.6--hdcf5f25_3' }" + + input: + tuple val(meta), path(assembly) // input.[fasta|fastq|gfa][.gz] + val out_fmt // output format (fasta/fastq/gfa) + val genome_size // estimated genome size for NG* statistics (optional). + val target // target specific sequence by header, optionally with coordinates (optional). + path agpfile // -a --agp-to-path converts input agp to path and replaces existing paths. + path include_bed // -i --include-bed generates output on a subset list of headers or coordinates in 0-based bed format. + path exclude_bed // -e --exclude-bed opposite of --include-bed. They can be combined (no coordinates). + path instructions // -k --swiss-army-knife set of instructions provided as an ordered list. + + output: + tuple val(meta), path("*.assembly_summary"), emit: assembly_summary + tuple val(meta), path("*.${out_fmt}.gz") , emit: assembly + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def agp = agpfile ? "--agp-to-path $agp" : "" + def ibed = include_bed ? "--include-bed $include_bed" : "" + def ebed = exclude_bed ? "--exclude-bed $exclude_bed" : "" + def sak = instructions ? "--swiss-army-knife $instructions" : "" + """ + gfastats \\ + $args \\ + --threads $task.cpus \\ + $agp \\ + $ibed \\ + $ebed \\ + $sak \\ + --out-format ${prefix}.${out_fmt}.gz \\ + $assembly \\ + $genome_size \\ + $target \\ + > ${prefix}.assembly_summary + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gfastats: \$( gfastats -v | sed '1!d;s/.*v//' ) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.${out_fmt}.gz + touch ${prefix}.assembly_summary + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gfastats: \$( gfastats -v | sed '1!d;s/.*v//' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/gfastats/meta.yml b/modules/nf-core/gfastats/meta.yml new file mode 100644 index 0000000..d0e97a8 --- /dev/null +++ b/modules/nf-core/gfastats/meta.yml @@ -0,0 +1,72 @@ +name: "gfastats" +description: | + A single fast and exhaustive tool for summary statistics and simultaneous *fa* + (fasta, fastq, gfa [.gz]) genome assembly file manipulation. +keywords: + - gfastats + - fasta + - genome assembly + - genome summary + - genome manipulation + - genome statistics +tools: + - "gfastats": + description: "The swiss army knife for genome assembly." + homepage: "https://github.com/vgl-hub/gfastats" + documentation: "https://github.com/vgl-hub/gfastats/tree/main/instructions" + tool_dev_url: "https://github.com/vgl-hub/gfastats" + doi: "10.1093/bioinformatics/btac460" + licence: "['MIT']" +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - assembly: + type: file + description: Draft assembly file + pattern: "*.{fasta,fastq,gfa}(.gz)?" + - out_fmt: + type: string + description: Output format (fasta, fastq, gfa) + - genome_size: + type: integer + description: estimated genome size (bp) for NG* statistics (optional). + - target: + type: string + description: target specific sequence by header, optionally with coordinates (optional). + - agpfile: + type: file + description: converts input agp to path and replaces existing paths. + - include_bed: + type: file + description: generates output on a subset list of headers or coordinates in 0-based bed format. + - exclude_bed: + type: file + description: opposite of --include-bed. They can be combined (no coordinates). + - instructions: + type: file + description: set of instructions provided as an ordered list. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - assembly_summary: + type: file + description: Assembly summary statistics file + pattern: "*.assembly_summary" + - assembly: + type: file + description: The assembly as modified by gfastats + pattern: "*.{fasta,fastq,gfa}.gz" +authors: + - "@mahesh-panchal" +maintainers: + - "@mahesh-panchal" diff --git a/modules/nf-core/merquryfk/merquryfk/environment.yml b/modules/nf-core/merquryfk/merquryfk/environment.yml new file mode 100644 index 0000000..44a5ee9 --- /dev/null +++ b/modules/nf-core/merquryfk/merquryfk/environment.yml @@ -0,0 +1,5 @@ +name: merquryfk_merquryfk +channels: + - conda-forge + - bioconda + - defaults diff --git a/modules/nf-core/merquryfk/merquryfk/main.nf b/modules/nf-core/merquryfk/merquryfk/main.nf new file mode 100644 index 0000000..ac163da --- /dev/null +++ b/modules/nf-core/merquryfk/merquryfk/main.nf @@ -0,0 +1,58 @@ +process MERQURYFK_MERQURYFK { + tag "$meta.id" + label 'process_medium' + + // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions. + container 'ghcr.io/nbisweden/fastk_genescopefk_merquryfk:1.2' + + input: + tuple val(meta), path(fastk_hist), path(fastk_ktab), path(assembly), path(haplotigs) + + output: + tuple val(meta), path("${prefix}.completeness.stats") , emit: stats + tuple val(meta), path("${prefix}.*_only.bed") , emit: bed + tuple val(meta), path("${prefix}.*.qv") , emit: assembly_qv + tuple val(meta), path("${prefix}.*.spectra-cn.fl.png"), emit: spectra_cn_fl_png, optional: true + tuple val(meta), path("${prefix}.*.spectra-cn.fl.pdf"), emit: spectra_cn_fl_pdf, optional: true + tuple val(meta), path("${prefix}.*.spectra-cn.ln.png"), emit: spectra_cn_ln_png, optional: true + tuple val(meta), path("${prefix}.*.spectra-cn.ln.pdf"), emit: spectra_cn_ln_pdf, optional: true + tuple val(meta), path("${prefix}.*.spectra-cn.st.png"), emit: spectra_cn_st_png, optional: true + tuple val(meta), path("${prefix}.*.spectra-cn.st.pdf"), emit: spectra_cn_st_pdf, optional: true + tuple val(meta), path("${prefix}.qv") , emit: qv + tuple val(meta), path("${prefix}.spectra-asm.fl.png") , emit: spectra_asm_fl_png, optional: true + tuple val(meta), path("${prefix}.spectra-asm.fl.pdf") , emit: spectra_asm_fl_pdf, optional: true + tuple val(meta), path("${prefix}.spectra-asm.ln.png") , emit: spectra_asm_ln_png, optional: true + tuple val(meta), path("${prefix}.spectra-asm.ln.pdf") , emit: spectra_asm_ln_pdf, optional: true + tuple val(meta), path("${prefix}.spectra-asm.st.png") , emit: spectra_asm_st_png, optional: true + tuple val(meta), path("${prefix}.spectra-asm.st.pdf") , emit: spectra_asm_st_pdf, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "MERQURYFK_MERQURYFK module does not support Conda. Please use Docker / Singularity / Podman instead." + } + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def FASTK_VERSION = 'f18a4e6d2207539f7b84461daebc54530a9559b0' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + def MERQURY_VERSION = '8ae344092df5dcaf83cfb7f90f662597a9b1fc61' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ + MerquryFK \\ + $args \\ + -T$task.cpus \\ + ${fastk_ktab.find{ it.toString().endsWith(".ktab") }} \\ + $assembly \\ + $haplotigs \\ + $prefix + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastk: $FASTK_VERSION + merquryfk: $MERQURY_VERSION + r: \$( R --version | sed '1!d; s/.*version //; s/ .*//' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/merquryfk/merquryfk/meta.yml b/modules/nf-core/merquryfk/merquryfk/meta.yml new file mode 100644 index 0000000..7d4af79 --- /dev/null +++ b/modules/nf-core/merquryfk/merquryfk/meta.yml @@ -0,0 +1,112 @@ +name: "merquryfk_merquryfk" +description: FastK based version of Merqury +keywords: + - Merqury + - reference-free + - assembly evaluation +tools: + - "merquryfk": + description: "FastK based version of Merqury" + homepage: "https://github.com/thegenemyers/MERQURY.FK" + tool_dev_url: "https://github.com/thegenemyers/MERQURY.FK" + licence: "https://github.com/thegenemyers/MERQURY.FK/blob/main/LICENSE" +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fastk_hist: + type: file + description: A histogram files from the program FastK + pattern: "*.hist" + - fastk_ktab: + type: file + description: Histogram ktab files from the program FastK (option -t) + pattern: "*.ktab*" + - assembly: + type: file + description: Genome (primary) assembly files (fasta format) + pattern: ".fasta" + - haplotigs: + type: file + description: Assembly haplotigs (fasta format) + pattern: ".fasta" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - stats: + type: file + description: Assembly statistics file + pattern: "*.completeness.stats" + - bed: + type: file + description: Assembly only kmer positions not supported by reads in bed format + pattern: "*_only.bed" + - spectra_cn_fl_png: + type: file + description: "Unstacked copy number spectra filled plot in PNG format" + pattern: "*.spectra-cn.fl.png" + - spectra_cn_ln_png: + type: file + description: "Unstacked copy number spectra line plot in PNG format" + pattern: "*.spectra-cn.ln.png" + - spectra_cn_st_png: + type: file + description: "Stacked copy number spectra line plot in PNG format" + pattern: "*.spectra-cn.st.png" + - spectra_asm_fl_png: + type: file + description: "Unstacked assembly spectra filled plot in PNG format" + pattern: "*.spectra-asm.fl.png" + - spectra_asm_ln_png: + type: file + description: "Unstacked assembly spectra line plot in PNG format" + pattern: "*.spectra-asm.ln.png" + - spectra_asm_st_png: + type: file + description: "Stacked assembly spectra line plot in PNG format" + pattern: "*.spectra-asm.st.png" + - spectra_cn_fl_pdf: + type: file + description: "Unstacked copy number spectra filled plot in PDF format" + pattern: "*.spectra-cn.fl.pdf" + - spectra_cn_ln_pdf: + type: file + description: "Unstacked copy number spectra line plot in PDF format" + pattern: "*.spectra-cn.ln.pdf" + - spectra_cn_st_pdf: + type: file + description: "Stacked copy number spectra line plot in PDF format" + pattern: "*.spectra-cn.st.pdf" + - spectra_asm_fl_pdf: + type: file + description: "Unstacked assembly spectra filled plot in PDF format" + pattern: "*.spectra-asm.fl.pdf" + - spectra_asm_ln_pdf: + type: file + description: "Unstacked assembly spectra line plot in PDF format" + pattern: "*.spectra-asm.ln.pdf" + - spectra_asm_st_pdf: + type: file + description: "Stacked assembly spectra line plot in PDF format" + pattern: "*.spectra-asm.st.pdf" + - assembly_qv: + type: file + description: "error and qv table for each scaffold of the assembly" + pattern: "*.qv" + - qv: + type: file + description: "error and qv of each assembly as a whole" + pattern: "*.qv" +authors: + - "@mahesh-panchal" +maintainers: + - "@mahesh-panchal" diff --git a/modules/nf-core/minimap2/align/environment.yml b/modules/nf-core/minimap2/align/environment.yml new file mode 100644 index 0000000..41e8fe9 --- /dev/null +++ b/modules/nf-core/minimap2/align/environment.yml @@ -0,0 +1,11 @@ +name: minimap2_align + +channels: + - conda-forge + - bioconda + - defaults + +dependencies: + - bioconda::htslib=1.20 + - bioconda::minimap2=2.28 + - bioconda::samtools=1.20 diff --git a/modules/nf-core/minimap2/align/main.nf b/modules/nf-core/minimap2/align/main.nf new file mode 100644 index 0000000..d82dc14 --- /dev/null +++ b/modules/nf-core/minimap2/align/main.nf @@ -0,0 +1,78 @@ +process MINIMAP2_ALIGN { + tag "$meta.id" + label 'process_high' + + // Note: the versions here need to match the versions used in the mulled container below and minimap2/index + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:3161f532a5ea6f1dec9be5667c9efc2afdac6104-0' : + 'biocontainers/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:3161f532a5ea6f1dec9be5667c9efc2afdac6104-0' }" + + input: + tuple val(meta), path(reads) + tuple val(meta2), path(reference) + val bam_format + val bam_index_extension + val cigar_paf_format + val cigar_bam + + output: + tuple val(meta), path("*.paf") , optional: true, emit: paf + tuple val(meta), path("*.bam") , optional: true, emit: bam + tuple val(meta), path("*.bam.${bam_index_extension}"), optional: true, emit: index + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def args3 = task.ext.args3 ?: '' + def args4 = task.ext.args4 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def bam_index = bam_index_extension ? "${prefix}.bam##idx##${prefix}.bam.${bam_index_extension} --write-index" : "${prefix}.bam" + def bam_output = bam_format ? "-a | samtools sort -@ ${task.cpus-1} -o ${bam_index} ${args2}" : "-o ${prefix}.paf" + def cigar_paf = cigar_paf_format && !bam_format ? "-c" : '' + def set_cigar_bam = cigar_bam && bam_format ? "-L" : '' + def bam_input = "${reads.extension}".matches('sam|bam|cram') + def samtools_reset_fastq = bam_input ? "samtools reset --threads ${task.cpus-1} $args3 $reads | samtools fastq --threads ${task.cpus-1} $args4 |" : '' + def query = bam_input ? "-" : reads + def target = reference ?: (bam_input ? error("BAM input requires reference") : reads) + + """ + $samtools_reset_fastq \\ + minimap2 \\ + $args \\ + -t $task.cpus \\ + $target \\ + $query \\ + $cigar_paf \\ + $set_cigar_bam \\ + $bam_output + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + minimap2: \$(minimap2 --version 2>&1) + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def output_file = bam_format ? "${prefix}.bam" : "${prefix}.paf" + def bam_index = bam_index_extension ? "touch ${prefix}.bam.${bam_index_extension}" : "" + def bam_input = "${reads.extension}".matches('sam|bam|cram') + def target = reference ?: (bam_input ? error("BAM input requires reference") : reads) + + """ + touch $output_file + ${bam_index} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + minimap2: \$(minimap2 --version 2>&1) + END_VERSIONS + """ +} diff --git a/modules/nf-core/minimap2/align/meta.yml b/modules/nf-core/minimap2/align/meta.yml new file mode 100644 index 0000000..8996f88 --- /dev/null +++ b/modules/nf-core/minimap2/align/meta.yml @@ -0,0 +1,84 @@ +name: minimap2_align +description: A versatile pairwise aligner for genomic and spliced nucleotide sequences +keywords: + - align + - fasta + - fastq + - genome + - paf + - reference +tools: + - minimap2: + description: | + A versatile pairwise aligner for genomic and spliced nucleotide sequences. + homepage: https://github.com/lh3/minimap2 + documentation: https://github.com/lh3/minimap2#uguide + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FASTA or FASTQ files of size 1 and 2 for single-end + and paired-end data, respectively. + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test_ref'] + - reference: + type: file + description: | + Reference database in FASTA format. + - bam_format: + type: boolean + description: Specify that output should be in BAM format + - bam_index_extension: + type: string + description: BAM alignment index extension (e.g. "bai") + - cigar_paf_format: + type: boolean + description: Specify that output CIGAR should be in PAF format + - cigar_bam: + type: boolean + description: | + Write CIGAR with >65535 ops at the CG tag. This is recommended when + doing XYZ (https://github.com/lh3/minimap2#working-with-65535-cigar-operations) +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - paf: + type: file + description: Alignment in PAF format + pattern: "*.paf" + - bam: + type: file + description: Alignment in BAM format + pattern: "*.bam" + - index: + type: file + description: BAM alignment index + pattern: "*.bam.*" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@heuermh" + - "@sofstam" + - "@sateeshperi" + - "@jfy133" + - "@fellen31" +maintainers: + - "@heuermh" + - "@sofstam" + - "@sateeshperi" + - "@jfy133" + - "@fellen31" diff --git a/modules/nf-core/minimap2/align/tests/main.nf.test b/modules/nf-core/minimap2/align/tests/main.nf.test new file mode 100644 index 0000000..4072c17 --- /dev/null +++ b/modules/nf-core/minimap2/align/tests/main.nf.test @@ -0,0 +1,441 @@ +nextflow_process { + + name "Test Process MINIMAP2_ALIGN" + script "../main.nf" + process "MINIMAP2_ALIGN" + + tag "modules" + tag "modules_nfcore" + tag "minimap2" + tag "minimap2/align" + + test("sarscov2 - fastq, fasta, true, [], false, false") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + input[1] = [ + [ id:'test_ref' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[2] = true + input[3] = [] + input[4] = false + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + bam(process.out.bam[0][1]).getHeader(), + bam(process.out.bam[0][1]).getReadsMD5(), + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - fastq, fasta, true, 'bai', false, false") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + input[1] = [ + [ id:'test_ref' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[2] = true + input[3] = 'bai' + input[4] = false + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + bam(process.out.bam[0][1]).getHeader(), + bam(process.out.bam[0][1]).getReadsMD5(), + file(process.out.index[0][1]).name, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - [fastq1, fastq2], fasta, true, false, false") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) + ] + ] + input[1] = [ + [ id:'test_ref' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[2] = true + input[3] = [] + input[4] = false + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + bam(process.out.bam[0][1]).getHeader(), + bam(process.out.bam[0][1]).getReadsMD5(), + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - fastq, [], true, false, false") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + ] + input[1] = [ + [ id:'test_ref' ], // meta map + [] + ] + input[2] = true + input[3] = [] + input[4] = false + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + bam(process.out.bam[0][1]).getHeader(), + bam(process.out.bam[0][1]).getReadsMD5(), + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - bam, fasta, true, [], false, false") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/bam/test3.single_end.markduplicates.sorted.bam', checkIfExists: true) + ] + input[1] = [ + [ id:'test_ref' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[2] = true + input[3] = [] + input[4] = false + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + bam(process.out.bam[0][1]).getHeader(), + bam(process.out.bam[0][1]).getReadsMD5(), + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - bam, fasta, true, 'bai', false, false") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/bam/test3.single_end.markduplicates.sorted.bam', checkIfExists: true) + ] + input[1] = [ + [ id:'test_ref' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[2] = true + input[3] = 'bai' + input[4] = false + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + bam(process.out.bam[0][1]).getHeader(), + bam(process.out.bam[0][1]).getReadsMD5(), + file(process.out.index[0][1]).name, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - bam, [], true, false, false") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/bam/test3.single_end.markduplicates.sorted.bam', checkIfExists: true) + ] + input[1] = [ + [ id:'test_ref' ], // meta map + [] + ] + input[2] = true + input[3] = [] + input[4] = false + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.failed } + ) + } + + } + + test("sarscov2 - fastq, fasta, true, [], false, false - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + input[1] = [ + [ id:'test_ref' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[2] = true + input[3] = [] + input[4] = false + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - fastq, fasta, true, 'bai', false, false - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + input[1] = [ + [ id:'test_ref' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[2] = true + input[3] = 'bai' + input[4] = false + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - fastq, fasta, false, [], false, false - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + input[1] = [ + [ id:'test_ref' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[2] = false + input[3] = [] + input[4] = false + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - bam, fasta, true, [], false, false - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/bam/test3.single_end.markduplicates.sorted.bam', checkIfExists: true) + ] + input[1] = [ + [ id:'test_ref' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[2] = true + input[3] = [] + input[4] = false + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - bam, fasta, true, 'bai', false, false - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/bam/test3.single_end.markduplicates.sorted.bam', checkIfExists: true) + ] + input[1] = [ + [ id:'test_ref' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[2] = true + input[3] = 'bai' + input[4] = false + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - bam, [], true, false, false - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/bam/test3.single_end.markduplicates.sorted.bam', checkIfExists: true) + ] + input[1] = [ + [ id:'test_ref' ], // meta map + [] + ] + input[2] = true + input[3] = [] + input[4] = false + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.failed } + ) + } + + } + +} \ No newline at end of file diff --git a/modules/nf-core/minimap2/align/tests/main.nf.test.snap b/modules/nf-core/minimap2/align/tests/main.nf.test.snap new file mode 100644 index 0000000..12264a8 --- /dev/null +++ b/modules/nf-core/minimap2/align/tests/main.nf.test.snap @@ -0,0 +1,476 @@ +{ + "sarscov2 - bam, fasta, true, 'bai', false, false": { + "content": [ + [ + "@HD\tVN:1.6\tSO:coordinate", + "@SQ\tSN:MT192765.1\tLN:29829", + "@PG\tID:minimap2\tPN:minimap2\tVN:2.28-r1209\tCL:minimap2 -t 2 -a genome.fasta -", + "@PG\tID:samtools\tPN:samtools\tPP:minimap2\tVN:1.20\tCL:samtools sort -@ 1 -o test.bam##idx##test.bam.bai --write-index" + ], + "5d426b9a5f5b2c54f1d7f1e4c238ae94", + "test.bam.bai", + [ + "versions.yml:md5,3548eeba9066efbf8d78ea99f8d813fd" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-07-25T09:03:00.827260362" + }, + "sarscov2 - bam, fasta, true, 'bai', false, false - stub": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + { + "id": "test", + "single_end": true + }, + "test.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + { + "id": "test", + "single_end": true + }, + "test.bam.bai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + "versions.yml:md5,98b8f5f36aa54b82210094f0b0d11938" + ], + "bam": [ + [ + { + "id": "test", + "single_end": true + }, + "test.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "index": [ + [ + { + "id": "test", + "single_end": true + }, + "test.bam.bai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "paf": [ + + ], + "versions": [ + "versions.yml:md5,98b8f5f36aa54b82210094f0b0d11938" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-07-23T11:21:37.92353539" + }, + "sarscov2 - fastq, fasta, true, 'bai', false, false - stub": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + { + "id": "test", + "single_end": true + }, + "test.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + { + "id": "test", + "single_end": true + }, + "test.bam.bai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + "versions.yml:md5,98b8f5f36aa54b82210094f0b0d11938" + ], + "bam": [ + [ + { + "id": "test", + "single_end": true + }, + "test.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "index": [ + [ + { + "id": "test", + "single_end": true + }, + "test.bam.bai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "paf": [ + + ], + "versions": [ + "versions.yml:md5,98b8f5f36aa54b82210094f0b0d11938" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-03T11:29:44.669021368" + }, + "sarscov2 - fastq, fasta, false, [], false, false - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.paf:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,98b8f5f36aa54b82210094f0b0d11938" + ], + "bam": [ + + ], + "index": [ + + ], + "paf": [ + [ + { + "id": "test", + "single_end": true + }, + "test.paf:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,98b8f5f36aa54b82210094f0b0d11938" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-03T11:15:52.738781039" + }, + "sarscov2 - fastq, fasta, true, [], false, false - stub": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + { + "id": "test", + "single_end": true + }, + "test.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,98b8f5f36aa54b82210094f0b0d11938" + ], + "bam": [ + [ + { + "id": "test", + "single_end": true + }, + "test.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "index": [ + + ], + "paf": [ + + ], + "versions": [ + "versions.yml:md5,98b8f5f36aa54b82210094f0b0d11938" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-03T11:15:23.033808223" + }, + "sarscov2 - [fastq1, fastq2], fasta, true, false, false": { + "content": [ + [ + "@HD\tVN:1.6\tSO:coordinate", + "@SQ\tSN:MT192765.1\tLN:29829", + "@PG\tID:minimap2\tPN:minimap2\tVN:2.28-r1209\tCL:minimap2 -t 2 -a genome.fasta test_1.fastq.gz test_2.fastq.gz", + "@PG\tID:samtools\tPN:samtools\tPP:minimap2\tVN:1.20\tCL:samtools sort -@ 1 -o test.bam" + ], + "1bc392244f228bf52cf0b5a8f6a654c9", + [ + "versions.yml:md5,3548eeba9066efbf8d78ea99f8d813fd" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-07-23T11:18:18.964586894" + }, + "sarscov2 - fastq, fasta, true, [], false, false": { + "content": [ + [ + "@HD\tVN:1.6\tSO:coordinate", + "@SQ\tSN:MT192765.1\tLN:29829", + "@PG\tID:minimap2\tPN:minimap2\tVN:2.28-r1209\tCL:minimap2 -t 2 -a genome.fasta test_1.fastq.gz", + "@PG\tID:samtools\tPN:samtools\tPP:minimap2\tVN:1.20\tCL:samtools sort -@ 1 -o test.bam" + ], + "f194745c0ccfcb2a9c0aee094a08750", + [ + "versions.yml:md5,3548eeba9066efbf8d78ea99f8d813fd" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-07-23T11:17:48.667488325" + }, + "sarscov2 - fastq, fasta, true, 'bai', false, false": { + "content": [ + [ + "@HD\tVN:1.6\tSO:coordinate", + "@SQ\tSN:MT192765.1\tLN:29829", + "@PG\tID:minimap2\tPN:minimap2\tVN:2.28-r1209\tCL:minimap2 -t 2 -a genome.fasta test_1.fastq.gz", + "@PG\tID:samtools\tPN:samtools\tPP:minimap2\tVN:1.20\tCL:samtools sort -@ 1 -o test.bam##idx##test.bam.bai --write-index" + ], + "f194745c0ccfcb2a9c0aee094a08750", + "test.bam.bai", + [ + "versions.yml:md5,3548eeba9066efbf8d78ea99f8d813fd" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-07-23T11:18:02.517416733" + }, + "sarscov2 - bam, fasta, true, [], false, false": { + "content": [ + [ + "@HD\tVN:1.6\tSO:coordinate", + "@SQ\tSN:MT192765.1\tLN:29829", + "@PG\tID:minimap2\tPN:minimap2\tVN:2.28-r1209\tCL:minimap2 -t 2 -a genome.fasta -", + "@PG\tID:samtools\tPN:samtools\tPP:minimap2\tVN:1.20\tCL:samtools sort -@ 1 -o test.bam" + ], + "5d426b9a5f5b2c54f1d7f1e4c238ae94", + [ + "versions.yml:md5,3548eeba9066efbf8d78ea99f8d813fd" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-07-25T09:02:49.64829488" + }, + "sarscov2 - bam, fasta, true, [], false, false - stub": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + { + "id": "test", + "single_end": true + }, + "test.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,98b8f5f36aa54b82210094f0b0d11938" + ], + "bam": [ + [ + { + "id": "test", + "single_end": true + }, + "test.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "index": [ + + ], + "paf": [ + + ], + "versions": [ + "versions.yml:md5,98b8f5f36aa54b82210094f0b0d11938" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-07-23T11:21:22.162291795" + }, + "sarscov2 - fastq, [], true, false, false": { + "content": [ + [ + "@HD\tVN:1.6\tSO:coordinate", + "@SQ\tSN:ERR5069949.2151832\tLN:150", + "@SQ\tSN:ERR5069949.576388\tLN:77", + "@SQ\tSN:ERR5069949.501486\tLN:146", + "@SQ\tSN:ERR5069949.1331889\tLN:132", + "@SQ\tSN:ERR5069949.2161340\tLN:80", + "@SQ\tSN:ERR5069949.973930\tLN:79", + "@SQ\tSN:ERR5069949.2417063\tLN:150", + "@SQ\tSN:ERR5069949.376959\tLN:151", + "@SQ\tSN:ERR5069949.1088785\tLN:149", + "@SQ\tSN:ERR5069949.1066259\tLN:147", + "@SQ\tSN:ERR5069949.2832676\tLN:139", + "@SQ\tSN:ERR5069949.2953930\tLN:151", + "@SQ\tSN:ERR5069949.324865\tLN:151", + "@SQ\tSN:ERR5069949.2185111\tLN:150", + "@SQ\tSN:ERR5069949.937422\tLN:151", + "@SQ\tSN:ERR5069949.2431709\tLN:150", + "@SQ\tSN:ERR5069949.1246538\tLN:148", + "@SQ\tSN:ERR5069949.1189252\tLN:98", + "@SQ\tSN:ERR5069949.2216307\tLN:147", + "@SQ\tSN:ERR5069949.3273002\tLN:148", + "@SQ\tSN:ERR5069949.3277445\tLN:151", + "@SQ\tSN:ERR5069949.3022231\tLN:147", + "@SQ\tSN:ERR5069949.184542\tLN:151", + "@SQ\tSN:ERR5069949.540529\tLN:149", + "@SQ\tSN:ERR5069949.686090\tLN:150", + "@SQ\tSN:ERR5069949.2787556\tLN:106", + "@SQ\tSN:ERR5069949.2650879\tLN:150", + "@SQ\tSN:ERR5069949.2064910\tLN:149", + "@SQ\tSN:ERR5069949.2328704\tLN:150", + "@SQ\tSN:ERR5069949.1067032\tLN:150", + "@SQ\tSN:ERR5069949.3338256\tLN:151", + "@SQ\tSN:ERR5069949.1412839\tLN:147", + "@SQ\tSN:ERR5069949.1538968\tLN:150", + "@SQ\tSN:ERR5069949.147998\tLN:94", + "@SQ\tSN:ERR5069949.366975\tLN:106", + "@SQ\tSN:ERR5069949.1372331\tLN:151", + "@SQ\tSN:ERR5069949.1709367\tLN:129", + "@SQ\tSN:ERR5069949.2388984\tLN:150", + "@SQ\tSN:ERR5069949.1132353\tLN:150", + "@SQ\tSN:ERR5069949.1151736\tLN:151", + "@SQ\tSN:ERR5069949.479807\tLN:150", + "@SQ\tSN:ERR5069949.2176303\tLN:151", + "@SQ\tSN:ERR5069949.2772897\tLN:151", + "@SQ\tSN:ERR5069949.1020777\tLN:122", + "@SQ\tSN:ERR5069949.465452\tLN:151", + "@SQ\tSN:ERR5069949.1704586\tLN:149", + "@SQ\tSN:ERR5069949.1258508\tLN:151", + "@SQ\tSN:ERR5069949.986441\tLN:119", + "@SQ\tSN:ERR5069949.2674295\tLN:148", + "@SQ\tSN:ERR5069949.885966\tLN:79", + "@SQ\tSN:ERR5069949.2342766\tLN:151", + "@SQ\tSN:ERR5069949.3122970\tLN:127", + "@SQ\tSN:ERR5069949.3279513\tLN:72", + "@SQ\tSN:ERR5069949.309410\tLN:151", + "@SQ\tSN:ERR5069949.532979\tLN:149", + "@SQ\tSN:ERR5069949.2888794\tLN:151", + "@SQ\tSN:ERR5069949.2205229\tLN:150", + "@SQ\tSN:ERR5069949.786562\tLN:151", + "@SQ\tSN:ERR5069949.919671\tLN:151", + "@SQ\tSN:ERR5069949.1328186\tLN:151", + "@SQ\tSN:ERR5069949.870926\tLN:149", + "@SQ\tSN:ERR5069949.2257580\tLN:151", + "@SQ\tSN:ERR5069949.3249622\tLN:77", + "@SQ\tSN:ERR5069949.611123\tLN:125", + "@SQ\tSN:ERR5069949.651338\tLN:142", + "@SQ\tSN:ERR5069949.169513\tLN:92", + "@SQ\tSN:ERR5069949.155944\tLN:150", + "@SQ\tSN:ERR5069949.2033605\tLN:150", + "@SQ\tSN:ERR5069949.2730382\tLN:142", + "@SQ\tSN:ERR5069949.2125592\tLN:150", + "@SQ\tSN:ERR5069949.1062611\tLN:151", + "@SQ\tSN:ERR5069949.1778133\tLN:151", + "@SQ\tSN:ERR5069949.3057020\tLN:95", + "@SQ\tSN:ERR5069949.2972968\tLN:141", + "@SQ\tSN:ERR5069949.2734474\tLN:149", + "@SQ\tSN:ERR5069949.856527\tLN:151", + "@SQ\tSN:ERR5069949.2098070\tLN:151", + "@SQ\tSN:ERR5069949.1552198\tLN:150", + "@SQ\tSN:ERR5069949.2385514\tLN:150", + "@SQ\tSN:ERR5069949.2270078\tLN:151", + "@SQ\tSN:ERR5069949.114870\tLN:150", + "@SQ\tSN:ERR5069949.2668880\tLN:147", + "@SQ\tSN:ERR5069949.257821\tLN:139", + "@SQ\tSN:ERR5069949.2243023\tLN:150", + "@SQ\tSN:ERR5069949.2605155\tLN:146", + "@SQ\tSN:ERR5069949.1340552\tLN:151", + "@SQ\tSN:ERR5069949.1561137\tLN:150", + "@SQ\tSN:ERR5069949.2361683\tLN:149", + "@SQ\tSN:ERR5069949.2521353\tLN:150", + "@SQ\tSN:ERR5069949.1261808\tLN:149", + "@SQ\tSN:ERR5069949.2734873\tLN:98", + "@SQ\tSN:ERR5069949.3017828\tLN:107", + "@SQ\tSN:ERR5069949.573706\tLN:150", + "@SQ\tSN:ERR5069949.1980512\tLN:151", + "@SQ\tSN:ERR5069949.1014693\tLN:150", + "@SQ\tSN:ERR5069949.3184655\tLN:150", + "@SQ\tSN:ERR5069949.29668\tLN:89", + "@SQ\tSN:ERR5069949.3258358\tLN:151", + "@SQ\tSN:ERR5069949.1476386\tLN:151", + "@SQ\tSN:ERR5069949.2415814\tLN:150", + "@PG\tID:minimap2\tPN:minimap2\tVN:2.28-r1209\tCL:minimap2 -t 2 -a test_1.fastq.gz test_1.fastq.gz", + "@PG\tID:samtools\tPN:samtools\tPP:minimap2\tVN:1.20\tCL:samtools sort -@ 1 -o test.bam" + ], + "16c1c651f8ec67383bcdee3c55aed94f", + [ + "versions.yml:md5,3548eeba9066efbf8d78ea99f8d813fd" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-07-23T11:18:34.246998277" + } +} \ No newline at end of file diff --git a/modules/nf-core/minimap2/align/tests/tags.yml b/modules/nf-core/minimap2/align/tests/tags.yml new file mode 100644 index 0000000..39dba37 --- /dev/null +++ b/modules/nf-core/minimap2/align/tests/tags.yml @@ -0,0 +1,2 @@ +minimap2/align: + - "modules/nf-core/minimap2/align/**" diff --git a/modules/nf-core/samtools/merge/environment.yml b/modules/nf-core/samtools/merge/environment.yml new file mode 100644 index 0000000..cd366d6 --- /dev/null +++ b/modules/nf-core/samtools/merge/environment.yml @@ -0,0 +1,8 @@ +name: samtools_merge +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.20 + - bioconda::htslib=1.20 diff --git a/modules/nf-core/samtools/merge/main.nf b/modules/nf-core/samtools/merge/main.nf new file mode 100644 index 0000000..693b1d8 --- /dev/null +++ b/modules/nf-core/samtools/merge/main.nf @@ -0,0 +1,61 @@ +process SAMTOOLS_MERGE { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.20--h50ea8bc_0' : + 'biocontainers/samtools:1.20--h50ea8bc_0' }" + + input: + tuple val(meta), path(input_files, stageAs: "?/*") + tuple val(meta2), path(fasta) + tuple val(meta3), path(fai) + + output: + tuple val(meta), path("${prefix}.bam") , optional:true, emit: bam + tuple val(meta), path("${prefix}.cram"), optional:true, emit: cram + tuple val(meta), path("*.csi") , optional:true, emit: csi + tuple val(meta), path("*.crai") , optional:true, emit: crai + path "versions.yml" , emit: versions + + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def file_type = input_files instanceof List ? input_files[0].getExtension() : input_files.getExtension() + def reference = fasta ? "--reference ${fasta}" : "" + """ + samtools \\ + merge \\ + --threads ${task.cpus-1} \\ + $args \\ + ${reference} \\ + ${prefix}.${file_type} \\ + $input_files + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + prefix = task.ext.suffix ? "${meta.id}${task.ext.suffix}" : "${meta.id}" + def file_type = input_files instanceof List ? input_files[0].getExtension() : input_files.getExtension() + def index_type = file_type == "bam" ? "csi" : "crai" + def index = args.contains("--write-index") ? "touch ${prefix}.${index_type}" : "" + """ + touch ${prefix}.${file_type} + ${index} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/merge/meta.yml b/modules/nf-core/samtools/merge/meta.yml new file mode 100644 index 0000000..2e8f3db --- /dev/null +++ b/modules/nf-core/samtools/merge/meta.yml @@ -0,0 +1,83 @@ +name: samtools_merge +description: Merge BAM or CRAM file +keywords: + - merge + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input_files: + type: file + description: BAM/CRAM file + pattern: "*.{bam,cram,sam}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fasta: + type: file + description: Reference file the CRAM was created with (optional) + pattern: "*.{fasta,fa}" + - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fai: + type: file + description: Index of the reference file the CRAM was created with (optional) + pattern: "*.fai" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM file + pattern: "*.{bam}" + - cram: + type: file + description: CRAM file + pattern: "*.{cram}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - csi: + type: file + description: BAM index file (optional) + pattern: "*.csi" + - crai: + type: file + description: CRAM index file (optional) + pattern: "*.crai" +authors: + - "@drpatelh" + - "@yuukiiwa " + - "@maxulysse" + - "@FriederikeHanssen" + - "@ramprasadn" +maintainers: + - "@drpatelh" + - "@yuukiiwa " + - "@maxulysse" + - "@FriederikeHanssen" + - "@ramprasadn" diff --git a/modules/nf-core/samtools/merge/tests/index.config b/modules/nf-core/samtools/merge/tests/index.config new file mode 100644 index 0000000..8c5668c --- /dev/null +++ b/modules/nf-core/samtools/merge/tests/index.config @@ -0,0 +1,3 @@ +process { + ext.args = "--write-index" +} \ No newline at end of file diff --git a/modules/nf-core/samtools/merge/tests/main.nf.test b/modules/nf-core/samtools/merge/tests/main.nf.test new file mode 100644 index 0000000..40b36e8 --- /dev/null +++ b/modules/nf-core/samtools/merge/tests/main.nf.test @@ -0,0 +1,137 @@ +nextflow_process { + + name "Test Process SAMTOOLS_MERGE" + script "../main.nf" + process "SAMTOOLS_MERGE" + + tag "modules" + tag "modules_nfcore" + tag "samtools" + tag "samtools/merge" + + test("bams") { + + config "./index.config" + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.methylated.sorted.bam', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.single_end.sorted.bam', checkIfExists: true) ] + ]) + input[1] = [[],[]] + input[2] = [[],[]] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.bam[0][1]).name).match("bams_bam") }, + { assert snapshot(process.out.cram).match("bams_cram") }, + { assert snapshot(file(process.out.csi[0][1]).name).match("bams_csi") }, + { assert snapshot(process.out.crai).match("bams_crai") }, + { assert snapshot(process.out.versions).match("bams_versions") } + ) + } + } + + test("crams") { + + config "./index.config" + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/cram/test.paired_end.recalibrated.sorted.cram', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/cram/test2.paired_end.recalibrated.sorted.cram', checkIfExists: true) ] + ]) + input[1] = Channel.of([ + [ id:'genome' ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) + ]) + input[2] = Channel.of([ + [ id:'genome' ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta.fai', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.cram[0][1]).name).match("crams_cram") }, + { assert snapshot(process.out.bam).match("crams_bam") }, + { assert snapshot(file(process.out.crai[0][1]).name).match("crams_crai") }, + { assert snapshot(process.out.csi).match("crams_csi") }, + { assert snapshot(process.out.versions).match("crams_versions") } + ) + } + } + + test("bam") { + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.methylated.sorted.bam', checkIfExists: true) ] + ]) + input[1] = [[],[]] + input[2] = [[],[]] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.bam[0][1]).name).match("bam_bam") }, + { assert snapshot(process.out.cram).match("bam_cram") }, + { assert snapshot(process.out.crai).match("bam_crai") }, + { assert snapshot(process.out.csi).match("bam_csi") }, + { assert snapshot(process.out.versions).match("bam_versions") } + ) + } + } + + test("bams_stub") { + + config "./index.config" + options "-stub" + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.methylated.sorted.bam', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.single_end.sorted.bam', checkIfExists: true) ] + ]) + input[1] = [[],[]] + input[2] = [[],[]] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.bam[0][1]).name).match("bams_stub_bam") }, + { assert snapshot(process.out.cram).match("bams_stub_cram") }, + { assert snapshot(file(process.out.csi[0][1]).name).match("bams_stub_csi") }, + { assert snapshot(process.out.crai).match("bams_stub_crai") }, + { assert snapshot(process.out.versions).match("bams_stub_versions") } + ) + } + } +} diff --git a/modules/nf-core/samtools/merge/tests/main.nf.test.snap b/modules/nf-core/samtools/merge/tests/main.nf.test.snap new file mode 100644 index 0000000..17bc846 --- /dev/null +++ b/modules/nf-core/samtools/merge/tests/main.nf.test.snap @@ -0,0 +1,228 @@ +{ + "crams_cram": { + "content": [ + "test.cram" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T18:50:00.647389" + }, + "bams_stub_cram": { + "content": [ + [ + + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T18:50:19.937013" + }, + "bams_crai": { + "content": [ + [ + + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T18:49:24.928616" + }, + "bams_bam": { + "content": [ + "test.bam" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T18:49:24.923289" + }, + "bams_cram": { + "content": [ + [ + + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T18:49:24.925716" + }, + "crams_csi": { + "content": [ + [ + + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T18:50:00.655959" + }, + "bam_bam": { + "content": [ + "test.bam" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T18:50:10.319539" + }, + "bam_versions": { + "content": [ + [ + "versions.yml:md5,84dab54b9812780df48f5cecef690c34" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T15:46:35.851936597" + }, + "bams_csi": { + "content": [ + "test.bam.csi" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T18:49:24.92719" + }, + "bams_stub_csi": { + "content": [ + "test.csi" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T18:50:19.940498" + }, + "bam_crai": { + "content": [ + [ + + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T18:50:10.328852" + }, + "bams_stub_versions": { + "content": [ + [ + "versions.yml:md5,84dab54b9812780df48f5cecef690c34" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T15:46:41.405707643" + }, + "bam_cram": { + "content": [ + [ + + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T18:50:10.324219" + }, + "bams_stub_bam": { + "content": [ + "test.bam" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T18:50:19.933153" + }, + "bams_versions": { + "content": [ + [ + "versions.yml:md5,84dab54b9812780df48f5cecef690c34" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T15:45:51.695689923" + }, + "crams_bam": { + "content": [ + [ + + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T18:50:00.650652" + }, + "crams_versions": { + "content": [ + [ + "versions.yml:md5,84dab54b9812780df48f5cecef690c34" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T15:46:30.185392319" + }, + "bam_csi": { + "content": [ + [ + + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T18:50:10.33292" + }, + "crams_crai": { + "content": [ + "test.cram.crai" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T18:50:00.653512" + }, + "bams_stub_crai": { + "content": [ + [ + + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T18:50:19.943839" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/merge/tests/tags.yml b/modules/nf-core/samtools/merge/tests/tags.yml new file mode 100644 index 0000000..b869abc --- /dev/null +++ b/modules/nf-core/samtools/merge/tests/tags.yml @@ -0,0 +1,2 @@ +samtools/merge: + - "modules/nf-core/samtools/merge/**" diff --git a/modules/nf-core/samtools/sort/environment.yml b/modules/nf-core/samtools/sort/environment.yml new file mode 100644 index 0000000..36a12ea --- /dev/null +++ b/modules/nf-core/samtools/sort/environment.yml @@ -0,0 +1,8 @@ +name: samtools_sort +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.20 + - bioconda::htslib=1.20 diff --git a/modules/nf-core/samtools/sort/main.nf b/modules/nf-core/samtools/sort/main.nf new file mode 100644 index 0000000..8e01909 --- /dev/null +++ b/modules/nf-core/samtools/sort/main.nf @@ -0,0 +1,73 @@ +process SAMTOOLS_SORT { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.20--h50ea8bc_0' : + 'biocontainers/samtools:1.20--h50ea8bc_0' }" + + input: + tuple val(meta) , path(bam) + tuple val(meta2), path(fasta) + + output: + tuple val(meta), path("*.bam"), emit: bam, optional: true + tuple val(meta), path("*.cram"), emit: cram, optional: true + tuple val(meta), path("*.crai"), emit: crai, optional: true + tuple val(meta), path("*.csi"), emit: csi, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def extension = args.contains("--output-fmt sam") ? "sam" : + args.contains("--output-fmt cram") ? "cram" : + "bam" + def reference = fasta ? "--reference ${fasta}" : "" + if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + + """ + samtools cat \\ + --threads $task.cpus \\ + ${bam} \\ + | \\ + samtools sort \\ + $args \\ + -T ${prefix} \\ + --threads $task.cpus \\ + ${reference} \\ + -o ${prefix}.${extension} \\ + - + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def extension = args.contains("--output-fmt sam") ? "sam" : + args.contains("--output-fmt cram") ? "cram" : + "bam" + """ + touch ${prefix}.${extension} + if [ "${extension}" == "bam" ]; + then + touch ${prefix}.${extension}.csi + elif [ "${extension}" == "cram" ]; + then + touch ${prefix}.${extension}.crai + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/sort/meta.yml b/modules/nf-core/samtools/sort/meta.yml new file mode 100644 index 0000000..341a7d0 --- /dev/null +++ b/modules/nf-core/samtools/sort/meta.yml @@ -0,0 +1,71 @@ +name: samtools_sort +description: Sort SAM/BAM/CRAM file +keywords: + - sort + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file(s) + pattern: "*.{bam,cram,sam}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fasta: + type: file + description: Reference genome FASTA file + pattern: "*.{fa,fasta,fna}" + optional: true +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: Sorted BAM file + pattern: "*.{bam}" + - cram: + type: file + description: Sorted CRAM file + pattern: "*.{cram}" + - crai: + type: file + description: CRAM index file (optional) + pattern: "*.crai" + - csi: + type: file + description: BAM index file (optional) + pattern: "*.csi" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@ewels" + - "@matthdsm" +maintainers: + - "@drpatelh" + - "@ewels" + - "@matthdsm" diff --git a/modules/nf-core/samtools/sort/tests/main.nf.test b/modules/nf-core/samtools/sort/tests/main.nf.test new file mode 100644 index 0000000..c2ea9c7 --- /dev/null +++ b/modules/nf-core/samtools/sort/tests/main.nf.test @@ -0,0 +1,128 @@ +nextflow_process { + + name "Test Process SAMTOOLS_SORT" + script "../main.nf" + process "SAMTOOLS_SORT" + tag "modules" + tag "modules_nfcore" + tag "samtools" + tag "samtools/sort" + + test("bam") { + + config "./nextflow.config" + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.bam', checkIfExists: true) + ]) + input[1] = Channel.of([ + [ id:'fasta' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + process.out.bam, + process.out.csi.collect { it.collect { it instanceof Map ? it : file(it).name } }, + process.out.versions + ).match()} + ) + } + } + + test("cram") { + + config "./nextflow_cram.config" + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/cram/test.paired_end.sorted.cram', checkIfExists: true) + ]) + input[1] = Channel.of([ + [ id:'fasta' ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + process.out.cram.collect { it.collect { it instanceof Map ? it : file(it).name } }, + process.out.crai.collect { it.collect { it instanceof Map ? it : file(it).name } }, + process.out.versions + ).match()} + ) + } + } + + test("bam - stub") { + + options "-stub" + config "./nextflow.config" + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.bam', checkIfExists: true) + ]) + input[1] = Channel.of([ + [ id:'fasta' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("cram - stub") { + + options "-stub" + config "./nextflow_cram.config" + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/cram/test.paired_end.sorted.cram', checkIfExists: true) + ]) + input[1] = Channel.of([ + [ id:'fasta' ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/samtools/sort/tests/main.nf.test.snap b/modules/nf-core/samtools/sort/tests/main.nf.test.snap new file mode 100644 index 0000000..da38d5d --- /dev/null +++ b/modules/nf-core/samtools/sort/tests/main.nf.test.snap @@ -0,0 +1,192 @@ +{ + "cram": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.sorted.cram" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.sorted.cram.crai" + ] + ], + [ + "versions.yml:md5,7a360de20e1d7a6f15a5e8fbe0a9c062" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T17:19:37.196205" + }, + "bam - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.sorted.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + [ + { + "id": "test", + "single_end": false + }, + "test.sorted.bam.csi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "4": [ + "versions.yml:md5,7a360de20e1d7a6f15a5e8fbe0a9c062" + ], + "bam": [ + [ + { + "id": "test", + "single_end": false + }, + "test.sorted.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "crai": [ + + ], + "cram": [ + + ], + "csi": [ + [ + { + "id": "test", + "single_end": false + }, + "test.sorted.bam.csi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,7a360de20e1d7a6f15a5e8fbe0a9c062" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T15:54:46.580756" + }, + "cram - stub": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "test.sorted.cram:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + { + "id": "test", + "single_end": false + }, + "test.sorted.cram.crai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + + ], + "4": [ + "versions.yml:md5,7a360de20e1d7a6f15a5e8fbe0a9c062" + ], + "bam": [ + + ], + "crai": [ + [ + { + "id": "test", + "single_end": false + }, + "test.sorted.cram.crai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "cram": [ + [ + { + "id": "test", + "single_end": false + }, + "test.sorted.cram:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "csi": [ + + ], + "versions": [ + "versions.yml:md5,7a360de20e1d7a6f15a5e8fbe0a9c062" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T15:57:30.505698" + }, + "bam": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.sorted.bam:md5,21c992d59615936b99f2ad008aa54400" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.sorted.bam.csi" + ] + ], + [ + "versions.yml:md5,7a360de20e1d7a6f15a5e8fbe0a9c062" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T15:54:25.872954" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/sort/tests/nextflow.config b/modules/nf-core/samtools/sort/tests/nextflow.config new file mode 100644 index 0000000..f642771 --- /dev/null +++ b/modules/nf-core/samtools/sort/tests/nextflow.config @@ -0,0 +1,8 @@ +process { + + withName: SAMTOOLS_SORT { + ext.prefix = { "${meta.id}.sorted" } + ext.args = "--write-index" + } + +} diff --git a/modules/nf-core/samtools/sort/tests/nextflow_cram.config b/modules/nf-core/samtools/sort/tests/nextflow_cram.config new file mode 100644 index 0000000..3a8c018 --- /dev/null +++ b/modules/nf-core/samtools/sort/tests/nextflow_cram.config @@ -0,0 +1,8 @@ +process { + + withName: SAMTOOLS_SORT { + ext.prefix = { "${meta.id}.sorted" } + ext.args = "--write-index --output-fmt cram" + } + +} diff --git a/modules/nf-core/samtools/sort/tests/tags.yml b/modules/nf-core/samtools/sort/tests/tags.yml new file mode 100644 index 0000000..cd63ea2 --- /dev/null +++ b/modules/nf-core/samtools/sort/tests/tags.yml @@ -0,0 +1,3 @@ +samtools/sort: + - modules/nf-core/samtools/sort/** + - tests/modules/nf-core/samtools/sort/** diff --git a/nextflow.config b/nextflow.config index fc630a6..3fb2d75 100644 --- a/nextflow.config +++ b/nextflow.config @@ -11,14 +11,7 @@ params { // TODO nf-core: Specify your pipeline's command line flags // Input options - input = null - - // MultiQC options - multiqc_config = null - multiqc_title = null - multiqc_logo = null - max_multiqc_email_size = '25.MB' - multiqc_methods_description = null + input = null // Boilerplate options outdir = null @@ -268,3 +261,25 @@ def check_max(obj, type) { } } } + +/** + * Returns a channel with the path if it's defined, otherwise returns a default channel. + * + * @param path The path to include into the channel + * @param default_channel A channel to use as the default if no path is defined. + * @return A channel with a path, or the default channel + */ +def readWithDefault( String path, Object default_channel ) { + path ? Channel.fromPath( path, checkIfExists: true ) : default_channel +} + +/** + * Returns a channel with the file defined by the path resolved against the directory. + * + * @param path The path of the file relative to the directory in dir + * @param dir A channel with a directory. + * @return A channel with a path relative to the dir path + */ +def resolveFileFromDir ( String path, Object dir ){ + dir.map{ results -> file( results.resolve( path ) ) } +} diff --git a/nextflow_schema.json b/nextflow_schema.json index b0fd6d6..55b26b3 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -17,11 +17,11 @@ "format": "file-path", "exists": true, "schema": "assets/schema_input.json", - "mimetype": "text/csv", - "pattern": "^\\S+\\.csv$", - "description": "Path to comma-separated file containing information about the samples in the experiment.", + "mimetype": "text/yaml", + "pattern": "^\\S+\\.yaml$", + "description": "Path to yaml file containing information about the samples in the experiment.", "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row.", - "fa_icon": "fas fa-file-csv" + "fa_icon": "fas fa-file-yaml" }, "outdir": { "type": "string", diff --git a/workflows/ear.nf b/workflows/ear.nf index 67d3ef6..b35a949 100644 --- a/workflows/ear.nf +++ b/workflows/ear.nf @@ -4,12 +4,20 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { FASTQC } from '../modules/nf-core/fastqc/main' -include { MULTIQC } from '../modules/nf-core/multiqc/main' -include { paramsSummaryMap } from 'plugin/nf-validation' -include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' -include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' -include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_ear_pipeline' +include { NEXTFLOW_RUN as CURATIONPRETEXT } from '../modules/local/nextflow/run' +include { NEXTFLOW_RUN as BLOBTOOLKIT } from '../modules/local/nextflow/run' + +include { YAML_INPUT } from '../subworkflows/local/yaml_input' +include { GENERATE_SAMPLESHEET } from '../modules/local/generate_samplesheet' +include { GFASTATS } from '../modules/nf-core/gfastats/main' +include { PE_MAPPING } from '../subworkflows/local/pe_mapping' +include { SE_MAPPING } from '../subworkflows/local/se_mapping' +include { SAMTOOLS_SORT } from '../modules/nf-core/samtools/sort/main' + +include { paramsSummaryMap } from 'plugin/nf-validation' +include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' +include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' +include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_ear_pipeline' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -20,21 +28,180 @@ include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_ear_ workflow EAR { take: - ch_samplesheet // channel: samplesheet read in from --input + ch_input main: - ch_versions = Channel.empty() - ch_multiqc_files = Channel.empty() + ch_versions = Channel.empty() + ch_align_bam = Channel.empty() + + // + // MODULE: YAML_INPUT + // + YAML_INPUT(ch_input) + reference = YAML_INPUT.out.reference + reference.view() + + // + // MODULE: Run Sanger-ToL/CurationPretext + // - This was built using: https://github.com/mahesh-panchal/nf-cascade + // + CURATIONPRETEXT( + "sanger-tol/curationpretext", + [ + "-r 1.0.0", + "--input", + reference, + "--longread", + YAML_INPUT.out.longread_dir, + "--cram", + YAML_INPUT.out.cpretext_hic_dir, + "$params.outdir/curationpretext", + "-profile singularity,sanger" + ].join(" ").trim(), // workflow opts + Channel.value([]), //readWithDefault( params.demo.params_file, Channel.value([]) ), // params file + Channel.value([]), // samplesheet - not used by this pipeline + Channel.value([]) //readWithDefault( params.demo.add_config, Channel.value([]) ), // custom config + + ) + + // + // MODULE: ASSEMBLY STATISTICS FOR THE FASTA + // + GFASTATS( + YAML_INPUT.out.reference, + "fasta", + [], + [], + [], + [], + [], + [] + ) + + // // + // // LOGIC: REFORMAT A BUNCH OF CHANNELS FOR MERQUERYFK + // // + // YAML_INPUT.out.reference + // .combine() + // .combine() + // .combine() + // .map{ meta, primary, haplotigs, fastk_hist, fastk_ktab -> + // tuple( meta, + // fastk_hist, + // fastk_ktab, + // primary, + // haplotigs + // ) + // } + // .set { merquryfk_input } + + // // + // // MODULE: MERQURYFK PLOTS OF GENOME + // // + + // MERQURYFK( + // merquryfk_input + // ) + + // + // LOGIC: SANGER-TOL/BLOBTOOLKIT expects the pacbio data to be already mapped + // + platform = YAML_INPUT.out.longread_type + + YAML_INPUT.out.sample_id + .combine(YAML_INPUT.out.longread_dir) + .set {pacbio_tuple} + + if ( platform.filter { it == "hifi" } || platform.filter { it == "clr" } || platform.filter { it == "ont" } ) { + // + // SUBWORKFLOW: SINGLE END MAPPING FOR ALIGNING LONGREAD DATA + // + SE_MAPPING ( + YAML_INPUT.out.reference, + pacbio_tuple, + platform + ) + ch_versions = ch_versions.mix(SE_MAPPING.out.versions) + + ch_align_bam + .mix( SE_MAPPING.out.mapped_bam ) + .set { merged_bam } + } + else if ( platform.filter { it == "illumina" } ) { + // + // SUBWORKFLOW: PAIRED END MAPPING FOR ALIGNING LONGREAD DATA + // + PE_MAPPING ( + YAML_INPUT.out.reference, + pacbio_tuple, + platform + ) + ch_versions = ch_versions.mix(PE_MAPPING.out.versions) + + ch_align_bam + .mix( PE_MAPPING.out.mapped_bam ) + .set { merged_bam } + } // - // MODULE: Run FastQC + // MODULE: SORT MAPPED BAM // - FASTQC ( - ch_samplesheet + SAMTOOLS_SORT ( + merged_bam, + YAML_INPUT.out.reference + ) + ch_versions = ch_versions.mix( SAMTOOLS_SORT.out.versions ) + + // + // MODULE: GENERATE_SAMPLESHEET creates a csv for the blobtoolkit pipeline + // + YAML_INPUT.out.sample_id + .combine(merged_bam) + .map{ sample_id, pacbio_path -> + tuple( [id: sample_id], + pacbio_path + ) + } + .set { samplesheet_input } + + + GENERATE_SAMPLESHEET( + samplesheet_input ) - ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}) - ch_versions = ch_versions.mix(FASTQC.out.versions.first()) + + // + // MODULE: Run Sanger-ToL/BlobToolKit + // - This was built using: https://github.com/mahesh-panchal/nf-cascade + // + // BLOBTOOLKIT( + // "sanger-tol/blobtoolkit", + // [ + // "-r 0.4.0", + // "--input", + // GENERATE_SAMPLESHEET.out.csv, + // "--fasta", + // reference, + // "--accession", + // YAML_INPUT.out.btk_gca_accession, + // "-taxon", + // YAML_INPUT.out.btk_taxid, + // "--taxdump", + // YAML_INPUT.out.btk_ncbi_taxonomy_path, + // "--blastp", + // YAML_INPUT.out.btk_nt_diamond_database, + // "--blastn", + // YAML_INPUT.out.btk_nt_database, + // "--blastx", + // YAML_INPUT.out.btk_nt_diamond_database, + // "$params.outdir/blobtoolkit", + // "-profile singularity,sanger" + // ].join(" ").trim(), // workflow opts + // Channel.value([]),//readWithDefault( params.demo.params_file, Channel.value([]) ), // params file + // Channel.value([]),//readWithDefault( params.demo.input, Channel.value([]) ), // samplesheet + // Channel.value([])//readWithDefault( params.demo.add_config, Channel.value([]) ), // custom config + + // ) // // Collate and save software versions @@ -47,47 +214,13 @@ workflow EAR { newLine: true ).set { ch_collated_versions } - // - // MODULE: MultiQC - // - ch_multiqc_config = Channel.fromPath( - "$projectDir/assets/multiqc_config.yml", checkIfExists: true) - ch_multiqc_custom_config = params.multiqc_config ? - Channel.fromPath(params.multiqc_config, checkIfExists: true) : - Channel.empty() - ch_multiqc_logo = params.multiqc_logo ? - Channel.fromPath(params.multiqc_logo, checkIfExists: true) : - Channel.empty() - summary_params = paramsSummaryMap( workflow, parameters_schema: "nextflow_schema.json") ch_workflow_summary = Channel.value(paramsSummaryMultiqc(summary_params)) - ch_multiqc_custom_methods_description = params.multiqc_methods_description ? - file(params.multiqc_methods_description, checkIfExists: true) : - file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) - ch_methods_description = Channel.value( - methodsDescriptionText(ch_multiqc_custom_methods_description)) - - ch_multiqc_files = ch_multiqc_files.mix( - ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) - ch_multiqc_files = ch_multiqc_files.mix(ch_collated_versions) - ch_multiqc_files = ch_multiqc_files.mix( - ch_methods_description.collectFile( - name: 'methods_description_mqc.yaml', - sort: true - ) - ) - MULTIQC ( - ch_multiqc_files.collect(), - ch_multiqc_config.toList(), - ch_multiqc_custom_config.toList(), - ch_multiqc_logo.toList() - ) emit: - multiqc_report = MULTIQC.out.report.toList() // channel: /path/to/multiqc_report.html versions = ch_versions // channel: [ path(versions.yml) ] } From d7b8491d6675823befeee1df0c006f1dae1e0b3f Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 5 Aug 2024 10:16:41 +0100 Subject: [PATCH 02/52] Last weeks addition --- subworkflows/local/pe_mapping.nf | 116 +++++++++++++++++++++++++++++++ subworkflows/local/se_mapping.nf | 115 ++++++++++++++++++++++++++++++ subworkflows/local/yaml_input.nf | 44 ++++++++++++ 3 files changed, 275 insertions(+) create mode 100644 subworkflows/local/pe_mapping.nf create mode 100644 subworkflows/local/se_mapping.nf create mode 100644 subworkflows/local/yaml_input.nf diff --git a/subworkflows/local/pe_mapping.nf b/subworkflows/local/pe_mapping.nf new file mode 100644 index 0000000..3c41670 --- /dev/null +++ b/subworkflows/local/pe_mapping.nf @@ -0,0 +1,116 @@ +include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_ILLUMINA } from '../../modules/nf-core/minimap2/align/main' +include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' + +workflow PE_MAPPING { + + take: + reference_tuple // Channel [ val(meta), path(file) ] + pacbio_tuple // Channel [ val(meta), val( str ) ] + reads_type // Channel val( str ) + + main: + ch_versions = Channel.empty() + + + // + // PROCESS: GETS PACBIO READ PATHS FROM READS_PATH + // + ch_grabbed_reads_path = GrabFiles( pacbio_tuple ) + + ch_grabbed_reads_path + .map { meta, files -> + tuple( files ) + } + .flatten() + .set { ch_reads_path } + + // + // PROCESS: MAKE MINIMAP INPUT CHANNEL + // + reference_tuple + .combine( ch_reads_path ) + .combine( reads_type ) + .map { meta, ref, reads_path, reads_type -> + tuple( + [ id : meta.id, + single_end : false, + readtype: reads_type.toString() + ], + reads_path, + ref, + true, + false, + false, + reads_type + ) + } + .set { pe_input } + + // + // PROCESS: MULTIMAP TO MAKE BOOLEAN ARGUMENTS + // + pe_input + .multiMap { meta, reads_path, ref, bam_output, cigar_paf, cigar_bam, reads_type -> + read_tuple : tuple( meta, read_path) + ref : ref + bool_bam_ouput : bam_output + bool_cigar_paf : cigar_paf + bool_cigar_bam : cigar_bam + } + .set { illumina_input } + + // + // MODULE: PAIRED END READ MAPPING USING MINIMAP + // + MINIMAP2_ALIGN_ILLUMINA ( + illumina_input.read_tuple, + illumina_input.ref, + illumina_input.bool_bam_ouput, + [], + illumina_input.bool_cigar_paf, + illumina_input.bool_cigar_bam + ) + ch_versions = ch_versions.mix(MINIMAP2_ALIGN_ILLUMINA.out.versions) + + ch_bams = MINIMAP2_ALIGN_ILLUMINA.out.bam + + ch_bams + .map { meta, file -> + tuple( file ) + } + .collect() + .map { file -> + tuple ( + [ id : file[0].toString().split('/')[-1].split('_')[0] ], // Change sample ID + file + ) + } + .set { collected_files_for_merge } + + // + // MODULE: MERGE ALL OUTPUT BAM + // + SAMTOOLS_MERGE( + collected_files_for_merge, + reference_tuple, + [[],[]] + ) + ch_versions = ch_versions.mix(SAMTOOLS_MERGE.out.versions) + + emit: + versions = ch_versions.ifEmpty(null) + mapped_bam = SAMTOOLS_MERGE.out.bam +} + +process GrabFiles { + tag "${meta.id}" + executor 'local' + + input: + tuple val(meta), path("in") + + output: + tuple val(meta), path("in/*.{fa,fasta}.{gz}") + + "true" +} \ No newline at end of file diff --git a/subworkflows/local/se_mapping.nf b/subworkflows/local/se_mapping.nf new file mode 100644 index 0000000..0340425 --- /dev/null +++ b/subworkflows/local/se_mapping.nf @@ -0,0 +1,115 @@ +include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_SE } from '../../modules/nf-core/minimap2/align/main' +include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' + +workflow SE_MAPPING { + + take: + reference_tuple // Channel [ val(meta), path(file) ] + pacbio_tuple // Channel [ val(meta), path(file) ] + reads_type // Channel val( str ) + + main: + ch_versions = Channel.empty() + ch_align_bams = Channel.empty() + + // + // PROCESS: GETS PACBIO READ PATHS FROM READS_PATH + // + ch_grabbed_reads_path = GrabFiles( pacbio_tuple ) + + ch_grabbed_reads_path + .map { meta, files -> + tuple( files ) + } + .flatten() + .set { ch_reads_path } + + // + // PROCESS: MAKE MINIMAP INPUT CHANNEL AND MAKE BRANCHES BASED ON INPUT READ TYPE + // + reference_tuple + .combine( ch_reads_path ) + .combine( reads_type ) + .map { meta, ref, reads_path, reads_type -> + tuple( + [ id : meta.id, + single_end : true, + readtype : reads_type.toString() + ], + reads_path, + ref, + true, + false, + false, + reads_type + ) + } + .set { minimap_se_input } + + // + // PROCESS: MULTIMAP TO MAKE BOOLEAN ARGUMENTS FOR MINIMAP HIFI MAPPING INPUT + // + minimap_se_input + .multiMap { meta, reads_path, ref, bam_output, cigar_paf, cigar_bam, reads_type -> + read_tuple : tuple( meta, reads_path) + ref : ref + bool_bam_ouput : bam_output + bool_cigar_paf : cigar_paf + bool_cigar_bam : cigar_bam + } + .set { se_input } + + // + // MOUDLES: MAPPING DIFFERENT TYPE OF READ AGAINIST REFERENCE + // + + MINIMAP2_ALIGN_SE ( + se_input.read_tuple, + se_input.ref, + se_input.bool_bam_ouput, + [], + se_input.bool_cigar_paf, + se_input.bool_cigar_bam + ) + ch_bams = MINIMAP2_ALIGN_SE.out.bam + + ch_bams + .map { meta, file -> + tuple( file ) + } + .collect() + .map { file -> + tuple ( + [ id : file[0].toString().split('/')[-1].split('_')[0] ], // Change sample ID + file + ) + } + .set { collected_files_for_merge } + + // + // MODULE: MERGE ALL OUTPUT BAM + // + SAMTOOLS_MERGE( + collected_files_for_merge, + reference_tuple, + [[],[]] + ) + ch_versions = ch_versions.mix(SAMTOOLS_MERGE.out.versions) + + emit: + versions = ch_versions.ifEmpty(null) + mapped_bam = SAMTOOLS_MERGE.out.bam +} + +process GrabFiles { + tag "${meta.id}" + executor 'local' + + input: + tuple val(meta), path("in") + + output: + tuple val(meta), path("in/*.{fa,fasta,fna}.{gz}") + + "true" +} \ No newline at end of file diff --git a/subworkflows/local/yaml_input.nf b/subworkflows/local/yaml_input.nf new file mode 100644 index 0000000..da75cb7 --- /dev/null +++ b/subworkflows/local/yaml_input.nf @@ -0,0 +1,44 @@ +#!/usr/bin/env nextflow + +import groovy.yaml.YamlSlurper + +workflow YAML_INPUT { + take: + input_file // params.input + + main: + ch_versions = Channel.empty() + + inputs = new YamlSlurper().parse(file(params.input)) + + emit: + // + // LOGIC: Building generic channels + // + sample_id = Channel.of(inputs.assembly_id) + longread_type = Channel.of(inputs.longread.type) + longread_dir = Channel.of(inputs.longread.dir) + reference = Channel.fromPath([inputs.assembly_id], inputs.reference_file, checkIfExists: true) + + // + // LOGIC: Building CurationPretext specific channels + // + cpretext_aligner = Channel.of(inputs.curationpretext.aligner) + cpretext_telomere_motif = Channel.of([inputs.assembly_id], inputs.curationpretext.telomere_motif) + cpretext_hic_dir = Channel.of([inputs.assembly_id], inputs.curationpretext.hic_dir) + + // + // LOGIC: Building BlobToolKit specific channels + // + btk_nt_database = Channel.of([inputs.assembly_id], inputs.btk.nt_database) + btk_nt_database_prefix = Channel.of(inputs.btk.nt_database_prefix) + btk_nt_diamond_database = Channel.of(inputs.btk.diamond_nt_database_path) + btk_un_diamond_database = Channel.of(inputs.btk.diamond_uniprot_database_path) + btk_ncbi_taxonomy_path = Channel.of(inputs.btk.ncbi_taxonomy_path) + btk_ncbi_lineage_path = Channel.of(inputs.btk.ncbi_rankedlineage_path) + btk_btk_yaml = Channel.of(inputs.btk.btk_yaml) + btk_taxid = Channel.of([inputs.assembly_id], inputs.btk.taxid) + btk_gca_accession = Channel.of(inputs.btk.gca_accession) + + versions = ch_versions.ifEmpty(null) +} From ab69ccd5c212dbd4dbb671d5676b875b4b476ca6 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 5 Aug 2024 10:16:58 +0100 Subject: [PATCH 03/52] Last weeks addition --- bin/generate_samplesheet.py | 44 +++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100755 bin/generate_samplesheet.py diff --git a/bin/generate_samplesheet.py b/bin/generate_samplesheet.py new file mode 100755 index 0000000..12af705 --- /dev/null +++ b/bin/generate_samplesheet.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 + +import os +import sys +import argparse + +""" +A simple script to generate a csv file required for the sanger-tol/blobtoolkit pipeline-module. + +Required input include the sample ID and the mapped BAM file generated with PacBio data and input FASTA assembly + +Written by Damon-Lee Pointon (dp24/DLBPointon) +""" + + +def parse_args(): + parser = argparse.ArgumentParser(description="Generate a csv file for BTK") + parser.add_argument("sample_name", type=str, help="Name of sample") + parser.add_argument( + "mapped_bam_file", + type=str, + help="Path containing the mapped BAM generated with PacBio data and the ASCC input assembly", + ) + parser.add_argument("-v", "--version", action="version", version="1.0.0") + return parser.parse_args() + + +def main(): + args = parse_args() + + data_list = [] + + data_list.append("sample,datatype,datafile\n") + if args.mapped_bam_file.endswith(".bam"): + data_list.append(f"{args.sample_name},pacbio,{args.mapped_bam_file}\n") + else: + sys.exit("I was expecting a mapped BAM file") + + with open("samplesheet.csv", "w") as file: + file.write("".join(data_list)) + + +if __name__ == "__main__": + main() From c458efb415a4be7a8cb8a3783025036f9baff2cb Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 5 Aug 2024 10:17:12 +0100 Subject: [PATCH 04/52] Last weeks addition --- assets/btk_draft.yaml | 17 +++++++++++++++++ assets/test.yaml | 19 +++++++++++++++++++ 2 files changed, 36 insertions(+) create mode 100644 assets/btk_draft.yaml create mode 100755 assets/test.yaml diff --git a/assets/btk_draft.yaml b/assets/btk_draft.yaml new file mode 100644 index 0000000..0e02351 --- /dev/null +++ b/assets/btk_draft.yaml @@ -0,0 +1,17 @@ +assembly: + level: bar +settings: + foo: 0 +similarity: + diamond_blastx: + foo: 0 +taxon: + class: class_name + family: family_name + genus: genus_name + kingdom: kingdom_name + name: species_name + order: order_name + phylum: phylum_name + superkingdom: superkingdom_name + taxid: 0 diff --git a/assets/test.yaml b/assets/test.yaml new file mode 100755 index 0000000..e2a9c79 --- /dev/null +++ b/assets/test.yaml @@ -0,0 +1,19 @@ +assembly_id: Oscheius_DF5033 +reference_file: /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/Oscheius_DF5033/assembly/draft/DF5033.hifiasm.noTelos.20211120/DF5033.noTelos.hifiasm.purged.noCont.noMito.fasta +longread: + type: hifi + dir: /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/Oscheius_DF5033/genomic_data/nxOscSpes1/pacbio/fasta/ +curationpretext: + aligner: minimap2 + telomere_motif: TTAGG + hic_dir: /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/Oscheius_DF5033/genomic_data/nxOscSpes1/hic-arima2/full/ +btk: + nt_database: /home/runner/work/ascc/ascc/blastdb/ + nt_database_prefix: tiny_plasmodium_blastdb.fa + diamond_uniprot_database_path: /home/runner/work/ascc/ascc/diamond.dmnd + diamond_nt_database_path: /home/runner/work/ascc/ascc/diamond.dmnd + ncbi_taxonomy_path: /home/runner/work/ascc/ascc/ncbi_taxdump/ + ncbi_rankedlineage_path: /home/runner/work/ascc/ascc/ncbi_taxdump/rankedlineage.dmp + btk_yaml: /nfs/users/nfs_d/dp24/sanger-tol-ear/assets/btk_draft.yaml + taxid: 352914 + gca_accession: GCA_0001 From 04adc75a35d6d7cbbcfc508fe13536d016fc03fa Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Wed, 7 Aug 2024 14:20:13 +0100 Subject: [PATCH 05/52] Adding 2 pipeline nesting method for btk --- modules/local/sanger_tol_btk.nf | 107 ++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 modules/local/sanger_tol_btk.nf diff --git a/modules/local/sanger_tol_btk.nf b/modules/local/sanger_tol_btk.nf new file mode 100644 index 0000000..fec146c --- /dev/null +++ b/modules/local/sanger_tol_btk.nf @@ -0,0 +1,107 @@ +process SANGER_TOL_BTK { + tag "$meta.id" + label 'process_low' + + input: + tuple val(meta), path(reference, stageAs: "REFERENCE.fa") + tuple val(meta1), path(bam) // Name needs to remain the same as previous process as they are referenced in the samplesheet + tuple val(meta2), path(samplesheet_csv, stageAs: "SAMPLESHEET.csv") + path blastp, stageAs: "blastp.dmnd" + path blastn + path blastx + path btk_config_file + path tax_dump + path btk_yaml, stageAs: "BTK.yaml" + val busco_lineages + val taxon + val gca_accession + + output: + tuple val(meta), path("${meta.id}_btk_out/blobtoolkit/draft"), emit: dataset + path("${meta.id}_btk_out/blobtoolkit/plots"), emit: plots + path("${meta.id}_btk_out/blobtoolkit/draft/summary.json.gz"), emit: summary_json + path("${meta.id}_btk_out/busco"), emit: busco_data + path("${meta.id}_btk_out/multiqc"), emit: multiqc_report + path("blobtoolkit_pipeline_info"), emit: pipeline_info + path "versions.yml", emit: versions + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: "" + def executor = task.ext.executor ?: "" + def profiles = task.ext.profiles ?: "" + def get_version = task.ext.version_data ?: "UNKNOWN - SETTING NOT SET" + def btk_config = btk_config_file ? "-c $btk_config_file" : "" + def pipeline_version = task.ext.version ?: "main" + // YAML used to avoid the use of GCA accession number + // https://github.com/sanger-tol/blobtoolkit/issues/77 + + // Seems to be an issue where a nested pipeline can't see the files in the same directory + // Running realpath gets around this but the files copied into the folder are + // now just wasted space. + + // outdir should be an arg + + // --accession draft \\ + + // blastx and blastp use the same database hence the StageAs + + + """ + $executor 'nextflow run sanger-tol/blobtoolkit \\ + -r $pipeline_version \\ + -profile $profiles \\ + --input "\$(realpath $samplesheet_csv)" \\ + --outdir ${prefix}_btk_out \\ + --fasta "\$(realpath REFERENCE.fa)" \\ + --yaml "\$(realpath BTK.yaml)" \\ + --busco_lineages $busco_lineages \\ + --taxon $taxon \\ + --taxdump "\$(realpath $tax_dump)" \\ + --blastp "\$(realpath blastp.dmnd)" \\ + --blastn "\$(realpath $blastn)" \\ + --blastx "\$(realpath $blastx)" \\ + $btk_config \\ + $args' + + mv ${prefix}_btk_out/pipeline_info blobtoolkit_pipeline_info + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + Blobtoolkit: $pipeline_version + Nextflow: \$(nextflow -v | cut -d " " -f3) + executor system: $get_version + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def pipeline_version = task.ext.version ?: "main" + + """ + mkdir -p ${prefix}_btk_out/blobtoolkit/$gca_accession + touch ${prefix}_btk_out/blobtoolkit/$gca_accession/test.json.gz + + mkdir ${prefix}_btk_out/blobtoolkit/plots + touch ${prefix}_btk_out/blobtoolkit/plots/test.png + + mkdir ${prefix}_btk_out/busco + touch ${prefix}_btk_out/busco/test.batch_summary.txt + touch ${prefix}_btk_out/busco/test.fasta.txt + touch ${prefix}_btk_out/busco/test.json + + mkdir ${prefix}_btk_out/multiqc + mkdir ${prefix}_btk_out/multiqc/multiqc_data + mkdir ${prefix}_btk_out/multiqc/multiqc_plots + touch ${prefix}_btk_out/multiqc/multiqc_report.html + + mv ${prefix}_btk_out/pipeline_info blobtoolkit_pipeline_info + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + Blobtoolkit: $pipeline_version + Nextflow: \$(nextflow -v | cut -d " " -f3) + executor system: $get_version + END_VERSIONS + """ +} From ba74c101264753f2b6df53c07f036191f0db3642 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Wed, 7 Aug 2024 14:20:31 +0100 Subject: [PATCH 06/52] Updates to add BTK --- assets/test.yaml | 17 ++-- conf/modules.config | 13 +-- subworkflows/local/se_mapping.nf | 18 ++--- subworkflows/local/yaml_input.nf | 77 +++++++++++++++--- workflows/ear.nf | 135 ++++++++++++++++++++----------- 5 files changed, 178 insertions(+), 82 deletions(-) diff --git a/assets/test.yaml b/assets/test.yaml index e2a9c79..f0a0fd5 100755 --- a/assets/test.yaml +++ b/assets/test.yaml @@ -1,5 +1,6 @@ assembly_id: Oscheius_DF5033 -reference_file: /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/Oscheius_DF5033/assembly/draft/DF5033.hifiasm.noTelos.20211120/DF5033.noTelos.hifiasm.purged.noCont.noMito.fasta +reference_hap1: /nfs/users/nfs_d/dp24/sanger-tol-ear/test.fa +reference_hap2: /nfs/users/nfs_d/dp24/sanger-tol-ear/test.fa longread: type: hifi dir: /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/Oscheius_DF5033/genomic_data/nxOscSpes1/pacbio/fasta/ @@ -7,13 +8,17 @@ curationpretext: aligner: minimap2 telomere_motif: TTAGG hic_dir: /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/Oscheius_DF5033/genomic_data/nxOscSpes1/hic-arima2/full/ +merquryfk: + fastk_hist: "./" + fastk_ktab: "./" btk: - nt_database: /home/runner/work/ascc/ascc/blastdb/ + nt_database: /lustre/scratch123/tol/teams/tola/users/ea10/pipeline_testing/20240704_blast_tiny_testdb/blastdb/ nt_database_prefix: tiny_plasmodium_blastdb.fa - diamond_uniprot_database_path: /home/runner/work/ascc/ascc/diamond.dmnd - diamond_nt_database_path: /home/runner/work/ascc/ascc/diamond.dmnd - ncbi_taxonomy_path: /home/runner/work/ascc/ascc/ncbi_taxdump/ - ncbi_rankedlineage_path: /home/runner/work/ascc/ascc/ncbi_taxdump/rankedlineage.dmp + diamond_uniprot_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/pipeline_testing/20240704_diamond_tiny_testdb/ascc_tinytest_diamond_db.dmnd + diamond_nr_database_path: /lustre/scratch123/tol/resources/nr/latest/nr.dmnd + ncbi_taxonomy_path: /lustre/scratch123/tol/resources/taxonomy/latest/new_taxdump + ncbi_rankedlineage_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdump/rankedlineage.dmp btk_yaml: /nfs/users/nfs_d/dp24/sanger-tol-ear/assets/btk_draft.yaml taxid: 352914 gca_accession: GCA_0001 + lineages: "diptera_odb10,insecta_odb10" diff --git a/conf/modules.config b/conf/modules.config index d203d2b..388b183 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -18,17 +18,12 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] - withName: FASTQC { - ext.args = '--quiet' + withName: GFASTATS { + ext.args = '--nstar-report' } - withName: 'MULTIQC' { - ext.args = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' } - publishDir = [ - path: { "${params.outdir}/multiqc" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] + withName: MERQURYFK_MERQURYFK { + ext.args = "-P." } } diff --git a/subworkflows/local/se_mapping.nf b/subworkflows/local/se_mapping.nf index 0340425..8c7ad52 100644 --- a/subworkflows/local/se_mapping.nf +++ b/subworkflows/local/se_mapping.nf @@ -15,22 +15,22 @@ workflow SE_MAPPING { // // PROCESS: GETS PACBIO READ PATHS FROM READS_PATH // - ch_grabbed_reads_path = GrabFiles( pacbio_tuple ) + ch_grabbed_reads_path = GrabFiles(pacbio_tuple) ch_grabbed_reads_path - .map { meta, files -> - tuple( files ) + .map {meta, files -> + tuple(files) } .flatten() - .set { ch_reads_path } + .set {ch_reads_path} // // PROCESS: MAKE MINIMAP INPUT CHANNEL AND MAKE BRANCHES BASED ON INPUT READ TYPE // - reference_tuple - .combine( ch_reads_path ) + ch_reads_path + .combine( reference_tuple ) .combine( reads_type ) - .map { meta, ref, reads_path, reads_type -> + .map { reads_path, meta, ref, reads_type -> tuple( [ id : meta.id, single_end : true, @@ -44,7 +44,7 @@ workflow SE_MAPPING { reads_type ) } - .set { minimap_se_input } + .set {minimap_se_input} // // PROCESS: MULTIMAP TO MAKE BOOLEAN ARGUMENTS FOR MINIMAP HIFI MAPPING INPUT @@ -52,7 +52,7 @@ workflow SE_MAPPING { minimap_se_input .multiMap { meta, reads_path, ref, bam_output, cigar_paf, cigar_bam, reads_type -> read_tuple : tuple( meta, reads_path) - ref : ref + ref : tuple( meta, ref) bool_bam_ouput : bam_output bool_cigar_paf : cigar_paf bool_cigar_bam : cigar_bam diff --git a/subworkflows/local/yaml_input.nf b/subworkflows/local/yaml_input.nf index da75cb7..51350dd 100644 --- a/subworkflows/local/yaml_input.nf +++ b/subworkflows/local/yaml_input.nf @@ -7,25 +7,81 @@ workflow YAML_INPUT { input_file // params.input main: - ch_versions = Channel.empty() + ch_versions = Channel.empty() - inputs = new YamlSlurper().parse(file(params.input)) + inputs = new YamlSlurper().parse(file(params.input)) + + sample_id = Channel.of(inputs.assembly_id) + longread_type = Channel.of(inputs.longread.type) + longread_dir = Channel.of(inputs.longread.dir) + + sample_id + .combine(longread_dir) + .map{sample, dir -> + tuple([id: sample], + dir + ) + } + .set {pacbio_tuple} + + reference_1 = Channel.fromPath(inputs.reference_hap1, checkIfExists: true) + reference_2 = Channel.fromPath(inputs.reference_hap2, checkIfExists: true) + + reference_1 + .combine(sample_id) + .map{ref, sample_id -> + tuple([id:sample_id], ref) + } + .set{reference_hap1} + + + + cpretext_aligner = Channel.of(inputs.curationpretext.aligner) + cpretext_telomere_motif_raw = Channel.of(inputs.curationpretext.telomere_motif) + cpretext_hic_dir_raw = Channel.of(inputs.curationpretext.hic_dir) + + sample_id + .combine(cpretext_telomere_motif_raw) + .map{sample, dir -> + tuple([id: sample], + dir + ) + } + .set {cpretext_telomere_motif} + + sample_id + .combine(cpretext_hic_dir_raw) + .map{sample, dir -> + tuple([id: sample], + dir + ) + } + .set {cpretext_hic_dir} emit: // // LOGIC: Building generic channels // - sample_id = Channel.of(inputs.assembly_id) - longread_type = Channel.of(inputs.longread.type) - longread_dir = Channel.of(inputs.longread.dir) - reference = Channel.fromPath([inputs.assembly_id], inputs.reference_file, checkIfExists: true) + sample_id + longread_type // val(data) + longread_dir = inputs.longread.dir // DataVariable + pacbio_tuple // tuple (meta), path(file) + reference_hap1 = reference_hap1 // tuple (meta), path(file) + reference_hap2 = reference_2 // DataVariable + reference_path = inputs.reference_hap1 // DataVariable // // LOGIC: Building CurationPretext specific channels // - cpretext_aligner = Channel.of(inputs.curationpretext.aligner) - cpretext_telomere_motif = Channel.of([inputs.assembly_id], inputs.curationpretext.telomere_motif) - cpretext_hic_dir = Channel.of([inputs.assembly_id], inputs.curationpretext.hic_dir) + cpretext_aligner + cpretext_telomere_motif + cpretext_hic_dir_raw = inputs.curationpretext.hic_dir // DataVariable + + // + // LOGIC: MERQURY CHANNELS + // + fastk_hist = Channel.of(inputs.merquryfk.fastk_hist) + fastk_ktab = Channel.of(inputs.merquryfk.fastk_ktab) // // LOGIC: Building BlobToolKit specific channels @@ -36,9 +92,10 @@ workflow YAML_INPUT { btk_un_diamond_database = Channel.of(inputs.btk.diamond_uniprot_database_path) btk_ncbi_taxonomy_path = Channel.of(inputs.btk.ncbi_taxonomy_path) btk_ncbi_lineage_path = Channel.of(inputs.btk.ncbi_rankedlineage_path) - btk_btk_yaml = Channel.of(inputs.btk.btk_yaml) + btk_yaml = Channel.of(inputs.btk.btk_yaml) btk_taxid = Channel.of([inputs.assembly_id], inputs.btk.taxid) btk_gca_accession = Channel.of(inputs.btk.gca_accession) + busco_lineages = Channel.of(inputs.btk.lineages) versions = ch_versions.ifEmpty(null) } diff --git a/workflows/ear.nf b/workflows/ear.nf index b35a949..ac0193e 100644 --- a/workflows/ear.nf +++ b/workflows/ear.nf @@ -6,6 +6,7 @@ include { NEXTFLOW_RUN as CURATIONPRETEXT } from '../modules/local/nextflow/run' include { NEXTFLOW_RUN as BLOBTOOLKIT } from '../modules/local/nextflow/run' +include { SANGER_TOL_BTK } from '../modules/local/sanger_tol_btk' include { YAML_INPUT } from '../subworkflows/local/yaml_input' include { GENERATE_SAMPLESHEET } from '../modules/local/generate_samplesheet' @@ -39,13 +40,15 @@ workflow EAR { // MODULE: YAML_INPUT // YAML_INPUT(ch_input) - reference = YAML_INPUT.out.reference - reference.view() // // MODULE: Run Sanger-ToL/CurationPretext // - This was built using: https://github.com/mahesh-panchal/nf-cascade // + reference = YAML_INPUT.out.reference_path.get() + hic_dir = YAML_INPUT.out.cpretext_hic_dir_raw.get() + longread_dir = YAML_INPUT.out.longread_dir.get() + CURATIONPRETEXT( "sanger-tol/curationpretext", [ @@ -53,23 +56,23 @@ workflow EAR { "--input", reference, "--longread", - YAML_INPUT.out.longread_dir, + longread_dir, "--cram", - YAML_INPUT.out.cpretext_hic_dir, - "$params.outdir/curationpretext", + hic_dir, "-profile singularity,sanger" - ].join(" ").trim(), // workflow opts + ].join(" ").trim(), // workflow opts Channel.value([]), //readWithDefault( params.demo.params_file, Channel.value([]) ), // params file Channel.value([]), // samplesheet - not used by this pipeline Channel.value([]) //readWithDefault( params.demo.add_config, Channel.value([]) ), // custom config - + //"$params.outdir/curationpretext", ) // // MODULE: ASSEMBLY STATISTICS FOR THE FASTA // + GFASTATS( - YAML_INPUT.out.reference, + YAML_INPUT.out.reference_hap1, "fasta", [], [], @@ -79,38 +82,45 @@ workflow EAR { [] ) - // // - // // LOGIC: REFORMAT A BUNCH OF CHANNELS FOR MERQUERYFK - // // - // YAML_INPUT.out.reference - // .combine() - // .combine() - // .combine() - // .map{ meta, primary, haplotigs, fastk_hist, fastk_ktab -> - // tuple( meta, - // fastk_hist, - // fastk_ktab, - // primary, - // haplotigs - // ) - // } - // .set { merquryfk_input } - - // // - // // MODULE: MERQURYFK PLOTS OF GENOME - // // - - // MERQURYFK( - // merquryfk_input - // ) + // + // LOGIC: REFORMAT A BUNCH OF CHANNELS FOR MERQUERYFK + // + + if (params.reference_hap2) { + YAML_INPUT.out.reference_hap1 + .combine(YAML_INPUT.out.reference_hap2) + .combine(YAML_INPUT.out.fastk_hist) + .combine(YAML_INPUT.out.fastk_ktab) + .map{ meta, primary, haplotigs, fastk_hist, fastk_ktab -> + tuple( meta, + fastk_hist, + fastk_ktab, + primary, + haplotigs + ) + } + .set { merquryfk_input } + + // + // MODULE: MERQURYFK PLOTS OF GENOME + // + + MERQURYFK( + merquryfk_input + ) + } // - // LOGIC: SANGER-TOL/BLOBTOOLKIT expects the pacbio data to be already mapped + // LOGIC: SANGER-TOL/BLOBTOOLKIT expects the pacbio data to be already mapped -> this has been changed but seeing as BTK and genomenote need it then we may as well keep it. + // This is also a requirement for genomenote // platform = YAML_INPUT.out.longread_type YAML_INPUT.out.sample_id .combine(YAML_INPUT.out.longread_dir) + .map{ sample, dir -> + tuple([id: sample], dir ) + } .set {pacbio_tuple} if ( platform.filter { it == "hifi" } || platform.filter { it == "clr" } || platform.filter { it == "ont" } ) { @@ -118,8 +128,8 @@ workflow EAR { // SUBWORKFLOW: SINGLE END MAPPING FOR ALIGNING LONGREAD DATA // SE_MAPPING ( - YAML_INPUT.out.reference, - pacbio_tuple, + YAML_INPUT.out.reference_hap1, + YAML_INPUT.out.pacbio_tuple, platform ) ch_versions = ch_versions.mix(SE_MAPPING.out.versions) @@ -133,8 +143,8 @@ workflow EAR { // SUBWORKFLOW: PAIRED END MAPPING FOR ALIGNING LONGREAD DATA // PE_MAPPING ( - YAML_INPUT.out.reference, - pacbio_tuple, + YAML_INPUT.out.reference_hap1, + YAML_INPUT.out.pacbio_tuple, platform ) ch_versions = ch_versions.mix(PE_MAPPING.out.versions) @@ -149,7 +159,7 @@ workflow EAR { // SAMTOOLS_SORT ( merged_bam, - YAML_INPUT.out.reference + YAML_INPUT.out.reference_hap1 ) ch_versions = ch_versions.mix( SAMTOOLS_SORT.out.versions ) @@ -174,35 +184,50 @@ workflow EAR { // MODULE: Run Sanger-ToL/BlobToolKit // - This was built using: https://github.com/mahesh-panchal/nf-cascade // + // BLOBTOOLKIT( // "sanger-tol/blobtoolkit", // [ - // "-r 0.4.0", + // "-r 0.5.0", // "--input", // GENERATE_SAMPLESHEET.out.csv, // "--fasta", // reference, - // "--accession", - // YAML_INPUT.out.btk_gca_accession, + // "--yaml", + // btk_yaml, // "-taxon", - // YAML_INPUT.out.btk_taxid, + // btk_taxon, // "--taxdump", - // YAML_INPUT.out.btk_ncbi_taxonomy_path, + // btk_taxdump, // "--blastp", - // YAML_INPUT.out.btk_nt_diamond_database, + // btk_blastp, // "--blastn", - // YAML_INPUT.out.btk_nt_database, + // btk_blastn, // "--blastx", - // YAML_INPUT.out.btk_nt_diamond_database, - // "$params.outdir/blobtoolkit", + // btk_uniprot, // "-profile singularity,sanger" // ].join(" ").trim(), // workflow opts // Channel.value([]),//readWithDefault( params.demo.params_file, Channel.value([]) ), // params file // Channel.value([]),//readWithDefault( params.demo.input, Channel.value([]) ), // samplesheet // Channel.value([])//readWithDefault( params.demo.add_config, Channel.value([]) ), // custom config - // ) + SANGER_TOL_BTK ( + YAML_INPUT.out.reference_hap1, + samplesheet_input, + GENERATE_SAMPLESHEET.out.csv, + YAML_INPUT.out.btk_un_diamond_database, + YAML_INPUT.out.btk_nt_diamond_database, + YAML_INPUT.out.btk_un_diamond_database, + [], + YAML_INPUT.out.btk_ncbi_taxonomy_path, + YAML_INPUT.out.btk_yaml, + YAML_INPUT.out.busco_lineages, + YAML_INPUT.out.btk_taxid, + 'GCA_0001' + ) + ch_versions = ch_versions.mix(SANGER_TOL_BTK.out.versions) + // // Collate and save software versions // @@ -224,6 +249,20 @@ workflow EAR { versions = ch_versions // channel: [ path(versions.yml) ] } + +process RenameDatabase { + tag "Rename DMND Database" + executor 'local' + + input: + db_path + + output: + path "UN.dmnd" + + "true" +} + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ THE END From 18c3e15ac6a6fe9d7253a53d7a44482abc146539 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 8 Aug 2024 15:34:52 +0100 Subject: [PATCH 07/52] Updates and additions --- conf/modules.config | 4 ++++ workflows/ear.nf | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/conf/modules.config b/conf/modules.config index 388b183..405d1d6 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -26,4 +26,8 @@ process { ext.args = "-P." } + withName: SAMTOOLS_SORT { + ext.prefix = { "${meta.id}_sorted"} + } + } diff --git a/workflows/ear.nf b/workflows/ear.nf index ac0193e..1df63e5 100644 --- a/workflows/ear.nf +++ b/workflows/ear.nf @@ -134,6 +134,8 @@ workflow EAR { ) ch_versions = ch_versions.mix(SE_MAPPING.out.versions) + SE_MAPPING.out.mapped_bam.view() + ch_align_bam .mix( SE_MAPPING.out.mapped_bam ) .set { merged_bam } @@ -154,6 +156,8 @@ workflow EAR { .set { merged_bam } } + merged_bam.view() + // // MODULE: SORT MAPPED BAM // From 24700823fc282b9518ef2fde5a80ad727a56cb62 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 8 Aug 2024 15:35:24 +0100 Subject: [PATCH 08/52] Updates and additions --- workflows/ear.nf | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/workflows/ear.nf b/workflows/ear.nf index 1df63e5..f068a6e 100644 --- a/workflows/ear.nf +++ b/workflows/ear.nf @@ -216,21 +216,21 @@ workflow EAR { // Channel.value([])//readWithDefault( params.demo.add_config, Channel.value([]) ), // custom config // ) - SANGER_TOL_BTK ( - YAML_INPUT.out.reference_hap1, - samplesheet_input, - GENERATE_SAMPLESHEET.out.csv, - YAML_INPUT.out.btk_un_diamond_database, - YAML_INPUT.out.btk_nt_diamond_database, - YAML_INPUT.out.btk_un_diamond_database, - [], - YAML_INPUT.out.btk_ncbi_taxonomy_path, - YAML_INPUT.out.btk_yaml, - YAML_INPUT.out.busco_lineages, - YAML_INPUT.out.btk_taxid, - 'GCA_0001' - ) - ch_versions = ch_versions.mix(SANGER_TOL_BTK.out.versions) + SANGER_TOL_BTK ( + YAML_INPUT.out.reference_hap1, + samplesheet_input, + GENERATE_SAMPLESHEET.out.csv, + YAML_INPUT.out.btk_un_diamond_database, + YAML_INPUT.out.btk_nt_diamond_database, + YAML_INPUT.out.btk_un_diamond_database, + [], + YAML_INPUT.out.btk_ncbi_taxonomy_path, + YAML_INPUT.out.btk_yaml, + YAML_INPUT.out.busco_lineages, + YAML_INPUT.out.btk_taxid, + 'GCA_0001' + ) + ch_versions = ch_versions.mix(SANGER_TOL_BTK.out.versions) // // Collate and save software versions From 56760f8f38f412727309ec22d195feb43ea7678e Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 9 Aug 2024 13:16:51 +0100 Subject: [PATCH 09/52] Updates to complete skeleton of pipeline --- assets/test.yaml | 4 +-- conf/modules.config | 8 +++++ modules/local/sanger_tol_btk.nf | 60 +++++++++++++++----------------- subworkflows/local/yaml_input.nf | 6 ++-- workflows/ear.nf | 52 ++++++++------------------- 5 files changed, 56 insertions(+), 74 deletions(-) diff --git a/assets/test.yaml b/assets/test.yaml index f0a0fd5..d4da164 100755 --- a/assets/test.yaml +++ b/assets/test.yaml @@ -1,6 +1,6 @@ assembly_id: Oscheius_DF5033 -reference_hap1: /nfs/users/nfs_d/dp24/sanger-tol-ear/test.fa -reference_hap2: /nfs/users/nfs_d/dp24/sanger-tol-ear/test.fa +reference_hap1: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/asccTinyTest_V2/assembly/pyoelii_tiny_testfile_with_adapters.fa +reference_hap2: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/asccTinyTest_V2/assembly/pyoelii_tiny_testfile_with_adapters.fa longread: type: hifi dir: /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/Oscheius_DF5033/genomic_data/nxOscSpes1/pacbio/fasta/ diff --git a/conf/modules.config b/conf/modules.config index 405d1d6..a96a69f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -30,4 +30,12 @@ process { ext.prefix = { "${meta.id}_sorted"} } + withName: SANGER_TOL_BTK { + ext.args = "--blastx_outext 'txt'" + ext.executor = "bsub -Is -tty -e test.e -o test.log -n 2 -q oversubscribed -M1400 -R'select[mem>1400] rusage[mem=1400] span[hosts=1]'" + ext.profiles = "singularity,sanger" + ext.get_versions = "lsid | head -n1 | cut -d ',' -f 1" + ext.version = "draft_assemblies" + } + } diff --git a/modules/local/sanger_tol_btk.nf b/modules/local/sanger_tol_btk.nf index fec146c..27e3ca0 100644 --- a/modules/local/sanger_tol_btk.nf +++ b/modules/local/sanger_tol_btk.nf @@ -17,34 +17,32 @@ process SANGER_TOL_BTK { val gca_accession output: - tuple val(meta), path("${meta.id}_btk_out/blobtoolkit/draft"), emit: dataset - path("${meta.id}_btk_out/blobtoolkit/plots"), emit: plots - path("${meta.id}_btk_out/blobtoolkit/draft/summary.json.gz"), emit: summary_json - path("${meta.id}_btk_out/busco"), emit: busco_data - path("${meta.id}_btk_out/multiqc"), emit: multiqc_report - path("blobtoolkit_pipeline_info"), emit: pipeline_info - path "versions.yml", emit: versions + tuple val(meta), path("${meta.id}_btk_out/blobtoolkit/REFERENCE"), emit: dataset + path("${meta.id}_btk_out/blobtoolkit/plots"), emit: plots + path("${meta.id}_btk_out/blobtoolkit/REFERENCE/summary.json.gz"), emit: summary_json + path("${meta.id}_btk_out/busco"), emit: busco_data + path("${meta.id}_btk_out/multiqc"), emit: multiqc_report + path("blobtoolkit_pipeline_info"), emit: pipeline_info + path "versions.yml", emit: versions script: - def prefix = task.ext.prefix ?: "${meta.id}" def args = task.ext.args ?: "" def executor = task.ext.executor ?: "" def profiles = task.ext.profiles ?: "" def get_version = task.ext.version_data ?: "UNKNOWN - SETTING NOT SET" def btk_config = btk_config_file ? "-c $btk_config_file" : "" - def pipeline_version = task.ext.version ?: "main" + def pipeline_version = task.ext.version ?: "draft_assemblies" // YAML used to avoid the use of GCA accession number // https://github.com/sanger-tol/blobtoolkit/issues/77 // Seems to be an issue where a nested pipeline can't see the files in the same directory // Running realpath gets around this but the files copied into the folder are - // now just wasted space. + // now just wasted space. Should be fixed with using Mahesh's method of nesting but + // this is proving a bit complicated with BTK // outdir should be an arg - // --accession draft \\ - - // blastx and blastp use the same database hence the StageAs + // blastx and blastp can use the same database hence the StageAs """ @@ -52,9 +50,8 @@ process SANGER_TOL_BTK { -r $pipeline_version \\ -profile $profiles \\ --input "\$(realpath $samplesheet_csv)" \\ - --outdir ${prefix}_btk_out \\ - --fasta "\$(realpath REFERENCE.fa)" \\ - --yaml "\$(realpath BTK.yaml)" \\ + --outdir ${meta.id}_btk_out \\ + --fasta ./REFERENCE.fa \\ --busco_lineages $busco_lineages \\ --taxon $taxon \\ --taxdump "\$(realpath $tax_dump)" \\ @@ -64,7 +61,7 @@ process SANGER_TOL_BTK { $btk_config \\ $args' - mv ${prefix}_btk_out/pipeline_info blobtoolkit_pipeline_info + mv ${meta.id}_btk_out/pipeline_info blobtoolkit_pipeline_info cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -75,27 +72,26 @@ process SANGER_TOL_BTK { """ stub: - def prefix = task.ext.prefix ?: "${meta.id}" - def pipeline_version = task.ext.version ?: "main" + def pipeline_version = task.ext.version ?: "draft_assemblies" """ - mkdir -p ${prefix}_btk_out/blobtoolkit/$gca_accession - touch ${prefix}_btk_out/blobtoolkit/$gca_accession/test.json.gz + mkdir -p ${meta.id}_btk_out/blobtoolkit/${meta.id}_out + touch ${meta.id}_btk_out/blobtoolkit/${meta.id}_out/test.json.gz - mkdir ${prefix}_btk_out/blobtoolkit/plots - touch ${prefix}_btk_out/blobtoolkit/plots/test.png + mkdir ${meta.id}_btk_out/blobtoolkit/plots + touch ${meta.id}_btk_out/blobtoolkit/plots/test.png - mkdir ${prefix}_btk_out/busco - touch ${prefix}_btk_out/busco/test.batch_summary.txt - touch ${prefix}_btk_out/busco/test.fasta.txt - touch ${prefix}_btk_out/busco/test.json + mkdir ${meta.id}_btk_out/busco + touch ${meta.id}_btk_out/busco/test.batch_summary.txt + touch ${meta.id}_btk_out/busco/test.fasta.txt + touch ${meta.id}_btk_out/busco/test.json - mkdir ${prefix}_btk_out/multiqc - mkdir ${prefix}_btk_out/multiqc/multiqc_data - mkdir ${prefix}_btk_out/multiqc/multiqc_plots - touch ${prefix}_btk_out/multiqc/multiqc_report.html + mkdir ${meta.id}_btk_out/multiqc + mkdir ${meta.id}_btk_out/multiqc/multiqc_data + mkdir ${meta.id}_btk_out/multiqc/multiqc_plots + touch ${meta.id}_btk_out/multiqc/multiqc_report.html - mv ${prefix}_btk_out/pipeline_info blobtoolkit_pipeline_info + mv ${meta.id}_btk_out/pipeline_info blobtoolkit_pipeline_info cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/subworkflows/local/yaml_input.nf b/subworkflows/local/yaml_input.nf index 51350dd..687c5db 100644 --- a/subworkflows/local/yaml_input.nf +++ b/subworkflows/local/yaml_input.nf @@ -86,14 +86,14 @@ workflow YAML_INPUT { // // LOGIC: Building BlobToolKit specific channels // - btk_nt_database = Channel.of([inputs.assembly_id], inputs.btk.nt_database) + btk_nt_database = Channel.of(inputs.btk.nt_database) btk_nt_database_prefix = Channel.of(inputs.btk.nt_database_prefix) - btk_nt_diamond_database = Channel.of(inputs.btk.diamond_nt_database_path) + btk_nt_diamond_database = Channel.of(inputs.btk.diamond_nr_database_path) btk_un_diamond_database = Channel.of(inputs.btk.diamond_uniprot_database_path) btk_ncbi_taxonomy_path = Channel.of(inputs.btk.ncbi_taxonomy_path) btk_ncbi_lineage_path = Channel.of(inputs.btk.ncbi_rankedlineage_path) btk_yaml = Channel.of(inputs.btk.btk_yaml) - btk_taxid = Channel.of([inputs.assembly_id], inputs.btk.taxid) + btk_taxid = Channel.of(inputs.btk.taxid) btk_gca_accession = Channel.of(inputs.btk.gca_accession) busco_lineages = Channel.of(inputs.btk.lineages) diff --git a/workflows/ear.nf b/workflows/ear.nf index f068a6e..9f90920 100644 --- a/workflows/ear.nf +++ b/workflows/ear.nf @@ -134,8 +134,6 @@ workflow EAR { ) ch_versions = ch_versions.mix(SE_MAPPING.out.versions) - SE_MAPPING.out.mapped_bam.view() - ch_align_bam .mix( SE_MAPPING.out.mapped_bam ) .set { merged_bam } @@ -156,8 +154,6 @@ workflow EAR { .set { merged_bam } } - merged_bam.view() - // // MODULE: SORT MAPPED BAM // @@ -172,56 +168,38 @@ workflow EAR { // YAML_INPUT.out.sample_id .combine(merged_bam) - .map{ sample_id, pacbio_path -> + .map{ sample_id, pacbio_meta, pacbio_path -> tuple( [id: sample_id], pacbio_path ) } - .set { samplesheet_input } + .set { mapped_bam } GENERATE_SAMPLESHEET( - samplesheet_input + mapped_bam ) + ch_versions = ch_versions.mix( GENERATE_SAMPLESHEET.out.versions ) // // MODULE: Run Sanger-ToL/BlobToolKit - // - This was built using: https://github.com/mahesh-panchal/nf-cascade // - - // BLOBTOOLKIT( - // "sanger-tol/blobtoolkit", - // [ - // "-r 0.5.0", - // "--input", - // GENERATE_SAMPLESHEET.out.csv, - // "--fasta", - // reference, - // "--yaml", - // btk_yaml, - // "-taxon", - // btk_taxon, - // "--taxdump", - // btk_taxdump, - // "--blastp", - // btk_blastp, - // "--blastn", - // btk_blastn, - // "--blastx", - // btk_uniprot, - // "-profile singularity,sanger" - // ].join(" ").trim(), // workflow opts - // Channel.value([]),//readWithDefault( params.demo.params_file, Channel.value([]) ), // params file - // Channel.value([]),//readWithDefault( params.demo.input, Channel.value([]) ), // samplesheet - // Channel.value([])//readWithDefault( params.demo.add_config, Channel.value([]) ), // custom config - // ) + YAML_INPUT.out.reference_hap1.view{ it -> "Reference: $it"} + mapped_bam.view{ it -> "samplesheet: $it"} + GENERATE_SAMPLESHEET.out.csv.view{ it -> "samplesheetcsv: $it"} + YAML_INPUT.out.btk_un_diamond_database.view{ it -> "un diamond: $it"} + YAML_INPUT.out.btk_nt_database.view{ it -> "nt diamond: $it"} + YAML_INPUT.out.btk_ncbi_taxonomy_path.view{ it -> "Taxdump: $it"} + YAML_INPUT.out.btk_yaml.view{ it -> "btk_yaml: $it"} + YAML_INPUT.out.busco_lineages.view{ it -> "lineages: $it"} + YAML_INPUT.out.btk_taxid.view{ it -> "TAXID: $it"} SANGER_TOL_BTK ( YAML_INPUT.out.reference_hap1, - samplesheet_input, + mapped_bam, GENERATE_SAMPLESHEET.out.csv, YAML_INPUT.out.btk_un_diamond_database, - YAML_INPUT.out.btk_nt_diamond_database, + YAML_INPUT.out.btk_nt_database, YAML_INPUT.out.btk_un_diamond_database, [], YAML_INPUT.out.btk_ncbi_taxonomy_path, From e24bba1ecd3e549554b62a30da5e0f0a261ff9df Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 19 Aug 2024 11:02:31 +0100 Subject: [PATCH 10/52] Updating nesting and fixing stuff --- assets/idCulLati1.yaml | 26 +++ assets/real_pdf.yaml | 45 +++++ assets/template_pdf.yaml | 45 +++++ assets/test.yaml | 1 + conf/base.config | 4 + conf/modules.config | 22 ++- conf/sanger-tol-btk.config | 7 + modules.json | 47 +++-- modules/local/sanger_tol_btk.nf | 11 +- modules/local/sanger_tol_cpretext.nf | 50 ++++++ modules/nf-core/merquryfk/merquryfk/main.nf | 7 +- subworkflows/local/main_mapping.nf | 77 ++++++++ subworkflows/local/yaml_input.nf | 62 ++++--- workflows/ear.nf | 188 ++++++++------------ 14 files changed, 428 insertions(+), 164 deletions(-) create mode 100644 assets/idCulLati1.yaml create mode 100644 assets/real_pdf.yaml create mode 100644 assets/template_pdf.yaml create mode 100644 conf/sanger-tol-btk.config create mode 100644 modules/local/sanger_tol_cpretext.nf create mode 100644 subworkflows/local/main_mapping.nf diff --git a/assets/idCulLati1.yaml b/assets/idCulLati1.yaml new file mode 100644 index 0000000..85479be --- /dev/null +++ b/assets/idCulLati1.yaml @@ -0,0 +1,26 @@ +assembly_id: idCulLati1_ear +reference_hap1: /nfs/treeoflife-01/teams/tola/users/dp24/ear/idCulLati1/primary.fa +reference_hap2: /nfs/treeoflife-01/teams/tola/users/dp24/ear/idCulLati1/hap2.fa +mapped_bam: /nfs/treeoflife-01/teams/tola/users/dp24/ear/idCulLati1/mapped_bam.bam +longread: + type: hifi + dir: /lustre/scratch122/tol/data/a/5/e/1/6/d/Culex_laticinctus/genomic_data/idCulLati1/pacbio/fasta/ +curationpretext: + aligner: minimap2 + telomere_motif: TTAGG + hic_dir: /lustre/scratch122/tol/data/a/5/e/1/6/d/Culex_laticinctus/genomic_data/idCulLati2/hic-arima2/ +merquryfk: + fastk_hist: /lustre/scratch122/tol/data/a/5/e/1/6/d/Culex_laticinctus/genomic_data/idCulLati1/pacbio/kmer/k31/idCulLati1.k31.hist + fastk_ktab: /lustre/scratch122/tol/data/a/5/e/1/6/d/Culex_laticinctus/genomic_data/idCulLati1/pacbio/kmer/k31/ +btk: + nt_database: /data/blastdb/Supported/NT/current + nt_database_prefix: nt + diamond_uniprot_database_path: /lustre/scratch123/tol/resources/uniprot_reference_proteomes/latest/reference_proteomes.dmnd + diamond_nr_database_path: /lustre/scratch123/tol/resources/nr/latest/nr.dmnd + ncbi_taxonomy_path: /lustre/scratch123/tol/resources/taxonomy/latest/new_taxdump/ + ncbi_rankedlineage_path: /lustre/scratch123/tol/resources/taxonomy/latest/new_taxdump/rankedlineage.dmp + btk_yaml: /nfs/users/nfs_d/dp24/sanger-tol-ear/assets/btk_draft.yaml + taxid: 1464561 + gca_accession: GCA_0001 + lineages: "insecta_odb10" + config: /nfs/treeoflife-01/teams/tola/users/dp24/ear/conf/sanger-tol-btk.config diff --git a/assets/real_pdf.yaml b/assets/real_pdf.yaml new file mode 100644 index 0000000..8f8d4a0 --- /dev/null +++ b/assets/real_pdf.yaml @@ -0,0 +1,45 @@ +# SAMPLE INFORMATION +ToLID: idCulLati1 +Species: Culex laticinctus +Sex: XX +Submitter: Michael Paulini +Affiliation: WSI +Tags: ERGA-BGE + +# SEQUENCING DATA +DATA: + - PacBio HiFi: 51x + - Arima v2: 152x + +# GENOME PROFILING DATA +PROFILING: + GenomeScope: + version: 2.0 + results_folder: /lustre/scratch123/tol/tolqc/data/erga-bge/insects/Culex_laticinctus/genomic_data/idCulLati1/pacbio/kmer/k31/ + +# ASSEMBLY DATA +ASSEMBLIES: + Pre-curation: + pipeline: [hifiasm_v0.19.8-r603|--primary, purge_dups_v1.2.5|-e, yahs_v1.2a.2|] + pri: + gfastats--nstar-report_txt: /lustre/scratch123/tol/tolqc/data/erga-bge/insects/Culex_laticinctus/working/idCulLati1.hifiasm.20240430/scaffolding/yahs/out.break.yahs/out_scaffolds_final.fa.gz.gfastats + busco_short_summary_txt: /lustre/scratch123/tol/tolqc/data/erga-bge/insects/Culex_laticinctus/working/idCulLati1.hifiasm.20240430/scaffolding/yahs/out.break.yahs/out_scaffolds_final.insecta_odb10.busco/short_summary.specific.insecta_odb10.out_scaffolds_final.insecta_odb10.busco.txt + merqury_folder: /lustre/scratch123/tol/tolqc/data/erga-bge/insects/Culex_laticinctus/working/idCulLati1.hifiasm.20240430/scaffolding/yahs/out.break.yahs/out_scaffolds_final.ccs.merquryk/ + + Curated: + pipeline: [hifiasm_v0.19.8-r603|--primary, purge_dups_v1.2.5|-e, yahs_v1.2a.2|, TreeVal_v1.1] + pri: + gfastats--nstar-report_txt: /lustre/scratch123/tol/tolqc/data/erga-bge/insects/Culex_laticinctus/assembly/curated/idCulLati1.1/ear/idCulLati1.1.primary.curated.fa.gfastats + busco_short_summary_txt: /lustre/scratch123/tol/tolqc/data/erga-bge/insects/Culex_laticinctus/assembly/curated/idCulLati1.1/ear/idCulLati1.1.primary.curated.insecta_odb10.busco/short_summary.specific.insecta_odb10.idCulLati1.1.primary.curated.insecta_odb10.busco.txt + merqury_folder: /lustre/scratch123/tol/tolqc/data/erga-bge/insects/Culex_laticinctus/assembly/curated/idCulLati1.1/ear/idCulLati1.1.primary.curated.ccs.merquryk/ + hic_FullMap_png: /lustre/scratch123/tol/tolqc/data/erga-bge/insects/Culex_laticinctus/assembly/curated/idCulLati1.1/ear/idCulLati1.1_normal_snapshots/idCulLati1.1_normal_FullMap.png + hic_FullMap_link: https://tolqc.cog.sanger.ac.uk/erga-bge/insects/Culex_laticinctus/assembly/curated/idCulLati1.1/ear/idCulLati1.1_normal.pretext + blobplot_cont_png: /lustre/scratch123/tol/tolqc/data/erga-bge/insects/Culex_laticinctus/assembly/curated/idCulLati1.1/ear/idCulLati1.1_primary_curated_btk_busco.blob.circle.png + +# CURATION NOTES +NOTES: + Obs_Haploid_num: 3 + Obs_Sex: XX + Interventions_per_Gb: 430 + Contamination_notes: "Total length of scaffolds removed: 989,717 (0.1 %)\nScaffolds removed: 1 (0.2 %)\nLargest scaffold removed: (989,717)\nFCS-GX contaminant species (number of scaffolds; total length of scaffolds):\nWolbachia endosymbiont (group B) of Melanostoma mellinum, a-proteobacteria (1; 989,717)" + Other_notes: "Chromosomes named by size" diff --git a/assets/template_pdf.yaml b/assets/template_pdf.yaml new file mode 100644 index 0000000..3779c19 --- /dev/null +++ b/assets/template_pdf.yaml @@ -0,0 +1,45 @@ +# SAMPLE INFORMATION +ToLID: +Species: +Sex: +Submitter: +Affiliation: WSI +Tags: ERGA-BGE + +# SEQUENCING DATA +DATA: + - PacBio HiFi: + - Arima v2: + +# GENOME PROFILING DATA +PROFILING: + GenomeScope: + version: 2.0 + results_folder: /lustre/scratch123/tol/tolqc/data/erga-bge/insects/Culex_laticinctus/genomic_data/idCulLati1/pacbio/kmer/k31/ + +# ASSEMBLY DATA +ASSEMBLIES: + Pre-curation: + pipeline: [hifiasm_v0.19.8-r603|--primary, purge_dups_v1.2.5|-e, yahs_v1.2a.2|] + pri: + gfastats--nstar-report_txt: /lustre/scratch123/tol/tolqc/data/erga-bge/insects/Culex_laticinctus/working/idCulLati1.hifiasm.20240430/scaffolding/yahs/out.break.yahs/out_scaffolds_final.fa.gz.gfastats + busco_short_summary_txt: /lustre/scratch123/tol/tolqc/data/erga-bge/insects/Culex_laticinctus/working/idCulLati1.hifiasm.20240430/scaffolding/yahs/out.break.yahs/out_scaffolds_final.insecta_odb10.busco/short_summary.specific.insecta_odb10.out_scaffolds_final.insecta_odb10.busco.txt + merqury_folder: /lustre/scratch123/tol/tolqc/data/erga-bge/insects/Culex_laticinctus/working/idCulLati1.hifiasm.20240430/scaffolding/yahs/out.break.yahs/out_scaffolds_final.ccs.merquryk/ + + Curated: + pipeline: [hifiasm_v0.19.8-r603|--primary, purge_dups_v1.2.5|-e, yahs_v1.2a.2|, TreeVal_v1.1] + pri: + gfastats--nstar-report_txt: idCulLati1.1.primary.curated.fa.gfastats + busco_short_summary_txt: short_summary.specific.insecta_odb10.idCulLati1.1.primary.curated.insecta_odb10.busco.txt + merqury_folder: + hic_FullMap_png: + hic_FullMap_link: https://tolqc.cog.sanger.ac.uk/erga-bge/insects/Culex_laticinctus/assembly/curated/idCulLati1.1/ear/idCulLati1.1_normal.pretext + blobplot_cont_png: idCulLati1.1_primary_curated_btk_busco.blob.circle.png + +# CURATION NOTES +NOTES: + Obs_Haploid_num: + Obs_Sex: + Interventions_per_Gb: + Contamination_notes: "Total length of scaffolds removed: 989,717 (0.1 %)\nScaffolds removed: 1 (0.2 %)\nLargest scaffold removed: (989,717)\nFCS-GX contaminant species (number of scaffolds; total length of scaffolds):\nWolbachia endosymbiont (group B) of Melanostoma mellinum, a-proteobacteria (1; 989,717)" + Other_notes: "Chromosomes named by size" diff --git a/assets/test.yaml b/assets/test.yaml index d4da164..6a5299a 100755 --- a/assets/test.yaml +++ b/assets/test.yaml @@ -4,6 +4,7 @@ reference_hap2: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/asccTinyTest_V2/as longread: type: hifi dir: /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/Oscheius_DF5033/genomic_data/nxOscSpes1/pacbio/fasta/ +mapped_bam: idCulLati1/mapped_bam.bam curationpretext: aligner: minimap2 telomere_motif: TTAGG diff --git a/conf/base.config b/conf/base.config index 4136c84..e609a9e 100644 --- a/conf/base.config +++ b/conf/base.config @@ -19,6 +19,10 @@ process { maxRetries = 1 maxErrors = '-1' + withName: "SANGER_TOL_CPRETEXT|SANGER_TOL_BTK" { + time = { check_max( 70.h * task.attempt, 'time' ) } + } + // Process-specific resource requirements // NOTE - Please try and re-use the labels below as much as possible. // These labels are used and recognised by default in DSL2 files hosted on nf-core/modules. diff --git a/conf/modules.config b/conf/modules.config index a96a69f..d31543e 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -12,11 +12,13 @@ process { - publishDir = [ - path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] + withName: "GFASTATS|MERQURYFK_MERQURYFK|SANGER_TOL_BTK|SANGER_TOL_CPRETEXT|CURATION_PRETEXT" { + publishDir = [ + path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } withName: GFASTATS { ext.args = '--nstar-report' @@ -31,11 +33,19 @@ process { } withName: SANGER_TOL_BTK { - ext.args = "--blastx_outext 'txt'" + ext.args = "" ext.executor = "bsub -Is -tty -e test.e -o test.log -n 2 -q oversubscribed -M1400 -R'select[mem>1400] rusage[mem=1400] span[hosts=1]'" ext.profiles = "singularity,sanger" ext.get_versions = "lsid | head -n1 | cut -d ',' -f 1" ext.version = "draft_assemblies" } + withName: SANGER_TOL_CPRETEXT { + ext.args = "" + ext.executor = "bsub -Is -tty -e test.e -o test.log -n 2 -q oversubscribed -M1400 -R'select[mem>1400] rusage[mem=1400] span[hosts=1]'" + ext.profiles = "singularity,sanger" + ext.get_versions = "lsid | head -n1 | cut -d ',' -f 1" + ext.version = "1.0.0" + } + } diff --git a/conf/sanger-tol-btk.config b/conf/sanger-tol-btk.config new file mode 100644 index 0000000..247dbbd --- /dev/null +++ b/conf/sanger-tol-btk.config @@ -0,0 +1,7 @@ +process { + withLabel:RUN_BLASTN:BLASTN_TAXON { + cpus = { check_max( 12 * task.attempt, 'cpus' ) } + memory = { check_max( 10.GB * task.attempt, 'memory' ) } + time = { check_max( 16.h * task.attempt, 'time' ) } + } +} \ No newline at end of file diff --git a/modules.json b/modules.json index 3b0db89..45499e0 100644 --- a/modules.json +++ b/modules.json @@ -8,42 +8,59 @@ "busco/busco": { "branch": "master", "git_sha": "17486961b8b1ab1aae258c83a7e947b40d8ab670", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "fastqc": { "branch": "master", "git_sha": "285a50500f9e02578d90b3ce6382ea3c30216acd", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "gfastats": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "merquryfk/merquryfk": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ], + "patch": "modules/nf-core/merquryfk/merquryfk/merquryfk-merquryfk.diff" }, "minimap2/align": { "branch": "master", "git_sha": "a33ef9475558c6b8da08c5f522ddaca1ec810306", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "multiqc": { "branch": "master", "git_sha": "b7ebe95761cd389603f9cc0e0dc384c0f663815a", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/merge": { "branch": "master", "git_sha": "04fbbc7c43cebc0b95d5b126f6d9fe4effa33519", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/sort": { "branch": "master", "git_sha": "46eca555142d6e597729fcb682adcc791796f514", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] } } }, @@ -52,20 +69,26 @@ "utils_nextflow_pipeline": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "utils_nfcore_pipeline": { "branch": "master", "git_sha": "92de218a329bfc9a9033116eb5f65fd270e72ba3", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "utils_nfvalidation_plugin": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] } } } } } -} +} \ No newline at end of file diff --git a/modules/local/sanger_tol_btk.nf b/modules/local/sanger_tol_btk.nf index 27e3ca0..4582179 100644 --- a/modules/local/sanger_tol_btk.nf +++ b/modules/local/sanger_tol_btk.nf @@ -9,7 +9,7 @@ process SANGER_TOL_BTK { path blastp, stageAs: "blastp.dmnd" path blastn path blastx - path btk_config_file + path config_file path tax_dump path btk_yaml, stageAs: "BTK.yaml" val busco_lineages @@ -19,7 +19,7 @@ process SANGER_TOL_BTK { output: tuple val(meta), path("${meta.id}_btk_out/blobtoolkit/REFERENCE"), emit: dataset path("${meta.id}_btk_out/blobtoolkit/plots"), emit: plots - path("${meta.id}_btk_out/blobtoolkit/REFERENCE/summary.json.gz"), emit: summary_json + path("${meta.id}_btk_out/blobtoolkit/REFERENCE/summary.json.gz"), emit: summary_json path("${meta.id}_btk_out/busco"), emit: busco_data path("${meta.id}_btk_out/multiqc"), emit: multiqc_report path("blobtoolkit_pipeline_info"), emit: pipeline_info @@ -30,7 +30,7 @@ process SANGER_TOL_BTK { def executor = task.ext.executor ?: "" def profiles = task.ext.profiles ?: "" def get_version = task.ext.version_data ?: "UNKNOWN - SETTING NOT SET" - def btk_config = btk_config_file ? "-c $btk_config_file" : "" + def config = config_file ? "-c $config_file" : "" def pipeline_version = task.ext.version ?: "draft_assemblies" // YAML used to avoid the use of GCA accession number // https://github.com/sanger-tol/blobtoolkit/issues/77 @@ -58,8 +58,9 @@ process SANGER_TOL_BTK { --blastp "\$(realpath blastp.dmnd)" \\ --blastn "\$(realpath $blastn)" \\ --blastx "\$(realpath $blastx)" \\ - $btk_config \\ - $args' + $config \\ + $args \\ + -resume' mv ${meta.id}_btk_out/pipeline_info blobtoolkit_pipeline_info diff --git a/modules/local/sanger_tol_cpretext.nf b/modules/local/sanger_tol_cpretext.nf new file mode 100644 index 0000000..eec53ae --- /dev/null +++ b/modules/local/sanger_tol_cpretext.nf @@ -0,0 +1,50 @@ +process SANGER_TOL_CPRETEXT { + tag "$reference" + label 'process_low' + + input: + path(reference) + path(longread_dir) + path(cram_dir) + path(config_file) + + output: + tuple val(reference), path("*_out/*"), emit: dataset + path "versions.yml", emit: versions + + script: + def pipeline_name = "sanger-tol/curationpretext" // should be a task.ext.args + def (pipeline_prefix,pipeline_suffix) = pipeline_name.split('/') + def args = task.ext.args ?: "" + def executor = task.ext.executor ?: "" + def profiles = task.ext.profiles ?: "" + def get_version = task.ext.version_data ?: "UNKNOWN - SETTING NOT SET" + def config = config_file ? "-c $config_file" : "" + def pipeline_version = task.ext.version ?: "draft_assemblies" + + // Seems to be an issue where a nested pipeline can't see the files in the same directory + // Running realpath gets around this but the files copied into the folder are + // now just wasted space. Should be fixed with using Mahesh's method of nesting but + // this is proving a bit complicated with BTK + + // outdir should be an arg + """ + $executor 'nextflow run $pipeline_name \\ + -r $pipeline_version \\ + -profile $profiles \\ + --input "\$(realpath $reference)" \\ + --outdir ${reference}_${pipeline_suffix}_out \\ + --longread "\$(realpath $longread_dir)" \\ + --cram "\$(realpath $cram_dir)" \\ + $args \\ + $config \\ + -resume' + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + $pipeline_suffix: $pipeline_version + Nextflow: \$(nextflow -v | cut -d " " -f3) + executor system: $get_version + END_VERSIONS + """ +} diff --git a/modules/nf-core/merquryfk/merquryfk/main.nf b/modules/nf-core/merquryfk/merquryfk/main.nf index ac163da..f0e78cc 100644 --- a/modules/nf-core/merquryfk/merquryfk/main.nf +++ b/modules/nf-core/merquryfk/merquryfk/main.nf @@ -39,11 +39,16 @@ process MERQURYFK_MERQURYFK { prefix = task.ext.prefix ?: "${meta.id}" def FASTK_VERSION = 'f18a4e6d2207539f7b84461daebc54530a9559b0' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. def MERQURY_VERSION = '8ae344092df5dcaf83cfb7f90f662597a9b1fc61' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + + // Passing in the link through FASTK works, however passing in through YAML_INPUT results in being unable to find file + // seems as though it is because it is in a folder rather directly in the folder merqury is running in. """ + cp ${fastk_ktab}/*ktab . && cp ${fastk_ktab}/.*ktab.* . + MerquryFK \\ $args \\ -T$task.cpus \\ - ${fastk_ktab.find{ it.toString().endsWith(".ktab") }} \\ + *.ktab \\ $assembly \\ $haplotigs \\ $prefix diff --git a/subworkflows/local/main_mapping.nf b/subworkflows/local/main_mapping.nf new file mode 100644 index 0000000..28c100f --- /dev/null +++ b/subworkflows/local/main_mapping.nf @@ -0,0 +1,77 @@ +include { SE_MAPPING } from './se_mapping' +include { PE_MAPPING } from './pe_mapping' + +include { SAMTOOLS_SORT } from '../../modules/nf-core/samtools/sort/main' + + +workflow MAIN_MAPPING { + + take: + sample_id // val(sample_id) + platform // val(data_type) + reference_hap1 // tuple val(meta), path(reference) + pacbio_tuple // tuple val(meta), path(longread_path) + + main: + ch_align_bam = Channel.empty() + ch_versions = Channel.empty() + + // + // LOGIC: SANGER-TOL/BLOBTOOLKIT expects the pacbio data to be already mapped -> this has been changed but seeing as BTK and genomenote need it then we may as well keep it. + // This is also a requirement for genomenote + // + + if ( platform.filter { it == "hifi" } || platform.filter { it == "clr" } || platform.filter { it == "ont" } ) { + // + // SUBWORKFLOW: SINGLE END MAPPING FOR ALIGNING LONGREAD DATA + // + SE_MAPPING ( + reference_hap1, + pacbio_tuple, + platform + ) + ch_versions = ch_versions.mix(SE_MAPPING.out.versions) + + ch_align_bam + .mix( SE_MAPPING.out.mapped_bam ) + .set { merged_bam } + } + else if ( platform.filter { it == "illumina" } ) { + // + // SUBWORKFLOW: PAIRED END MAPPING FOR ALIGNING LONGREAD DATA + // + PE_MAPPING ( + reference_hap1, + pacbio_tuple, + platform + ) + ch_versions = ch_versions.mix(PE_MAPPING.out.versions) + + ch_align_bam + .mix( PE_MAPPING.out.mapped_bam ) + .set { merged_bam } + } + + // + // MODULE: SORT MAPPED BAM + // + SAMTOOLS_SORT ( + merged_bam, + reference_hap1 + ) + ch_versions = ch_versions.mix( SAMTOOLS_SORT.out.versions ) + + sample_id + .combine(merged_bam) + .map{ sample_id, pacbio_meta, pacbio_path -> + tuple( [id: sample_id], + pacbio_path + ) + } + .set { mapped_bam } + + emit: + mapped_bam // channel: tuple val(meta), path(mapped_bam) + versions = ch_versions // channel: [ path(versions.yml) ] + +} \ No newline at end of file diff --git a/subworkflows/local/yaml_input.nf b/subworkflows/local/yaml_input.nf index 687c5db..6561d27 100644 --- a/subworkflows/local/yaml_input.nf +++ b/subworkflows/local/yaml_input.nf @@ -4,7 +4,7 @@ import groovy.yaml.YamlSlurper workflow YAML_INPUT { take: - input_file // params.input + input_file // params.input main: ch_versions = Channel.empty() @@ -58,44 +58,62 @@ workflow YAML_INPUT { } .set {cpretext_hic_dir} + + if (params.mapped) { + bam_path = Channel.of(inputs.mapped_bam) + + sample_id + .combine(bam_path) + .map{ sample, dir -> + tuple([id: sample], + dir + ) + } + .set {mapped_bam} + } else { + mapped_bam = [[],[]] + } + emit: // // LOGIC: Building generic channels // sample_id - longread_type // val(data) - longread_dir = inputs.longread.dir // DataVariable - pacbio_tuple // tuple (meta), path(file) - reference_hap1 = reference_hap1 // tuple (meta), path(file) - reference_hap2 = reference_2 // DataVariable - reference_path = inputs.reference_hap1 // DataVariable + longread_type // val(data) + longread_dir = inputs.longread.dir // DataVariable + pacbio_tuple // tuple (meta), path(file) + reference_hap1 // tuple (meta), path(file) + reference_hap2 = reference_2 // DataVariable + reference_path = inputs.reference_hap1 // DataVariable + mapped_bam // // LOGIC: Building CurationPretext specific channels // cpretext_aligner cpretext_telomere_motif - cpretext_hic_dir_raw = inputs.curationpretext.hic_dir // DataVariable + cpretext_hic_dir_raw = inputs.curationpretext.hic_dir // DataVariable // // LOGIC: MERQURY CHANNELS // - fastk_hist = Channel.of(inputs.merquryfk.fastk_hist) - fastk_ktab = Channel.of(inputs.merquryfk.fastk_ktab) + fastk_hist = Channel.fromPath(inputs.merquryfk.fastk_hist) + fastk_ktab = Channel.fromPath(inputs.merquryfk.fastk_ktab, hidden: true) // // LOGIC: Building BlobToolKit specific channels // - btk_nt_database = Channel.of(inputs.btk.nt_database) - btk_nt_database_prefix = Channel.of(inputs.btk.nt_database_prefix) - btk_nt_diamond_database = Channel.of(inputs.btk.diamond_nr_database_path) - btk_un_diamond_database = Channel.of(inputs.btk.diamond_uniprot_database_path) - btk_ncbi_taxonomy_path = Channel.of(inputs.btk.ncbi_taxonomy_path) - btk_ncbi_lineage_path = Channel.of(inputs.btk.ncbi_rankedlineage_path) - btk_yaml = Channel.of(inputs.btk.btk_yaml) - btk_taxid = Channel.of(inputs.btk.taxid) - btk_gca_accession = Channel.of(inputs.btk.gca_accession) - busco_lineages = Channel.of(inputs.btk.lineages) - - versions = ch_versions.ifEmpty(null) + btk_nt_database = Channel.of(inputs.btk.nt_database) + btk_nt_database_prefix = Channel.of(inputs.btk.nt_database_prefix) + btk_nt_diamond_database = Channel.of(inputs.btk.diamond_nr_database_path) + btk_un_diamond_database = Channel.of(inputs.btk.diamond_uniprot_database_path) + btk_ncbi_taxonomy_path = Channel.of(inputs.btk.ncbi_taxonomy_path) + btk_ncbi_lineage_path = Channel.of(inputs.btk.ncbi_rankedlineage_path) + btk_yaml = Channel.of(inputs.btk.btk_yaml) + btk_taxid = Channel.of(inputs.btk.taxid) + btk_gca_accession = Channel.of(inputs.btk.gca_accession) + busco_lineages = Channel.of(inputs.btk.lineages) + btk_config = Channel.fromPath(inputs.btk.config) + + versions = ch_versions.ifEmpty(null) } diff --git a/workflows/ear.nf b/workflows/ear.nf index 9f90920..9062d84 100644 --- a/workflows/ear.nf +++ b/workflows/ear.nf @@ -4,16 +4,16 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { NEXTFLOW_RUN as CURATIONPRETEXT } from '../modules/local/nextflow/run' -include { NEXTFLOW_RUN as BLOBTOOLKIT } from '../modules/local/nextflow/run' +// include { NEXTFLOW_RUN as CURATIONPRETEXT } from '../modules/local/nextflow/run' +// include { NEXTFLOW_RUN as BLOBTOOLKIT } from '../modules/local/nextflow/run' include { SANGER_TOL_BTK } from '../modules/local/sanger_tol_btk' +include { SANGER_TOL_CPRETEXT } from '../modules/local/sanger_tol_cpretext' include { YAML_INPUT } from '../subworkflows/local/yaml_input' include { GENERATE_SAMPLESHEET } from '../modules/local/generate_samplesheet' include { GFASTATS } from '../modules/nf-core/gfastats/main' -include { PE_MAPPING } from '../subworkflows/local/pe_mapping' -include { SE_MAPPING } from '../subworkflows/local/se_mapping' -include { SAMTOOLS_SORT } from '../modules/nf-core/samtools/sort/main' +include { MAIN_MAPPING } from '../subworkflows/local/main_mapping' +include { MERQURYFK_MERQURYFK } from '../modules/nf-core/merquryfk/merquryfk/main' include { paramsSummaryMap } from 'plugin/nf-validation' include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' @@ -32,7 +32,7 @@ workflow EAR { ch_input main: - + params.mapped = false ch_versions = Channel.empty() ch_align_bam = Channel.empty() @@ -45,32 +45,39 @@ workflow EAR { // MODULE: Run Sanger-ToL/CurationPretext // - This was built using: https://github.com/mahesh-panchal/nf-cascade // - reference = YAML_INPUT.out.reference_path.get() - hic_dir = YAML_INPUT.out.cpretext_hic_dir_raw.get() - longread_dir = YAML_INPUT.out.longread_dir.get() - - CURATIONPRETEXT( - "sanger-tol/curationpretext", - [ - "-r 1.0.0", - "--input", - reference, - "--longread", - longread_dir, - "--cram", - hic_dir, - "-profile singularity,sanger" - ].join(" ").trim(), // workflow opts - Channel.value([]), //readWithDefault( params.demo.params_file, Channel.value([]) ), // params file - Channel.value([]), // samplesheet - not used by this pipeline - Channel.value([]) //readWithDefault( params.demo.add_config, Channel.value([]) ), // custom config - //"$params.outdir/curationpretext", + reference = YAML_INPUT.out.reference_path.get() + hic_dir = YAML_INPUT.out.cpretext_hic_dir_raw.get() + longread_dir = YAML_INPUT.out.longread_dir.get() + + // CURATIONPRETEXT( + // "sanger-tol/curationpretext", + // [ + // "-r 1.0.0", + // "--input", + // reference, + // "--longread", + // longread_dir, + // "--cram", + // hic_dir, + // "-profile singularity,sanger" + // ].join(" ").trim(), // workflow opts + // Channel.value([]), //readWithDefault( params.demo.params_file, Channel.value([]) ), // params file + // Channel.value([]), // samplesheet - not used by this pipeline + // Channel.value([]) //readWithDefault( params.demo.add_config, Channel.value([]) ), // custom config + // ) + + SANGER_TOL_CPRETEXT( + reference, + longread_dir, + hic_dir, + [] ) + ch_versions = ch_versions.mix( SANGER_TOL_CPRETEXT.out.versions ) + // // MODULE: ASSEMBLY STATISTICS FOR THE FASTA // - GFASTATS( YAML_INPUT.out.reference_hap1, "fasta", @@ -81,127 +88,70 @@ workflow EAR { [], [] ) + ch_versions = ch_versions.mix( GFASTATS.out.versions ) + // // LOGIC: REFORMAT A BUNCH OF CHANNELS FOR MERQUERYFK // - - if (params.reference_hap2) { - YAML_INPUT.out.reference_hap1 - .combine(YAML_INPUT.out.reference_hap2) - .combine(YAML_INPUT.out.fastk_hist) - .combine(YAML_INPUT.out.fastk_ktab) - .map{ meta, primary, haplotigs, fastk_hist, fastk_ktab -> - tuple( meta, - fastk_hist, - fastk_ktab, - primary, - haplotigs - ) - } - .set { merquryfk_input } - - // - // MODULE: MERQURYFK PLOTS OF GENOME - // - - MERQURYFK( - merquryfk_input - ) - } - + YAML_INPUT.out.reference_hap1 + .combine(YAML_INPUT.out.reference_hap2) + .combine(YAML_INPUT.out.fastk_hist) + .combine(YAML_INPUT.out.fastk_ktab) + .map{ meta, primary, haplotigs, fastk_hist, fastk_ktab -> + tuple( meta, + fastk_hist, + fastk_ktab, + primary, + haplotigs + ) + } + .set { merquryfk_input } // - // LOGIC: SANGER-TOL/BLOBTOOLKIT expects the pacbio data to be already mapped -> this has been changed but seeing as BTK and genomenote need it then we may as well keep it. - // This is also a requirement for genomenote + // MODULE: MERQURYFK PLOTS OF GENOME // - platform = YAML_INPUT.out.longread_type - - YAML_INPUT.out.sample_id - .combine(YAML_INPUT.out.longread_dir) - .map{ sample, dir -> - tuple([id: sample], dir ) - } - .set {pacbio_tuple} + MERQURYFK_MERQURYFK( + merquryfk_input + ) + ch_versions = ch_versions.mix( MERQURYFK_MERQURYFK.out.versions ) - if ( platform.filter { it == "hifi" } || platform.filter { it == "clr" } || platform.filter { it == "ont" } ) { - // - // SUBWORKFLOW: SINGLE END MAPPING FOR ALIGNING LONGREAD DATA - // - SE_MAPPING ( - YAML_INPUT.out.reference_hap1, - YAML_INPUT.out.pacbio_tuple, - platform - ) - ch_versions = ch_versions.mix(SE_MAPPING.out.versions) - ch_align_bam - .mix( SE_MAPPING.out.mapped_bam ) - .set { merged_bam } - } - else if ( platform.filter { it == "illumina" } ) { + ch_mapped_bam = YAML_INPUT.out.mapped_bam + if (!params.mapped) { // - // SUBWORKFLOW: PAIRED END MAPPING FOR ALIGNING LONGREAD DATA + // SUBWORKFLOW: MAIN_MAPPING CONTAINS ALL THE MAPPING LOGIC + // This allows us to more esily bypass the mapping if we already have a sorted and mapped bam // - PE_MAPPING ( + MAIN_MAPPING ( + YAML_INPUT.out.sample_id, + YAML_INPUT.out.longread_type, YAML_INPUT.out.reference_hap1, YAML_INPUT.out.pacbio_tuple, - platform ) - ch_versions = ch_versions.mix(PE_MAPPING.out.versions) - - ch_align_bam - .mix( PE_MAPPING.out.mapped_bam ) - .set { merged_bam } + ch_versions = ch_versions.mix( MAIN_MAPPING.out.versions ) + ch_mapped_bam = MAIN_MAPPING.out.mapped_bam } - // - // MODULE: SORT MAPPED BAM - // - SAMTOOLS_SORT ( - merged_bam, - YAML_INPUT.out.reference_hap1 - ) - ch_versions = ch_versions.mix( SAMTOOLS_SORT.out.versions ) - // // MODULE: GENERATE_SAMPLESHEET creates a csv for the blobtoolkit pipeline // - YAML_INPUT.out.sample_id - .combine(merged_bam) - .map{ sample_id, pacbio_meta, pacbio_path -> - tuple( [id: sample_id], - pacbio_path - ) - } - .set { mapped_bam } - GENERATE_SAMPLESHEET( - mapped_bam + ch_mapped_bam ) ch_versions = ch_versions.mix( GENERATE_SAMPLESHEET.out.versions ) // // MODULE: Run Sanger-ToL/BlobToolKit // - YAML_INPUT.out.reference_hap1.view{ it -> "Reference: $it"} - mapped_bam.view{ it -> "samplesheet: $it"} - GENERATE_SAMPLESHEET.out.csv.view{ it -> "samplesheetcsv: $it"} - YAML_INPUT.out.btk_un_diamond_database.view{ it -> "un diamond: $it"} - YAML_INPUT.out.btk_nt_database.view{ it -> "nt diamond: $it"} - YAML_INPUT.out.btk_ncbi_taxonomy_path.view{ it -> "Taxdump: $it"} - YAML_INPUT.out.btk_yaml.view{ it -> "btk_yaml: $it"} - YAML_INPUT.out.busco_lineages.view{ it -> "lineages: $it"} - YAML_INPUT.out.btk_taxid.view{ it -> "TAXID: $it"} - SANGER_TOL_BTK ( YAML_INPUT.out.reference_hap1, - mapped_bam, + ch_mapped_bam, GENERATE_SAMPLESHEET.out.csv, YAML_INPUT.out.btk_un_diamond_database, YAML_INPUT.out.btk_nt_database, YAML_INPUT.out.btk_un_diamond_database, - [], + YAML_INPUT.out.btk_config, YAML_INPUT.out.btk_ncbi_taxonomy_path, YAML_INPUT.out.btk_yaml, YAML_INPUT.out.busco_lineages, @@ -225,13 +175,15 @@ workflow EAR { workflow, parameters_schema: "nextflow_schema.json") ch_workflow_summary = Channel.value(paramsSummaryMultiqc(summary_params)) - - emit: versions = ch_versions // channel: [ path(versions.yml) ] } +// +// MODULE: THERE ARE TWO DATABASES WHICH ARE FREQUENTLY THE SAME DATABASE +// THIS STOPS NAME CONFLICTS BEFORE THEY ARE COPIED TO THE SAME PLACE +// process RenameDatabase { tag "Rename DMND Database" executor 'local' From 66a5f0911d92a11f29d541befd8132c5d6301db9 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 19 Aug 2024 11:04:12 +0100 Subject: [PATCH 11/52] adding merqury_fk --- .../merquryfk/merquryfk-merquryfk.diff | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 modules/nf-core/merquryfk/merquryfk/merquryfk-merquryfk.diff diff --git a/modules/nf-core/merquryfk/merquryfk/merquryfk-merquryfk.diff b/modules/nf-core/merquryfk/merquryfk/merquryfk-merquryfk.diff new file mode 100644 index 0000000..751b30b --- /dev/null +++ b/modules/nf-core/merquryfk/merquryfk/merquryfk-merquryfk.diff @@ -0,0 +1,23 @@ +Changes in module 'nf-core/merquryfk/merquryfk' +--- modules/nf-core/merquryfk/merquryfk/main.nf ++++ modules/nf-core/merquryfk/merquryfk/main.nf +@@ -39,11 +39,16 @@ + prefix = task.ext.prefix ?: "${meta.id}" + def FASTK_VERSION = 'f18a4e6d2207539f7b84461daebc54530a9559b0' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + def MERQURY_VERSION = '8ae344092df5dcaf83cfb7f90f662597a9b1fc61' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. ++ ++ // Passing in the link through FASTK works, however passing in through YAML_INPUT results in being unable to find file ++ // seems as though it is because it is in a folder rather directly in the folder merqury is running in. + """ ++ cp ${fastk_ktab}/*ktab . && cp ${fastk_ktab}/.*ktab.* . ++ + MerquryFK \\ + $args \\ + -T$task.cpus \\ +- ${fastk_ktab.find{ it.toString().endsWith(".ktab") }} \\ ++ *.ktab \\ + $assembly \\ + $haplotigs \\ + $prefix + +************************************************************ From 2fe82afa28e04889e2919b0ab2ba0b0043976edc Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Wed, 21 Aug 2024 13:21:40 +0100 Subject: [PATCH 12/52] Updating documentation --- CHANGELOG.md | 34 +++++++++++++++++--- README.md | 91 +++++++++++++++++++++++++++++++--------------------- 2 files changed, 84 insertions(+), 41 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c3d90bc..3173f7c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,15 +2,39 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +Naming based on: [Mythical creatures](https://en.wikipedia.org/wiki/List_of_legendary_creatures_by_type). -## v1.0dev - [date] +## v1.0.0 - Aquatic Bahamut [21/08/2024] Initial release of sanger-tol/ear, created with the [nf-core](https://nf-co.re/) template. +The current pipeline means the MVP for ear. -### `Added` +### Added +GFASTATS to generate statistics on the input primary genome. +MERQURY_FK to generate kmer graphs and analyses of the primary, haplotype and merged assembly. +BLOBTOOLKIT to generate busco files and blobtoolkit dataset/plots. +CURATIONPRETEXT to generate pretext plots and pngs. -### `Fixed` +### Parameters -### `Dependencies` +| Old parameter | New parameter | +| --------------- | ------------- | +| | --mapped | -### `Deprecated` +### Software dependencies + +| Dependency | Old version | New version | +| ----------- | ------------- | ------------- | +| sanger-tol/blobtoolkit* | | draft_assemblies | +| sanger-tol/curationpretext* | | 1.0.0 (UNSC Cradle) | +| GFASTATS | | 1.3.6--hdcf5f25_3 | +| MERQUERY_FK | | 1.2 | +| MINIMAP2_ALIGN | | 2.28 | +| SAMTOOLS_MERGE | | 1.20--h50ea8bc_0 | +| SAMTOOLS_SORT | | 1.20--h50ea8bc_0 | +| + +- Note: for pipelines, please check their own CHANGELOG file for a full list of software dependencies. + +### Dependencies +The pipeline depends on a number of databases which are noted in [README](README.md) and [USAGE](docs/usage.md). diff --git a/README.md b/README.md index 506512d..652eba6 100644 --- a/README.md +++ b/README.md @@ -10,51 +10,74 @@ ## Introduction -**sanger-tol/ear** is a bioinformatics pipeline that ... +**sanger-tol/ear** is a bioinformatics pipeline that generates the data files required for the the generation of ERGA Assembly Reports. Sanger-tol/ear nests two other sanger-tol pipelines (blobtoolkit and curationpretext). - - - - - -1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) -2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) +1. Read the input yaml file (YAML_INPUT) +2. Run GFASTATS (GFASTARS) +3. Run MERQURYFK_MERQURYFK (MERQURYFK) +4. Run MAIN_MAPPING, longread single-end/paired-end mapping +5. Run GENERATE_SAMPLESHEET, generate a csv file required for SANGER_TOL_BTK. +6. Run SANGER_TOL_BTK, also known as SANGER-TOL/BLOBTOOLKIT a subpipline for SANGER-TOL/EAR +7. Run SANGER_TOL_CPRETEXT, also known as SANGER-TOL/CURATIONPRETEXT a subpipeline for SANGER-TOL/EAR. ## Usage > [!NOTE] > If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data. - Now, you can run the pipeline using: - - ```bash -nextflow run sanger-tol/ear \ - -profile \ - --input samplesheet.csv \ - --outdir +nextflow run sanger-tol/ear -profile \\ + --input assets/idCulLati1.yaml \\ + --mapped TRUE \\ # OPTIONAL + --outdir test-truth ``` > [!WARNING] @@ -65,10 +88,6 @@ nextflow run sanger-tol/ear \ sanger-tol/ear was originally written by DLBPointon. -We thank the following people for their extensive assistance in the development of this pipeline: - - - ## Contributions and Support If you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md). From 03b074c04b4e8368ed3c935819ce62bc9cb7c66a Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Wed, 21 Aug 2024 13:21:49 +0100 Subject: [PATCH 13/52] Updating documentation --- docs/output.md | 70 ++++++++++++------ docs/usage.md | 192 +++++++++++++++++++++++++++++++++++++------------ 2 files changed, 195 insertions(+), 67 deletions(-) diff --git a/docs/output.md b/docs/output.md index 335ec21..f5a9c8b 100644 --- a/docs/output.md +++ b/docs/output.md @@ -6,54 +6,80 @@ This document describes the output produced by the pipeline. Most of the plots a The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. - - ## Pipeline overview The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: -- [FastQC](#fastqc) - Raw read QC -- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline +- [GFASTATS](#gfastats) - Collect statistics on the curated primary assembly +- [MERQURYFK](#merquryfk) - Generate kmer plots for the curated assembly using previous run information +- [SANGER_TOL_BTK](#sanger_tol_btk) - Run Blobtoolkit to generate plots and short_summary.txt from BUSCO. +- [SANGER_TOL_CPRETEXT](#sanger_tol_cpretext) - Run Curationpretext to generate Pretext files and accessory tracks. - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution -### FastQC +### GFASTATS + +
+Output files + +- `gfastats/` + - `*.assembly.summary`: Assembly metrics of the input primary file. + - `*_fasta.gz`: GZipped primary assembly file. + +
+ +[GFASTATS](https://github.com/vgl-hub/gfastats) is a single fast and exhaustive tool for summary statistics and simultaneous *fa* (fasta, fastq, gfa [.gz]) genome assembly file manipulation. + +### MERQURYFK
Output files -- `fastqc/` - - `*_fastqc.html`: FastQC report containing quality metrics. - - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. +- `merquryfk/` + - `*.completeness.stats`: + - `*{"primary","haplotype",""}_only.bed`: + - `*{"primary","haplotype",""}.qv`: + - `*.spectra-asm.{fl,ln,st}.png`: + - `*{"primary","haplotype"}.spectra-cn.{fl,ln,st}.png`:
-[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). +[MERQURYFK](https://github.com/thegenemyers/MERQURY.FK) is a FastK based version of Merqury. + +Merqury is a novel tool for reference-free assembly evaluation based on efficient k-mer set operations. By comparing k-mers in a de novo assembly to those found in unassembled high-accuracy reads, Merqury estimates base-level accuracy and completeness. + + +## SANGER_TOL_BTK -![MultiQC - FastQC sequence counts plot](images/mqc_fastqc_counts.png) +
+Output files -![MultiQC - FastQC mean quality scores plot](images/mqc_fastqc_quality.png) +- `sanger/*_blobtoolkit_out/` + - `blobtoolkit/plots/*png`: Blobtoolkit plots + - `blobtoolkit/{ASSEMBLY_NAME}/*.json.gz`: Blobtoolkit dataset for use in BTK_viewer. + - `busco/*_odb10/*.{tsv,tar.gz,json,txt}`: Busco output + - `muliqc/`: MultiQC plots/data and report.html. + - [`pipeline_info`](#pipeline-information) + +
-![MultiQC - FastQC adapter content plot](images/mqc_fastqc_adapter.png) +[SANGER_TOL_BTK](https://pipelines.tol.sanger.ac.uk/blobtoolkit) is a bioinformatics pipeline that can be used to identify and analyse non-target DNA for eukaryotic genomes. -:::note -The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. -::: -### MultiQC +## SANGER_TOL_CPRETEXT
Output files -- `multiqc/` - - `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser. - - `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline. - - `multiqc_plots/`: directory containing static images from the report in various formats. +- `sanger/*_curationpretext_out/` + - `accessory_files/*.{bigWig,bed,bedgraph}`: Track files describing Telomere, gap, coverage data across the genome. + - `pretext_maps_raw`: Pre-accessory file ingestion pretext files. + - `pretext_maps_processed`: Post-accessory file ingestion pretext files, e.g. the final output. + - [`pipeline_info`](#pipeline-information)
-[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. +[SANGER_TOL_CPRETEXT](https://pipelines.tol.sanger.ac.uk/curationpretext) is a bioinformatics pipeline typically used in conjunction with [TreeVal](https://pipelines.tol.sanger.ac.uk/treeval) to generate pretext maps (and optionally telomeric, gap, coverage, and repeat density plots which can be ingested into pretext) for the manual curation of high quality genomes. -Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . ### Pipeline information diff --git a/docs/usage.md b/docs/usage.md index 42521d3..b703d3e 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -6,60 +6,179 @@ -## Samplesheet input +## Yaml input -You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. +You will need to create a yaml with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. ```bash --input '[path to samplesheet file]' ``` -### Multiple runs of the same sample +The structure of this file should be as follows: -The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes: +```yaml +# General Vales for all subpiplines and modules +assembly_id: +reference_hap1: +reference_hap2: +reference_haplotigs: + +# If a mapped bam already exists use the below + --mapped TRUE on the nextflow command else ignore it and the pipeline will create it. +mapped_bam: + +merquryfk: + fastk_hist: + fastk_ktab: