From e5b60b54c00eff45ff2ddd26b3bff7c503ab4a2b Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 20 Sep 2024 12:44:02 +0100 Subject: [PATCH 1/3] Adding MINIMAP2 resource fix --- conf/base.config | 54 +++++++++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/conf/base.config b/conf/base.config index e609a9e..aa5a770 100644 --- a/conf/base.config +++ b/conf/base.config @@ -11,16 +11,22 @@ process { // TODO nf-core: Check the defaults for all processes - cpus = { check_max( 1 * task.attempt, 'cpus' ) } - memory = { check_max( 6.GB * task.attempt, 'memory' ) } - time = { check_max( 4.h * task.attempt, 'time' ) } + cpus = { check_max( 1 * task.attempt, 'cpus' ) } + memory = { check_max( 6.GB * task.attempt, 'memory' ) } + time = { check_max( 4.h * task.attempt, 'time' ) } - errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } - maxRetries = 1 - maxErrors = '-1' + errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } + maxRetries = 1 + maxErrors = '-1' withName: "SANGER_TOL_CPRETEXT|SANGER_TOL_BTK" { - time = { check_max( 70.h * task.attempt, 'time' ) } + time = { check_max( 70.h * task.attempt, 'time' ) } + } + + withName: "MINIMAP2_ALIGN_SE" { + cpus = { check_max( 16 , 'cpus' ) } + memory = { check_max( 1.GB * ( reference.size() < 2e9 ? 40 : Math.ceil( ( reference.size() / 1e+9 ) * 20 ) * task.attempt ) , 'memory') } + time = { check_max( 1.h * ( reference.size() < 1e9 ? 10 : reference.size() < 10e9 ? 30 : 48), 'time' ) } } // Process-specific resource requirements @@ -31,36 +37,36 @@ process { // TODO nf-core: Customise requirements for specific processes. // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors withLabel:process_single { - cpus = { check_max( 1 , 'cpus' ) } - memory = { check_max( 6.GB * task.attempt, 'memory' ) } - time = { check_max( 4.h * task.attempt, 'time' ) } + cpus = { check_max( 1 , 'cpus' ) } + memory = { check_max( 6.GB * task.attempt, 'memory' ) } + time = { check_max( 4.h * task.attempt, 'time' ) } } withLabel:process_low { - cpus = { check_max( 2 * task.attempt, 'cpus' ) } - memory = { check_max( 12.GB * task.attempt, 'memory' ) } - time = { check_max( 4.h * task.attempt, 'time' ) } + cpus = { check_max( 2 * task.attempt, 'cpus' ) } + memory = { check_max( 12.GB * task.attempt, 'memory' ) } + time = { check_max( 4.h * task.attempt, 'time' ) } } withLabel:process_medium { - cpus = { check_max( 6 * task.attempt, 'cpus' ) } - memory = { check_max( 36.GB * task.attempt, 'memory' ) } - time = { check_max( 8.h * task.attempt, 'time' ) } + cpus = { check_max( 6 * task.attempt, 'cpus' ) } + memory = { check_max( 36.GB * task.attempt, 'memory' ) } + time = { check_max( 8.h * task.attempt, 'time' ) } } withLabel:process_high { - cpus = { check_max( 12 * task.attempt, 'cpus' ) } - memory = { check_max( 72.GB * task.attempt, 'memory' ) } - time = { check_max( 16.h * task.attempt, 'time' ) } + cpus = { check_max( 12 * task.attempt, 'cpus' ) } + memory = { check_max( 72.GB * task.attempt, 'memory' ) } + time = { check_max( 16.h * task.attempt, 'time' ) } } withLabel:process_long { - time = { check_max( 20.h * task.attempt, 'time' ) } + time = { check_max( 20.h * task.attempt, 'time' ) } } withLabel:process_high_memory { - memory = { check_max( 200.GB * task.attempt, 'memory' ) } + memory = { check_max( 200.GB * task.attempt, 'memory' ) } } withLabel:error_ignore { - errorStrategy = 'ignore' + errorStrategy = 'ignore' } withLabel:error_retry { - errorStrategy = 'retry' - maxRetries = 2 + errorStrategy = 'retry' + maxRetries = 2 } } From 870bf41e0c54e18949edd016d53cbf4dc31824c0 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 20 Sep 2024 12:44:36 +0100 Subject: [PATCH 2/3] Updating documentation --- CHANGELOG.md | 6 +- CITATIONS.md | 24 ++++++-- LICENSE | 4 +- README.md | 8 ++- assets/samplesheet.csv | 3 - assets/schema_input.json | 130 ++++++++++++++++++++++++++++++++++++--- 6 files changed, 154 insertions(+), 21 deletions(-) delete mode 100644 assets/samplesheet.csv diff --git a/CHANGELOG.md b/CHANGELOG.md index 9106bfd..9959669 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ The current pipeline means the MVP for ear. GFASTATS to generate statistics on the input primary genome. MERQURY_FK to generate kmer graphs and analyses of the primary, haplotype and merged assembly. +MAIN_MAPPING which is a small mapping subworkflow, that can work with single and paired reads. BLOBTOOLKIT to generate busco files and blobtoolkit dataset/plots. CURATIONPRETEXT to generate pretext plots and pngs. @@ -21,12 +22,13 @@ CURATIONPRETEXT to generate pretext plots and pngs. | Old parameter | New parameter | | ------------- | ------------- | | | --mapped | +| | --steps | ### Software dependencies | Dependency | Old version | New version | | ---------------------------- | ----------- | ------------------- | -| sanger-tol/blobtoolkit\* | | draft_assemblies | +| sanger-tol/blobtoolkit\* | | 0.6.0 (Bellsprout) | | sanger-tol/curationpretext\* | | 1.0.0 (UNSC Cradle) | | GFASTATS | | 1.3.6--hdcf5f25_3 | | MERQUERY_FK | | 1.2 | @@ -36,7 +38,7 @@ CURATIONPRETEXT to generate pretext plots and pngs. | -- Note: for pipelines, please check their own CHANGELOG file for a full list of software dependencies. +\* for pipelines, please check their own CHANGELOG file for a full list of software dependencies. ### Dependencies diff --git a/CITATIONS.md b/CITATIONS.md index c0cf948..28e3ca8 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -10,13 +10,29 @@ ## Pipeline tools -- [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) +- [GFastar/GFastats](https://www.biorxiv.org/content/10.1101/2022.03.24.485682v1) - > Andrews, S. (2010). FastQC: A Quality Control Tool for High Throughput Sequence Data [Online]. + > Formenti, G., Abueg, L., Brajuka, N., Gallardo, C., Giani, A., Fedrigo, O., Jarvis, ED. (2022). Gfastats: conversion, evaluation and manipulation of genome sequences using assembly graphs. bioRxiv. doi: https://doi.org/10.1101/2022.03.24.485682 -- [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) +- [Merqury_FK](https://github.com/thegenemyers/MERQURY.FK) - > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. + > Myers, G., Rhie, A. (2024). MerquryFK & KatFK. [online]. https://github.com/thegenemyers/MERQURY.FK. (Accessed on 20 September 2024). + +- [Minimap2](https://pubmed.ncbi.nlm.nih.gov/34623391/) + + > Li, H. 2021. ‘New strategies to improve MINIMAP2 alignment accuracy’, Bioinformatics, 37(23), pp. 4572–4574. doi:10.1093/bioinformatics/btab705. + +- [Samtools](https://pubmed.ncbi.nlm.nih.gov/33590861/) + + > Danecek P, Bonfield JK, Liddle J, Marshall J, Ohan V, Pollard MO, Whitwham A, Keane T, McCarthy SA, Davies RM, Li H. Twelve years of SAMtools and BCFtools. Gigascience. 2021 Feb 16;10(2):giab008. doi: 10.1093/gigascience/giab008. PMID: 33590861; PMCID: PMC7931819. + +- [sanger-tol/blobtoolkit](https://zenodo.org/records/13758882) + + > Muffato, M., Butt, Z., Challis, R., Kumar, S., Qi, G., Ramos Díaz, A., Surana, P., & Yates, B. (2024). sanger-tol/blobtoolkit: v0.6.0 – Bellsprout (0.6.0). Zenodo. https://doi.org/10.5281/zenodo.13758882 + +- [sanger-tol/curationpretext](https://zenodo.org/records/13758882) + + > Pointon, DLB. (2024). sanger-tol/curationpretext: v1.0.0 (UNSC Cradle). [online]. https://github.com/sanger-tol/curationpretext/releases/tag/1.0.0. (Accessed on 20 September 2024). ## Software packaging/containerisation tools diff --git a/LICENSE b/LICENSE index 967fdcd..138ff19 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) DLBPointon +Copyright (c) 2022 - 2023 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md index 5b61fe6..b8e17ab 100644 --- a/README.md +++ b/README.md @@ -59,8 +59,9 @@ curationpretext: hic_dir: btk: taxid: 1464561 - lineages: - gca_accession: GCA_0001 + lineages: < CSV LIST OF DATABASES TO USE: "insecta_odb10,diptera_odb10"> + gca_accession: GCA_0001 + nt_database: nt_database_prefix: diamond_uniprot_database_path: @@ -76,7 +77,8 @@ Now, you can run the pipeline using: nextflow run sanger-tol/ear -profile \\ --input assets/idCulLati1.yaml \\ --mapped TRUE \\ # OPTIONAL - --outdir test-truth + --steps ["", "btk", "cpretext", "merquryfk"] # OPTIONAL CSV LIST OF STEPS TO EXCLUDE FROM EXECUTION + --outdir test ``` > [!WARNING] diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv deleted file mode 100644 index 5f653ab..0000000 --- a/assets/samplesheet.csv +++ /dev/null @@ -1,3 +0,0 @@ -sample,fastq_1,fastq_2 -SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz -SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz, diff --git a/assets/schema_input.json b/assets/schema_input.json index 8012bf6..61d2b74 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -13,21 +13,137 @@ "errorMessage": "Sample name must be provided and cannot contain spaces", "meta": ["id"] }, - "fastq_1": { + "reference_hap1": { "type": "string", "format": "file-path", "exists": true, - "pattern": "^\\S+\\.f(ast)?q\\.gz$", - "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" + "pattern": "^\\S+\\.f[ast]a$", + "errorMessage": "Primary assembly input file, decompressed" }, - "fastq_2": { + "reference_hap2": { "type": "string", "format": "file-path", "exists": true, - "pattern": "^\\S+\\.f(ast)?q\\.gz$", - "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" + "pattern": "^\\S+\\.f[ast]a$", + "errorMessage": "Haplotype assembly input file, decompressed" + }, + "reference_haplotigs": { + "type": "string", + "format": "file-path", + "exists": true, + "pattern": "^\\S+\\.f[ast]a$", + "errorMessage": "Haplotigs removed from Primary Assembly input file during curation, decompressed" + }, + "mapped_bam": { + "type": "string", + "format": "file-path", + "exists": true, + "pattern": "^\\S+\\.bam$", + "errorMessage": "Optional mapped bam file used to skip mapping of pacbio files" + }, + "merquryfk": { + "type": "object", + "properties": { + "fastk_hist": { + "type": "string", + "format": "file-path", + "exists": true, + "pattern": "^\\S+\\.hist$", + "errorMessage": "Path to hist file" + }, + "fastk_ktab": { + "type": "string", + "errorMessage": "Directory containing ktab files" + } + } + }, + "longread": { + "type": "object", + "properties": { + "dir": { + "type": "string", + "errorMessage": "Path to folder containing fasta.gz files" + }, + "type": { + "type": "string", + "errorMessage": "type of longread data" + } + } + }, + "curationpretext": { + "type": "object", + "properties": { + "aligner": { + "type": "string", + "errorMessage": "Aligner" + }, + "telomere_motif": { + "type": "string", + "errorMessage": "Telomere motif for telomere search" + }, + "hic_dir": { + "type": "string", + "errorMessage": "Directory of the cram data" + } + } + }, + "btk": { + "type": "object", + "properties": { + "taxid": { + "type": "string", + "errorMessage": "NCBI Taxid of organism" + }, + "lineages": { + "type": "string", + "errorMessage": "CSV list of BUSCO lineages to run against" + }, + "gca_accession": { + "type": "string", + "errorMessage": "gca_accession if applicable" + }, + "nt_database": { + "type": "string", + "errorMessage": "nt database directory" + }, + "nt_database_prefix": { + "type": "string", + "errorMessage": "Prefix for nt database" + }, + "diamond_uniprot_database_path": { + "type": "string", + "format": "file-path", + "exists": true, + "pattern": "^\\S+\\.dmnd$", + "errorMessage": "Diamond protein database" + }, + "diamond_nr_database_path": { + "type": "string", + "format": "file-path", + "exists": true, + "pattern": "^\\S+\\.dmnd$", + "errorMessage": "Nuclear diamond database" + }, + "ncbi_taxonomy_path": { + "type": "string", + "errorMessage": "Directory for tax2taxid" + }, + "ncbi_rankedlineage_path": { + "type": "string", + "format": "file-path", + "exists": true, + "pattern": "^\\S+\\.dmp$", + "errorMessage": "Taxonomy dump" + }, + "config": { + "type": "string", + "format": "file-path", + "pattern": "^\\S+\\.config$", + "errorMessage": "Extra configuration file for Blobtoolkit pipeline" + } + } } }, - "required": ["sample", "fastq_1"] + "required": ["sample", "reference_hap1", "reference_hap2"] } } From 93d17c240a0ab2dd1bf1ff35c2359b9a74d87068 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 20 Sep 2024 12:53:03 +0100 Subject: [PATCH 3/3] Fix LICENSE lint --- .nf-core.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.nf-core.yml b/.nf-core.yml index d9fe12b..407734e 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -10,6 +10,7 @@ lint: - .github/workflows/awsfulltest.yml - conf/igenomes.config files_unchanged: + - LICENSE - CODE_OF_CONDUCT.md - assets/nf-core-ear_logo_light.png - docs/images/nf-core-ear_logo_light.png