Merge pull request #7 from sanger-tol/dp24_bug_fix

Dp24 bug fix
sanger-tol · Sep 20, 2024 · 539666d · 539666d
2 parents 74b0025 + 93d17c2
commit 539666d
Show file tree

Hide file tree

Showing 8 changed files with 185 additions and 45 deletions.
diff --git a/.nf-core.yml b/.nf-core.yml
@@ -10,6 +10,7 @@ lint:
     - .github/workflows/awsfulltest.yml
     - conf/igenomes.config
   files_unchanged:
+    - LICENSE
     - CODE_OF_CONDUCT.md
     - assets/nf-core-ear_logo_light.png
     - docs/images/nf-core-ear_logo_light.png

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,7 @@ The current pipeline means the MVP for ear.
 
 GFASTATS to generate statistics on the input primary genome.
 MERQURY_FK to generate kmer graphs and analyses of the primary, haplotype and merged assembly.
+MAIN_MAPPING which is a small mapping subworkflow, that can work with single and paired reads.
 BLOBTOOLKIT to generate busco files and blobtoolkit dataset/plots.
 CURATIONPRETEXT to generate pretext plots and pngs.
 
@@ -21,12 +22,13 @@ CURATIONPRETEXT to generate pretext plots and pngs.
 | Old parameter | New parameter |
 | ------------- | ------------- |
 |               | --mapped      |
+|               | --steps       |
 
 ### Software dependencies
 
 | Dependency                   | Old version | New version         |
 | ---------------------------- | ----------- | ------------------- |
-| sanger-tol/blobtoolkit\*     |             | draft_assemblies    |
+| sanger-tol/blobtoolkit\*     |             | 0.6.0 (Bellsprout)  |
 | sanger-tol/curationpretext\* |             | 1.0.0 (UNSC Cradle) |
 | GFASTATS                     |             | 1.3.6--hdcf5f25_3   |
 | MERQUERY_FK                  |             | 1.2                 |
@@ -36,7 +38,7 @@ CURATIONPRETEXT to generate pretext plots and pngs.
 
 |
 
-- Note: for pipelines, please check their own CHANGELOG file for a full list of software dependencies.
+\* for pipelines, please check their own CHANGELOG file for a full list of software dependencies.
 
 ### Dependencies
 

diff --git a/CITATIONS.md b/CITATIONS.md
@@ -10,13 +10,29 @@
 
 ## Pipeline tools
 
-- [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)
+- [GFastar/GFastats](https://www.biorxiv.org/content/10.1101/2022.03.24.485682v1)
 
-  > Andrews, S. (2010). FastQC: A Quality Control Tool for High Throughput Sequence Data [Online].
+  > Formenti, G., Abueg, L., Brajuka, N., Gallardo, C., Giani, A., Fedrigo, O., Jarvis, ED. (2022). Gfastats: conversion, evaluation and manipulation of genome sequences using assembly graphs. bioRxiv. doi: https://doi.org/10.1101/2022.03.24.485682
 
-- [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/)
+- [Merqury_FK](https://github.com/thegenemyers/MERQURY.FK)
 
-  > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924.
+  > Myers, G., Rhie, A. (2024). MerquryFK & KatFK. [online]. https://github.com/thegenemyers/MERQURY.FK. (Accessed on 20 September 2024).
+
+- [Minimap2](https://pubmed.ncbi.nlm.nih.gov/34623391/)
+
+  > Li, H. 2021. ‘New strategies to improve MINIMAP2 alignment accuracy’, Bioinformatics, 37(23), pp. 4572–4574. doi:10.1093/bioinformatics/btab705.
+
+- [Samtools](https://pubmed.ncbi.nlm.nih.gov/33590861/)
+
+  > Danecek P, Bonfield JK, Liddle J, Marshall J, Ohan V, Pollard MO, Whitwham A, Keane T, McCarthy SA, Davies RM, Li H. Twelve years of SAMtools and BCFtools. Gigascience. 2021 Feb 16;10(2):giab008. doi: 10.1093/gigascience/giab008. PMID: 33590861; PMCID: PMC7931819.
+
+- [sanger-tol/blobtoolkit](https://zenodo.org/records/13758882)
+
+  > Muffato, M., Butt, Z., Challis, R., Kumar, S., Qi, G., Ramos Díaz, A., Surana, P., & Yates, B. (2024). sanger-tol/blobtoolkit: v0.6.0 – Bellsprout (0.6.0). Zenodo. https://doi.org/10.5281/zenodo.13758882
+
+- [sanger-tol/curationpretext](https://zenodo.org/records/13758882)
+
+  > Pointon, DLB. (2024). sanger-tol/curationpretext: v1.0.0 (UNSC Cradle). [online]. https://github.com/sanger-tol/curationpretext/releases/tag/1.0.0. (Accessed on 20 September 2024).
 
 ## Software packaging/containerisation tools
 

diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) DLBPointon
+Copyright (c) 2022 - 2023 Genome Research Ltd.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -59,8 +59,9 @@ curationpretext:
   hic_dir: <DIRECTORY OF HIC READ FILES .CRAM AND .CRAI>
 btk:
   taxid: 1464561
-  lineages: <CSV LIST OF DATABASES TO USE: "insecta_odb10,diptera_odb10">
-  gca_accession: GCA_0001 <DEFAULT, DO NOT CHANGE UNLESS YOU HAVE A GCA_ACCESSION FOR YOUR SPECIES>
+  lineages: < CSV LIST OF DATABASES TO USE: "insecta_odb10,diptera_odb10">
+  gca_accession: GCA_0001 <DEFAULT, DO NOT CHANGE UNLESS YOU HAVE A GCA_ACCESSION FOR YOUR SPECIES >
+
   nt_database: <DIRECTORY CONTAINING BLAST DB>
   nt_database_prefix: <BLASTDB PREFIX>
   diamond_uniprot_database_path: <PATH TO reference_proteomes.dmnd FROM UNIPROT>
@@ -76,7 +77,8 @@ Now, you can run the pipeline using:
 nextflow run sanger-tol/ear -profile <singularity,docker> \\
    --input assets/idCulLati1.yaml \\
    --mapped TRUE \\ # OPTIONAL
-   --outdir test-truth
+   --steps ["", "btk", "cpretext", "merquryfk"] # OPTIONAL CSV LIST OF STEPS TO EXCLUDE FROM EXECUTION
+   --outdir test
 ```
 
 > [!WARNING]

diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv
diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -13,21 +13,137 @@
                 "errorMessage": "Sample name must be provided and cannot contain spaces",
                 "meta": ["id"]
             },
-            "fastq_1": {
+            "reference_hap1": {
                 "type": "string",
                 "format": "file-path",
                 "exists": true,
-                "pattern": "^\\S+\\.f(ast)?q\\.gz$",
-                "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'"
+                "pattern": "^\\S+\\.f[ast]a$",
+                "errorMessage": "Primary assembly input file, decompressed"
             },
-            "fastq_2": {
+            "reference_hap2": {
                 "type": "string",
                 "format": "file-path",
                 "exists": true,
-                "pattern": "^\\S+\\.f(ast)?q\\.gz$",
-                "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'"
+                "pattern": "^\\S+\\.f[ast]a$",
+                "errorMessage": "Haplotype assembly input file, decompressed"
+            },
+            "reference_haplotigs": {
+                "type": "string",
+                "format": "file-path",
+                "exists": true,
+                "pattern": "^\\S+\\.f[ast]a$",
+                "errorMessage": "Haplotigs removed from Primary Assembly input file during curation, decompressed"
+            },
+            "mapped_bam": {
+                "type": "string",
+                "format": "file-path",
+                "exists": true,
+                "pattern": "^\\S+\\.bam$",
+                "errorMessage": "Optional mapped bam file used to skip mapping of pacbio files"
+            },
+            "merquryfk": {
+                "type": "object",
+                "properties": {
+                    "fastk_hist": {
+                        "type": "string",
+                        "format": "file-path",
+                        "exists": true,
+                        "pattern": "^\\S+\\.hist$",
+                        "errorMessage": "Path to hist file"
+                    },
+                    "fastk_ktab": {
+                        "type": "string",
+                        "errorMessage": "Directory containing ktab files"
+                    }
+                }
+            },
+            "longread": {
+                "type": "object",
+                "properties": {
+                    "dir": {
+                        "type": "string",
+                        "errorMessage": "Path to folder containing fasta.gz files"
+                    },
+                    "type": {
+                        "type": "string",
+                        "errorMessage": "type of longread data"
+                    }
+                }
+            },
+            "curationpretext": {
+                "type": "object",
+                "properties": {
+                    "aligner": {
+                        "type": "string",
+                        "errorMessage": "Aligner"
+                    },
+                    "telomere_motif": {
+                        "type": "string",
+                        "errorMessage": "Telomere motif for telomere search"
+                    },
+                    "hic_dir": {
+                        "type": "string",
+                        "errorMessage": "Directory of the cram data"
+                    }
+                }
+            },
+            "btk": {
+                "type": "object",
+                "properties": {
+                    "taxid": {
+                        "type": "string",
+                        "errorMessage": "NCBI Taxid of organism"
+                    },
+                    "lineages": {
+                        "type": "string",
+                        "errorMessage": "CSV list of BUSCO lineages to run against"
+                    },
+                    "gca_accession": {
+                        "type": "string",
+                        "errorMessage": "gca_accession if applicable"
+                    },
+                    "nt_database": {
+                        "type": "string",
+                        "errorMessage": "nt database directory"
+                    },
+                    "nt_database_prefix": {
+                        "type": "string",
+                        "errorMessage": "Prefix for nt database"
+                    },
+                    "diamond_uniprot_database_path": {
+                        "type": "string",
+                        "format": "file-path",
+                        "exists": true,
+                        "pattern": "^\\S+\\.dmnd$",
+                        "errorMessage": "Diamond protein database"
+                    },
+                    "diamond_nr_database_path": {
+                        "type": "string",
+                        "format": "file-path",
+                        "exists": true,
+                        "pattern": "^\\S+\\.dmnd$",
+                        "errorMessage": "Nuclear diamond database"
+                    },
+                    "ncbi_taxonomy_path": {
+                        "type": "string",
+                        "errorMessage": "Directory for tax2taxid"
+                    },
+                    "ncbi_rankedlineage_path": {
+                        "type": "string",
+                        "format": "file-path",
+                        "exists": true,
+                        "pattern": "^\\S+\\.dmp$",
+                        "errorMessage": "Taxonomy dump"
+                    },
+                    "config": {
+                        "type": "string",
+                        "format": "file-path",
+                        "pattern": "^\\S+\\.config$",
+                        "errorMessage": "Extra configuration file for Blobtoolkit pipeline"
+                    }
+                }
             }
         },
-        "required": ["sample", "fastq_1"]
+        "required": ["sample", "reference_hap1", "reference_hap2"]
     }
 }
diff --git a/conf/base.config b/conf/base.config
@@ -11,16 +11,22 @@
 process {
 
     // TODO nf-core: Check the defaults for all processes
-    cpus   = { check_max( 1    * task.attempt, 'cpus'   ) }
-    memory = { check_max( 6.GB * task.attempt, 'memory' ) }
-    time   = { check_max( 4.h  * task.attempt, 'time'   ) }
+    cpus                = { check_max( 1    * task.attempt, 'cpus'   ) }
+    memory              = { check_max( 6.GB * task.attempt, 'memory' ) }
+    time                = { check_max( 4.h  * task.attempt, 'time'   ) }
 
-    errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' }
-    maxRetries    = 1
-    maxErrors     = '-1'
+    errorStrategy       = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' }
+    maxRetries          = 1
+    maxErrors           = '-1'
 
     withName: "SANGER_TOL_CPRETEXT|SANGER_TOL_BTK" {
-        time    = { check_max( 70.h  * task.attempt, 'time'   ) }
+        time            = { check_max( 70.h  * task.attempt, 'time'   ) }
+    }
+
+        withName: "MINIMAP2_ALIGN_SE" {
+        cpus            = { check_max( 16                  , 'cpus'    ) }
+        memory          = { check_max( 1.GB     * ( reference.size() < 2e9 ? 40 : Math.ceil( ( reference.size() / 1e+9 ) * 20 ) * task.attempt ) , 'memory') }
+        time            = { check_max( 1.h      * ( reference.size() < 1e9 ? 10 : reference.size() < 10e9 ? 30 : 48), 'time'      ) }
     }
 
     // Process-specific resource requirements
@@ -31,36 +37,36 @@ process {
     // TODO nf-core: Customise requirements for specific processes.
     // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors
     withLabel:process_single {
-        cpus   = { check_max( 1                  , 'cpus'    ) }
-        memory = { check_max( 6.GB * task.attempt, 'memory'  ) }
-        time   = { check_max( 4.h  * task.attempt, 'time'    ) }
+        cpus            = { check_max( 1                  , 'cpus'    ) }
+        memory          = { check_max( 6.GB * task.attempt, 'memory'  ) }
+        time            = { check_max( 4.h  * task.attempt, 'time'    ) }
     }
     withLabel:process_low {
-        cpus   = { check_max( 2     * task.attempt, 'cpus'    ) }
-        memory = { check_max( 12.GB * task.attempt, 'memory'  ) }
-        time   = { check_max( 4.h   * task.attempt, 'time'    ) }
+        cpus            = { check_max( 2     * task.attempt, 'cpus'    ) }
+        memory          = { check_max( 12.GB * task.attempt, 'memory'  ) }
+        time            = { check_max( 4.h   * task.attempt, 'time'    ) }
     }
     withLabel:process_medium {
-        cpus   = { check_max( 6     * task.attempt, 'cpus'    ) }
-        memory = { check_max( 36.GB * task.attempt, 'memory'  ) }
-        time   = { check_max( 8.h   * task.attempt, 'time'    ) }
+        cpus            = { check_max( 6     * task.attempt, 'cpus'    ) }
+        memory          = { check_max( 36.GB * task.attempt, 'memory'  ) }
+        time            = { check_max( 8.h   * task.attempt, 'time'    ) }
     }
     withLabel:process_high {
-        cpus   = { check_max( 12    * task.attempt, 'cpus'    ) }
-        memory = { check_max( 72.GB * task.attempt, 'memory'  ) }
-        time   = { check_max( 16.h  * task.attempt, 'time'    ) }
+        cpus            = { check_max( 12    * task.attempt, 'cpus'    ) }
+        memory          = { check_max( 72.GB * task.attempt, 'memory'  ) }
+        time            = { check_max( 16.h  * task.attempt, 'time'    ) }
     }
     withLabel:process_long {
-        time   = { check_max( 20.h  * task.attempt, 'time'    ) }
+        time            = { check_max( 20.h  * task.attempt, 'time'    ) }
     }
     withLabel:process_high_memory {
-        memory = { check_max( 200.GB * task.attempt, 'memory' ) }
+        memory          = { check_max( 200.GB * task.attempt, 'memory' ) }
     }
     withLabel:error_ignore {
-        errorStrategy = 'ignore'
+        errorStrategy   = 'ignore'
     }
     withLabel:error_retry {
-        errorStrategy = 'retry'
-        maxRetries    = 2
+        errorStrategy   = 'retry'
+        maxRetries      = 2
     }
 }