Merge pull request #11 from sanger-tol/main

Back merge
sanger-tol · Oct 8, 2024 · 0b4aaeb · 0b4aaeb
2 parents 37de128 + 20cb5a4
commit 0b4aaeb
Show file tree

Hide file tree

Showing 80 changed files with 4,691 additions and 971 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -10,6 +10,8 @@ on:
 
 env:
   NXF_ANSI_LOG: false
+  NXF_SINGULARITY_CACHEDIR: ${{ github.workspace }}/.singularity
+  NXF_SINGULARITY_LIBRARYDIR: ${{ github.workspace }}/.singularity
 
 concurrency:
   group: "${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}"
@@ -24,9 +26,32 @@ jobs:
     strategy:
       matrix:
         NXF_VER:
-          - "23.04.0"
+          - "24.04.0"
           - "latest-everything"
     steps:
+      - name: Get branch names
+        # Pulls the names of current branches in repo
+        # steps.branch-names.outputs.current_branch is used later and returns the name of the branch the PR is made FROM not to
+        id: branch-names
+        uses: tj-actions/branch-names@v8
+
+      - name: Setup apptainer
+        uses: eWaterCycle/setup-apptainer@main
+
+      - name: Set up Singularity
+        run: |
+          mkdir -p $NXF_SINGULARITY_CACHEDIR
+          mkdir -p $NXF_SINGULARITY_LIBRARYDIR
+
+      - name: Install Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+
+      - name: Install nf-core
+        run: |
+          pip install nf-core
+
       - name: Check out pipeline code
         uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4
 
@@ -35,12 +60,26 @@ jobs:
         with:
           version: "${{ matrix.NXF_VER }}"
 
-      - name: Disk space cleanup
-        uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1
+      # This will only download the main pipeline containers, subpipelines need their own nf-download
+      - name: NF-Core Download - download singularity containers
+        run: |
+          nf-core download sanger-tol/ear --revision ${{ steps.branch-names.outputs.current_branch }} --compress none -d --force --outdir sanger-ear --container-cache-utilisation amend --container-system singularity
+
+      - name: Download Tiny test data
+        # Download A fungal test data set that is full enough to show some real output.
+        # Needs a kmer db for merqury
+        run: |
+          curl https://tolit.cog.sanger.ac.uk/test-data/resources/treeval/TreeValTinyData.tar.gz | tar xzf -
+          cp TreeValTinyData/assembly/draft/grTriPseu1.fa TreeValTinyData/assembly/draft/grTriPseu1-hap.fa
+          cp TreeValTinyData/assembly/draft/grTriPseu1.fa TreeValTinyData/assembly/draft/grTriPseu1-all_hap.fa
+
+      # - name: Disk space cleanup
+      #   uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1
 
       - name: Run pipeline with test data
         # TODO nf-core: You can customise CI pipeline run tests as required
         # For example: adding multiple test runs with different parameters
         # Remember that you can parallelise this by using strategy.matrix
+        # Skip BTK and CPRETEXT as they are already tested on their repos.
         run: |
-          nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results
+          nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results --steps btk,cpretext,merquryfk
diff --git a/.nf-core.yml b/.nf-core.yml
@@ -4,11 +4,13 @@ lint:
     - assets/nf-core-ear_logo_light.png
     - docs/images/nf-core-ear_logo_light.png
     - docs/images/nf-core-ear_logo_dark.png
+    - lib/nfcore_external_java_deps.jar
     - .github/ISSUE_TEMPLATE/config.yml
     - .github/workflows/awstest.yml
     - .github/workflows/awsfulltest.yml
     - conf/igenomes.config
   files_unchanged:
+    - LICENSE
     - CODE_OF_CONDUCT.md
     - assets/nf-core-ear_logo_light.png
     - docs/images/nf-core-ear_logo_light.png

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,15 +2,47 @@
 
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+Naming based on: [Mythical creatures](https://en.wikipedia.org/wiki/List_of_legendary_creatures_by_type).
 
-## v1.0dev - [date]
+## v0.6.1 - Robert Beiny H1 [08/10/2024]
+
+- Blobtookit version was specified in the wrong location, so defaulted to a development branch "draft_assemblies", this has now been updated to v0.6.0.
+- Zenodo DOI has now been added to the repo.
+
+## v0.6.0 - Robert Beiny [20/09/2024]
 
 Initial release of sanger-tol/ear, created with the [nf-core](https://nf-co.re/) template.
+The current pipeline means the MVP for ear.
+
+### Added
+
+GFASTATS to generate statistics on the input primary genome.
+MERQURY_FK to generate kmer graphs and analyses of the primary, haplotype and merged assembly.
+MAIN_MAPPING which is a small mapping subworkflow, that can work with single and paired reads.
+BLOBTOOLKIT to generate busco files and blobtoolkit dataset/plots.
+CURATIONPRETEXT to generate pretext plots and pngs.
+
+### Parameters
+
+| Old parameter | New parameter |
+| ------------- | ------------- |
+|               | --mapped      |
+|               | --steps       |
+
+### Software dependencies
 
-### `Added`
+| Dependency                   | Old version | New version         |
+| ---------------------------- | ----------- | ------------------- |
+| sanger-tol/blobtoolkit\*     |             | 0.6.0 (Bellsprout)  |
+| sanger-tol/curationpretext\* |             | 1.0.0 (UNSC Cradle) |
+| GFASTATS                     |             | 1.3.6--hdcf5f25_3   |
+| MERQUERY_FK                  |             | 1.2                 |
+| MINIMAP2_ALIGN               |             | 2.28                |
+| SAMTOOLS_MERGE               |             | 1.20--h50ea8bc_0    |
+| SAMTOOLS_SORT                |             | 1.20--h50ea8bc_0    |
 
-### `Fixed`
+\* for pipelines, please check their own CHANGELOG file for a full list of software dependencies.
 
-### `Dependencies`
+### Dependencies
 
-### `Deprecated`
+The pipeline depends on a number of databases which are noted in [README](README.md) and [USAGE](docs/usage.md).
diff --git a/CITATIONS.md b/CITATIONS.md
@@ -10,13 +10,29 @@
 
 ## Pipeline tools
 
-- [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)
+- [GFastar/GFastats](https://www.biorxiv.org/content/10.1101/2022.03.24.485682v1)
 
-  > Andrews, S. (2010). FastQC: A Quality Control Tool for High Throughput Sequence Data [Online].
+  > Formenti, G., Abueg, L., Brajuka, N., Gallardo, C., Giani, A., Fedrigo, O., Jarvis, ED. (2022). Gfastats: conversion, evaluation and manipulation of genome sequences using assembly graphs. bioRxiv. doi: https://doi.org/10.1101/2022.03.24.485682
 
-- [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/)
+- [Merqury_FK](https://github.com/thegenemyers/MERQURY.FK)
 
-  > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924.
+  > Myers, G., Rhie, A. (2024). MerquryFK & KatFK. [online]. https://github.com/thegenemyers/MERQURY.FK. (Accessed on 20 September 2024).
+
+- [Minimap2](https://pubmed.ncbi.nlm.nih.gov/34623391/)
+
+  > Li, H. 2021. ‘New strategies to improve MINIMAP2 alignment accuracy’, Bioinformatics, 37(23), pp. 4572–4574. doi:10.1093/bioinformatics/btab705.
+
+- [Samtools](https://pubmed.ncbi.nlm.nih.gov/33590861/)
+
+  > Danecek P, Bonfield JK, Liddle J, Marshall J, Ohan V, Pollard MO, Whitwham A, Keane T, McCarthy SA, Davies RM, Li H. Twelve years of SAMtools and BCFtools. Gigascience. 2021 Feb 16;10(2):giab008. doi: 10.1093/gigascience/giab008. PMID: 33590861; PMCID: PMC7931819.
+
+- [sanger-tol/blobtoolkit](https://zenodo.org/records/13758882)
+
+  > Muffato, M., Butt, Z., Challis, R., Kumar, S., Qi, G., Ramos Díaz, A., Surana, P., & Yates, B. (2024). sanger-tol/blobtoolkit: v0.6.0 – Bellsprout (0.6.0). Zenodo. https://doi.org/10.5281/zenodo.13758882
+
+- [sanger-tol/curationpretext](https://zenodo.org/records/13758882)
+
+  > Pointon, DLB. (2024). sanger-tol/curationpretext: v1.0.0 (UNSC Cradle). [online]. https://github.com/sanger-tol/curationpretext/releases/tag/1.0.0. (Accessed on 20 September 2024).
 
 ## Software packaging/containerisation tools
 

diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) DLBPointon
+Copyright (c) 2022 - 2023 Genome Research Ltd.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/README.md b/README.md
@@ -1,60 +1,84 @@
 [![GitHub Actions CI Status](https://github.com/sanger-tol/ear/actions/workflows/ci.yml/badge.svg)](https://github.com/sanger-tol/ear/actions/workflows/ci.yml)
-[![GitHub Actions Linting Status](https://github.com/sanger-tol/ear/actions/workflows/linting.yml/badge.svg)](https://github.com/sanger-tol/ear/actions/workflows/linting.yml)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX)
+[![GitHub Actions Linting Status](https://github.com/sanger-tol/ear/actions/workflows/linting.yml/badge.svg)](https://github.com/sanger-tol/ear/actions/workflows/linting.yml)[![DOI](https://zenodo.org/badge/833605808.svg)](https://doi.org/10.5281/zenodo.13819520)
 [![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)
-
-[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A523.04.0-23aa62.svg)](https://www.nextflow.io/)
+[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A524.04.0-23aa62.svg)](https://www.nextflow.io/)
 [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)
 [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)
 [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)
 [![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/sanger-tol/ear)
 
 ## Introduction
 
-**sanger-tol/ear** is a bioinformatics pipeline that ...
-
-<!-- TODO nf-core:
-   Complete this sentence with a 2-3 sentence summary of what types of data the pipeline ingests, a brief overview of the
-   major pipeline sections and the types of output it produces. You're giving an overview to someone new
-   to nf-core here, in 15-20 seconds. For an example, see https://github.com/nf-core/rnaseq/blob/master/README.md#introduction
--->
-
-<!-- TODO nf-core: Include a figure that guides the user through the major workflow steps. Many nf-core
-     workflows use the "tube map" design for that. See https://nf-co.re/docs/contributing/design_guidelines#examples for examples.   -->
-<!-- TODO nf-core: Fill in short bullet-pointed list of the default steps in the pipeline -->
+**sanger-tol/ear** is a bioinformatics pipeline that generates the data files required for the the generation of ERGA Assembly Reports. Sanger-tol/ear nests two other sanger-tol pipelines (blobtoolkit and curationpretext).
 
-1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/))
-2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/))
+1. Read the input yaml file (YAML_INPUT)
+2. Run GFASTATS (GFASTARS)
+3. Run MERQURYFK_MERQURYFK (MERQURYFK)
+4. Run MAIN_MAPPING, longread single-end/paired-end mapping
+5. Run GENERATE_SAMPLESHEET, generate a csv file required for SANGER_TOL_BTK.
+6. Run SANGER_TOL_BTK, also known as SANGER-TOL/BLOBTOOLKIT a subpipline for SANGER-TOL/EAR
+7. Run SANGER_TOL_CPRETEXT, also known as SANGER-TOL/CURATIONPRETEXT a subpipeline for SANGER-TOL/EAR.
 
 ## Usage
 
 > [!NOTE]
 > If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.
 
-<!-- TODO nf-core: Describe the minimum required steps to execute the pipeline, e.g. how to prepare samplesheets.
-     Explain what rows and columns represent. For instance (please edit as appropriate):
-
-First, prepare a samplesheet with your input data that looks as follows:
-
-`samplesheet.csv`:
-
-```csv
-sample,fastq_1,fastq_2
-CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz
+The sanger-tol/ear pipeline requires a number of databases in place in order to run the blobtoolkit pipeline.
+These include:
+
+- A blast nt database
+- A Diamond blast uniprot database
+- A Diamond blast nr database
+- An NCBI taxdump
+- An NCBI rankedlineage.dmp
+
+Next, a yaml file containing the following should then be completed:
+
+```yaml
+# General Vales for all subpiplines and modules
+assembly_id: <NAME OF ASSEMBLY>
+reference_hap1: <LOCATION OF PRIMARY ASSEMBLY FILE .FA>
+reference_hap2: <LOCATION OF HAPLOTYPE ASSEBMLY FILE .FA>
+reference_haplotigs: <LOCATION OF THE HAPLOTIGS FILE, REMOVED DURING CURATION .FA>
+
+# If a mapped bam already exists use the below + --mapped TRUE on the nextflow command else ignore it and the pipeline will create it.
+mapped_bam: <MAPPED BAM .BAM>
+
+merquryfk:
+  fastk_hist: <THE PATH TO THE .HIST FILE>
+  fastk_ktab: <PATH TO THE DIRECTORY CONTAINING THE KTAB FILES, ENSURE THE HIDDEN FILES ARE HERE TOO>
+
+# Used by both subpipelines
+longread:
+  type: <hifi|clr|ont|illumina>
+  dir: <DIRECTORY OF LONGREAD FILES .FASTA.GZ>
+curationpretext:
+  aligner: <minimap2|BWAMEM>
+  telomere_motif: <TELOMERE MOTIF OF SAMPLE>
+  hic_dir: <DIRECTORY OF HIC READ FILES .CRAM AND .CRAI>
+btk:
+  taxid: 1464561
+  lineages: < CSV LIST OF DATABASES TO USE: "insecta_odb10,diptera_odb10">
+  gca_accession: GCA_0001 <DEFAULT, DO NOT CHANGE UNLESS YOU HAVE A GCA_ACCESSION FOR YOUR SPECIES >
+
+  nt_database: <DIRECTORY CONTAINING BLAST DB>
+  nt_database_prefix: <BLASTDB PREFIX>
+  diamond_uniprot_database_path: <PATH TO reference_proteomes.dmnd FROM UNIPROT>
+  diamond_nr_database_path: <PATH TO nr.dmnd>
+  ncbi_taxonomy_path: <DIRECTORY CONTAINING THE TAXDUMP>
+  ncbi_rankedlineage_path: <FOLDER CONTAINING THE rankedlineage.dmp FILE>
+  config: <PATH TO ear/conf/sanger-tol-btk.config TO OVERWRITE PROCESS LIMITS>
 ```
 
-Each row represents a fastq file (single-end) or a pair of fastq files (paired end).
-
--->
-
 Now, you can run the pipeline using:
 
-<!-- TODO nf-core: update the following command to include all required parameters for a minimal example -->
-
 ```bash
-nextflow run sanger-tol/ear \
-   -profile <docker/singularity/.../institute> \
-   --input samplesheet.csv \
-   --outdir <OUTDIR>
+nextflow run sanger-tol/ear -profile <singularity,docker> \\
+   --input assets/idCulLati1.yaml \\
+   --mapped TRUE \\ # OPTIONAL
+   --steps ["", "btk", "cpretext", "merquryfk"] # OPTIONAL CSV LIST OF STEPS TO EXCLUDE FROM EXECUTION
+   --outdir test
 ```
 
 > [!WARNING]
@@ -65,10 +89,6 @@ nextflow run sanger-tol/ear \
 
 sanger-tol/ear was originally written by DLBPointon.
 
-We thank the following people for their extensive assistance in the development of this pipeline:
-
-<!-- TODO nf-core: If applicable, make list of people who have also contributed -->
-
 ## Contributions and Support
 
 If you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).

diff --git a/assets/btk_draft.yaml b/assets/btk_draft.yaml
@@ -0,0 +1,17 @@
+assembly:
+  level: bar
+settings:
+  foo: 0
+similarity:
+  diamond_blastx:
+    foo: 0
+taxon:
+  class: class_name
+  family: family_name
+  genus: genus_name
+  kingdom: kingdom_name
+  name: species_name
+  order: order_name
+  phylum: phylum_name
+  superkingdom: superkingdom_name
+  taxid: 0
diff --git a/assets/idCulLati1.yaml b/assets/idCulLati1.yaml
@@ -0,0 +1,33 @@
+# General Vales for all subpiplines and modules
+assembly_id: idCulLati1_ear
+reference_hap1: /nfs/treeoflife-01/teams/tola/users/dp24/ear/idCulLati1/primary.fa
+reference_hap2: /nfs/treeoflife-01/teams/tola/users/dp24/ear/idCulLati1/hap2.fa
+reference_haplotigs: /nfs/treeoflife-01/teams/tola/users/dp24/ear/haplotigs.fa
+
+# If a mapped bam already exists use the below + --mapped TRUE on the nextflow command else ignore.
+mapped_bam: /nfs/treeoflife-01/teams/tola/users/dp24/ear/idCulLati1/mapped_bam.bam
+
+merquryfk:
+  fastk_hist: /lustre/scratch122/tol/data/a/5/e/1/6/d/Culex_laticinctus/genomic_data/idCulLati1/pacbio/kmer/k31/idCulLati1.k31.hist
+  fastk_ktab: /lustre/scratch122/tol/data/a/5/e/1/6/d/Culex_laticinctus/genomic_data/idCulLati1/pacbio/kmer/k31/
+
+# Used by both subpipelines
+longread:
+  type: hifi
+  dir: /lustre/scratch122/tol/data/a/5/e/1/6/d/Culex_laticinctus/genomic_data/idCulLati1/pacbio/fasta/
+curationpretext:
+  aligner: minimap2
+  telomere_motif: TTAGG
+  hic_dir: /lustre/scratch122/tol/data/a/5/e/1/6/d/Culex_laticinctus/genomic_data/idCulLati2/hic-arima2/
+btk:
+  taxid: 1464561
+  lineages: "insecta_odb10"
+  gca_accession: GCA_0001
+  nt_database: /data/blastdb/Supported/NT/current
+  nt_database_prefix: nt
+  diamond_uniprot_database_path: /lustre/scratch123/tol/resources/uniprot_reference_proteomes/latest/reference_proteomes.dmnd
+  diamond_nr_database_path: /lustre/scratch123/tol/resources/nr/latest/nr.dmnd
+  ncbi_taxonomy_path: /lustre/scratch123/tol/resources/taxonomy/latest/new_taxdump/
+  ncbi_rankedlineage_path: /lustre/scratch123/tol/resources/taxonomy/latest/new_taxdump/rankedlineage.dmp
+  btk_yaml: /nfs/users/nfs_d/dp24/sanger-tol-ear/assets/btk_draft.yaml
+  config: /nfs/treeoflife-01/teams/tola/users/dp24/ear/conf/sanger-tol-btk.config