diff --git a/Program_Licenses.md b/Program_Licenses.md index e576e07da..a8c6d80db 100644 --- a/Program_Licenses.md +++ b/Program_Licenses.md @@ -17,6 +17,7 @@ The licenses of the open-source software that is contained in these Docker image | bedtools | MIT | https://github.com/arq5x/bedtools2/blob/master/LICENSE | | blast+ | Public Domain | https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/blast/LICENSE | | bowtie2 | GNU GPLv3 | https://github.com/BenLangmead/bowtie2/blob/master/LICENSE | +| BUSCO | MIT | https://gitlab.com/ezlab/busco/-/raw/master/LICENSE | | BWA | GNU GPLv3 | https://github.com/lh3/bwa/blob/master/COPYING | | Canu
Racon
Minimap2 | GNU GPLv3 (Canu),
MIT (Racon),
MIT (Minimap2) | https://github.com/marbl/canu/blob/master/README.license.GPL https://github.com/isovic/racon/blob/master/LICENSE https://github.com/lh3/minimap2/blob/master/LICENSE.txt | | centroid | GitHub No License | https://github.com/https://github.com/stjacqrm/centroid | diff --git a/README.md b/README.md index 5375736fd..e72aafb13 100644 --- a/README.md +++ b/README.md @@ -46,6 +46,7 @@ To learn more about the docker pull rate limits and the open source software pro | [berrywood-report-env](https://hub.docker.com/r/staphb/berrywood-report-env/)
[![docker pulls](https://badgen.net/docker/pulls/staphb/berrywood-report-env)](https://hub.docker.com/r/staphb/berrywood-report-env) | | none | | [blast+](https://hub.docker.com/r/staphb/blast/)
[![docker pulls](https://badgen.net/docker/pulls/staphb/blast)](https://hub.docker.com/r/staphb/blast) | | https://www.ncbi.nlm.nih.gov/books/NBK279690/ | | [bowtie2](https://hub.docker.com/r/staphb/bowtie2/)
[![docker pulls](https://badgen.net/docker/pulls/staphb/bowtie2)](https://hub.docker.com/r/staphb/bowtie2) | | http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml
https://github.com/BenLangmead/bowtie2 | +| [BUSCO](https://hub.docker.com/r/staphb/busco/)
[![docker pulls](https://badgen.net/docker/pulls/staphb/busco)](https://hub.docker.com/r/staphb/busco) | | https://busco.ezlab.org/busco_userguide.html
https://gitlab.com/ezlab/busco | | [BWA](https://hub.docker.com/r/staphb/bwa)
[![docker pulls](https://badgen.net/docker/pulls/staphb/bwa)](https://hub.docker.com/r/staphb/bwa) | | https://github.com/lh3/bwa | | [Canu](https://hub.docker.com/r/staphb/canu)
[![docker pulls](https://badgen.net/docker/pulls/staphb/canu?)](https://hub.docker.com/r/staphb/canu)| | https://canu.readthedocs.io/en/latest/
https://github.com/marbl/canu | | [Canu-Racon](https://hub.docker.com/r/staphb/canu-racon/)
[![docker pulls](https://badgen.net/docker/pulls/staphb/canu-racon)](https://hub.docker.com/r/staphb/canu-racon) | | https://canu.readthedocs.io/en/latest/
https://github.com/lbcb-sci/racon
https://github.com/isovic/racon (ARCHIVED)
https://lh3.github.io/minimap2/ | diff --git a/busco/5.4.7/Dockerfile b/busco/5.4.7/Dockerfile new file mode 100644 index 000000000..99618b0cf --- /dev/null +++ b/busco/5.4.7/Dockerfile @@ -0,0 +1,85 @@ +FROM ubuntu:focal as app + +ARG BUSCO_VER="5.4.7" +ARG BBMAP_VER="39.01" +ARG BLAST_VER="2.14.0" +ARG DEBIAN_FRONTEND=noninteractive + +LABEL base.image="ubuntu:focal" +LABEL dockerfile.version="1" +LABEL software="BUSCO" +LABEL software.version="${BUSCO_VER}" +LABEL description="Assessing genome assembly and annotation completeness with Benchmarking Universal Single-Copy Orthologs" +LABEL website="https://busco.ezlab.org/" +LABEL license="https://gitlab.com/ezlab/busco/-/raw/master/LICENSE" +LABEL maintainer="Kutluhan Incekara" +LABEL maintainer.email="kutluhan.incekara@ct.gov" + +# install dependencies +RUN apt-get update && apt-get install --no-install-recommends -y \ + wget \ + python3-biopython \ + python3-pandas \ + python3-setuptools\ + hmmer \ + prodigal \ + augustus \ + r-cran-ggplot2 \ + gcc-x86-64-linux-gnu \ + default-jre \ + libjenkins-json-java \ + libgoogle-gson-java \ + libjson-java \ + && rm -rf /var/lib/apt/lists/* && apt-get autoclean \ + && ln -s /usr/bin/python3 /usr/bin/python + +# install other necessary tools +# blast +RUN wget https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.14.0/ncbi-blast-${BLAST_VER}+-x64-linux.tar.gz &&\ + tar -xvf ncbi-blast-${BLAST_VER}+-x64-linux.tar.gz && rm ncbi-blast-${BLAST_VER}+-x64-linux.tar.gz +# sepp (greengenes version) +RUN wget https://raw.githubusercontent.com/smirarab/sepp-refs/54415e8905c5fa26cdd631c526b21f2bcdba95b5/gg/sepp-package.tar.bz &&\ + tar xvfj sepp-package.tar.bz && rm sepp-package.tar.bz &&\ + cd sepp-package/sepp &&\ + python setup.py config -c && chmod 755 run_* +# bbtools +RUN wget https://sourceforge.net/projects/bbmap/files/BBMap_${BBMAP_VER}.tar.gz &&\ + tar -xvf BBMap_${BBMAP_VER}.tar.gz && rm BBMap_${BBMAP_VER}.tar.gz &&\ + mv /bbmap/* /usr/local/bin/ +# metaeuk +RUN wget https://github.com/soedinglab/metaeuk/releases/download/6-a5d39d9/metaeuk-linux-sse41.tar.gz &&\ + tar -xvf metaeuk-linux-sse41.tar.gz && rm metaeuk-linux-sse41.tar.gz &&\ + mv /metaeuk/bin/* /usr/local/bin/ + +# and finally busco +RUN wget https://gitlab.com/ezlab/busco/-/archive/${BUSCO_VER}/busco-${BUSCO_VER}.tar.gz &&\ + tar -xvf busco-${BUSCO_VER}.tar.gz && \ + rm busco-${BUSCO_VER}.tar.gz &&\ + cd busco-${BUSCO_VER} && \ + python3 setup.py install + +ENV AUGUSTUS_CONFIG_PATH="/usr/share/augustus/config/" +ENV PATH="${PATH}:/ncbi-blast-${BLAST_VER}+/bin:/sepp-package/sepp:/usr/share/augustus/scripts" +ENV LC_ALL=C + +WORKDIR /data + +CMD busco -h + +## Tests ## +FROM app as test +# run tests for bacteria and eukaryota +RUN busco -i /busco-5.4.7/test_data/bacteria/genome.fna -c 8 -m geno -f --out test_bacteria +RUN busco -i /busco-5.4.7/test_data/eukaryota/genome.fna -c 8 -m geno -f --out test_eukaryota +RUN busco -i /busco-5.4.7/test_data/eukaryota/genome.fna -l eukaryota_odb10 -c 8 -m geno -f --out test_eukaryota_augustus --augustus + +# generate plot +RUN mkdir my_summaries &&\ + find . -name "short_summary.*.txt" -exec cp {} my_summaries \; &&\ + python3 /busco-5.4.7/scripts/generate_plot.py -wd my_summaries + +# using actual data (Salmonella genome) +RUN wget -q https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/010/941/835/GCA_010941835.1_PDT000052640.3/GCA_010941835.1_PDT000052640.3_genomic.fna.gz && \ + gzip -d GCA_010941835.1_PDT000052640.3_genomic.fna.gz && \ + busco -m genome -i GCA_010941835.1_PDT000052640.3_genomic.fna -o busco_GCA_010941835.1 --cpu 4 --auto-lineage-prok && \ + head busco_GCA_010941835.1/short_summary*.txt diff --git a/busco/5.4.7/README.md b/busco/5.4.7/README.md new file mode 100644 index 000000000..414b7f640 --- /dev/null +++ b/busco/5.4.7/README.md @@ -0,0 +1,94 @@ +# Assessing genome assembly and annotation completeness with Benchmarking Universal Single-Copy Orthologs (BUSCO) container + +Main tool : [BUSCO](https://gitlab.com/ezlab/busco/) + +Additional tools: +- BBTools 39.01 +- HMMER 3.3 +- Prodigal 2.6.3 +- BLAST+ 2.14.0 +- AUGUSTUS 3.3.3 +- MetaEuk (Release 6-a5d39d9) +- SEPP 4.5.1 +- Python 3.8.10 +- BioPython 1.76 +- R 3.6.3 +- Perl 5.30.0 +- OpenJDK 11.0.20 + +Full documentation: https://busco.ezlab.org/busco_userguide.html + +This fully functional BUSCO docker image allows you to use all the program options. All additional tools were added to satisfy the requirements of those functions. This image does not contain any lineage dataset. BUSCO downloads the passed dataset name automatically while running. If a full path is given as lineage, this automated management will be disabled. The usage options are given below. Please refer to the BUSCO manual for further information. +## Example Usage +### Specific lineage +```bash +busco -i assembly.fasta -l bacteria_odb10 -o output -m genome +``` +or +```bash +busco -i assembly.fasta -l /path/to/folder/bacteria_odb10 -o output -m genome +``` +### Auto lineage selection: +```bash +busco -i assembly.fasta -o output -m genome --auto-lineage-prok +``` +### Additional options: +```bash + -i FASTA FILE, --in FASTA FILE + Input sequence file in FASTA format. Can be an assembled genome or transcriptome (DNA), or protein sequences from an annotated gene set. + -o OUTPUT, --out OUTPUT + Give your analysis run a recognisable short name. Output folders and files will be labelled with this name. WARNING: do not provide a path + -m MODE, --mode MODE Specify which BUSCO analysis mode to run. + There are three valid modes: + - geno or genome, for genome assemblies (DNA) + - tran or transcriptome, for transcriptome assemblies (DNA) + - prot or proteins, for annotated gene sets (protein) + -l LINEAGE, --lineage_dataset LINEAGE + Specify the name of the BUSCO lineage to be used. + --auto-lineage Run auto-lineage to find optimum lineage path + --auto-lineage-prok Run auto-lineage just on non-eukaryote trees to find optimum lineage path + --auto-lineage-euk Run auto-placement just on eukaryote tree to find optimum lineage path + -c N, --cpu N Specify the number (N=integer) of threads/cores to use. + -f, --force Force rewriting of existing files. Must be used when output files with the provided name already exist. + -r, --restart Continue a run that had already partially completed. + -q, --quiet Disable the info logs, displays only errors + --out_path OUTPUT_PATH + Optional location for results folder, excluding results folder name. Default is current working directory. + --download_path DOWNLOAD_PATH + Specify local filepath for storing BUSCO dataset downloads + --datasets_version DATASETS_VERSION + Specify the version of BUSCO datasets, e.g. odb10 + --download_base_url DOWNLOAD_BASE_URL + Set the url to the remote BUSCO dataset location + --update-data Download and replace with last versions all lineages datasets and files necessary to their automated selection + --offline To indicate that BUSCO cannot attempt to download files + --metaeuk_parameters METAEUK_PARAMETERS + Pass additional arguments to Metaeuk for the first run. All arguments should be contained within a single pair of quotation marks, separated by commas. E.g. "--param1=1,--param2=2" + --metaeuk_rerun_parameters METAEUK_RERUN_PARAMETERS + Pass additional arguments to Metaeuk for the second run. All arguments should be contained within a single pair of quotation marks, separated by commas. E.g. "--param1=1,--param2=2" + -e N, --evalue N E-value cutoff for BLAST searches. Allowed formats, 0.001 or 1e-03 (Default: 1e-03) + --limit REGION_LIMIT How many candidate regions (contig or transcript) to consider per BUSCO (default: 3) + --augustus Use augustus gene predictor for eukaryote runs + --augustus_parameters AUGUSTUS_PARAMETERS + Pass additional arguments to Augustus. All arguments should be contained within a single pair of quotation marks, separated by commas. E.g. "--param1=1,--param2=2" + --augustus_species AUGUSTUS_SPECIES + Specify a species for Augustus training. + --long Optimization Augustus self-training mode (Default: Off); adds considerably to the run time, but can improve results for some non-model organisms + --config CONFIG_FILE Provide a config file + -v, --version Show this version and exit + -h, --help Show this help message and exit + --list-datasets Print the list of available BUSCO datasets +``` +### Plot +Example usage of plotting script: +```bash +# collect short summaries +mkdir my_summaries +cp SPEC1/short_summary.generic.lineage1_odb10.SPEC1.txt my_summaries/. +cp SPEC2/short_summary.generic.lineage2_odb10.SPEC2.txt my_summaries/. +cp SPEC3/short_summary.specific.lineage2_odb10.SPEC3.txt my_summaries/. +cp SPEC4/short_summary.generic.lineage3_odb10.SPEC4.txt my_summaries/. +cp SPEC5/short_summary.generic.lineage4_odb10.SPEC5.txt my_summaries/. +# plot via script +python3 scripts/generate_plot.py –wd my_summaries +```