diff --git a/run_plassembler.ipynb b/run_plassembler.ipynb new file mode 100644 index 0000000..b4602c3 --- /dev/null +++ b/run_plassembler.ipynb @@ -0,0 +1,571 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QGd2GEI3N-02" + }, + "source": [ + "##Plassembler\n", + "\n", + "[plassembler](https://github.com/gbouras13/plassembler) is a program that is designed for automated & fast assembly of plasmids in bacterial genomes that have been hybrid sequenced with long read & paired-end short read sequencing. It was originally designed for Oxford Nanopore Technologies long reads, but it will also work with Pacbio reads. As of v1.3.0, it also works well for long-read only assembled genomes.\n", + "\n", + "The full documentation for Plassembler can be found [here](https://plassembler.readthedocs.io/en/latest).\n", + "\n", + "**To run the code cells, press the play buttons on the top left of each block**\n", + "\n", + "Main Instructions\n", + "\n", + "* Cells 1 and 2 installs plassembler and download the databases. These must be run first.\n", + "* Once they have been run, you can run either Cell 3, Cell 4 or both as many times as you wish.\n", + "* To run `plassembler run` (if you have both long- and short-reads), run Cell 3.\n", + "* To run `plassembler long` (if you have only long-reads), run Cell 4.\n", + "\n", + "Other instructions\n", + "\n", + "* Please make sure you change the runtime to CPU (GPU is not required).\n", + "* To do this, go to the top toolbar, then to Runtime -> Change runtime type -> Hardware accelerator\n", + "* You may want to upload your FASTQ files first as this takes a while for large files\n", + "* Click on the folder icon to the left and use file upload button (with the upwards facing arrow)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Ii39RG8eOZUx", + "outputId": "418838e7-33a4-4389-c35a-5c95b93389c4" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "3.10\n", + "installing mamba...\n", + "installing plassembler ...\n", + "CPU times: user 692 ms, sys: 89.9 ms, total: 782 ms\n", + "Wall time: 2min 28s\n" + ] + } + ], + "source": [ + "#@title 1. Install plassembler\n", + "\n", + "#@markdown This cell installs plassembler.\n", + "\n", + "%%time\n", + "import os\n", + "from sys import version_info\n", + "python_version = f\"{version_info.major}.{version_info.minor}\"\n", + "PYTHON_VERSION = python_version\n", + "PLASSEMBLER_VERSION = \"1.6.2\"\n", + "\n", + "print(PYTHON_VERSION)\n", + "\n", + "if not os.path.isfile(\"MAMBA_READY\"):\n", + " print(\"installing mamba...\")\n", + " os.system(\"wget -qnc https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh\")\n", + " os.system(\"bash Mambaforge-Linux-x86_64.sh -bfp /usr/local\")\n", + " os.system(\"mamba config --set auto_update_conda false\")\n", + " os.system(\"touch MAMBA_READY\")\n", + "\n", + "if not os.path.isfile(\"PLASSEMBLER_READY\"):\n", + " print(\"installing plassembler ...\")\n", + " os.system(f\"mamba create -n plassemblerENV -y -c conda-forge -c bioconda python=3.9 plassembler==1.6.2 unicycler==0.5.0\")\n", + " os.system(\"touch PLASSEMBLER_READY\")\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "tfltpbZ_QLfZ", + "outputId": "cc682a99-a928-4ad4-9d88-c3293c833bfc" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "|████████████████████████████████████████| 381.1M/381.1M [100%] in 17.2s (22.18M/s) \n", + "Downloading plassembler database. This will take some time. Please be patient :)\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "2024-04-27 04:41:58.678 | INFO | plassembler:download:1123 - Checking database installation at plassembler_db\n", + "2024-04-27 04:41:58.679 | INFO | plassembler.utils.db:check_db_installation:60 - Database directory is missing file plassembler_db/plsdb_2023_11_03_v2.msh. Plassembler Database will be downloaded.\n", + "2024-04-27 04:41:58.679 | INFO | plassembler.utils.db:get_database_zenodo:72 - Downloading Plassembler Database to the directory plassembler_db\n", + "2024-04-27 04:42:17.517 | INFO | plassembler.utils.db:get_database_zenodo:95 - Database file download OK: 3a24bacc05bb857dc044fc6662b58db7\n", + "2024-04-27 04:42:17.517 | INFO | plassembler.utils.db:get_database_zenodo:101 - Extracting DB tarball: file=plassembler_db/201123_plassembler_v1.5.0_databases.tar.gz, output=plassembler_db\n", + "2024-04-27 04:42:23.488 | INFO | plassembler.utils.db:get_database_zenodo:104 - Plassembler Database download into plassembler_db successful.\n" + ] + } + ], + "source": [ + "#@title 2. Download plassembler database\n", + "\n", + "#@markdown This cell downloads the plassembler database.\n", + "#@markdown It will take some time (5-10 mins). Please be patient.\n", + "\n", + "%%bash\n", + "source activate plassemblerENV\n", + "python\n", + "import os\n", + "\n", + "print(\"Downloading plassembler database. This will take some time. Please be patient :)\")\n", + "os.system(\"plassembler download -d plassembler_db\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "background_save": true + }, + "id": "Sjdpu6R-Kig9", + "outputId": "bb98e816-3b90-4a80-8642-7927045ba31e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input file E_faecalis_CI_1049.fastq.gz exists\n", + "Input file 1049_S46_R1_001.fastq.gz exists\n", + "Input file 1049_S46_R2_001.fastq.gz exists\n", + "Running plassembler run\n", + "plassembler run completed successfully.\n", + "Your output is in plassembler_output.\n", + "Zipping the output directory so you can download it all in one go.\n", + "Output directory has been zipped to plassembler_output.zip\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-04-23 12:51:34.343 | INFO | plassembler:begin_plassembler:100 - You are using Plassembler version 1.6.2\n", + "2024-04-23 12:51:34.344 | INFO | plassembler:begin_plassembler:101 - Repository homepage is https://github.com/gbouras13/plassembler\n", + "2024-04-23 12:51:34.344 | INFO | plassembler:begin_plassembler:102 - Written by George Bouras: george.bouras@adelaide.edu.au\n", + "2024-04-23 12:51:34.345 | INFO | plassembler:run:405 - Database directory is plassembler_db\n", + "2024-04-23 12:51:34.345 | INFO | plassembler:run:406 - Longreads file is E_faecalis_CI_1049.fastq.gz\n", + "2024-04-23 12:51:34.345 | INFO | plassembler:run:407 - R1 fasta file is 1049_S46_R1_001.fastq.gz\n", + "2024-04-23 12:51:34.345 | INFO | plassembler:run:408 - R2 fasta file is 1049_S46_R2_001.fastq.gz\n", + "2024-04-23 12:51:34.346 | INFO | plassembler:run:409 - Chromosome length threshold is 1000000\n", + "2024-04-23 12:51:34.346 | INFO | plassembler:run:410 - Output directory is plassembler_output\n", + "2024-04-23 12:51:34.346 | INFO | plassembler:run:411 - Min long read length is 1000\n", + "2024-04-23 12:51:34.346 | INFO | plassembler:run:412 - Min long read quality is 9\n", + "2024-04-23 12:51:34.346 | INFO | plassembler:run:413 - Thread count is 2\n", + "2024-04-23 12:51:34.347 | INFO | plassembler:run:414 - --force is True\n", + "2024-04-23 12:51:34.347 | INFO | plassembler:run:415 - --skip_qc is False\n", + "2024-04-23 12:51:34.347 | INFO | plassembler:run:416 - --raw_flag is False\n", + "2024-04-23 12:51:34.347 | INFO | plassembler:run:417 - --pacbio_model is nothing\n", + "2024-04-23 12:51:34.348 | INFO | plassembler:run:418 - --keep_fastqs is False\n", + "2024-04-23 12:51:34.348 | INFO | plassembler:run:419 - --keep_chromosome is False\n", + "2024-04-23 12:51:34.348 | INFO | plassembler:run:420 - --flye_directory is nothing\n", + "2024-04-23 12:51:34.348 | INFO | plassembler:run:421 - --flye_assembly is nothing\n", + "2024-04-23 12:51:34.349 | INFO | plassembler:run:422 - --flye_info is nothing\n", + "2024-04-23 12:51:34.349 | INFO | plassembler:run:423 - --no_chromosome is False\n", + "2024-04-23 12:51:34.349 | INFO | plassembler:run:424 - --use_raven is False\n", + "2024-04-23 12:51:34.349 | INFO | plassembler:run:425 - --depth_filter is 0.25\n", + "2024-04-23 12:51:34.349 | INFO | plassembler:run:426 - --unicycler_options is None\n", + "2024-04-23 12:51:34.350 | INFO | plassembler:run:427 - --spades_options is None\n", + "2024-04-23 12:51:34.350 | INFO | plassembler:run:431 - Checking dependencies\n", + "2024-04-23 12:51:34.473 | INFO | plassembler.utils.input_commands:check_dependencies:199 - Flye version found is v2.9.3-b1797.\n", + "2024-04-23 12:51:34.474 | INFO | plassembler.utils.input_commands:check_dependencies:209 - Flye version is ok.\n", + "2024-04-23 12:51:34.490 | INFO | plassembler.utils.input_commands:check_dependencies:218 - Raven v1.8.3 found.\n", + "2024-04-23 12:51:34.491 | INFO | plassembler.utils.input_commands:check_dependencies:220 - Raven version is ok.\n", + "2024-04-23 12:51:34.639 | INFO | plassembler.utils.input_commands:check_dependencies:242 - Unicycler version found is v0.5.0.\n", + "2024-04-23 12:51:34.640 | INFO | plassembler.utils.input_commands:check_dependencies:255 - Unicycler version is ok.\n", + "2024-04-23 12:51:35.165 | INFO | plassembler.utils.input_commands:check_dependencies:265 - SPAdes v3.15.5 found.\n", + "2024-04-23 12:51:35.195 | INFO | plassembler.utils.input_commands:check_dependencies:278 - Samtools v1.20 found.\n", + "2024-04-23 12:51:35.209 | INFO | plassembler.utils.input_commands:check_dependencies:289 - minimap2 v2.28-r1209 found.\n", + "2024-04-23 12:51:35.233 | INFO | plassembler.utils.input_commands:check_dependencies:300 - fastp v0.23.4 found.\n", + "2024-04-23 12:51:35.263 | INFO | plassembler.utils.input_commands:check_dependencies:311 - chopper v0.8.0 found.\n", + "2024-04-23 12:51:35.292 | INFO | plassembler.utils.input_commands:check_dependencies:326 - mash v2.3 found.\n", + "2024-04-23 12:51:35.646 | INFO | plassembler.utils.input_commands:check_dependencies:337 - canu v2.2 found.\n", + "2024-04-23 12:51:36.810 | INFO | plassembler.utils.input_commands:check_dependencies:348 - dnaapler v0.7.0 found.\n", + "2024-04-23 12:51:36.965 | INFO | plassembler.utils.input_commands:check_dependencies:365 - BLAST version found is v2.15.0.\n", + "2024-04-23 12:51:36.966 | INFO | plassembler.utils.input_commands:check_dependencies:371 - All dependencies found.\n", + "2024-04-23 12:51:36.966 | INFO | plassembler:run:436 - Checking database installation.\n", + "2024-04-23 12:51:36.967 | INFO | plassembler.utils.db:check_db_installation:52 - PLSDB Database mash sketch at plassembler_db/plsdb_2023_11_03_v2.msh exists.\n", + "2024-04-23 12:51:36.967 | INFO | plassembler.utils.db:check_db_installation:53 - PLSDB Database tsv metadata file at plassembler_db/plsdb_2023_11_03_v2.tsv exists.\n", + "2024-04-23 12:51:36.967 | INFO | plassembler.utils.db:check_db_installation:54 - PLSDB Database at plassembler_db has already been downloaded\n", + "2024-04-23 12:51:36.967 | INFO | plassembler:run:439 - Database successfully checked.\n", + "2024-04-23 12:51:36.968 | INFO | plassembler:run:442 - Checking input fastqs.\n", + "2024-04-23 12:51:36.980 | INFO | plassembler.utils.input_commands:validate_fastq:24 - FASTQ E_faecalis_CI_1049.fastq.gz checked\n", + "2024-04-23 12:51:36.981 | INFO | plassembler.utils.input_commands:validate_fastq:24 - FASTQ 1049_S46_R1_001.fastq.gz checked\n", + "2024-04-23 12:51:36.981 | INFO | plassembler.utils.input_commands:validate_fastq:24 - FASTQ 1049_S46_R2_001.fastq.gz checked\n", + "2024-04-23 12:51:36.982 | INFO | plassembler:run:450 - FASTQ file E_faecalis_CI_1049.fastq.gz compression is True\n", + "2024-04-23 12:51:36.982 | INFO | plassembler:run:451 - FASTQ file 1049_S46_R1_001.fastq.gz compression is True\n", + "2024-04-23 12:51:36.982 | INFO | plassembler:run:452 - FASTQ file 1049_S46_R2_001.fastq.gz compression is True\n", + "2024-04-23 12:51:36.982 | INFO | plassembler:run:460 - Filtering long reads with chopper\n", + "2024-04-23 12:51:36.982 | INFO | plassembler.utils.qc:chopper:25 - Started running chopper\n", + "2024-04-23 12:52:45.773 | INFO | plassembler.utils.qc:chopper:82 - Finished running chopper\n", + "2024-04-23 12:52:45.775 | INFO | plassembler:run:484 - Running Flye.\n", + "2024-04-23 12:52:45.776 | INFO | plassembler.utils.external_tools:run:58 - Started running flye --nano-hq plassembler_output/chopper_long_reads.fastq.gz --out-dir plassembler_output --threads 2 ...\n", + "2024-04-23 13:11:02.932 | INFO | plassembler.utils.external_tools:run:60 - Done running flye --nano-hq plassembler_output/chopper_long_reads.fastq.gz --out-dir plassembler_output --threads 2\n", + "2024-04-23 13:11:02.934 | INFO | plassembler:run:535 - Counting Contigs.\n", + "2024-04-23 13:11:02.966 | INFO | plassembler.utils.plass_class:get_contig_count:87 - Assembled 8 contigs.\n", + "2024-04-23 13:11:02.966 | INFO | plassembler:run:736 - More than one contig was assembled with Flye.\n", + "2024-04-23 13:11:02.967 | INFO | plassembler:run:737 - Extracting Chromosome.\n", + "2024-04-23 13:11:03.059 | INFO | plassembler:run:772 - Chromosome Identified. Plassembler will now use long and short reads to assemble plasmids accurately.\n", + "2024-04-23 13:11:03.059 | INFO | plassembler:run:774 - Mapping long reads.\n", + "2024-04-23 13:11:03.060 | INFO | plassembler.utils.external_tools:run_to_stdout:67 - Started running minimap2 -ax map-ont -t 2 plassembler_output/flye_renamed.fasta plassembler_output/chopper_long_reads.fastq.gz ...\n", + "2024-04-23 13:12:28.265 | INFO | plassembler.utils.external_tools:run_to_stdout:69 - Done running minimap2 -ax map-ont -t 2 plassembler_output/flye_renamed.fasta plassembler_output/chopper_long_reads.fastq.gz\n", + "2024-04-23 13:12:28.266 | INFO | plassembler:run:789 - Trimming short reads.\n", + "2024-04-23 13:12:28.268 | INFO | plassembler.utils.external_tools:run:58 - Started running fastp --out1 plassembler_output/trimmed_R1.fastq --out2 plassembler_output/trimmed_R2.fastq --in1 1049_S46_R1_001.fastq.gz --in2 1049_S46_R2_001.fastq.gz ...\n", + "2024-04-23 13:13:36.724 | INFO | plassembler.utils.external_tools:run:60 - Done running fastp --out1 plassembler_output/trimmed_R1.fastq --out2 plassembler_output/trimmed_R2.fastq --in1 1049_S46_R1_001.fastq.gz --in2 1049_S46_R2_001.fastq.gz\n", + "2024-04-23 13:13:36.725 | INFO | plassembler:run:793 - Mapping short reads.\n", + "2024-04-23 13:13:36.726 | INFO | plassembler.utils.external_tools:run_to_stdout:67 - Started running minimap2 -ax sr -t 2 plassembler_output/flye_renamed.fasta plassembler_output/trimmed_R1.fastq plassembler_output/trimmed_R2.fastq ...\n", + "2024-04-23 13:17:18.115 | INFO | plassembler.utils.external_tools:run_to_stdout:69 - Done running minimap2 -ax sr -t 2 plassembler_output/flye_renamed.fasta plassembler_output/trimmed_R1.fastq plassembler_output/trimmed_R2.fastq\n", + "2024-04-23 13:17:18.116 | INFO | plassembler:run:800 - Processing Sam/Bam Files and extracting Fastqs.\n", + "2024-04-23 13:17:20.121 | INFO | plassembler.utils.external_tools:run_to_stdout:67 - Started running samtools view -h -@ 2 -b plassembler_output/short_read.sam ...\n", + "2024-04-23 13:18:19.739 | INFO | plassembler.utils.external_tools:run_to_stdout:69 - Done running samtools view -h -@ 2 -b plassembler_output/short_read.sam\n", + "2024-04-23 13:18:19.742 | INFO | plassembler.utils.external_tools:run_to_stdout:67 - Started running samtools view -b -h -@ 2 -L plassembler_output/non_chromosome.bed plassembler_output/short_read.bam ...\n", + "2024-04-23 13:18:30.906 | INFO | plassembler.utils.external_tools:run_to_stdout:69 - Done running samtools view -b -h -@ 2 -L plassembler_output/non_chromosome.bed plassembler_output/short_read.bam\n", + "2024-04-23 13:18:30.907 | INFO | plassembler.utils.external_tools:run_to_stdout:67 - Started running samtools view -b -h -f 4 -@ 2 plassembler_output/short_read.bam ...\n", + "2024-04-23 13:18:33.824 | INFO | plassembler.utils.external_tools:run_to_stdout:69 - Done running samtools view -b -h -f 4 -@ 2 plassembler_output/short_read.bam\n", + "2024-04-23 13:18:33.826 | INFO | plassembler.utils.external_tools:run_to_stdout:67 - Started running samtools view -b -h -@ 2 -L plassembler_output/chromosome.bed plassembler_output/short_read.bam ...\n", + "2024-04-23 13:19:28.383 | INFO | plassembler.utils.external_tools:run_to_stdout:69 - Done running samtools view -b -h -@ 2 -L plassembler_output/chromosome.bed plassembler_output/short_read.bam\n", + "2024-04-23 13:19:28.384 | INFO | plassembler.utils.external_tools:run:58 - Started running samtools fastq -@ 2 plassembler_output/unmapped_bam_file.bam -1 plassembler_output/unmapped_R1.fastq -2 plassembler_output/unmapped_R2.fastq -0 /dev/null -s /dev/null -n ...\n", + "2024-04-23 13:19:28.420 | INFO | plassembler.utils.external_tools:run:60 - Done running samtools fastq -@ 2 plassembler_output/unmapped_bam_file.bam -1 plassembler_output/unmapped_R1.fastq -2 plassembler_output/unmapped_R2.fastq -0 /dev/null -s /dev/null -n\n", + "2024-04-23 13:19:28.421 | INFO | plassembler.utils.external_tools:run:58 - Started running samtools fastq -@ 2 plassembler_output/non_chromosome.bam -1 plassembler_output/mapped_non_chromosome_R1.fastq -2 plassembler_output/mapped_non_chromosome_R2.fastq -0 /dev/null -s /dev/null -n ...\n", + "2024-04-23 13:19:30.721 | INFO | plassembler.utils.external_tools:run:60 - Done running samtools fastq -@ 2 plassembler_output/non_chromosome.bam -1 plassembler_output/mapped_non_chromosome_R1.fastq -2 plassembler_output/mapped_non_chromosome_R2.fastq -0 /dev/null -s /dev/null -n\n", + "2024-04-23 13:20:22.612 | INFO | plassembler:run:817 - Running Unicycler.\n", + "2024-04-23 13:20:22.613 | INFO | plassembler.utils.external_tools:run:58 - Started running unicycler -1 plassembler_output/short_read_concat_R1.fastq -2 plassembler_output/short_read_concat_R2.fastq -l plassembler_output/plasmid_long.fastq -t 2 -o plassembler_output/unicycler_output ...\n", + "2024-04-23 15:27:09.255 | INFO | plassembler.utils.external_tools:run:60 - Done running unicycler -1 plassembler_output/short_read_concat_R1.fastq -2 plassembler_output/short_read_concat_R2.fastq -l plassembler_output/plasmid_long.fastq -t 2 -o plassembler_output/unicycler_output\n", + "2024-04-23 15:27:09.258 | INFO | plassembler:run:842 - Unicycler identified plasmids. Calculating Plasmid Copy Numbers.\n", + "2024-04-23 15:27:09.460 | INFO | plassembler.utils.external_tools:run_to_stdout:67 - Started running minimap2 -ax map-ont -t 2 plassembler_output/combined.fasta plassembler_output/chopper_long_reads.fastq.gz ...\n", + "2024-04-23 15:29:09.453 | INFO | plassembler.utils.external_tools:run_to_stdout:69 - Done running minimap2 -ax map-ont -t 2 plassembler_output/combined.fasta plassembler_output/chopper_long_reads.fastq.gz\n", + "2024-04-23 15:29:09.456 | INFO | plassembler.utils.external_tools:run:58 - Started running samtools sort -@ 2 plassembler_output/combined_long.sam -o plassembler_output/combined_sorted_long.bam ...\n", + "2024-04-23 15:29:21.504 | INFO | plassembler.utils.external_tools:run:60 - Done running samtools sort -@ 2 plassembler_output/combined_long.sam -o plassembler_output/combined_sorted_long.bam\n", + "2024-04-23 15:29:21.507 | INFO | plassembler.utils.external_tools:run_to_stdout:67 - Started running minimap2 -ax sr -t 2 plassembler_output/combined.fasta plassembler_output/trimmed_R1.fastq plassembler_output/trimmed_R2.fastq ...\n", + "2024-04-23 15:33:04.675 | INFO | plassembler.utils.external_tools:run_to_stdout:69 - Done running minimap2 -ax sr -t 2 plassembler_output/combined.fasta plassembler_output/trimmed_R1.fastq plassembler_output/trimmed_R2.fastq\n", + "2024-04-23 15:33:04.677 | INFO | plassembler.utils.external_tools:run:58 - Started running samtools sort -@ 2 plassembler_output/combined_short.sam -o plassembler_output/combined_sorted_short.bam ...\n", + "2024-04-23 15:34:06.059 | INFO | plassembler.utils.external_tools:run:60 - Done running samtools sort -@ 2 plassembler_output/combined_short.sam -o plassembler_output/combined_sorted_short.bam\n", + "2024-04-23 15:34:56.744 | INFO | plassembler:run:850 - Calculating mash distances to PLSDB.\n", + "2024-04-23 15:34:56.755 | INFO | plassembler.utils.external_tools:run:58 - Started running mash sketch plassembler_output/plasmids.fasta -i ...\n", + "2024-04-23 15:34:57.021 | INFO | plassembler.utils.external_tools:run:60 - Done running mash sketch plassembler_output/plasmids.fasta -i\n", + "2024-04-23 15:34:57.024 | INFO | plassembler.utils.external_tools:run_to_stdout:67 - Started running mash dist plassembler_output/plasmids.fasta.msh plassembler_db/plsdb_2023_11_03_v2.msh -v 0.1 -d 0.1 -i ...\n", + "2024-04-23 15:35:05.574 | INFO | plassembler.utils.external_tools:run_to_stdout:69 - Done running mash dist plassembler_output/plasmids.fasta.msh plassembler_db/plsdb_2023_11_03_v2.msh -v 0.1 -d 0.1 -i\n", + "2024-04-23 15:35:07.113 | INFO | plassembler.utils.plass_class:combine_depth_mash_tsvs:751 - Filtering contigs below depth filter: 0.25.\n", + "2024-04-23 15:35:07.113 | INFO | plassembler.utils.plass_class:combine_depth_mash_tsvs:753 - All plasmids whose short and long read copy numbers are both below 0.25 will be removed.\n", + "2024-04-23 15:35:07.114 | INFO | plassembler.utils.plass_class:combine_depth_mash_tsvs:769 - No plasmids were filtered due to low depth.\n", + "2024-04-23 15:35:08.341 | INFO | plassembler:end_plassembler:117 - Plassembler has finished\n", + "2024-04-23 15:35:08.341 | INFO | plassembler:end_plassembler:118 - Elapsed time: 9814.03 seconds\n" + ] + } + ], + "source": [ + "#@title 3. Plassembler Run (Hybrid reads)\n", + "\n", + "#@markdown This will probably take a while (depends on your read sets - an hour or two probably: best to put it on over lunch) as the colab environment has limited resources.\n", + "\n", + "#@markdown First, upload your long-reads as a single input .fastq or .fastq.gz file\n", + "\n", + "#@markdown Click on the folder icon to the left and use file upload button.\n", + "\n", + "#@markdown Once it is uploaded, write the file name in the LONG_FASTQ field on the right.\n", + "\n", + "#@markdown Then, upload your short-reads as 2 input .fastq or .fastq.gz files\n", + "\n", + "#@markdown Click on the folder icon to the left and use file upload button.\n", + "\n", + "#@markdown Once they are uploaded, write the forward (R1) file name in the R1_FASTQ field on the right and the the reverse (R2) file name in the R2_FASTQ field on the right.\n", + "\n", + "#@markdown Then provide a directory for plassembler's output using PLASSEMBLER_OUT_DIR.\n", + "#@markdown The default is 'plassembler_output'.\n", + "\n", + "#@markdown Then provide an estimated chromosome size (as an integer) name using CHROMOSOME.\n", + "#@markdown The default is 1000000.\n", + "\n", + "#@markdown You can also provide a min_length for QC filtering the long read data with MIN_LENGTH.\n", + "#@markdown If you provide nothing it will default to 1000.\n", + "\n", + "#@markdown You can also provide a min_quality for QC filtering the long read data with MIN_QUALITY.\n", + "#@markdown If you provide nothing it will default to 9.\n", + "\n", + "#@markdown You can click SKIP_QC to turn off qc (fastp and filtlong).\n", + "#@markdown By default it is False.\n", + "\n", + "#@markdown You can click RAW to pass --nano-raw for Flye. Designed for Guppy fast\n", + "#@markdown configuration reads. By default, Flye will assume\n", + "#@markdown SUP or HAC reads and use --nano-hq.\n", + "\n", + "#@markdown If you have pacbio reads, please change PACBIO_MODEL\n", + "#@markdown from 'none' to one of 'pacbio-hifi', 'pacbio-corr', 'pacbio-raw'. Use pacbio-raw for\n", + "#@markdown PacBio regular CLR reads (<20 percent error),\n", + "#@markdown pacbio-corr for PacBio reads that were corrected with other methods (<3 percent error) or pacbio-\n", + "#@markdown hifi for PacBio HiFi reads (<1 percent error).\n", + "\n", + "#@markdown You can click FORCE to overwrite the output directory.\n", + "#@markdown This may be useful if your earlier pharokka run has crashed for whatever reason.\n", + "\n", + "#@markdown The results of `plassembler run` will be in the folder icon on the left hand panel.\n", + "#@markdown Additionally, it will be zipped so you can download the whole directory.\n", + "\n", + "#@markdown The file to download is PLASSEMBLER_OUT_DIR.zip, where PLASSEMBLER_OUT_DIR is what you provided\n", + "\n", + "#@markdown If you do not see the output directory,\n", + "#@markdown refresh the window by either clicking the folder with the refresh icon below \"Files\"\n", + "#@markdown or double click and select \"Refresh\".\n", + "\n", + "%%bash\n", + "source activate plassemblerENV\n", + "\n", + "python\n", + "import os\n", + "import sys\n", + "import subprocess\n", + "import zipfile\n", + "\n", + "LONG_FASTQ = '' #@param {type:\"string\"}\n", + "R1_FASTQ = '' #@param {type:\"string\"}\n", + "R2_FASTQ = '' #@param {type:\"string\"}\n", + "THREADS = \"2\"\n", + "\n", + "if os.path.exists(LONG_FASTQ):\n", + " print(f\"Input file {LONG_FASTQ} exists\")\n", + "else:\n", + " print(f\"Error: File {LONG_FASTQ} does not exist\")\n", + " print(f\"Please check the spelling and that you have uploaded it correctly\")\n", + " sys.exit(1)\n", + "\n", + "if os.path.exists(R1_FASTQ):\n", + " print(f\"Input file {R1_FASTQ} exists\")\n", + "else:\n", + " print(f\"Error: File {R1_FASTQ} does not exist\")\n", + " print(f\"Please check the spelling and that you have uploaded it correctly\")\n", + " sys.exit(1)\n", + "\n", + "if os.path.exists(R2_FASTQ):\n", + " print(f\"Input file {R2_FASTQ} exists\")\n", + "else:\n", + " print(f\"Error: File {R2_FASTQ} does not exist\")\n", + " print(f\"Please check the spelling and that you have uploaded it correctly\")\n", + " sys.exit(1)\n", + "\n", + "PLASSEMBLER_OUT_DIR = 'plassembler_output' #@param {type:\"string\"}\n", + "CHROMOSOME = 1000000 #@param {type:\"integer\"}\n", + "MIN_LENGTH = 1000 #@param {type:\"integer\"}\n", + "MIN_QUALITY = 9 #@param {type:\"integer\"}\n", + "SKIP_QC = False #@param {type:\"boolean\"}\n", + "RAW = False #@param {type:\"boolean\"}\n", + "PACBIO_MODEL = 'none' #@param {type:\"string\"}\n", + "allowed_gene_predictors = ['none', 'pacbio-hifi', 'pacbio-corr', 'pacbio-raw']\n", + "# Check if the input parameter is valid\n", + "if PACBIO_MODEL.lower() not in allowed_gene_predictors:\n", + " raise ValueError(\"Invalid PACBIO_MODEL. Please choose from: 'none', 'pacbio-hifi', 'pacbio-corr', 'pacbio-raw'.\")\n", + "\n", + "FORCE = False #@param {type:\"boolean\"}\n", + "\n", + "\n", + "# Construct the command\n", + "command = f\"plassembler run -d plassembler_db -c {CHROMOSOME} -l {LONG_FASTQ} -1 {R1_FASTQ} -2 {R2_FASTQ} -o {PLASSEMBLER_OUT_DIR} -t {THREADS} --min_length {MIN_LENGTH} --min_quality {MIN_QUALITY}\"\n", + "\n", + "if SKIP_QC is True:\n", + " command = f\"{command} --skip_qc\"\n", + "\n", + "if RAW is True:\n", + " command = f\"{command} -r\"\n", + "\n", + "if FORCE is True:\n", + " command = f\"{command} -f\"\n", + "\n", + "if PACBIO_MODEL != 'none':\n", + " command = f\"{command} --pacbio_model {PACBIO_MODEL}\"\n", + "\n", + "\n", + "# Execute the command\n", + "try:\n", + " print(\"Running plassembler run\")\n", + " subprocess.run(command, shell=True, check=True)\n", + " print(\"plassembler run completed successfully.\")\n", + " print(f\"Your output is in {PLASSEMBLER_OUT_DIR}.\")\n", + " print(f\"Zipping the output directory so you can download it all in one go.\")\n", + "\n", + " zip_filename = f\"{PLASSEMBLER_OUT_DIR}.zip\"\n", + "\n", + " # Zip the contents of the output directory\n", + " with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:\n", + " for root, dirs, files in os.walk(PLASSEMBLER_OUT_DIR):\n", + " for file in files:\n", + " zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), PLASSEMBLER_OUT_DIR))\n", + " print(f\"Output directory has been zipped to {zip_filename}\")\n", + "\n", + "\n", + "except subprocess.CalledProcessError as e:\n", + " print(f\"Error occurred: {e}\")\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9QfjP3q-Q04f" + }, + "outputs": [], + "source": [ + "#@title 4. Plassembler Long (long-reads only)\n", + "\n", + "#@markdown This will probably take a while (best to put it on overnight or over lunch) as the colab environment has limited resources.\n", + "\n", + "#@markdown First, upload your long-reads as a single input .fastq or .fastq.gz file\n", + "\n", + "#@markdown Click on the folder icon to the left and use file upload button.\n", + "\n", + "#@markdown Once it is uploaded, write the file name in the LONG_FASTQ field on the right.\n", + "\n", + "#@markdown Then provide a directory for plassembler's output using PLASSEMBLER_OUT_DIR.\n", + "#@markdown The default is 'plassembler_output'.\n", + "\n", + "#@markdown Then provide an estimated chromosome size (as an integer) name using CHROMOSOME.\n", + "#@markdown The default is 1000000.\n", + "\n", + "#@markdown You can also provide a min_length for QC filtering the long read data with MIN_LENGTH.\n", + "#@markdown If you provide nothing it will default to 1000.\n", + "\n", + "#@markdown You can also provide a min_quality for QC filtering the long read data with MIN_QUALITY.\n", + "#@markdown If you provide nothing it will default to 9.\n", + "\n", + "#@markdown You can click SKIP_QC to turn off qc (filtlong).\n", + "#@markdown By default it is False.\n", + "\n", + "#@markdown You can click RAW to pass --nano-raw for Flye. Designed for Guppy fast\n", + "#@markdown configuration reads. By default, Flye will assume\n", + "#@markdown SUP or HAC reads and use --nano-hq.\n", + "\n", + "#@markdown If you have pacbio reads, please change PACBIO_MODEL\n", + "#@markdown from 'none' to one of 'pacbio-hifi', 'pacbio-corr', 'pacbio-raw'. Use pacbio-raw for\n", + "#@markdown PacBio regular CLR reads (<20 percent error),\n", + "#@markdown pacbio-corr for PacBio reads that were corrected with other methods (<3 percent error) or pacbio-\n", + "#@markdown hifi for PacBio HiFi reads (<1 percent error).\n", + "\n", + "#@markdown You can click FORCE to overwrite the output directory.\n", + "#@markdown This may be useful if your earlier pharokka run has crashed for whatever reason.\n", + "\n", + "#@markdown The results of `plassembler long` will be in the folder icon on the left hand panel.\n", + "#@markdown Additionally, it will be zipped so you can download the whole directory.\n", + "\n", + "#@markdown The file to download is PLASSEMBLER_OUT_DIR.zip, where PLASSEMBLER_OUT_DIR is what you provided\n", + "\n", + "#@markdown If you do not see the output directory,\n", + "#@markdown refresh the window by either clicking the folder with the refresh icon below \"Files\"\n", + "#@markdown or double click and select \"Refresh\".\n", + "\n", + "\n", + "%%bash\n", + "source activate plassemblerENV\n", + "\n", + "python\n", + "import os\n", + "import sys\n", + "import subprocess\n", + "import zipfile\n", + "LONG_FASTQ = '' #@param {type:\"string\"}\n", + "THREADS = \"2\"\n", + "\n", + "if os.path.exists(LONG_FASTQ):\n", + " print(f\"Input file {LONG_FASTQ} exists\")\n", + "else:\n", + " print(f\"Error: File {LONG_FASTQ} does not exist\")\n", + " print(f\"Please check the spelling and that you have uploaded it correctly\")\n", + " sys.exit(1)\n", + "\n", + "\n", + "PLASSEMBLER_OUT_DIR = 'plassembler_output' #@param {type:\"string\"}\n", + "CHROMOSOME = 1000000 #@param {type:\"integer\"}\n", + "MIN_LENGTH = 1000 #@param {type:\"integer\"}\n", + "MIN_QUALITY = 9 #@param {type:\"integer\"}\n", + "SKIP_QC = False #@param {type:\"boolean\"}\n", + "RAW = False #@param {type:\"boolean\"}\n", + "PACBIO_MODEL = 'none' #@param {type:\"string\"}\n", + "allowed_gene_predictors = ['none', 'pacbio-hifi', 'pacbio-corr', 'pacbio-raw']\n", + "# Check if the input parameter is valid\n", + "if PACBIO_MODEL.lower() not in allowed_gene_predictors:\n", + " raise ValueError(\"Invalid PACBIO_MODEL. Please choose from: 'none', 'pacbio-hifi', 'pacbio-corr', 'pacbio-raw'.\")\n", + "\n", + "FORCE = False #@param {type:\"boolean\"}\n", + "\n", + "# Construct the command\n", + "command = f\"plassembler long -d plassembler_db -c {CHROMOSOME} -l {LONG_FASTQ} -o {PLASSEMBLER_OUT_DIR} -t {THREADS} --min_length {MIN_LENGTH} --min_quality {MIN_QUALITY}\"\n", + "\n", + "if SKIP_QC is True:\n", + " command = f\"{command} --skip_qc\"\n", + "\n", + "if RAW is True:\n", + " command = f\"{command} -r\"\n", + "\n", + "if FORCE is True:\n", + " command = f\"{command} -f\"\n", + "\n", + "if PACBIO_MODEL != 'none':\n", + " command = f\"{command} --pacbio_model {PACBIO_MODEL}\"\n", + "\n", + "# Execute the command\n", + "try:\n", + " print(\"Running plassembler long\")\n", + " subprocess.run(command, shell=True, check=True)\n", + " print(\"plassembler long completed successfully.\")\n", + " print(f\"Your output is in {PLASSEMBLER_OUT_DIR}.\")\n", + " print(f\"Zipping the output directory so you can download it all in one go.\")\n", + "\n", + " zip_filename = f\"{PLASSEMBLER_OUT_DIR}.zip\"\n", + "\n", + " # Zip the contents of the output directory\n", + " with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:\n", + " for root, dirs, files in os.walk(PLASSEMBLER_OUT_DIR):\n", + " for file in files:\n", + " zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), PLASSEMBLER_OUT_DIR))\n", + " print(f\"Output directory has been zipped to {zip_filename}\")\n", + "\n", + "\n", + "except subprocess.CalledProcessError as e:\n", + " print(f\"Error occurred: {e}\")\n", + "\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [], + "include_colab_link": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file