Merge pull request #42 from gbouras13/dev

v1.5.0 update DB with PLSDB upgrade
gbouras13 · Nov 21, 2023 · 117bde9 · 117bde9
2 parents c8cb954 + d6974d2
commit 117bde9
Show file tree

Hide file tree

Showing 14 changed files with 199 additions and 149 deletions.
diff --git a/HISTORY.md b/HISTORY.md
@@ -1,5 +1,11 @@
 # History
 
+1.5.0 (2023-11-21)
+------------------
+
+* **If you upgrade to v1.5.0, you will need to update the database using `plassembler download`** 
+* Plassembler v1.5.0 incorporates a new database thanks to the recent PLSDB release [2023_11_03_v2](https://ccb-microbe.cs.uni-saarland.de/plsdb/). Thanks @[biobrad](https://github.com/biobrad) for the heads up.
+
 1.4.1 (2023-10-30)
 ------------------
 

diff --git a/README.md b/README.md
@@ -24,6 +24,22 @@ Additionally, I would recommend reading the following guides to bacterial genome
 *  [Perfect Bacterial Assembly Tutorial](https://github.com/rrwick/Perfect-bacterial-genome-tutorial)
 *  [Perfect bacterial assembly Paper](https://doi.org/10.1371/journal.pcbi.1010905)
 
+## Quick Start
+
+The easiest way to install `plassembler` is via conda:
+
+`conda install -c bioconda plassembler`
+
+Followed by database download and installation:
+
+`plassembler download -d <databse directory>`
+
+And finally run `plassembler`:
+
+`plassembler run -d <database directory> -l <long read fastq> -o <output dir> -1 < short read R1 fastq> -2 < short read R2 fastq>  -c <estimated chromosome length>`
+
+Please read the [Installation](#installation) section for more details, especially if you are an inexperienced command line user.
+
 ## Manuscript
 
 `plassembler` has been recently published in *Bioinformatics*:
@@ -32,13 +48,19 @@ George Bouras, Anna E. Sheppard, Vijini Mallawaarachchi, Sarah Vreugde, Plassemb
 
 If you use `plassembler`, please see the full [Citations](#citations) section for a list of all programs `plassembler` uses under the hood, in order to fully recognise the creators of these tools for their work.
 
+## Documentation
+
+The full documentation for Plassembler can be found [here](https://plassembler.readthedocs.io/en/latest).
+
 ## Table of Contents 
 
 - [plassembler](#plassembler)
   - [Automated Bacterial Plasmid Assembly Program](#automated-bacterial-plasmid-assembly-program)
+  - [Quick Start](#quick-start)
   - [Manuscript](#manuscript)
+  - [Documentation](#documentation)
   - [Table of Contents](#table-of-contents)
-  - [Quick Start](#quick-start)
+  - [`plassembler` v1.5.0 Update New Database (21 November 2023)](#plassembler-v150-update-new-database-21-november-2023)
   - [`plassembler` v1.3.0 Updates (24 October 2023)](#plassembler-v130-updates-24-october-2023)
   - [Why Does Plassembler Exist?](#why-does-plassembler-exist)
   - [Why Not Just Use Unicycler?](#why-not-just-use-unicycler)
@@ -58,21 +80,11 @@ If you use `plassembler`, please see the full [Citations](#citations) section fo
   - [Bugs and Suggestions](#bugs-and-suggestions)
   - [Citations](#citations)
 
-## Quick Start
-
-The easiest way to install `plassembler` is via conda:
-
-`conda install -c bioconda plassembler`
-
-Followed by database download and installation:
-
-`plassembler download -d <databse directory>`
-
-And finally run `plassembler`:
+## `plassembler` v1.5.0 Update New Database (21 November 2023)
 
-`plassembler run -d <database directory> -l <long read fastq> -o <output dir> -1 < short read R1 fastq> -2 < short read R2 fastq>  -c <estimated chromosome length>`
+* **If you upgrade to v1.5.0, you will need to update the database using `plassembler download`** 
+* Plassembler v1.5.0 incorporates a new expanded database thanks to the recent PLSDB release [2023_11_03_v2](https://ccb-microbe.cs.uni-saarland.de/plsdb/). Thanks @[biobrad](https://github.com/biobrad) for the heads up.
 
-Please read the [Installation](#installation) section for more details, especially if you are an inexperienced command line user.
 
 ## `plassembler` v1.3.0 Updates (24 October 2023)
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "plassembler"
-version = "1.4.1" # change VERSION too
+version = "1.5.0" # change VERSION too
 description = "Quickly and accurately assemble plasmids in hybrid sequenced bacterial isolates"
 authors = ["George Bouras <[email protected]>"]
 license = "MIT"

diff --git a/src/plassembler/__init__.py b/src/plassembler/__init__.py
@@ -1496,9 +1496,7 @@ def long(
                 corrected_fastqs: Path = Path(outdir) / "corrected_plasmid_long.fastq"
                 corrected_fasta_to_fastq(canu_reads, corrected_fastqs)
             except:
-                logger.warning(
-                    "canu correct failed to correct any reads. Advancing with uncorrected reads"
-                )
+                logger.warning("Advancing with uncorrected reads")
                 corrected_fastqs = entropy_filtered_fastq
 
             # remove canu directory

diff --git a/src/plassembler/utils/VERSION b/src/plassembler/utils/VERSION
@@ -1 +1 @@
-1.4.1
+1.5.0
diff --git a/src/plassembler/utils/db.py b/src/plassembler/utils/db.py
@@ -23,12 +23,14 @@ def check_db_installation(db_dir: Path, install_flag: bool):
     """
     # Mash files
 
-    mash_db_names = ["plsdb.msh", "plsdb.tsv"]
+    mash_db_names = ["plsdb_2023_11_03_v2.msh", "plsdb_2023_11_03_v2.tsv"]
 
     f1: Path = db_dir / f"{mash_db_names[0]}"
     f2: Path = db_dir / f"{mash_db_names[1]}"
 
     if f1.exists() and f2.exists():
+        logger.info(f"PLSDB Database mash sketch at {f1} exists.")
+        logger.info(f"PLSDB Database tsv metadata file at {f2} exists.")
         logger.info(f"PLSDB Database at {db_dir} has already been downloaded")
     else:
         for file_name in mash_db_names:
@@ -48,10 +50,10 @@ def check_db_installation(db_dir: Path, install_flag: bool):
 
 def get_database_zenodo(db_dir: Path):
     logger.info("Downloading Plassembler Database.")
-    tarball = "plsdb_110222_plassembler_v0.1.4_databases.tar.gz"
+    tarball = "201123_plassembler_v1.5.0_databases.tar.gz"
     tar_path = Path(f"{db_dir}/{tarball}")
-    db_url = "https://zenodo.org/record/7499200/files/plsdb_110222_plassembler_v0.1.4_databases.tar.gz"
-    requiredmd5 = "f5144045e6e5d0d5a6b7f78d0c08840d"
+    db_url = "https://zenodo.org/record/10158040/files/201123_plassembler_v1.5.0_databases.tar.gz"
+    requiredmd5 = "3a24bacc05bb857dc044fc6662b58db7"
 
     # remvoe the directory
     if os.path.exists(db_dir):
@@ -114,9 +116,7 @@ def untar(tarball_path: Path, output_path: Path):
             tar_file.extractall(path=str(output_path))
 
         # get untarred directory
-        untarpath = os.path.join(
-            output_path, "plsdb_110222_plassembler_v0.1.4_databases"
-        )
+        untarpath = os.path.join(output_path, "201123_plassembler_v1.5.0_databases")
 
         # Get a list of all files in the source directory
         files_to_move = [

diff --git a/src/plassembler/utils/external_tools.py b/src/plassembler/utils/external_tools.py
@@ -31,7 +31,10 @@ def __init__(
         logdir.mkdir(parents=True, exist_ok=True)
         command_hash = hashlib.sha256(self.command_as_str.encode("utf-8")).hexdigest()
         tool_name = Path(tool).name
-        logfile_prefix: Path = logdir / f"{tool_name}_{command_hash}"
+        # to make sure no spaces or -
+        tool_name_with_underscores = tool_name.replace(" ", "_")
+        tool_name_with_underscores = tool_name_with_underscores.replace("-", "_")
+        logfile_prefix: Path = logdir / f"{tool_name_with_underscores}_{command_hash}"
         self.out_log = f"{logfile_prefix}.out"
         self.err_log = f"{logfile_prefix}.err"
         self.outfile = outfile
@@ -115,13 +118,21 @@ def run_tool(
                     "Dnaapler failed to reorient any putative plasmids to begin with repA."
                 )
                 logger.warning("Continuing with the un-reoriented contigs.")
-            elif tool.tool_str == "canu":  # for dnaapler errors
+            elif tool.tool_str == "canu":  # for canu errors
                 logger.warning(
-                    "canu failed to assemble anything from the unmapped reads."
+                    "Canu failed to assemble anything from the unmapped reads."
                 )
                 logger.warning(
                     f"If you think your sample should still have plasmids, please check stdout log file: {tool.out_log} and stderr log file: {tool.err_log}"
                 )
+            elif tool.tool_str == "canu -correct":  # for canu errors
+                logger.warning("Canu failed to correct any reads.")
+                logger.warning(
+                    "This probably means there is low depth, don't be too concerned."
+                )
+                logger.warning(
+                    f"If you are concerned, check stdout log file: {tool.out_log} and stderr log file: {tool.err_log}."
+                )
             else:
                 logger.warning(
                     f"Error calling {tool.command_as_str} (return code {error.returncode})"