Runtime errors and flake

gbouras13 · Sep 11, 2023 · c31af52 · c31af52
1 parent 4244eb9
commit c31af52
Show file tree

Hide file tree

Showing 7 changed files with 22 additions and 216 deletions.
diff --git a/src/plassembler/__init__.py b/src/plassembler/__init__.py
@@ -406,7 +406,7 @@ def run(
         logger.info(
             f"You have specified a {flye_directory} with an existing flye assembly."
         )
-        logger.info(f"Copying files.")
+        logger.info("Copying files.")
         # copies the files to the outdir
         shutil.copy2(
             os.path.join(flye_directory, "assembly_info.txt"),
@@ -1150,7 +1150,7 @@ def long(
         logger.info(
             f"You have specified a {flye_directory} with an existing flye assembly."
         )
-        logger.info(f"Copying files.")
+        logger.info("Copying files.")
         # copies the files to the outdir
         shutil.copy2(
             os.path.join(flye_directory, "assembly_info.txt"),
@@ -1238,7 +1238,7 @@ def long(
                 canu_nano_or_pacbio,
                 total_flye_plasmid_length,
             )
-        except:
+        except Exception:
             logger.warning(
                 "canu failed to assemble anything from the unmapped reads. This likely means you have 0 plasmids in this sample."
             )

diff --git a/src/plassembler/utils/db.py b/src/plassembler/utils/db.py
@@ -49,7 +49,7 @@ def check_db_installation(db_dir: Path, install_flag: bool):
 def get_database_zenodo(db_dir: Path):
     logger.info("Downloading Plassembler Database.")
     tarball = "plsdb_110222_plassembler_v0.1.4_databases.tar.gz"
-    tar_path = Path(f"{db_dir}/plsdb_110222_plassembler_v0.1.4_databases.tar.gz")
+    tar_path = Path(f"{db_dir}/{tarball}")
     db_url = "https://zenodo.org/record/7499200/files/plsdb_110222_plassembler_v0.1.4_databases.tar.gz"
     requiredmd5 = "f5144045e6e5d0d5a6b7f78d0c08840d"
 

diff --git a/src/plassembler/utils/plass_class.py b/src/plassembler/utils/plass_class.py
@@ -208,7 +208,6 @@ def identify_chromosome_process_flye(self, chromosome_len):
         :return chromosome_flag: bool whether chromosome assembles
         """
         outdir = self.outdir
-        long_only = self.long_only
         info_file = os.path.join(outdir, "assembly_info.txt")
         col_list = [
             "seq_name",
@@ -304,7 +303,6 @@ def identify_chromosome_process_flye_long(self, chromosome_len):
         :return chromosome_flag: bool whether chromosome assembles
         """
         outdir = self.outdir
-        long_only = self.long_only
         info_file = os.path.join(outdir, "assembly_info.txt")
         col_list = [
             "seq_name",
@@ -763,14 +761,12 @@ def finalise_contigs_long(self, prefix):
         with open(os.path.join(outdir, prefix + "_plasmids.fasta"), "w") as dna_fa:
             for dna_record in SeqIO.parse(plasmid_fasta, "fasta"):
                 id = dna_record.id
-                l = len(dna_record.seq)
+                length = len(dna_record.seq)
                 copy_number = combined_depth_mash_df.plasmid_copy_number_long[i]
                 if "circular" in dna_record.description:  # circular contigs from canu
-                    desc = (
-                        f"len={l} plasmid_copy_number_long={copy_number}x circular=True"
-                    )
+                    desc = f"len={length} plasmid_copy_number_long={copy_number}x circular=True"
                 else:
-                    desc = f"len={l} plasmid_copy_number_long={copy_number}x"
+                    desc = f"len={length} plasmid_copy_number_long={copy_number}x"
                 i += 1
                 record = SeqRecord(dna_record.seq, id=id, description=desc)
                 SeqIO.write(record, dna_fa, "fasta")

diff --git a/src/plassembler/utils/run_canu.py b/src/plassembler/utils/run_canu.py
@@ -1,11 +1,8 @@
 import math
-from collections import Counter
 from itertools import product
 from pathlib import Path
 
-import pandas as pd
 from Bio import SeqIO
-from Bio.Seq import Seq
 from Bio.SeqRecord import SeqRecord
 from loguru import logger
 
@@ -171,189 +168,3 @@ def trim_contigs(canu_filtered_fasta, outdir):
         SeqIO.write(trimmed_records, output_handle, "fasta")
 
     return output_filename
-
-
-def make_blastdb(canu_output_dir, plasmid_fasta, logdir):
-    db: Path = Path(canu_output_dir) / "db"
-
-    makeblastdb = ExternalTool(
-        tool="makeblastdb",
-        input=f"-in {plasmid_fasta}",
-        output=f"-out {db}",
-        params="-dbtype nucl ",
-        logdir=logdir,
-        outfile="",
-    )
-
-    ExternalTool.run_tool(makeblastdb, to_stdout=False)
-
-
-"""
-some of this adapted from dnaapler 
-
-https://github.com/gbouras13/dnaapler
-
-"""
-
-
-def run_blast(canu_output_dir, plasmid_fasta, threads, logdir):
-    blast_output: Path = Path(canu_output_dir) / "blast_output.txt"
-    db: Path = Path(canu_output_dir) / "db"
-
-    blast = ExternalTool(
-        tool="blastn",
-        input=f"-query {plasmid_fasta}",
-        output=f"-out {blast_output}",
-        params=f'-db {db} -evalue  1e-05 -num_threads {threads} -outfmt " 6 qseqid qlen sseqid slen length qstart qend sstart send pident nident gaps mismatch evalue bitscore qseq sseq "',
-        logdir=logdir,
-        outfile="",
-    )
-    ExternalTool.run_tool(blast, to_stdout=False)
-
-
-def process_blast_output(canu_output_dir, combined_plasmid_file, outdir):
-    """Processes
-
-    :param input: input file
-    :param blast_file: blast output file
-    :param out_file: output file
-    :return: output_filename: Path of output dedeup Fasta
-    """
-
-    blast_output_file: Path = Path(canu_output_dir) / "blast_output.txt"
-
-    # define colnames
-    col_list = [
-        "qseqid",
-        "qlen",
-        "sseqid",
-        "slen",
-        "length",
-        "qstart",
-        "qend",
-        "sstart",
-        "send",
-        "pident",
-        "nident",
-        "gaps",
-        "mismatch",
-        "evalue",
-        "bitscore",
-        "qseq",
-        "sseq",
-    ]
-
-    # read in the dataframe from BLAST
-    try:
-        blast_df = pd.read_csv(
-            blast_output_file, delimiter="\t", index_col=False, names=col_list
-        )
-    except Exception:
-        logger.error("There was an issue with parsing the BLAST output file.")
-
-    # if the BLAST input is empty
-    if isinstance(blast_df, pd.DataFrame) and blast_df.empty:
-        logger.error("There were 0 BLAST hits. This must be a BLAST error.")
-
-    # keep only the columns where the same contig is blasted against each other
-
-    # Filter rows where qseqid is equal to sseqid - the ones fo BLASTing against themselves
-    blast_df = blast_df[blast_df["qseqid"] == blast_df["sseqid"]]
-
-    # read in the canu FASTA and save as a dictionary
-
-    fasta_dict = {}
-    i = 1
-    for record in SeqIO.parse(combined_plasmid_file, "fasta"):
-        fasta_dict[record.id] = {
-            "count": i,
-            "sequence": str(record.seq),
-            "dupe": False,  # stores the dupe status
-            "start": 1,  # stores the start
-            "end": len(record.seq),  # stores the end
-        }
-        i += 1
-
-    for contig in fasta_dict.keys():
-        tmp_df = blast_df[blast_df["qseqid"] == contig]
-        # Sort by 'length' column in descending order
-        tmp_df_sorted = tmp_df.sort_values(by="length", ascending=False)
-        # get rid of the 100% match row
-        tmp_df_sorted = tmp_df_sorted[tmp_df_sorted["qlen"] != tmp_df_sorted["length"]]
-        # more than 99% identical
-        tmp_df_sorted = tmp_df_sorted[tmp_df_sorted["pident"] > 99]
-        # starts need to be < 100
-        tmp_df_sorted = tmp_df_sorted[tmp_df_sorted["qstart"] < 100]
-        num_rows = tmp_df_sorted.shape[0]
-        if num_rows == 0:  # where there is no dupe at all
-            fasta_dict[contig]["dupe"] = False
-            # exit
-        else:
-            # Get the first row
-            first_row = tmp_df_sorted.iloc[0]
-            # ensure the match is good
-            if (
-                first_row["length"] < 500
-            ):  # less than 500bp repeat in the top hit - probably Insertion Seq not a real plasmid dupe - otherwise probably legit
-                fasta_dict[contig]["dupe"] = False
-            else:
-                try:
-                    # the repeat will be in the longest hit with qstart < 100  (usually 1 or very close to it)
-                    # heuristic i need to check i guess
-                    # just take until the next repeat element
-                    # this has been filtered for prior
-                    best_row = tmp_df_sorted.iloc[0]
-                    fasta_dict[contig]["dupe"] = True
-                    fasta_dict[contig]["start"] = best_row["qstart"]
-                    fasta_dict[contig]["end"] = best_row["sstart"]
-
-                    # if the query end is larger than the sstart - there is an overlap
-                    # take 1 as the start and then the sstart as the end
-                    # otherwise check for  concatenation (within 1000bp))
-                    # otherwise exit just the whole plasmid
-                    # if best_row["qend"] > best_row["sstart"]:
-                    #     fasta_dict[contig]["dupe"] = True
-                    #     fasta_dict[contig]["start"] = best_row["qstart"]
-                    #     fasta_dict[contig]["end"] = best_row["sstart"]
-                    # #elif (best_row["qend"] + 1000) > best_row[
-                    #     #"sstart"
-                    # #]:  # the longest match is likely to be a duplication
-                    # else:
-                    #     fasta_dict[contig]["dupe"] = True
-                    #     fasta_dict[contig]["start"] = best_row["qstart"]
-                    #     fasta_dict[contig]["end"] = best_row["sstart"]
-                    # else:
-                    #     fasta_dict[contig]["dupe"] = False
-                except Exception:
-                    logger.error("Flye not found. Please reinstall Plassembler.")
-
-    # Create a list of SeqRecord objects
-    records = []
-    for entry_id, entry_data in fasta_dict.items():
-        subsequence = entry_data["sequence"][
-            entry_data["start"] - 1 : entry_data["end"]
-        ]
-        count = str(entry_data["count"])
-        l = entry_data["end"]
-
-        record = SeqRecord(
-            seq=Seq(subsequence), id=count, description=f"{count} len={l}"
-        )
-        records.append(record)
-
-    # Write the records to a FASTA file
-    output_filename: Path = Path(outdir) / "combined_plasmids_dedup.fasta"
-    with open(output_filename, "w") as output_handle:
-        SeqIO.write(records, output_handle, "fasta")
-
-    return output_filename
-
-
-# then BLAST output
-# # then figure out for overlaps
-# parse all blast hits as a dictionary
-# need more than 1 hit (itself)
-# if the blast hit is more than 50% and less than 90 % of contig length, take as duplicate
-# elif the blast hit is more than 2000bp (lower could be e.g. IS element) and far away (more than 50% length away)
-# then assume partial duplication too
-# get all dupe regions this way
diff --git a/src/plassembler/utils/run_dnaapler.py b/src/plassembler/utils/run_dnaapler.py
@@ -32,6 +32,6 @@ def run_dnaapler(threads, plasmid_fasta, logdir, outdir):
         )
         return plasmids_for_sketching
     except Exception:
-        logger.warning(f"Dnaapler failed to reorient any plasmids.")
+        logger.warning("Dnaapler failed to reorient any plasmids.")
         plasmids_for_sketching = plasmid_fasta
         return plasmids_for_sketching
diff --git a/tests/test_end_to_end.py b/tests/test_end_to_end.py
@@ -72,7 +72,7 @@ def test_citation():
 
 
 # test running end to end
-#### for cases 1,2,3
+# for cases 1,2,3
 # uncomment for mac running to check
 # 70kbp, 44kbp and 9kbp plasmid reads are from
 # the 70kbp is a fake chromosome
@@ -91,7 +91,7 @@ def test_plassembler_case_1(self):
         remove_directory(outdir)
 
     def test_plassembler_case_2(self):
-        with self.assertRaises(ValueError):
+        with self.assertRaises(RuntimeError):
             """test plassembler run case 2 no chromosome assembled"""
             longreads: Path = f"{end_to_end}/input_fastq.gz"
             s1: Path = f"{end_to_end}/input_R1.fastq.gz"
@@ -114,11 +114,11 @@ def test_plassembler_case_3(self):
         remove_directory(outdir)
 
     def test_plassembler_case_4(self):
-        with self.assertRaises(ValueError):
+        with self.assertRaises(RuntimeError):
             """test plassembler run case 4. Only chromosome assembled with flye, no plasmid in recovery."""
             longreads: Path = f"{end_to_end}/abaumanii_plasmid.fastq.gz"
             s1: Path = f"{end_to_end}/abaumanii_reads_R1.fastq.gz"
-            s2: Path = f"{end_to_end}/aabaumanii_reads_R2.fastq.gz"
+            s2: Path = f"{end_to_end}/abaumanii_reads_R2.fastq.gz"
             chromosome = 100000
             outdir: Path = f"{end_to_end}/test_out"
             cmd = f"plassembler run -l {longreads} -c {chromosome} -1 {s1} -2 {s2} -d {plassembler_db_dir} -o {outdir}  -t 8 -f"
@@ -128,7 +128,7 @@ def test_plassembler_case_4(self):
     # skipqc
 
     def test_plassembler_skipqc(self):
-        with self.assertRaises(ValueError):
+        with self.assertRaises(RuntimeError):
             """test plassembler run case 4. Only chromosome assembled with flye, no plasmid in recovery."""
             longreads: Path = f"{end_to_end}/abaumanii_plasmid.fastq.gz"
             s1: Path = f"{end_to_end}/abaumanii_reads_R1.fastq.gz"
@@ -141,8 +141,8 @@ def test_plassembler_skipqc(self):
 
     # flye_dir
     def test_plassembler_flye_dir(self):
-        with self.assertRaises(ValueError):
-            """test plassembler run case 4. With flye directory."""
+        with self.assertRaises(RuntimeError):
+            """test plassembler run case 4. With flye directory. Should fail out (Saves time)."""
             longreads: Path = f"{end_to_end}/abaumanii_plasmid.fastq.gz"
             s1: Path = f"{end_to_end}/abaumanii_reads_R1.fastq.gz"
             s2: Path = f"{end_to_end}/abaumanii_reads_R2.fastq.gz"
@@ -157,7 +157,7 @@ def test_plassembler_flye_dir(self):
     long
     """
 
-    def test_plassembler_long():
+    def test_plassembler_long(self):
         """test plassembler long"""
         longreads: Path = f"{end_to_end}/input_fastq.gz"
         chromosome = 50000
@@ -166,8 +166,8 @@ def test_plassembler_long():
         exec_command(cmd)
         remove_directory(outdir)
 
-    def test_plassembler_long_no_chrom():
-        with self.assertRaises(ValueError):
+    def test_plassembler_long_no_chrom(self):
+        with self.assertRaises(RuntimeError):
             """test plassembler long - no chromosome recovered"""
             longreads: Path = f"{end_to_end}/input_fastq.gz"
             chromosome = 500000
@@ -176,8 +176,8 @@ def test_plassembler_long_no_chrom():
             exec_command(cmd)
             remove_directory(outdir)
 
-    def test_plassembler_long_no_plasmids():
-        """test plassembler long - no plasmids"""
+    def test_plassembler_long_no_plasmids(self):
+        """test plassembler long - no plasmids recovered at all"""
         longreads: Path = f"{end_to_end}/abaumanii_plasmid.fastq.gz"
         chromosome = 50000
         outdir: Path = f"{end_to_end}/test_out"
@@ -186,10 +186,10 @@ def test_plassembler_long_no_plasmids():
         remove_directory(outdir)
 
     """
-    assembled 
+    assembled
     """
 
-    def test_plassembler_assembled():
+    def test_plassembler_assembled(self):
         """test plassembler assembled"""
         longreads: Path = f"{end_to_end}/input_fastq.gz"
         s1: Path = f"{end_to_end}/input_R1.fastq.gz"

diff --git a/tests/test_external_commands.py b/tests/test_external_commands.py
@@ -96,7 +96,6 @@ class test_sam_to_fastq(unittest.TestCase):
     # sam to bam
     def test_extract_long_fastqs_slow_keep_fastqs(self):
         expected_return = True
-        threads = 1
         samfile: Path = Path(f"{map_dir}/long_read.sam")
         # not in the dir to prevent overwriting
         plasmidfastq: Path = Path(f"{map_dir}/sam_to_bam/plasmid_long.fastq")