Skip to content

Commit

Permalink
Runtime errors and flake
Browse files Browse the repository at this point in the history
  • Loading branch information
gbouras13 committed Sep 11, 2023
1 parent 4244eb9 commit c31af52
Show file tree
Hide file tree
Showing 7 changed files with 22 additions and 216 deletions.
6 changes: 3 additions & 3 deletions src/plassembler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,7 +406,7 @@ def run(
logger.info(
f"You have specified a {flye_directory} with an existing flye assembly."
)
logger.info(f"Copying files.")
logger.info("Copying files.")
# copies the files to the outdir
shutil.copy2(
os.path.join(flye_directory, "assembly_info.txt"),
Expand Down Expand Up @@ -1150,7 +1150,7 @@ def long(
logger.info(
f"You have specified a {flye_directory} with an existing flye assembly."
)
logger.info(f"Copying files.")
logger.info("Copying files.")
# copies the files to the outdir
shutil.copy2(
os.path.join(flye_directory, "assembly_info.txt"),
Expand Down Expand Up @@ -1238,7 +1238,7 @@ def long(
canu_nano_or_pacbio,
total_flye_plasmid_length,
)
except:
except Exception:
logger.warning(
"canu failed to assemble anything from the unmapped reads. This likely means you have 0 plasmids in this sample."
)
Expand Down
2 changes: 1 addition & 1 deletion src/plassembler/utils/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def check_db_installation(db_dir: Path, install_flag: bool):
def get_database_zenodo(db_dir: Path):
logger.info("Downloading Plassembler Database.")
tarball = "plsdb_110222_plassembler_v0.1.4_databases.tar.gz"
tar_path = Path(f"{db_dir}/plsdb_110222_plassembler_v0.1.4_databases.tar.gz")
tar_path = Path(f"{db_dir}/{tarball}")
db_url = "https://zenodo.org/record/7499200/files/plsdb_110222_plassembler_v0.1.4_databases.tar.gz"
requiredmd5 = "f5144045e6e5d0d5a6b7f78d0c08840d"

Expand Down
10 changes: 3 additions & 7 deletions src/plassembler/utils/plass_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,6 @@ def identify_chromosome_process_flye(self, chromosome_len):
:return chromosome_flag: bool whether chromosome assembles
"""
outdir = self.outdir
long_only = self.long_only
info_file = os.path.join(outdir, "assembly_info.txt")
col_list = [
"seq_name",
Expand Down Expand Up @@ -304,7 +303,6 @@ def identify_chromosome_process_flye_long(self, chromosome_len):
:return chromosome_flag: bool whether chromosome assembles
"""
outdir = self.outdir
long_only = self.long_only
info_file = os.path.join(outdir, "assembly_info.txt")
col_list = [
"seq_name",
Expand Down Expand Up @@ -763,14 +761,12 @@ def finalise_contigs_long(self, prefix):
with open(os.path.join(outdir, prefix + "_plasmids.fasta"), "w") as dna_fa:
for dna_record in SeqIO.parse(plasmid_fasta, "fasta"):
id = dna_record.id
l = len(dna_record.seq)
length = len(dna_record.seq)
copy_number = combined_depth_mash_df.plasmid_copy_number_long[i]
if "circular" in dna_record.description: # circular contigs from canu
desc = (
f"len={l} plasmid_copy_number_long={copy_number}x circular=True"
)
desc = f"len={length} plasmid_copy_number_long={copy_number}x circular=True"
else:
desc = f"len={l} plasmid_copy_number_long={copy_number}x"
desc = f"len={length} plasmid_copy_number_long={copy_number}x"
i += 1
record = SeqRecord(dna_record.seq, id=id, description=desc)
SeqIO.write(record, dna_fa, "fasta")
Expand Down
189 changes: 0 additions & 189 deletions src/plassembler/utils/run_canu.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
import math
from collections import Counter
from itertools import product
from pathlib import Path

import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from loguru import logger

Expand Down Expand Up @@ -171,189 +168,3 @@ def trim_contigs(canu_filtered_fasta, outdir):
SeqIO.write(trimmed_records, output_handle, "fasta")

return output_filename


def make_blastdb(canu_output_dir, plasmid_fasta, logdir):
db: Path = Path(canu_output_dir) / "db"

makeblastdb = ExternalTool(
tool="makeblastdb",
input=f"-in {plasmid_fasta}",
output=f"-out {db}",
params="-dbtype nucl ",
logdir=logdir,
outfile="",
)

ExternalTool.run_tool(makeblastdb, to_stdout=False)


"""
some of this adapted from dnaapler
https://github.com/gbouras13/dnaapler
"""


def run_blast(canu_output_dir, plasmid_fasta, threads, logdir):
blast_output: Path = Path(canu_output_dir) / "blast_output.txt"
db: Path = Path(canu_output_dir) / "db"

blast = ExternalTool(
tool="blastn",
input=f"-query {plasmid_fasta}",
output=f"-out {blast_output}",
params=f'-db {db} -evalue 1e-05 -num_threads {threads} -outfmt " 6 qseqid qlen sseqid slen length qstart qend sstart send pident nident gaps mismatch evalue bitscore qseq sseq "',
logdir=logdir,
outfile="",
)
ExternalTool.run_tool(blast, to_stdout=False)


def process_blast_output(canu_output_dir, combined_plasmid_file, outdir):
"""Processes
:param input: input file
:param blast_file: blast output file
:param out_file: output file
:return: output_filename: Path of output dedeup Fasta
"""

blast_output_file: Path = Path(canu_output_dir) / "blast_output.txt"

# define colnames
col_list = [
"qseqid",
"qlen",
"sseqid",
"slen",
"length",
"qstart",
"qend",
"sstart",
"send",
"pident",
"nident",
"gaps",
"mismatch",
"evalue",
"bitscore",
"qseq",
"sseq",
]

# read in the dataframe from BLAST
try:
blast_df = pd.read_csv(
blast_output_file, delimiter="\t", index_col=False, names=col_list
)
except Exception:
logger.error("There was an issue with parsing the BLAST output file.")

# if the BLAST input is empty
if isinstance(blast_df, pd.DataFrame) and blast_df.empty:
logger.error("There were 0 BLAST hits. This must be a BLAST error.")

# keep only the columns where the same contig is blasted against each other

# Filter rows where qseqid is equal to sseqid - the ones fo BLASTing against themselves
blast_df = blast_df[blast_df["qseqid"] == blast_df["sseqid"]]

# read in the canu FASTA and save as a dictionary

fasta_dict = {}
i = 1
for record in SeqIO.parse(combined_plasmid_file, "fasta"):
fasta_dict[record.id] = {
"count": i,
"sequence": str(record.seq),
"dupe": False, # stores the dupe status
"start": 1, # stores the start
"end": len(record.seq), # stores the end
}
i += 1

for contig in fasta_dict.keys():
tmp_df = blast_df[blast_df["qseqid"] == contig]
# Sort by 'length' column in descending order
tmp_df_sorted = tmp_df.sort_values(by="length", ascending=False)
# get rid of the 100% match row
tmp_df_sorted = tmp_df_sorted[tmp_df_sorted["qlen"] != tmp_df_sorted["length"]]
# more than 99% identical
tmp_df_sorted = tmp_df_sorted[tmp_df_sorted["pident"] > 99]
# starts need to be < 100
tmp_df_sorted = tmp_df_sorted[tmp_df_sorted["qstart"] < 100]
num_rows = tmp_df_sorted.shape[0]
if num_rows == 0: # where there is no dupe at all
fasta_dict[contig]["dupe"] = False
# exit
else:
# Get the first row
first_row = tmp_df_sorted.iloc[0]
# ensure the match is good
if (
first_row["length"] < 500
): # less than 500bp repeat in the top hit - probably Insertion Seq not a real plasmid dupe - otherwise probably legit
fasta_dict[contig]["dupe"] = False
else:
try:
# the repeat will be in the longest hit with qstart < 100 (usually 1 or very close to it)
# heuristic i need to check i guess
# just take until the next repeat element
# this has been filtered for prior
best_row = tmp_df_sorted.iloc[0]
fasta_dict[contig]["dupe"] = True
fasta_dict[contig]["start"] = best_row["qstart"]
fasta_dict[contig]["end"] = best_row["sstart"]

# if the query end is larger than the sstart - there is an overlap
# take 1 as the start and then the sstart as the end
# otherwise check for concatenation (within 1000bp))
# otherwise exit just the whole plasmid
# if best_row["qend"] > best_row["sstart"]:
# fasta_dict[contig]["dupe"] = True
# fasta_dict[contig]["start"] = best_row["qstart"]
# fasta_dict[contig]["end"] = best_row["sstart"]
# #elif (best_row["qend"] + 1000) > best_row[
# #"sstart"
# #]: # the longest match is likely to be a duplication
# else:
# fasta_dict[contig]["dupe"] = True
# fasta_dict[contig]["start"] = best_row["qstart"]
# fasta_dict[contig]["end"] = best_row["sstart"]
# else:
# fasta_dict[contig]["dupe"] = False
except Exception:
logger.error("Flye not found. Please reinstall Plassembler.")

# Create a list of SeqRecord objects
records = []
for entry_id, entry_data in fasta_dict.items():
subsequence = entry_data["sequence"][
entry_data["start"] - 1 : entry_data["end"]
]
count = str(entry_data["count"])
l = entry_data["end"]

record = SeqRecord(
seq=Seq(subsequence), id=count, description=f"{count} len={l}"
)
records.append(record)

# Write the records to a FASTA file
output_filename: Path = Path(outdir) / "combined_plasmids_dedup.fasta"
with open(output_filename, "w") as output_handle:
SeqIO.write(records, output_handle, "fasta")

return output_filename


# then BLAST output
# # then figure out for overlaps
# parse all blast hits as a dictionary
# need more than 1 hit (itself)
# if the blast hit is more than 50% and less than 90 % of contig length, take as duplicate
# elif the blast hit is more than 2000bp (lower could be e.g. IS element) and far away (more than 50% length away)
# then assume partial duplication too
# get all dupe regions this way
2 changes: 1 addition & 1 deletion src/plassembler/utils/run_dnaapler.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,6 @@ def run_dnaapler(threads, plasmid_fasta, logdir, outdir):
)
return plasmids_for_sketching
except Exception:
logger.warning(f"Dnaapler failed to reorient any plasmids.")
logger.warning("Dnaapler failed to reorient any plasmids.")
plasmids_for_sketching = plasmid_fasta
return plasmids_for_sketching
28 changes: 14 additions & 14 deletions tests/test_end_to_end.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def test_citation():


# test running end to end
#### for cases 1,2,3
# for cases 1,2,3
# uncomment for mac running to check
# 70kbp, 44kbp and 9kbp plasmid reads are from
# the 70kbp is a fake chromosome
Expand All @@ -91,7 +91,7 @@ def test_plassembler_case_1(self):
remove_directory(outdir)

def test_plassembler_case_2(self):
with self.assertRaises(ValueError):
with self.assertRaises(RuntimeError):
"""test plassembler run case 2 no chromosome assembled"""
longreads: Path = f"{end_to_end}/input_fastq.gz"
s1: Path = f"{end_to_end}/input_R1.fastq.gz"
Expand All @@ -114,11 +114,11 @@ def test_plassembler_case_3(self):
remove_directory(outdir)

def test_plassembler_case_4(self):
with self.assertRaises(ValueError):
with self.assertRaises(RuntimeError):
"""test plassembler run case 4. Only chromosome assembled with flye, no plasmid in recovery."""
longreads: Path = f"{end_to_end}/abaumanii_plasmid.fastq.gz"
s1: Path = f"{end_to_end}/abaumanii_reads_R1.fastq.gz"
s2: Path = f"{end_to_end}/aabaumanii_reads_R2.fastq.gz"
s2: Path = f"{end_to_end}/abaumanii_reads_R2.fastq.gz"
chromosome = 100000
outdir: Path = f"{end_to_end}/test_out"
cmd = f"plassembler run -l {longreads} -c {chromosome} -1 {s1} -2 {s2} -d {plassembler_db_dir} -o {outdir} -t 8 -f"
Expand All @@ -128,7 +128,7 @@ def test_plassembler_case_4(self):
# skipqc

def test_plassembler_skipqc(self):
with self.assertRaises(ValueError):
with self.assertRaises(RuntimeError):
"""test plassembler run case 4. Only chromosome assembled with flye, no plasmid in recovery."""
longreads: Path = f"{end_to_end}/abaumanii_plasmid.fastq.gz"
s1: Path = f"{end_to_end}/abaumanii_reads_R1.fastq.gz"
Expand All @@ -141,8 +141,8 @@ def test_plassembler_skipqc(self):

# flye_dir
def test_plassembler_flye_dir(self):
with self.assertRaises(ValueError):
"""test plassembler run case 4. With flye directory."""
with self.assertRaises(RuntimeError):
"""test plassembler run case 4. With flye directory. Should fail out (Saves time)."""
longreads: Path = f"{end_to_end}/abaumanii_plasmid.fastq.gz"
s1: Path = f"{end_to_end}/abaumanii_reads_R1.fastq.gz"
s2: Path = f"{end_to_end}/abaumanii_reads_R2.fastq.gz"
Expand All @@ -157,7 +157,7 @@ def test_plassembler_flye_dir(self):
long
"""

def test_plassembler_long():
def test_plassembler_long(self):
"""test plassembler long"""
longreads: Path = f"{end_to_end}/input_fastq.gz"
chromosome = 50000
Expand All @@ -166,8 +166,8 @@ def test_plassembler_long():
exec_command(cmd)
remove_directory(outdir)

def test_plassembler_long_no_chrom():
with self.assertRaises(ValueError):
def test_plassembler_long_no_chrom(self):
with self.assertRaises(RuntimeError):
"""test plassembler long - no chromosome recovered"""
longreads: Path = f"{end_to_end}/input_fastq.gz"
chromosome = 500000
Expand All @@ -176,8 +176,8 @@ def test_plassembler_long_no_chrom():
exec_command(cmd)
remove_directory(outdir)

def test_plassembler_long_no_plasmids():
"""test plassembler long - no plasmids"""
def test_plassembler_long_no_plasmids(self):
"""test plassembler long - no plasmids recovered at all"""
longreads: Path = f"{end_to_end}/abaumanii_plasmid.fastq.gz"
chromosome = 50000
outdir: Path = f"{end_to_end}/test_out"
Expand All @@ -186,10 +186,10 @@ def test_plassembler_long_no_plasmids():
remove_directory(outdir)

"""
assembled
assembled
"""

def test_plassembler_assembled():
def test_plassembler_assembled(self):
"""test plassembler assembled"""
longreads: Path = f"{end_to_end}/input_fastq.gz"
s1: Path = f"{end_to_end}/input_R1.fastq.gz"
Expand Down
1 change: 0 additions & 1 deletion tests/test_external_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ class test_sam_to_fastq(unittest.TestCase):
# sam to bam
def test_extract_long_fastqs_slow_keep_fastqs(self):
expected_return = True
threads = 1
samfile: Path = Path(f"{map_dir}/long_read.sam")
# not in the dir to prevent overwriting
plasmidfastq: Path = Path(f"{map_dir}/sam_to_bam/plasmid_long.fastq")
Expand Down

0 comments on commit c31af52

Please sign in to comment.