Skip to content

Commit

Permalink
fix tests and filter low entropy fastqs
Browse files Browse the repository at this point in the history
  • Loading branch information
gbouras13 committed Oct 22, 2023
1 parent 2500250 commit f88571c
Show file tree
Hide file tree
Showing 5 changed files with 85 additions and 44 deletions.
70 changes: 44 additions & 26 deletions src/plassembler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
run_canu,
run_canu_correct,
trim_contigs,
filter_entropy_fastqs
)
from plassembler.utils.run_dnaapler import run_dnaapler
from plassembler.utils.run_mash import mash_sketch, run_mash
Expand Down Expand Up @@ -440,29 +441,37 @@ def run(
skip_assembly = validate_flye_assembly_info(flye_assembly, flye_info)

if skip_assembly is False:
if use_raven is True:
logger.info("Running Flye.")
run_flye(outdir, threads, raw_flag, pacbio_model, logdir)
else:
if flye_directory != "nothing":
logger.info(
"You have specified --use_raven. Using Raven for long read assembly."
f"You have specified a {flye_directory} with an existing flye assembly."
)
logger.info("Copying files.")
# copies the files to the outdir
shutil.copy2(
os.path.join(flye_directory, "assembly_info.txt"),
os.path.join(outdir, "assembly_info.txt"),
)
shutil.copy2(
os.path.join(flye_directory, "assembly.fasta"),
os.path.join(outdir, "assembly.fasta"),
)
logger.info("Running Raven.")
run_raven(outdir, threads, logdir)
else:
logger.info("Running Flye.")
run_flye(outdir, threads, raw_flag, pacbio_model, logdir)
else:
logger.info(
f"You have specified a {flye_directory} with an existing flye assembly."
)
logger.info("Copying files.")
# copies the files to the outdir
shutil.copy2(
os.path.join(flye_directory, "assembly_info.txt"),
os.path.join(outdir, "assembly_info.txt"),
)
shutil.copy2(
os.path.join(flye_directory, "assembly.fasta"),
os.path.join(outdir, "assembly.fasta"),
)
logger.info(
f"You have specified a {flye_assembly} and {flye_info} from an existing flye assembly."
)
shutil.copy2(
flye_assembly,
os.path.join(outdir, "assembly.fasta"),
)
shutil.copy2(
flye_info,
os.path.join(outdir, "assembly_info.txt"),
)


# instanatiate the class with some of the commands
plass = Plass()
plass.outdir = outdir
Expand Down Expand Up @@ -1321,10 +1330,7 @@ def long(
corrected_error_rate = 0.045
else:
canu_nano_or_pacbio = "nanopore"
if raw_flag is True: # ont raw
corrected_error_rate = 0.12
else:
corrected_error_rate = 0.03
corrected_error_rate = 0.12
else: # if none by default
try:
float(corrected_error_rate) > 0
Expand Down Expand Up @@ -1388,11 +1394,23 @@ def long(

canu_output_dir: Path = Path(outdir) / "canu"

logger.info(
"Removing junk low entropy reads."
)

# filter entropy of fastq (to remove rubbish repeats) so they don't survive the correction step
entropy_filtered_fastq = Path(outdir) / "plasmid_long_entropy_filtered.fastq"
filter_entropy_fastqs(plasmidfastqs, entropy_filtered_fastq)

logger.info(
"Correcting reads with canu prior to running Unicycler."
)

try:
run_canu_correct(
threads,
logdir,
plasmidfastqs,
entropy_filtered_fastq,
canu_output_dir,
canu_nano_or_pacbio,
total_flye_plasmid_length,
Expand All @@ -1410,7 +1428,7 @@ def long(
logger.warning(
"canu correct failed to correct any reads. Advancing with uncorrected reads"
)
corrected_fastqs = plasmidfastqs
corrected_fastqs = entropy_filtered_fastq

unicycler_dir: Path = Path(outdir) / "unicycler_output"
run_unicycler_long(threads, logdir, corrected_fastqs, unicycler_dir)
Expand Down
22 changes: 22 additions & 0 deletions src/plassembler/utils/run_canu.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,28 @@ def filter_entropy(canu_fasta, outdir):

return output_filename

def filter_entropy_fastqs(fastq: Path, output_filename: Path) -> None:
"""Filter FASTQ records based on entropy and write the filtered records to a new FASTQ file.
:param canu_fastq: Input FASTQ file containing long read sequences.
:param outdir: Output directory for the filtered FASTQ file.
:return: Path to the filtered FASTQ file.
"""

filtered_records = []

for record in SeqIO.parse(fastq, "fastq"):
entropy = shannon_entropy_5mers(str(record.seq))
if entropy > 5:
# Reject low entropy sequences - adjust the threshold as needed.
filtered_records.append(record)

# Write the filtered records to a new FASTQ file.
with open(output_filename, "w") as output_handle:
SeqIO.write(filtered_records, output_handle, "fastq")




"""
trim contigs
Expand Down
21 changes: 10 additions & 11 deletions tests/test_end_to_end.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,17 +153,16 @@ def test_plassembler_flye_assembly_info(self):

# flye_dir
def test_plassembler_flye_dir(self):
with self.assertRaises(RuntimeError):
"""test plassembler run case 4. With flye directory. Should error out (Saves time)."""
longreads: Path = f"{end_to_end}/abaumanii_plasmid.fastq.gz"
s1: Path = f"{end_to_end}/abaumanii_reads_R1.fastq.gz"
s2: Path = f"{end_to_end}/abaumanii_reads_R2.fastq.gz"
flye_dir: Path = f"{end_to_end}/abaumanii_plasmid_flye_output"
chromosome = 1000000
outdir: Path = f"{end_to_end}/test_out"
cmd = f"plassembler run -l {longreads} -c {chromosome} -1 {s1} -2 {s2} -d {plassembler_db_dir} -o {outdir} -t 8 --flye_info {flye_info} -f"
exec_command(cmd)
remove_directory(outdir)
"""test plassembler run case 4. With flye directory.."""
longreads: Path = f"{end_to_end}/abaumanii_plasmid.fastq.gz"
s1: Path = f"{end_to_end}/abaumanii_reads_R1.fastq.gz"
s2: Path = f"{end_to_end}/abaumanii_reads_R2.fastq.gz"
flye_dir: Path = f"{end_to_end}/abaumanii_plasmid_flye_output"
chromosome = 1000000
outdir: Path = f"{end_to_end}/test_out"
cmd = f"plassembler run -l {longreads} -c {chromosome} -1 {s1} -2 {s2} -d {plassembler_db_dir} -o {outdir} -t 8 --flye_dir {flye_dir} -f"
exec_command(cmd)
remove_directory(outdir)

def test_plassembler_flye_assembly_info_missing(self):
with self.assertRaises(RuntimeError):
Expand Down
2 changes: 1 addition & 1 deletion tests/test_plass_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ def test_combine_tsv(self):
plasmid_fasta = Path(f"{assembly_depth_dir}/plasmids.fasta")
assembly.get_depth(logdir, pacbio_model, threads)
assembly.process_mash_tsv(plassembler_db_dir, plasmid_fasta)
assembly.combine_depth_mash_tsvs(prefix)
assembly.combine_depth_mash_tsvs(prefix, False)
remove_file(Path(f"{assembly_depth_dir}/combined_long.sam"))
remove_file(Path(f"{assembly_depth_dir}/combined_short.bam"))
remove_file(Path(f"{assembly_depth_dir}/combined_sorted_long.bam"))
Expand Down
14 changes: 8 additions & 6 deletions tests/test_plassembler.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,16 +125,18 @@ def test_fasta_input(self):
# assembled multifasta chrom
def test_validate_fastas_assembled_mode_multiFASTA(self):
with self.assertRaises(SystemExit):
input_plasmids = os.path.join(val_data, "test.fasta")
input_chromosome = os.path.join(val_data, "test_multi.fasta")
validate_fastas_assembled_mode(input_chromosome, input_plasmids)
input_plasmids: Path = Path(val_data) / "test.fasta"
input_chromosome: Path = Path(val_data) / "test_multi.fasta"
no_copy_numbers = False
validate_fastas_assembled_mode(input_chromosome, input_plasmids, no_copy_numbers)

# assembled single chrom works fine
def test_validate_fastas_assembled_mode_single(self):
input_plasmids = os.path.join(val_data, "test.fasta")
input_chromosome = os.path.join(val_data, "test.fasta")
input_chromosome: Path = Path(val_data) / "test.fasta"
input_plasmids: Path = Path(val_data) / "test.fasta"
no_copy_numbers = False
tmp = 1
validate_fastas_assembled_mode(input_chromosome, input_plasmids)
validate_fastas_assembled_mode(input_chromosome, input_plasmids, no_copy_numbers)
self.assertEqual(tmp, 1)

# no gzip
Expand Down

0 comments on commit f88571c

Please sign in to comment.