diff --git a/src/plassembler/__init__.py b/src/plassembler/__init__.py index 45d6a96..101dc32 100644 --- a/src/plassembler/__init__.py +++ b/src/plassembler/__init__.py @@ -38,6 +38,7 @@ run_canu, run_canu_correct, trim_contigs, + filter_entropy_fastqs ) from plassembler.utils.run_dnaapler import run_dnaapler from plassembler.utils.run_mash import mash_sketch, run_mash @@ -440,29 +441,37 @@ def run( skip_assembly = validate_flye_assembly_info(flye_assembly, flye_info) if skip_assembly is False: - if use_raven is True: + logger.info("Running Flye.") + run_flye(outdir, threads, raw_flag, pacbio_model, logdir) + else: + if flye_directory != "nothing": logger.info( - "You have specified --use_raven. Using Raven for long read assembly." + f"You have specified a {flye_directory} with an existing flye assembly." + ) + logger.info("Copying files.") + # copies the files to the outdir + shutil.copy2( + os.path.join(flye_directory, "assembly_info.txt"), + os.path.join(outdir, "assembly_info.txt"), + ) + shutil.copy2( + os.path.join(flye_directory, "assembly.fasta"), + os.path.join(outdir, "assembly.fasta"), ) - logger.info("Running Raven.") - run_raven(outdir, threads, logdir) else: - logger.info("Running Flye.") - run_flye(outdir, threads, raw_flag, pacbio_model, logdir) - else: - logger.info( - f"You have specified a {flye_directory} with an existing flye assembly." - ) - logger.info("Copying files.") - # copies the files to the outdir - shutil.copy2( - os.path.join(flye_directory, "assembly_info.txt"), - os.path.join(outdir, "assembly_info.txt"), - ) - shutil.copy2( - os.path.join(flye_directory, "assembly.fasta"), - os.path.join(outdir, "assembly.fasta"), - ) + logger.info( + f"You have specified a {flye_assembly} and {flye_info} from an existing flye assembly." + ) + shutil.copy2( + flye_assembly, + os.path.join(outdir, "assembly.fasta"), + ) + shutil.copy2( + flye_info, + os.path.join(outdir, "assembly_info.txt"), + ) + + # instanatiate the class with some of the commands plass = Plass() plass.outdir = outdir @@ -1321,10 +1330,7 @@ def long( corrected_error_rate = 0.045 else: canu_nano_or_pacbio = "nanopore" - if raw_flag is True: # ont raw - corrected_error_rate = 0.12 - else: - corrected_error_rate = 0.03 + corrected_error_rate = 0.12 else: # if none by default try: float(corrected_error_rate) > 0 @@ -1388,11 +1394,23 @@ def long( canu_output_dir: Path = Path(outdir) / "canu" + logger.info( + "Removing junk low entropy reads." + ) + + # filter entropy of fastq (to remove rubbish repeats) so they don't survive the correction step + entropy_filtered_fastq = Path(outdir) / "plasmid_long_entropy_filtered.fastq" + filter_entropy_fastqs(plasmidfastqs, entropy_filtered_fastq) + + logger.info( + "Correcting reads with canu prior to running Unicycler." + ) + try: run_canu_correct( threads, logdir, - plasmidfastqs, + entropy_filtered_fastq, canu_output_dir, canu_nano_or_pacbio, total_flye_plasmid_length, @@ -1410,7 +1428,7 @@ def long( logger.warning( "canu correct failed to correct any reads. Advancing with uncorrected reads" ) - corrected_fastqs = plasmidfastqs + corrected_fastqs = entropy_filtered_fastq unicycler_dir: Path = Path(outdir) / "unicycler_output" run_unicycler_long(threads, logdir, corrected_fastqs, unicycler_dir) diff --git a/src/plassembler/utils/run_canu.py b/src/plassembler/utils/run_canu.py index 523fe9d..33d25b5 100644 --- a/src/plassembler/utils/run_canu.py +++ b/src/plassembler/utils/run_canu.py @@ -151,6 +151,28 @@ def filter_entropy(canu_fasta, outdir): return output_filename +def filter_entropy_fastqs(fastq: Path, output_filename: Path) -> None: + """Filter FASTQ records based on entropy and write the filtered records to a new FASTQ file. + + :param canu_fastq: Input FASTQ file containing long read sequences. + :param outdir: Output directory for the filtered FASTQ file. + :return: Path to the filtered FASTQ file. + """ + + filtered_records = [] + + for record in SeqIO.parse(fastq, "fastq"): + entropy = shannon_entropy_5mers(str(record.seq)) + if entropy > 5: + # Reject low entropy sequences - adjust the threshold as needed. + filtered_records.append(record) + + # Write the filtered records to a new FASTQ file. + with open(output_filename, "w") as output_handle: + SeqIO.write(filtered_records, output_handle, "fastq") + + + """ trim contigs diff --git a/tests/test_end_to_end.py b/tests/test_end_to_end.py index 4071888..48e98ee 100644 --- a/tests/test_end_to_end.py +++ b/tests/test_end_to_end.py @@ -153,17 +153,16 @@ def test_plassembler_flye_assembly_info(self): # flye_dir def test_plassembler_flye_dir(self): - with self.assertRaises(RuntimeError): - """test plassembler run case 4. With flye directory. Should error out (Saves time).""" - longreads: Path = f"{end_to_end}/abaumanii_plasmid.fastq.gz" - s1: Path = f"{end_to_end}/abaumanii_reads_R1.fastq.gz" - s2: Path = f"{end_to_end}/abaumanii_reads_R2.fastq.gz" - flye_dir: Path = f"{end_to_end}/abaumanii_plasmid_flye_output" - chromosome = 1000000 - outdir: Path = f"{end_to_end}/test_out" - cmd = f"plassembler run -l {longreads} -c {chromosome} -1 {s1} -2 {s2} -d {plassembler_db_dir} -o {outdir} -t 8 --flye_info {flye_info} -f" - exec_command(cmd) - remove_directory(outdir) + """test plassembler run case 4. With flye directory..""" + longreads: Path = f"{end_to_end}/abaumanii_plasmid.fastq.gz" + s1: Path = f"{end_to_end}/abaumanii_reads_R1.fastq.gz" + s2: Path = f"{end_to_end}/abaumanii_reads_R2.fastq.gz" + flye_dir: Path = f"{end_to_end}/abaumanii_plasmid_flye_output" + chromosome = 1000000 + outdir: Path = f"{end_to_end}/test_out" + cmd = f"plassembler run -l {longreads} -c {chromosome} -1 {s1} -2 {s2} -d {plassembler_db_dir} -o {outdir} -t 8 --flye_dir {flye_dir} -f" + exec_command(cmd) + remove_directory(outdir) def test_plassembler_flye_assembly_info_missing(self): with self.assertRaises(RuntimeError): diff --git a/tests/test_plass_class.py b/tests/test_plass_class.py index 3b6bab5..6430b69 100644 --- a/tests/test_plass_class.py +++ b/tests/test_plass_class.py @@ -155,7 +155,7 @@ def test_combine_tsv(self): plasmid_fasta = Path(f"{assembly_depth_dir}/plasmids.fasta") assembly.get_depth(logdir, pacbio_model, threads) assembly.process_mash_tsv(plassembler_db_dir, plasmid_fasta) - assembly.combine_depth_mash_tsvs(prefix) + assembly.combine_depth_mash_tsvs(prefix, False) remove_file(Path(f"{assembly_depth_dir}/combined_long.sam")) remove_file(Path(f"{assembly_depth_dir}/combined_short.bam")) remove_file(Path(f"{assembly_depth_dir}/combined_sorted_long.bam")) diff --git a/tests/test_plassembler.py b/tests/test_plassembler.py index f832376..efdaa0e 100644 --- a/tests/test_plassembler.py +++ b/tests/test_plassembler.py @@ -125,16 +125,18 @@ def test_fasta_input(self): # assembled multifasta chrom def test_validate_fastas_assembled_mode_multiFASTA(self): with self.assertRaises(SystemExit): - input_plasmids = os.path.join(val_data, "test.fasta") - input_chromosome = os.path.join(val_data, "test_multi.fasta") - validate_fastas_assembled_mode(input_chromosome, input_plasmids) + input_plasmids: Path = Path(val_data) / "test.fasta" + input_chromosome: Path = Path(val_data) / "test_multi.fasta" + no_copy_numbers = False + validate_fastas_assembled_mode(input_chromosome, input_plasmids, no_copy_numbers) # assembled single chrom works fine def test_validate_fastas_assembled_mode_single(self): - input_plasmids = os.path.join(val_data, "test.fasta") - input_chromosome = os.path.join(val_data, "test.fasta") + input_chromosome: Path = Path(val_data) / "test.fasta" + input_plasmids: Path = Path(val_data) / "test.fasta" + no_copy_numbers = False tmp = 1 - validate_fastas_assembled_mode(input_chromosome, input_plasmids) + validate_fastas_assembled_mode(input_chromosome, input_plasmids, no_copy_numbers) self.assertEqual(tmp, 1) # no gzip