Skip to content

Commit

Permalink
add canu for long only
Browse files Browse the repository at this point in the history
  • Loading branch information
gbouras13 committed Sep 5, 2023
1 parent ca685ac commit 34540bc
Show file tree
Hide file tree
Showing 7 changed files with 201 additions and 205 deletions.
4 changes: 3 additions & 1 deletion build/environment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ dependencies:
- chopper >=0.5.0
- mash >=2.2
- raven-assembler >=1.8
- samtools >= 0.15.0
- samtools >=0.15.0
- canu >=2.2
- blast >=2.10
- just
- poetry
- python >=3.8,<3.10
Expand Down
192 changes: 80 additions & 112 deletions src/plassembler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,7 @@ def begin_plassembler(outdir, force):
# remove outdir on force
if force is True:
if os.path.isdir(outdir) is True:
# shutil.rmtree(outdir)
print("l")
shutil.rmtree(outdir)
else:
logger.info(
f"--force was specified even though the directory {outdir} does not already exist. Continuing "
Expand Down Expand Up @@ -1090,27 +1089,27 @@ def long(
if pacbio_model != "nothing":
pacbio_model = validate_pacbio_model(pacbio_model)

# if skip_qc is False:
# # filtering long readfastq
# logger.info("Filtering long reads with chopper")
# chopper( # due to the stdin side of this, just implement the class maually in py
# longreads, outdir, min_length, min_quality, long_zipped, threads, logdir
# )

# else: # copy the input to the outdir
# shutil.copy2(
# longreads,
# Path(f"{outdir}/chopper_long_reads.fastq.gz"),
# )

# # Raven for long only or '--use_raven'
# if use_raven is True:
# logger.info(f"--use_raven is {use_raven}. Using Raven for long read assembly.")
# logger.info("Running Raven.")
# run_raven(outdir, threads, logdir)
# else:
# logger.info("Running Flye.")
# run_flye(outdir, threads, raw_flag, pacbio_model, logdir)
if skip_qc is False:
# filtering long readfastq
logger.info("Filtering long reads with chopper")
chopper( # due to the stdin side of this, just implement the class maually in py
longreads, outdir, min_length, min_quality, long_zipped, threads, logdir
)

else: # copy the input to the outdir
shutil.copy2(
longreads,
Path(f"{outdir}/chopper_long_reads.fastq.gz"),
)

# Raven for long only or '--use_raven'
if use_raven is True:
logger.info(f"--use_raven is {use_raven}. Using Raven for long read assembly.")
logger.info("Running Raven.")
run_raven(outdir, threads, logdir)
else:
logger.info("Running Flye.")
run_flye(outdir, threads, raw_flag, pacbio_model, logdir)

# instanatiate the class with some of the commands
plass = Plass()
Expand Down Expand Up @@ -1148,105 +1147,74 @@ def long(
logger.error(message)

else:
####################################################################
# Only 1 contig
####################################################################

if plass.contig_count == 1:
# chromosome identified but no plasmids - just finish
# end plassembler
move_and_copy_files(
outdir,
prefix,
False, # unicycler success
False, # keep fastqs
True, # assembled mode
False, # long only
use_raven,
)
remove_intermediate_files(
outdir,
keep_chromosome,
False, # assembled mode
True, # long only
use_raven,
)
logger.error("Chromosome identified but no plasmids.")

####################################################################
# Multiple Contigs
####################################################################

elif plass.contig_count > 1:
# no_plasmids_flag = False as obviously "plasmids"
plass.no_plasmids_flag = False

logger.info("Mapping long reads.")
input_long_reads: Path = Path(outdir) / "chopper_long_reads.fastq.gz"
fasta: Path = Path(outdir) / "flye_renamed.fasta"
samfile: Path = Path(outdir) / "long_read.sam"
minimap_long_reads(
input_long_reads, fasta, samfile, threads, pacbio_model, logdir
)

# for long, custom function is quick enough
logger.info("Processing Sam/Bam Files and extracting Fastqs.")
samfile: Path = Path(outdir) / "long_read.sam"
plasmidfastqs: Path = Path(outdir) / "plasmid_long.fastq"
extract_long_fastqs_fast(samfile, plasmidfastqs, threads)
# no_plasmids_flag = False as obviously "plasmids"
plass.no_plasmids_flag = False

# canu
logger.info("Running canu.")
if pacbio_model != "":
canu_nano_or_pacbio = "pacbio"
else:
canu_nano_or_pacbio = "nanopore"
canu_output_dir: Path = Path(outdir) / "canu"
run_canu(
threads, logdir, plasmidfastqs, canu_output_dir, canu_nano_or_pacbio
)
make_blastdb(canu_output_dir, logdir)
run_blast(canu_output_dir, threads, logdir)
process_blast_output(canu_output_dir, outdir)
logger.info("Mapping long reads.")
input_long_reads: Path = Path(outdir) / "chopper_long_reads.fastq.gz"
fasta: Path = Path(outdir) / "flye_renamed.fasta"
samfile: Path = Path(outdir) / "long_read.sam"
minimap_long_reads(
input_long_reads, fasta, samfile, threads, pacbio_model, logdir
)

plass.get_depth_long(logdir, pacbio_model, threads)
# for long, custom function is quick enough
logger.info("Processing Sam/Bam Files and extracting Fastqs.")
samfile: Path = Path(outdir) / "long_read.sam"
plasmidfastqs: Path = Path(outdir) / "plasmid_long.fastq"
extract_long_fastqs_fast(samfile, plasmidfastqs, threads)

# run mash
logger.info("Calculating mash distances to PLSDB.")
# canu
logger.info("Running canu.")
if pacbio_model != "":
canu_nano_or_pacbio = "pacbio"
else:
canu_nano_or_pacbio = "nanopore"
canu_output_dir: Path = Path(outdir) / "canu"
run_canu(
threads, logdir, plasmidfastqs, canu_output_dir, canu_nano_or_pacbio
)
make_blastdb(canu_output_dir, logdir)
run_blast(canu_output_dir, threads, logdir)
process_blast_output(canu_output_dir, outdir)

# mash sketches the plasmids
mash_sketch(outdir, os.path.join(outdir, "plasmids_initial.fasta"), logdir)
# depth
plass.get_depth_long(logdir, pacbio_model, threads)

# runs mash
run_mash(outdir, database, logdir)
# run mash
logger.info("Calculating mash distances to PLSDB.")
# mash sketches the plasmids
mash_sketch(outdir, os.path.join(outdir, "plasmids_canu.fasta"), logdir)
# runs mash
run_mash(outdir, database, logdir)

# processes output
plass.process_mash_tsv(database)
# processes output
plass.process_mash_tsv(database)

# combine depth and mash tsvs
plass.combine_depth_mash_tsvs(prefix)
# combine depth and mash tsvs
plass.combine_depth_mash_tsvs(prefix)

# rename contigs and update copy number with plsdb
plass.finalise_contigs_long(prefix)
# rename contigs and update copy number with plsdb
plass.finalise_contigs_long(prefix)

# cleanup files
move_and_copy_files(
outdir,
prefix,
False, # unicycler success
False, # keep fastqs
False, # assembled mode
True, # long only
use_raven,
)
# cleanup files
move_and_copy_files(
outdir,
prefix,
False, # unicycler success
False, # keep fastqs
False, # assembled mode
True, # long only
use_raven,
)

# remove_intermediate_files(
# outdir,
# keep_chromosome,
# False, # assembled mode
# True, # long only
# use_raven,
# )
remove_intermediate_files(
outdir,
keep_chromosome,
False, # assembled mode
True, # long only
use_raven,
)

# end plassembler
end_plassembler(start_time)
Expand Down
7 changes: 4 additions & 3 deletions src/plassembler/utils/cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,10 @@ def remove_intermediate_files(
remove_file(os.path.join(out_dir, "chopper_long_reads.fastq.gz"))
remove_file(os.path.join(out_dir, "multimap_plasmid_chromosome_long.fastq"))

# multimer
remove_file(os.path.join(out_dir, "mapping.paf"))

# long
remove_file(os.path.join(out_dir, "plasmids_canu.fasta"))
remove_file(os.path.join(out_dir, "plasmid_long.fastq"))

# chromosome
if keep_chromosome is False:
remove_file(os.path.join(out_dir, "chromosome.fasta"))
Expand Down
89 changes: 57 additions & 32 deletions src/plassembler/utils/input_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,13 +129,7 @@ def check_dependencies():
logger.error("Flye not found. Please reinstall Plassembler.")

message = (
"Flye version found is v"
+ str(flye_major_version)
+ "."
+ str(flye_minor_version)
+ "."
+ flye_minorest_version
+ "."
f"Flye version found is v{flye_major_version}.{flye_minor_version}.{flye_minorest_version}."
)
logger.info(message)

Expand Down Expand Up @@ -164,25 +158,24 @@ def check_dependencies():

# unicycler

process = sp.Popen(["unicycler", "--version"], stdout=sp.PIPE, stderr=sp.STDOUT)
unicycler_out, _ = process.communicate()
unicycler_out = unicycler_out.decode()
logger.info(unicycler_out)
unicycler_version = unicycler_out.split(" ")[1]
# get rid of the "v"
logger.info(unicycler_version)

unicycler_version = unicycler_version[1:]
logger.info(unicycler_version)

unicycler_major_version = unicycler_version.split(".")[0]
unicycler_minor_version = int(unicycler_version.split(".")[1])
unicycler_minorest_version = int(unicycler_version.split(".")[2])
# except Exception:
# message = "Unicycler not found. Please reinstall Plassembler, see instructions at https://github.com/gbouras13/plassembler."
# logger.error(message)

message = f"Unicycler version found is v{unicycler_major_version}.{unicycler_minor_version}.{unicycler_minorest_version}."
try:
process = sp.Popen(["unicycler", "--version"], stdout=sp.PIPE, stderr=sp.STDOUT)
unicycler_out, _ = process.communicate()
unicycler_out = unicycler_out.decode()
unicycler_version = unicycler_out.split(" ")[1]
# get rid of the "v"
unicycler_version = unicycler_version[1:]

unicycler_major_version = int(unicycler_version.split(".")[0])
unicycler_minor_version = int(unicycler_version.split(".")[1])
unicycler_minorest_version = int(unicycler_version.split(".")[2])
except Exception:
message = "Unicycler not found. Please re-install Unicycler, see instructions at https://github.com/gbouras13/plassembler."
logger.error(message)

message = (
f"Unicycler version found is v{unicycler_major_version}.{unicycler_minor_version}.{unicycler_minorest_version}."
)
logger.info(message)

if unicycler_minor_version < 4:
Expand All @@ -205,7 +198,7 @@ def check_dependencies():
spades_out = spades_out.decode()
spades_version = spades_out.split(" ")[3]
spades_version = spades_version.split("\n")[0]
message = "SPAdes " + str(spades_version) + " found."
message = f"SPAdes {spades_version} found."
logger.info(message)
except Exception:
logger.error("SPAdes not found.")
Expand All @@ -218,7 +211,7 @@ def check_dependencies():
samtools_version = samtools_out.split("\n")[0].split(" ")[
1
] # get second line, and then second component of line
message = "Samtools v" + str(samtools_version) + " found."
message = f"Samtools v{samtools_version} found."
logger.info(message)
except Exception:
logger.error("Samtools not found.")
Expand All @@ -229,7 +222,7 @@ def check_dependencies():
minimap2_out, _ = process.communicate()
minimap2_version = minimap2_out.decode()
minimap2_version = minimap2_version.split("\n")[0]
message = "minimap2 v" + str(minimap2_version) + " found."
message = f"minimap2 v{minimap2_version} found."
logger.info(message)
except Exception:
logger.error("minimap2 not found.")
Expand All @@ -240,7 +233,7 @@ def check_dependencies():
_, fastp_out = process.communicate()
fastp_version = fastp_out.decode()
fastp_version = fastp_version.split("\n")[0].split(" ")[1]
message = "fastp v" + str(fastp_version) + " found."
message = f"fastp v{fastp_version} found."
logger.info(message)
except Exception:
logger.error("fastp not found.")
Expand All @@ -251,7 +244,7 @@ def check_dependencies():
chopper_out, _ = process.communicate()
chopper_version = chopper_out.decode()
chopper_version = chopper_version.split("\n")[0].split(" ")[1]
message = "chopper v" + str(chopper_version) + " found."
message = f"chopper v{chopper_version} found."
logger.info(message)
except Exception:
logger.error("chopper not found.")
Expand All @@ -266,11 +259,43 @@ def check_dependencies():
if "version" in line:
version_line.append(line)
mash_version = version_line[0].split(" ")[2]
message = "mash v" + str(mash_version) + " found."
message = f"mash v{mash_version} found."
logger.info(message)
except Exception:
logger.error("mash not found")


# canu
try:
process = sp.Popen(["canu", "--version"], stdout=sp.PIPE, stderr=sp.PIPE)
canu_out, _ = process.communicate()
canu_out = canu_out.decode()
canu_out = canu_out.split("\n")[0].split(" ")[1]
message = f"canu v{canu_out} found."
logger.info(message)
except Exception:
logger.error("canu not found")

# blast

try:
process = sp.Popen(["blastn", "-version"], stdout=sp.PIPE, stderr=sp.STDOUT)
blast_out, _ = process.communicate()
blast_out = blast_out.decode().strip()
blast_out = blast_out.split("\n")[0]
blast_version = blast_out.split(" ")[1]
blast_version = blast_version.strip("+")
blast_major_version = int(blast_version.split(".")[0])
blast_minor_version = int(blast_version.split(".")[1])
blast_minorest_version = int(blast_version.split(".")[2])
message = (
f"BLAST version found is v{blast_major_version}.{blast_minor_version}.{blast_minorest_version}."
)
logger.info(message)
except Exception:
message = "BLAST not found."
logger.error(message)

# all dependencies found
logger.info("All dependencies found.")

Expand Down
Loading

0 comments on commit 34540bc

Please sign in to comment.