Skip to content

Commit

Permalink
add some canu long only format
Browse files Browse the repository at this point in the history
  • Loading branch information
gbouras13 committed Sep 5, 2023
1 parent d57aa00 commit d3696cd
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 86 deletions.
18 changes: 12 additions & 6 deletions src/plassembler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@
# import classes
from plassembler.utils.plass_class import Assembly, Plass
from plassembler.utils.qc import chopper, copy_sr_fastq_file, fastp
from plassembler.utils.run_canu import (
make_blastdb,
process_blast_output,
run_blast,
run_canu,
)
from plassembler.utils.run_mash import mash_sketch, run_mash
from plassembler.utils.run_unicycler import run_unicycler
from plassembler.utils.sam_to_fastq import (
Expand All @@ -33,8 +39,6 @@
)
from plassembler.utils.test_incompatibility import incompatbility
from plassembler.utils.util import get_version, print_citation
from plassembler.utils.run_canu import run_canu, make_blastdb, run_blast, process_blast_output


log_fmt = (
"[<green>{time:YYYY-MM-DD HH:mm:ss}</green>] <level>{level: <8}</level> | "
Expand All @@ -56,8 +60,8 @@ def begin_plassembler(outdir, force):
# remove outdir on force
if force is True:
if os.path.isdir(outdir) is True:
#shutil.rmtree(outdir)
print('l')
# shutil.rmtree(outdir)
print("l")
else:
logger.info(
f"--force was specified even though the directory {outdir} does not already exist. Continuing "
Expand Down Expand Up @@ -922,6 +926,7 @@ def download(ctx, database, force, **kwargs):
long only
"""


def long_options(func):
"""Run command line args
Define common command line args here, and include them with the @common_options decorator below.
Expand Down Expand Up @@ -1197,12 +1202,13 @@ def long(
else:
canu_nano_or_pacbio = "nanopore"
canu_output_dir: Path = Path(outdir) / "canu"
run_canu(threads, logdir, plasmidfastqs, canu_output_dir, canu_nano_or_pacbio)
run_canu(
threads, logdir, plasmidfastqs, canu_output_dir, canu_nano_or_pacbio
)
make_blastdb(canu_output_dir, logdir)
run_blast(canu_output_dir, threads, logdir)
process_blast_output(canu_output_dir, outdir)


plass.get_depth_long(logdir, pacbio_model, threads)

# run mash
Expand Down
4 changes: 2 additions & 2 deletions src/plassembler/utils/plass_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -423,7 +423,7 @@ def get_depth_long(self, logdir, pacbio_model, threads):
outdir = self.outdir

input_long_reads: Path = Path(outdir) / "chopper_long_reads.fastq.gz"
fasta: Path = Path(outdir) / "flye_renamed.fasta"
fasta: Path = Path(outdir) / "plasmids.fasta"
sam_file: Path = Path(outdir) / "combined_long.sam"
sorted_bam: Path = Path(outdir) / "combined_sorted_long.bam"

Expand All @@ -436,7 +436,7 @@ def get_depth_long(self, logdir, pacbio_model, threads):

# get contig lengths

fasta: Path = Path(outdir) / "flye_renamed.fasta"
fasta: Path = Path(outdir) / "plasmids.fasta"
contig_lengths = get_contig_lengths(fasta)

# depths
Expand Down
143 changes: 65 additions & 78 deletions src/plassembler/utils/run_canu.py
Original file line number Diff line number Diff line change
@@ -1,48 +1,50 @@
from plassembler.utils.external_tools import ExternalTool
from pathlib import Path

import pandas as pd
from loguru import logger
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from loguru import logger

from plassembler.utils.external_tools import ExternalTool

def run_canu(
threads, logdir, longreads, canu_output_dir, canu_nano_or_pacbio
):

def run_canu(threads, logdir, longreads, canu_output_dir, canu_nano_or_pacbio):
"""runs canu
:param long: long read fastq
:param canu_output_dir: canu Output Directory
:param threads: threads
:param logdir: logdir
:return:
"""
# canu -p C308_canu -d C308_canu genomeSize=0.01m maxInputCoverage=200 maxThreads=8 -nanopore plasmid_long.fastq
# canu -p C308_canu -d C308_canu genomeSize=0.01m maxInputCoverage=200 maxThreads=8 -nanopore plasmid_long.fastq
canu = ExternalTool(
tool="canu",
input="",
output="",
params=f" -p canu -d {canu_output_dir} genomeSize=0.01m maxInputCoverage=250 maxThreads={threads} -{canu_nano_or_pacbio} {longreads}",
logdir=logdir,
outfile=""
outfile="",
)

ExternalTool.run_tool(canu, to_stdout=False)

def make_blastdb(
canu_output_dir, logdir):

canu_fasta: Path = Path(canu_output_dir) / "canu.contigs.fasta"
db: Path = Path(canu_output_dir) / "db"

makeblastdb = ExternalTool(
def make_blastdb(canu_output_dir, logdir):
canu_fasta: Path = Path(canu_output_dir) / "canu.contigs.fasta"
db: Path = Path(canu_output_dir) / "db"

makeblastdb = ExternalTool(
tool="makeblastdb",
input=f"-in {canu_fasta}",
output=f"-out {db}",
params="-dbtype nucl ",
logdir=logdir,
outfile="")
outfile="",
)

ExternalTool.run_tool(makeblastdb, to_stdout=False)

ExternalTool.run_tool(makeblastdb, to_stdout=False)

"""
some of this adapted from dnaapler
Expand All @@ -51,41 +53,38 @@ def make_blastdb(
"""

def run_blast(
canu_output_dir, threads, logdir):

canu_fasta: Path = Path(canu_output_dir) / "canu.contigs.fasta"
blast_output: Path = Path(canu_output_dir) / "blast_output.txt"
db: Path = Path(canu_output_dir) / "db"

blast = ExternalTool(
def run_blast(canu_output_dir, threads, logdir):
canu_fasta: Path = Path(canu_output_dir) / "canu.contigs.fasta"
blast_output: Path = Path(canu_output_dir) / "blast_output.txt"
db: Path = Path(canu_output_dir) / "db"

blast = ExternalTool(
tool="blastn",
input=f"-query {canu_fasta}",
output=f"-out {blast_output}",
params=f'-db {db} -evalue 1e-05 -num_threads {threads} -outfmt " 6 qseqid qlen sseqid slen length qstart qend sstart send pident nident gaps mismatch evalue bitscore qseq sseq "',
logdir=logdir,
outfile=""
outfile="",
)
ExternalTool.run_tool(blast, to_stdout=False)
ExternalTool.run_tool(blast, to_stdout=False)


def parse_blast_output(
canu_output_dir, threads, logdir):

canu_fasta: Path = Path(canu_output_dir) / "canu.contigs.fasta"
blast_output_file: Path = Path(canu_output_dir) / "blast_output.txt"
db: Path = Path(canu_output_dir) / "db"
def parse_blast_output(canu_output_dir, threads, logdir):
canu_fasta: Path = Path(canu_output_dir) / "canu.contigs.fasta"
blast_output_file: Path = Path(canu_output_dir) / "blast_output.txt"
db: Path = Path(canu_output_dir) / "db"

blast = ExternalTool(
blast = ExternalTool(
tool="blastn",
input=f"-query {input}",
output=f"-out {blast_output_file}",
params=f'-db {db} -evalue 1e-05 -num_threads {threads} -outfmt " 6 qseqid qlen sseqid slen length qstart qend sstart send pident nident gaps mismatch evalue bitscore qseq sseq "',
logdir=logdir,
outfile=""
outfile="",
)


def process_blast_output(canu_output_dir, outdir):
"""Processes
Expand Down Expand Up @@ -129,14 +128,12 @@ def process_blast_output(canu_output_dir, outdir):

# if the BLAST input is empty
if isinstance(blast_df, pd.DataFrame) and blast_df.empty:
logger.error(
"There were 0 BLAST hits. This must be a BLAST error."
)
logger.error("There were 0 BLAST hits. This must be a BLAST error.")

# keep only the columns where the same contig is blasted against each other

# Filter rows where qseqid is equal to sseqid - the ones fo BLASTing against themselves
blast_df = blast_df[blast_df['qseqid'] == blast_df['sseqid']]
blast_df = blast_df[blast_df["qseqid"] == blast_df["sseqid"]]

# read in the canu FASTA and save as a dictionary

Expand All @@ -146,90 +143,80 @@ def process_blast_output(canu_output_dir, outdir):
fasta_dict[record.id] = {
"count": i,
"sequence": str(record.seq),
"dupe": False, # stores the dupe status
"start": 1, # stores the start
"end": len(record.seq) # stores the end
"dupe": False, # stores the dupe status
"start": 1, # stores the start
"end": len(record.seq), # stores the end
}
i +=1
i += 1

for contig in fasta_dict.keys():

tmp_df = blast_df[blast_df['qseqid'] == contig]
tmp_df = blast_df[blast_df["qseqid"] == contig]
# Sort by 'length' column in descending order
tmp_df_sorted = tmp_df.sort_values(by='length', ascending=False)
tmp_df_sorted = tmp_df.sort_values(by="length", ascending=False)
# get rid of the 100% match row
tmp_df_sorted = tmp_df_sorted[tmp_df_sorted['qlen'] != tmp_df_sorted['length']]
tmp_df_sorted = tmp_df_sorted[tmp_df_sorted["qlen"] != tmp_df_sorted["length"]]
# more than 99% identical
tmp_df_sorted = tmp_df_sorted[tmp_df_sorted['pident'] > 99]
tmp_df_sorted = tmp_df_sorted[tmp_df_sorted["pident"] > 99]
num_rows = tmp_df_sorted.shape[0]
if num_rows == 0: # where there is no dupe at all
if num_rows == 0: # where there is no dupe at all
fasta_dict[contig]["dupe"] = 1
# exit
else:
# Get the first row
first_row = tmp_df_sorted.iloc[0]
# ensure the match is good
if first_row["length"] < 500: # less than 500bp repeat in the top hit - probably IS not a real plasmid dupe
#exit
print('exit')
if (
first_row["length"] < 500
): # less than 500bp repeat in the top hit - probably IS not a real plasmid dupe
# exit
print("exit")
else:
# the repeat will be in the longest hit with qstart = 1
best_row = tmp_df_sorted[tmp_df_sorted['qstart'] == 1].iloc[0]
best_row = tmp_df_sorted[tmp_df_sorted["qstart"] == 1].iloc[0]
# if the query end is larger than the sstart - there is an overlap
# take 1 as the start and then the sstart as the end
# otherwise check for concatenation (within 50))
# otherwise exit just the whole plasmid
if best_row['qend'] > best_row['sstart']:
if best_row["qend"] > best_row["sstart"]:
fasta_dict[contig]["dupe"] = True
fasta_dict[contig]["start"] = 1
fasta_dict[contig]["end"] = best_row['sstart']
elif (best_row['qend']+ 100) > best_row['sstart']: # likely to be pure duplication if within 100bp
fasta_dict[contig]["end"] = best_row["sstart"]
elif (best_row["qend"] + 100) > best_row[
"sstart"
]: # likely to be pure duplication if within 100bp
fasta_dict[contig]["dupe"] = True
fasta_dict[contig]["start"] = 1
fasta_dict[contig]["end"] = best_row['qend']
fasta_dict[contig]["end"] = best_row["qend"]
else:
print('no duplication detected')


print("no duplication detected")


print(fasta_dict)

# Create a list of SeqRecord objects
# Create a list of SeqRecord objects
records = []
for entry_id, entry_data in fasta_dict.items():

subsequence = entry_data["sequence"][entry_data["start"] - 1:entry_data["end"]]
subsequence = entry_data["sequence"][
entry_data["start"] - 1 : entry_data["end"]
]
count = str(entry_data["count"])
l = entry_data["end"]

record = SeqRecord(
seq=Seq(subsequence),
id=count,
description=f"{count} len={l}"
seq=Seq(subsequence), id=count, description=f"{count} len={l}"
)
records.append(record)

# Write the records to a FASTA file
output_filename: Path = Path(outdir) / "plasmids.fasta"
with open(output_filename, "w") as output_handle:
SeqIO.write(records, output_handle, "fasta")











# then BLAST output
# then BLAST output
# # then figure out for overlaps
# parse all blast hits as a dictionary
# need more than 1 hit (itself)
# if the blast hit is more than 50% and less than 90 % of contig length, take as duplicate
# elif the blast hit is more than 2000bp (lower could be e.g. IS element) and far away (more than 50% length away)
# then assume partial duplication too
# get all dupe regions this way
# then assume partial duplication too
# get all dupe regions this way

0 comments on commit d3696cd

Please sign in to comment.