Skip to content

Commit

Permalink
sr and lr copy number calculations
Browse files Browse the repository at this point in the history
  • Loading branch information
gbouras13 committed Mar 1, 2024
1 parent 63aa682 commit 3900b35
Show file tree
Hide file tree
Showing 5 changed files with 86 additions and 42 deletions.
86 changes: 56 additions & 30 deletions src/plassembler/utils/plass_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -707,8 +707,12 @@ def combine_depth_mash_tsvs(self, prefix, depth_filter):

# get chroms and plasmids

combined_chrom_df = combined_depth_mash_df[combined_depth_mash_df["contig"].str.contains("chromosome")]
combined_plasmid_df = combined_depth_mash_df[~combined_depth_mash_df["contig"].str.contains("chromosome")]
combined_chrom_df = combined_depth_mash_df[
combined_depth_mash_df["contig"].str.contains("chromosome")
]
combined_plasmid_df = combined_depth_mash_df[
~combined_depth_mash_df["contig"].str.contains("chromosome")
]

# get all plasmid contig ids and then filter
all_plasmid_contig_ids = combined_plasmid_df["contig"].astype(str).tolist()
Expand Down Expand Up @@ -742,7 +746,7 @@ def combine_depth_mash_tsvs(self, prefix, depth_filter):
if contig_id not in kept_plasmid_contig_ids
]


# logging
# needs to be at least 1 filtered out id if the filtering did anything
logger.info(f"Filtering contigs below depth filter: {depth_filter}.")
if "mean_depth_short" in combined_plasmid_df.columns:
Expand All @@ -761,22 +765,16 @@ def combine_depth_mash_tsvs(self, prefix, depth_filter):
logger.info(
f"{len(filtered_out_contig_ids)} plasmids were filtered as they were below the depth filter."
)

# Updating 'contig_id' names starting from 1
num_rows = len(combined_plasmid_df)
new_column_names = list(range(1, num_rows + 1))
combined_plasmid_df['contig'] = new_column_names
# Reset index after renaming
combined_plasmid_df.reset_index(drop=True, inplace=True)
else:
logger.info(f"No plasmids were filtered due to low depth.")


# concat back
# there is 1+ plasmid
if len(kept_plasmid_contig_ids) > 0 :
combined_depth_mash_df = pd.concat([combined_chrom_df, combined_plasmid_df], axis=0)
else: # only chroms
# concat dfs back
# there is 1+ plasmid
if len(kept_plasmid_contig_ids) > 0:
combined_depth_mash_df = pd.concat(
[combined_chrom_df, combined_plasmid_df], axis=0
)
else: # only chroms
combined_depth_mash_df = combined_chrom_df

combined_depth_mash_df.to_csv(
Expand All @@ -787,7 +785,7 @@ def combine_depth_mash_tsvs(self, prefix, depth_filter):

def finalise_contigs(self, prefix):
"""
Renames the contigs of unicycler with the new plasmid copy numbers and outputs finalised file
Filters contigs plassembler run
"""
outdir = self.outdir

Expand All @@ -799,23 +797,42 @@ def finalise_contigs(self, prefix):
].reset_index(drop=True)
# get contigs only
plasmid_fasta = os.path.join(outdir, "unicycler_output", "assembly.fasta")
i = 0
with open(os.path.join(outdir, prefix + "_plasmids.fasta"), "w") as dna_fa:
for dna_record in SeqIO.parse(plasmid_fasta, "fasta"):
split_desc = dna_record.description.split(" ")
contig_id = str(split_desc[0])
# only keep the contigs that passed the depth threshold
if str(split_desc[0]) not in filtered_out_contig_ids:
if contig_id not in filtered_out_contig_ids:
sr_copy_number_list = combined_depth_mash_df.loc[
combined_depth_mash_df["contig"] == contig_id,
"plasmid_copy_number_short",
].values
lr_copy_number_list = combined_depth_mash_df.loc[
combined_depth_mash_df["contig"] == contig_id,
"plasmid_copy_number_long",
].values
if len(sr_copy_number_list) > 0:
sr_copy_number = sr_copy_number_list[0]
else:
logger.error("Plassembler failed")

if len(lr_copy_number_list) > 0:
lr_copy_number = lr_copy_number_list[0]
else:
logger.error("Plassembler failed")

# will be ordered so new_contig_count will be the index of the df
# but 1 index for the output
if "circular" in dna_record.description: # circular contigs
id_updated = f"{i} {split_desc[1]} plasmid_copy_number_short={combined_depth_mash_df.plasmid_copy_number_short[i]}x plasmid_copy_number_long={combined_depth_mash_df.plasmid_copy_number_long[i]}x circular=true"
id_updated = f"{contig_id} {split_desc[1]} plasmid_copy_number_short={sr_copy_number}x plasmid_copy_number_long={lr_copy_number}x circular=true"
else: # non circular contigs
id_updated = f"{i} {split_desc[1]} plasmid_copy_number_short={combined_depth_mash_df.plasmid_copy_number_short[i]}x plasmid_copy_number_long={combined_depth_mash_df.plasmid_copy_number_long[i]}x"
i += 1
id_updated = f"{contig_id} {split_desc[1]} plasmid_copy_number_short={sr_copy_number}x plasmid_copy_number_long={lr_copy_number}x"
record = SeqRecord(dna_record.seq, id=id_updated, description="")
SeqIO.write(record, dna_fa, "fasta")

def finalise_contigs_long(self, prefix):
"""
Renames the contigs of assembly with new ones
Filters contigs plassembler long
"""
outdir = self.outdir

Expand All @@ -827,21 +844,30 @@ def finalise_contigs_long(self, prefix):
].reset_index(drop=True)
# get contigs only
plasmid_fasta = os.path.join(outdir, "plasmids.fasta")
i = 0
with open(os.path.join(outdir, prefix + "_plasmids.fasta"), "w") as dna_fa:
for dna_record in SeqIO.parse(plasmid_fasta, "fasta"):
# only keep the contigs that passed the depth threshold
if str(dna_record.id) not in filtered_out_contig_ids:
contig_id = str(dna_record.id)
if contig_id not in filtered_out_contig_ids:
length = len(dna_record.seq)
copy_number = combined_depth_mash_df.plasmid_copy_number_long[i]
lr_copy_number_list = combined_depth_mash_df.loc[
combined_depth_mash_df["contig"] == contig_id,
"plasmid_copy_number_long",
].values
if len(lr_copy_number_list) > 0:
lr_copy_number = lr_copy_number_list[0]
else:
logger.error("Plassembler failed")

if (
"circular" in dna_record.description
): # circular contigs from canu
desc = f"len={length} plasmid_copy_number_long={copy_number}x circular=True"
desc = f"len={length} plasmid_copy_number_long={lr_copy_number}x circular=True"
else:
desc = f"len={length} plasmid_copy_number_long={copy_number}x"
i += 1
record = SeqRecord(dna_record.seq, id=str(i), description=desc)
desc = (
f"len={length} plasmid_copy_number_long={lr_copy_number}x"
)
record = SeqRecord(dna_record.seq, id=contig_id, description=desc)
SeqIO.write(record, dna_fa, "fasta")


Expand Down
Binary file modified tests/test_data/end_to_end/input_R1.fastq.gz
Binary file not shown.
Binary file modified tests/test_data/end_to_end/input_R2.fastq.gz
Binary file not shown.
Binary file added tests/test_data/end_to_end/input_half.fastq.gz
Binary file not shown.
42 changes: 30 additions & 12 deletions tests/test_end_to_end.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,17 +124,6 @@ def test_plassembler_case_3(self):
exec_command(cmd)
remove_directory(outdir)

def test_plassembler_case_3(self):
"""test plassembler run - chromosome and plasmids assembled with Flye"""
longreads: Path = f"{end_to_end}/input_fastq.gz"
s1: Path = f"{end_to_end}/input_R1.fastq.gz"
s2: Path = f"{end_to_end}/input_R2.fastq.gz"
chromosome = 50000
outdir: Path = f"{end_to_end}/test_out"
cmd = f"plassembler run -l {longreads} -c {chromosome} -1 {s1} -2 {s2} -d {plassembler_db_dir} -o {outdir} -t 8 -f"
exec_command(cmd)
remove_directory(outdir)

def test_plassembler_case_4(self):
"""test plassembler run case 4. Only chromosome assembled with flye, no plasmid in recovery."""
longreads: Path = f"{end_to_end}/abaumanii_plasmid.fastq.gz"
Expand All @@ -159,7 +148,7 @@ def test_plassembler_multiple_chromosomes_no_plasmids(self):

# copy number

def test_plassembler_case_depth_filter(self):
def test_plassembler_case_depth_filter_all(self):
"""test plassembler run depth_filter 1.2 - will have no plasmids left"""
longreads: Path = f"{end_to_end}/input_fastq.gz"
s1: Path = f"{end_to_end}/input_R1.fastq.gz"
Expand All @@ -170,6 +159,17 @@ def test_plassembler_case_depth_filter(self):
exec_command(cmd)
remove_directory(outdir)

def test_plassembler_case_depth_filter_some(self):
"""test plassembler run depth_filter 0.6 with input_half lr - will have only 1 plasmid"""
longreads: Path = f"{end_to_end}/input_half.fastq.gz"
s1: Path = f"{end_to_end}/input_R1.fastq.gz"
s2: Path = f"{end_to_end}/input_R2.fastq.gz"
chromosome = 50000
outdir: Path = f"{end_to_end}/test_out"
cmd = f"plassembler run -l {longreads} -c {chromosome} -1 {s1} -2 {s2} -d {plassembler_db_dir} -o {outdir} -t 8 -f --depth_filter 0.6"
exec_command(cmd)
remove_directory(outdir)

def test_plassembler_case_extra_unicycler_spades_opts(self):
"""test plassembler with extra unicycler and spades opts"""
longreads: Path = f"{end_to_end}/input_fastq.gz"
Expand Down Expand Up @@ -263,6 +263,24 @@ def test_plassembler_long(self):
exec_command(cmd)
remove_directory(outdir)

def test_plassembler_depth_all(self):
"""test plassembler long depth filter will all plasmids filtered"""
longreads: Path = f"{end_to_end}/input_fastq.gz"
chromosome = 50000
outdir: Path = f"{end_to_end}/test_out"
cmd = f"plassembler long -l {longreads} -c {chromosome} -d {plassembler_db_dir} -o {outdir} --depth_filter 10 -t 8 -f"
exec_command(cmd)
remove_directory(outdir)

def test_plassembler_depth_some(self):
"""test plassembler long depth filter will some plasmids filtered"""
longreads: Path = f"{end_to_end}/input_depth_filter.fastq.gz"
chromosome = 50000
outdir: Path = f"{end_to_end}/test_out"
cmd = f"plassembler long -l {longreads} -c {chromosome} -d {plassembler_db_dir} -o {outdir} --depth_filter 2 -t 8 -f"
exec_command(cmd)
remove_directory(outdir)

# def test_plassembler_long_canu(self):
# """test plassembler long canu"""
# longreads: Path = f"{end_to_end}/input_fastq.gz"
Expand Down

0 comments on commit 3900b35

Please sign in to comment.