Skip to content

Commit 3900b35

Browse files
committed
sr and lr copy number calculations
1 parent 63aa682 commit 3900b35

File tree

5 files changed

+86
-42
lines changed

5 files changed

+86
-42
lines changed

src/plassembler/utils/plass_class.py

Lines changed: 56 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -707,8 +707,12 @@ def combine_depth_mash_tsvs(self, prefix, depth_filter):
707707

708708
# get chroms and plasmids
709709

710-
combined_chrom_df = combined_depth_mash_df[combined_depth_mash_df["contig"].str.contains("chromosome")]
711-
combined_plasmid_df = combined_depth_mash_df[~combined_depth_mash_df["contig"].str.contains("chromosome")]
710+
combined_chrom_df = combined_depth_mash_df[
711+
combined_depth_mash_df["contig"].str.contains("chromosome")
712+
]
713+
combined_plasmid_df = combined_depth_mash_df[
714+
~combined_depth_mash_df["contig"].str.contains("chromosome")
715+
]
712716

713717
# get all plasmid contig ids and then filter
714718
all_plasmid_contig_ids = combined_plasmid_df["contig"].astype(str).tolist()
@@ -742,7 +746,7 @@ def combine_depth_mash_tsvs(self, prefix, depth_filter):
742746
if contig_id not in kept_plasmid_contig_ids
743747
]
744748

745-
749+
# logging
746750
# needs to be at least 1 filtered out id if the filtering did anything
747751
logger.info(f"Filtering contigs below depth filter: {depth_filter}.")
748752
if "mean_depth_short" in combined_plasmid_df.columns:
@@ -761,22 +765,16 @@ def combine_depth_mash_tsvs(self, prefix, depth_filter):
761765
logger.info(
762766
f"{len(filtered_out_contig_ids)} plasmids were filtered as they were below the depth filter."
763767
)
764-
765-
# Updating 'contig_id' names starting from 1
766-
num_rows = len(combined_plasmid_df)
767-
new_column_names = list(range(1, num_rows + 1))
768-
combined_plasmid_df['contig'] = new_column_names
769-
# Reset index after renaming
770-
combined_plasmid_df.reset_index(drop=True, inplace=True)
771768
else:
772769
logger.info(f"No plasmids were filtered due to low depth.")
773770

774-
775-
# concat back
776-
# there is 1+ plasmid
777-
if len(kept_plasmid_contig_ids) > 0 :
778-
combined_depth_mash_df = pd.concat([combined_chrom_df, combined_plasmid_df], axis=0)
779-
else: # only chroms
771+
# concat dfs back
772+
# there is 1+ plasmid
773+
if len(kept_plasmid_contig_ids) > 0:
774+
combined_depth_mash_df = pd.concat(
775+
[combined_chrom_df, combined_plasmid_df], axis=0
776+
)
777+
else: # only chroms
780778
combined_depth_mash_df = combined_chrom_df
781779

782780
combined_depth_mash_df.to_csv(
@@ -787,7 +785,7 @@ def combine_depth_mash_tsvs(self, prefix, depth_filter):
787785

788786
def finalise_contigs(self, prefix):
789787
"""
790-
Renames the contigs of unicycler with the new plasmid copy numbers and outputs finalised file
788+
Filters contigs plassembler run
791789
"""
792790
outdir = self.outdir
793791

@@ -799,23 +797,42 @@ def finalise_contigs(self, prefix):
799797
].reset_index(drop=True)
800798
# get contigs only
801799
plasmid_fasta = os.path.join(outdir, "unicycler_output", "assembly.fasta")
802-
i = 0
803800
with open(os.path.join(outdir, prefix + "_plasmids.fasta"), "w") as dna_fa:
804801
for dna_record in SeqIO.parse(plasmid_fasta, "fasta"):
805802
split_desc = dna_record.description.split(" ")
803+
contig_id = str(split_desc[0])
806804
# only keep the contigs that passed the depth threshold
807-
if str(split_desc[0]) not in filtered_out_contig_ids:
805+
if contig_id not in filtered_out_contig_ids:
806+
sr_copy_number_list = combined_depth_mash_df.loc[
807+
combined_depth_mash_df["contig"] == contig_id,
808+
"plasmid_copy_number_short",
809+
].values
810+
lr_copy_number_list = combined_depth_mash_df.loc[
811+
combined_depth_mash_df["contig"] == contig_id,
812+
"plasmid_copy_number_long",
813+
].values
814+
if len(sr_copy_number_list) > 0:
815+
sr_copy_number = sr_copy_number_list[0]
816+
else:
817+
logger.error("Plassembler failed")
818+
819+
if len(lr_copy_number_list) > 0:
820+
lr_copy_number = lr_copy_number_list[0]
821+
else:
822+
logger.error("Plassembler failed")
823+
824+
# will be ordered so new_contig_count will be the index of the df
825+
# but 1 index for the output
808826
if "circular" in dna_record.description: # circular contigs
809-
id_updated = f"{i} {split_desc[1]} plasmid_copy_number_short={combined_depth_mash_df.plasmid_copy_number_short[i]}x plasmid_copy_number_long={combined_depth_mash_df.plasmid_copy_number_long[i]}x circular=true"
827+
id_updated = f"{contig_id} {split_desc[1]} plasmid_copy_number_short={sr_copy_number}x plasmid_copy_number_long={lr_copy_number}x circular=true"
810828
else: # non circular contigs
811-
id_updated = f"{i} {split_desc[1]} plasmid_copy_number_short={combined_depth_mash_df.plasmid_copy_number_short[i]}x plasmid_copy_number_long={combined_depth_mash_df.plasmid_copy_number_long[i]}x"
812-
i += 1
829+
id_updated = f"{contig_id} {split_desc[1]} plasmid_copy_number_short={sr_copy_number}x plasmid_copy_number_long={lr_copy_number}x"
813830
record = SeqRecord(dna_record.seq, id=id_updated, description="")
814831
SeqIO.write(record, dna_fa, "fasta")
815832

816833
def finalise_contigs_long(self, prefix):
817834
"""
818-
Renames the contigs of assembly with new ones
835+
Filters contigs plassembler long
819836
"""
820837
outdir = self.outdir
821838

@@ -827,21 +844,30 @@ def finalise_contigs_long(self, prefix):
827844
].reset_index(drop=True)
828845
# get contigs only
829846
plasmid_fasta = os.path.join(outdir, "plasmids.fasta")
830-
i = 0
831847
with open(os.path.join(outdir, prefix + "_plasmids.fasta"), "w") as dna_fa:
832848
for dna_record in SeqIO.parse(plasmid_fasta, "fasta"):
833849
# only keep the contigs that passed the depth threshold
834-
if str(dna_record.id) not in filtered_out_contig_ids:
850+
contig_id = str(dna_record.id)
851+
if contig_id not in filtered_out_contig_ids:
835852
length = len(dna_record.seq)
836-
copy_number = combined_depth_mash_df.plasmid_copy_number_long[i]
853+
lr_copy_number_list = combined_depth_mash_df.loc[
854+
combined_depth_mash_df["contig"] == contig_id,
855+
"plasmid_copy_number_long",
856+
].values
857+
if len(lr_copy_number_list) > 0:
858+
lr_copy_number = lr_copy_number_list[0]
859+
else:
860+
logger.error("Plassembler failed")
861+
837862
if (
838863
"circular" in dna_record.description
839864
): # circular contigs from canu
840-
desc = f"len={length} plasmid_copy_number_long={copy_number}x circular=True"
865+
desc = f"len={length} plasmid_copy_number_long={lr_copy_number}x circular=True"
841866
else:
842-
desc = f"len={length} plasmid_copy_number_long={copy_number}x"
843-
i += 1
844-
record = SeqRecord(dna_record.seq, id=str(i), description=desc)
867+
desc = (
868+
f"len={length} plasmid_copy_number_long={lr_copy_number}x"
869+
)
870+
record = SeqRecord(dna_record.seq, id=contig_id, description=desc)
845871
SeqIO.write(record, dna_fa, "fasta")
846872

847873

4.7 KB
Binary file not shown.
6.5 KB
Binary file not shown.
3.24 MB
Binary file not shown.

tests/test_end_to_end.py

Lines changed: 30 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -124,17 +124,6 @@ def test_plassembler_case_3(self):
124124
exec_command(cmd)
125125
remove_directory(outdir)
126126

127-
def test_plassembler_case_3(self):
128-
"""test plassembler run - chromosome and plasmids assembled with Flye"""
129-
longreads: Path = f"{end_to_end}/input_fastq.gz"
130-
s1: Path = f"{end_to_end}/input_R1.fastq.gz"
131-
s2: Path = f"{end_to_end}/input_R2.fastq.gz"
132-
chromosome = 50000
133-
outdir: Path = f"{end_to_end}/test_out"
134-
cmd = f"plassembler run -l {longreads} -c {chromosome} -1 {s1} -2 {s2} -d {plassembler_db_dir} -o {outdir} -t 8 -f"
135-
exec_command(cmd)
136-
remove_directory(outdir)
137-
138127
def test_plassembler_case_4(self):
139128
"""test plassembler run case 4. Only chromosome assembled with flye, no plasmid in recovery."""
140129
longreads: Path = f"{end_to_end}/abaumanii_plasmid.fastq.gz"
@@ -159,7 +148,7 @@ def test_plassembler_multiple_chromosomes_no_plasmids(self):
159148

160149
# copy number
161150

162-
def test_plassembler_case_depth_filter(self):
151+
def test_plassembler_case_depth_filter_all(self):
163152
"""test plassembler run depth_filter 1.2 - will have no plasmids left"""
164153
longreads: Path = f"{end_to_end}/input_fastq.gz"
165154
s1: Path = f"{end_to_end}/input_R1.fastq.gz"
@@ -170,6 +159,17 @@ def test_plassembler_case_depth_filter(self):
170159
exec_command(cmd)
171160
remove_directory(outdir)
172161

162+
def test_plassembler_case_depth_filter_some(self):
163+
"""test plassembler run depth_filter 0.6 with input_half lr - will have only 1 plasmid"""
164+
longreads: Path = f"{end_to_end}/input_half.fastq.gz"
165+
s1: Path = f"{end_to_end}/input_R1.fastq.gz"
166+
s2: Path = f"{end_to_end}/input_R2.fastq.gz"
167+
chromosome = 50000
168+
outdir: Path = f"{end_to_end}/test_out"
169+
cmd = f"plassembler run -l {longreads} -c {chromosome} -1 {s1} -2 {s2} -d {plassembler_db_dir} -o {outdir} -t 8 -f --depth_filter 0.6"
170+
exec_command(cmd)
171+
remove_directory(outdir)
172+
173173
def test_plassembler_case_extra_unicycler_spades_opts(self):
174174
"""test plassembler with extra unicycler and spades opts"""
175175
longreads: Path = f"{end_to_end}/input_fastq.gz"
@@ -263,6 +263,24 @@ def test_plassembler_long(self):
263263
exec_command(cmd)
264264
remove_directory(outdir)
265265

266+
def test_plassembler_depth_all(self):
267+
"""test plassembler long depth filter will all plasmids filtered"""
268+
longreads: Path = f"{end_to_end}/input_fastq.gz"
269+
chromosome = 50000
270+
outdir: Path = f"{end_to_end}/test_out"
271+
cmd = f"plassembler long -l {longreads} -c {chromosome} -d {plassembler_db_dir} -o {outdir} --depth_filter 10 -t 8 -f"
272+
exec_command(cmd)
273+
remove_directory(outdir)
274+
275+
def test_plassembler_depth_some(self):
276+
"""test plassembler long depth filter will some plasmids filtered"""
277+
longreads: Path = f"{end_to_end}/input_depth_filter.fastq.gz"
278+
chromosome = 50000
279+
outdir: Path = f"{end_to_end}/test_out"
280+
cmd = f"plassembler long -l {longreads} -c {chromosome} -d {plassembler_db_dir} -o {outdir} --depth_filter 2 -t 8 -f"
281+
exec_command(cmd)
282+
remove_directory(outdir)
283+
266284
# def test_plassembler_long_canu(self):
267285
# """test plassembler long canu"""
268286
# longreads: Path = f"{end_to_end}/input_fastq.gz"

0 commit comments

Comments
 (0)