@@ -707,8 +707,12 @@ def combine_depth_mash_tsvs(self, prefix, depth_filter):
707
707
708
708
# get chroms and plasmids
709
709
710
- combined_chrom_df = combined_depth_mash_df [combined_depth_mash_df ["contig" ].str .contains ("chromosome" )]
711
- combined_plasmid_df = combined_depth_mash_df [~ combined_depth_mash_df ["contig" ].str .contains ("chromosome" )]
710
+ combined_chrom_df = combined_depth_mash_df [
711
+ combined_depth_mash_df ["contig" ].str .contains ("chromosome" )
712
+ ]
713
+ combined_plasmid_df = combined_depth_mash_df [
714
+ ~ combined_depth_mash_df ["contig" ].str .contains ("chromosome" )
715
+ ]
712
716
713
717
# get all plasmid contig ids and then filter
714
718
all_plasmid_contig_ids = combined_plasmid_df ["contig" ].astype (str ).tolist ()
@@ -742,7 +746,7 @@ def combine_depth_mash_tsvs(self, prefix, depth_filter):
742
746
if contig_id not in kept_plasmid_contig_ids
743
747
]
744
748
745
-
749
+ # logging
746
750
# needs to be at least 1 filtered out id if the filtering did anything
747
751
logger .info (f"Filtering contigs below depth filter: { depth_filter } ." )
748
752
if "mean_depth_short" in combined_plasmid_df .columns :
@@ -761,22 +765,16 @@ def combine_depth_mash_tsvs(self, prefix, depth_filter):
761
765
logger .info (
762
766
f"{ len (filtered_out_contig_ids )} plasmids were filtered as they were below the depth filter."
763
767
)
764
-
765
- # Updating 'contig_id' names starting from 1
766
- num_rows = len (combined_plasmid_df )
767
- new_column_names = list (range (1 , num_rows + 1 ))
768
- combined_plasmid_df ['contig' ] = new_column_names
769
- # Reset index after renaming
770
- combined_plasmid_df .reset_index (drop = True , inplace = True )
771
768
else :
772
769
logger .info (f"No plasmids were filtered due to low depth." )
773
770
774
-
775
- # concat back
776
- # there is 1+ plasmid
777
- if len (kept_plasmid_contig_ids ) > 0 :
778
- combined_depth_mash_df = pd .concat ([combined_chrom_df , combined_plasmid_df ], axis = 0 )
779
- else : # only chroms
771
+ # concat dfs back
772
+ # there is 1+ plasmid
773
+ if len (kept_plasmid_contig_ids ) > 0 :
774
+ combined_depth_mash_df = pd .concat (
775
+ [combined_chrom_df , combined_plasmid_df ], axis = 0
776
+ )
777
+ else : # only chroms
780
778
combined_depth_mash_df = combined_chrom_df
781
779
782
780
combined_depth_mash_df .to_csv (
@@ -787,7 +785,7 @@ def combine_depth_mash_tsvs(self, prefix, depth_filter):
787
785
788
786
def finalise_contigs (self , prefix ):
789
787
"""
790
- Renames the contigs of unicycler with the new plasmid copy numbers and outputs finalised file
788
+ Filters contigs plassembler run
791
789
"""
792
790
outdir = self .outdir
793
791
@@ -799,23 +797,42 @@ def finalise_contigs(self, prefix):
799
797
].reset_index (drop = True )
800
798
# get contigs only
801
799
plasmid_fasta = os .path .join (outdir , "unicycler_output" , "assembly.fasta" )
802
- i = 0
803
800
with open (os .path .join (outdir , prefix + "_plasmids.fasta" ), "w" ) as dna_fa :
804
801
for dna_record in SeqIO .parse (plasmid_fasta , "fasta" ):
805
802
split_desc = dna_record .description .split (" " )
803
+ contig_id = str (split_desc [0 ])
806
804
# only keep the contigs that passed the depth threshold
807
- if str (split_desc [0 ]) not in filtered_out_contig_ids :
805
+ if contig_id not in filtered_out_contig_ids :
806
+ sr_copy_number_list = combined_depth_mash_df .loc [
807
+ combined_depth_mash_df ["contig" ] == contig_id ,
808
+ "plasmid_copy_number_short" ,
809
+ ].values
810
+ lr_copy_number_list = combined_depth_mash_df .loc [
811
+ combined_depth_mash_df ["contig" ] == contig_id ,
812
+ "plasmid_copy_number_long" ,
813
+ ].values
814
+ if len (sr_copy_number_list ) > 0 :
815
+ sr_copy_number = sr_copy_number_list [0 ]
816
+ else :
817
+ logger .error ("Plassembler failed" )
818
+
819
+ if len (lr_copy_number_list ) > 0 :
820
+ lr_copy_number = lr_copy_number_list [0 ]
821
+ else :
822
+ logger .error ("Plassembler failed" )
823
+
824
+ # will be ordered so new_contig_count will be the index of the df
825
+ # but 1 index for the output
808
826
if "circular" in dna_record .description : # circular contigs
809
- id_updated = f"{ i } { split_desc [1 ]} plasmid_copy_number_short={ combined_depth_mash_df . plasmid_copy_number_short [ i ] } x plasmid_copy_number_long={ combined_depth_mash_df . plasmid_copy_number_long [ i ] } x circular=true"
827
+ id_updated = f"{ contig_id } { split_desc [1 ]} plasmid_copy_number_short={ sr_copy_number } x plasmid_copy_number_long={ lr_copy_number } x circular=true"
810
828
else : # non circular contigs
811
- id_updated = f"{ i } { split_desc [1 ]} plasmid_copy_number_short={ combined_depth_mash_df .plasmid_copy_number_short [i ]} x plasmid_copy_number_long={ combined_depth_mash_df .plasmid_copy_number_long [i ]} x"
812
- i += 1
829
+ id_updated = f"{ contig_id } { split_desc [1 ]} plasmid_copy_number_short={ sr_copy_number } x plasmid_copy_number_long={ lr_copy_number } x"
813
830
record = SeqRecord (dna_record .seq , id = id_updated , description = "" )
814
831
SeqIO .write (record , dna_fa , "fasta" )
815
832
816
833
def finalise_contigs_long (self , prefix ):
817
834
"""
818
- Renames the contigs of assembly with new ones
835
+ Filters contigs plassembler long
819
836
"""
820
837
outdir = self .outdir
821
838
@@ -827,21 +844,30 @@ def finalise_contigs_long(self, prefix):
827
844
].reset_index (drop = True )
828
845
# get contigs only
829
846
plasmid_fasta = os .path .join (outdir , "plasmids.fasta" )
830
- i = 0
831
847
with open (os .path .join (outdir , prefix + "_plasmids.fasta" ), "w" ) as dna_fa :
832
848
for dna_record in SeqIO .parse (plasmid_fasta , "fasta" ):
833
849
# only keep the contigs that passed the depth threshold
834
- if str (dna_record .id ) not in filtered_out_contig_ids :
850
+ contig_id = str (dna_record .id )
851
+ if contig_id not in filtered_out_contig_ids :
835
852
length = len (dna_record .seq )
836
- copy_number = combined_depth_mash_df .plasmid_copy_number_long [i ]
853
+ lr_copy_number_list = combined_depth_mash_df .loc [
854
+ combined_depth_mash_df ["contig" ] == contig_id ,
855
+ "plasmid_copy_number_long" ,
856
+ ].values
857
+ if len (lr_copy_number_list ) > 0 :
858
+ lr_copy_number = lr_copy_number_list [0 ]
859
+ else :
860
+ logger .error ("Plassembler failed" )
861
+
837
862
if (
838
863
"circular" in dna_record .description
839
864
): # circular contigs from canu
840
- desc = f"len={ length } plasmid_copy_number_long={ copy_number } x circular=True"
865
+ desc = f"len={ length } plasmid_copy_number_long={ lr_copy_number } x circular=True"
841
866
else :
842
- desc = f"len={ length } plasmid_copy_number_long={ copy_number } x"
843
- i += 1
844
- record = SeqRecord (dna_record .seq , id = str (i ), description = desc )
867
+ desc = (
868
+ f"len={ length } plasmid_copy_number_long={ lr_copy_number } x"
869
+ )
870
+ record = SeqRecord (dna_record .seq , id = contig_id , description = desc )
845
871
SeqIO .write (record , dna_fa , "fasta" )
846
872
847
873
0 commit comments