disable EM until further troubleshooting; updating HTML output

mhammell-laboratory · Nov 25, 2024 · 778b86e · 778b86e
1 parent 51bf49c
commit 778b86e
Show file tree

Hide file tree

Showing 5 changed files with 68 additions and 46 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,9 @@
 *.bak
 *~
 
+# Debug
+debug_environment.yaml
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/TEsmall/abundance.py b/TEsmall/abundance.py
@@ -33,7 +33,7 @@ def em_dic_to_df(em_dict, srna_class, file_root):
         te_table = pd.concat([te_name_df, df], sort=True, axis=1)
         te_table.columns = ['ftype', 'fid', file_root]
         te_table.iloc[:, 2] = te_table.iloc[:, 2].round(10)
-        #te_table.to_csv('/Users/koneill/Desktop/fix_tesmall/df_{0}_{1}.txt'.format(file_root, srna_class), sep=' ', mode='a', index=False)
+        te_table.to_csv('df_{0}_{1}.txt'.format(file_root, srna_class), sep=' ', mode='a', index=False)
         return te_table
     else:
         empty_df = pd.DataFrame(columns=['ftype', 'fid', file_root])  # in case you get an empty dictionary from the EM, there are no reads in that class
@@ -45,25 +45,28 @@ def em_dic_to_df(em_dict, srna_class, file_root):
 
 def calc_abundance(em_in):
     dfs = []  # holds dfs from each sample to merge for final count table
-    for anno, cca_anno, em_weight_df, mirna_dic, te_s_dic, te_as_dic in em_in:
+#    for anno, cca_anno, em_weight_df, mirna_dic, te_s_dic, te_as_dic in em_in:
+#    for anno, cca_anno, em_weight_df, te_s_dic, te_as_dic in em_in:
+    for anno, cca_anno in em_in:
         logging.info("Calculating feature abundances...")
         root = splitext(anno)[0]
         # create data frames from annotation files (1/n) and EM weighted dictionaries
         df = pd.read_csv(anno, sep="\t", usecols=["rid", "ftype", "fid"])  # get necessary columns
-        df = df[(df["ftype"] != "anti_TE") & (df["ftype"] != "sense_TE") & (df["ftype"] != "miRNA")]  # filter out EM treated categories
+        df = df[(df["ftype"] != "anti_TE") & (df["ftype"] != "sense_TE")]  # filter out EM treated categories
         rweight = 1 / df.groupby("rid").fid.nunique()  # calculate 1/n weight per read
         ftable = df.groupby(["ftype", "fid"]).rid.unique()  # group on sRNA id in class
         count = ftable.apply(lambda l: round(sum([rweight[s] for s in l])))  # hierarchecal df with ftype and fid (fid contains fid and weight tab delim)
         temp_dic = {root: count}  # makes new key with sample name
         count = pd.DataFrame(temp_dic).reset_index()  # resets weight column name to sample root name
-        te_s_table = em_dic_to_df(te_s_dic, 'sense_TE', root)  # get matching EM dfs
-        te_as_table = em_dic_to_df(te_as_dic, 'anti_TE', root)
-        mir_table = em_dic_to_df(mirna_dic, 'miRNA', root)
+#        te_s_table = em_dic_to_df(te_s_dic, 'sense_TE', root)  # get matching EM dfs
+#        te_as_table = em_dic_to_df(te_as_dic, 'anti_TE', root)
+#        mir_table = em_dic_to_df(mirna_dic, 'miRNA', root)
 
-        em_count = pd.concat([te_s_table, te_as_table, mir_table], sort=True)  # put EM dfs together
-        #em_count.to_csv('/Users/koneill/Desktop/fix_tesmall/em_merge_df_{0}.txt'.format(root), sep=' ', mode='a')
-        em_count = em_count[["ftype", "fid", root]]  # order columns properly
-        #em_count.to_csv('/Users/koneill/Desktop/fix_tesmall/em_merge_df_named_{0}.txt'.format(root), sep=' ', mode='a')
+#        em_count = pd.concat([te_s_table, te_as_table, mir_table], sort=True)  # put EM dfs together
+#        em_count = pd.concat([te_s_table, te_as_table], sort=True)  # put EM dfs together
+#        em_count.to_csv('em_merge_df_{0}.txt'.format(root), sep=' ', mode='a')
+#        em_count = em_count[["ftype", "fid", root]]  # order columns properly
+#        em_count.to_csv('em_merge_df_named_{0}.txt'.format(root), sep=' ', mode='a')
 
         cca_df = pd.read_csv(cca_anno, sep="\t", usecols=["rid", "ftype", "fid"])  # do everything from before on cca annotation
         cca_rweight = 1 / cca_df.groupby("rid").fid.nunique()
@@ -72,26 +75,29 @@ def calc_abundance(em_in):
         temp_cca_dic = {root: cca_count}  # fix weight column
         cca_count = pd.DataFrame(temp_cca_dic).reset_index()
         # put main (1/n) anno counts, EM weighted counts, and (1/n) 3' trf counts together
-        count_out = pd.concat([count, em_count, cca_count], sort=True, ignore_index=True)
+#        count_out = pd.concat([count, em_count, cca_count], sort=True, ignore_index=True)
+        count_out = pd.concat([count, cca_count], sort=True, ignore_index=True)
         count_out = count_out.loc[count_out[root] > 0.1, :]  # filter species with 0 counts
         count_out = count_out[['fid', 'ftype', root]]
         count_out = count_out.groupby(['fid','ftype'], as_index = False)[root].sum()
         count_out = count_out[['fid', 'ftype', root]]
-        #count_out.to_csv("test_sample_df_final_{0}.txt".format(root), sep="\t", na_rep=0, float_format="%.0f", index=False)
+        count_out.to_csv("test_sample_df_final_{0}.txt".format(root), sep="\t", na_rep=0, float_format="%.0f", index=False)
         dfs.append(count_out)
 
         # make a bedgraph file too
         coor = pd.read_csv(anno, sep="\t", usecols=["rid", "ftype", "fid", 'rchr', 'rstart', 'rend'])  # get the right columns from anno file
         cca_coor = pd.read_csv(cca_anno, sep="\t", usecols=["rid", "ftype", "fid", 'rchr', 'rstart', 'rend'])
         bed = coor.merge(rweight.to_frame(), on='rid')  # get their already calculated associated weights
         bed.columns = ['rid', 'rchr', 'rstart', 'rend', 'ftype', 'fid', 'weight']  # name columns
-        bed = bed[(bed["ftype"] != "anti_TE") & (bed["ftype"] != "sense_TE") & (bed["ftype"] != "miRNA")]  # filter out those that arent TEs or miRNAs since these weights are from EM
-        em_bed = pd.merge(coor, em_weight_df, left_on=['ftype', 'fid', 'rid'], right_on=['ftype', 'fid', 'rid'])  # merge in EM weights
+#        bed = bed[(bed["ftype"] != "anti_TE") & (bed["ftype"] != "sense_TE") & (bed["ftype"] != "miRNA")]  # filter out those that arent TEs or miRNAs since these weights are from EM
+#        bed = bed[(bed["ftype"] != "anti_TE") & (bed["ftype"] != "sense_TE")]  # filter out those that arent TEs since these weights are from EM
+#        em_bed = pd.merge(coor, em_weight_df, left_on=['ftype', 'fid', 'rid'], right_on=['ftype', 'fid', 'rid'])  # merge in EM weights
         cca_bed = cca_coor.merge(cca_rweight.to_frame(), on='rid')  # get 3' trf weights from parallel annotation counting
         cca_bed.columns = ['rid', 'rchr', 'rstart', 'rend', 'ftype', 'fid', 'weight']  # rename columns
 
         # merge bed dataframes to one big one
-        for_bed_graph = pd.concat([bed, em_bed, cca_bed])
+#        for_bed_graph = pd.concat([bed, em_bed, cca_bed])
+        for_bed_graph = pd.concat([bed, cca_bed])
         for_bed_graph = for_bed_graph[['rchr', 'rstart', 'rend', 'weight']]  # pick the columns an actual .bed file uses
         for_bed_graph = for_bed_graph.sort_values(by=['rchr', 'rstart'])  # sort 'em for your viewer
         for_bed_graph = for_bed_graph.groupby(['rchr', 'rstart', 'rend'], as_index=False)['weight'].sum()  # sum within bins of read length denoted by annotation start stop

diff --git a/TEsmall/annotation.py b/TEsmall/annotation.py
@@ -196,7 +196,7 @@ def merge_weight_dict(EM_output, em_index):
     return count_dic
 
 
-def make_list_for_bed_grph(te_s_weights, te_s_read_ref, te_as_weights, te_as_read_ref, mir_weights, mir_read_ref, array_count):
+def make_list_for_bed_grph(te_s_weights, te_s_read_ref, te_as_weights, te_as_read_ref, array_count):
     idx = array_count - 1
     result_array = np.empty((array_count, 4), dtype='object')
     x = 'sense_TE'
@@ -218,13 +218,13 @@ def make_list_for_bed_grph(te_s_weights, te_s_read_ref, te_as_weights, te_as_rea
             result_array[idx] = [y, k, i, read_weight]
             idx -= 1
             # result_array = np.append(result_array, [q], axis=0)
-    for k, v in list(mir_read_ref.items()):
-        read_num = len(v)
-        weight_feature = mir_weights[k]
-        read_weight = weight_feature / read_num
-        for i in mir_read_ref[k]:
-            result_array[idx] = [z, k, i, read_weight]
-            idx -= 1
+#    for k, v in list(mir_read_ref.items()):
+#        read_num = len(v)
+#        weight_feature = mir_weights[k]
+#        read_weight = weight_feature / read_num
+#        for i in mir_read_ref[k]:
+#            result_array[idx] = [z, k, i, read_weight]
+#            idx -= 1
             # result_array = np.append(result_array, [q], axis=0)
     weight_df = pd.DataFrame(result_array, columns=['ftype', 'fid', 'rid', 'weight'])
     return weight_df
@@ -233,19 +233,22 @@ def make_list_for_bed_grph(te_s_weights, te_s_read_ref, te_as_weights, te_as_rea
 def handle_annotation_EM(trf_free_multi, annot_dir, order):
     anno_fi, te_s_read_list, te_s_index, te_s_len, te_as_read_list, te_as_index, te_as_len, mir_read_list, mir_index, mir_len, te_s_ref_dic, te_as_ref_dic, mir_ref_dic, array_count = annotate_reads(trf_free_multi, annot_dir, order)
     sys.stderr.write("Annotation done. Fixing counts for EM\n")
-    mir_uniquecs, mir_multics, mir_multi_list = getcountsEM(mir_read_list, mir_index)
-    te_as_uniquecs, te_as_multics, te_as_multi_list = getcountsEM(te_as_read_list, te_as_index)
-    te_s_uniquecs, te_s_multics, te_s_multi_list = getcountsEM(te_s_read_list, te_s_index)
+#    mir_uniquecs, mir_multics, mir_multi_list = getcountsEM(mir_read_list, mir_index)
+#    te_as_uniquecs, te_as_multics, te_as_multi_list = getcountsEM(te_as_read_list, te_as_index)
+#    te_s_uniquecs, te_s_multics, te_s_multi_list = getcountsEM(te_s_read_list, te_s_index)
     # make function to not hardcode average read length to 20
-    EM_te_as_multi = EMestimate(te_as_len, te_as_multi_list, te_as_index, te_as_uniquecs, te_as_multics, 100, 20)
-    EM_te_s_multi = EMestimate(te_s_len, te_s_multi_list, te_s_index, te_s_uniquecs, te_s_multics, 100, 20)
-    EM_mir_multi = EMestimate(mir_len, mir_multi_list, mir_index, mir_uniquecs, mir_multics, 100, 20)
-    EM_te_as = list(map(operator.add, te_as_uniquecs, EM_te_as_multi))
-    EM_te_s = list(map(operator.add, te_s_uniquecs, EM_te_s_multi))
-    EM_mir = list(map(operator.add, mir_uniquecs, EM_mir_multi))
-    sys.stderr.write("EM done. Matching weights to features\n")
-    mir_weights = merge_weight_dict(EM_mir, mir_index)
-    te_s_weights = merge_weight_dict(EM_te_s, te_s_index)
-    te_as_weights = merge_weight_dict(EM_te_as, te_as_index)
-    em_weight_df_for_bed = make_list_for_bed_grph(te_s_weights, te_s_ref_dic, te_as_weights, te_as_ref_dic, mir_weights, mir_ref_dic, array_count)
-    return anno_fi, em_weight_df_for_bed, mir_weights, te_s_weights, te_as_weights
+#    EM_te_as_multi = EMestimate(te_as_len, te_as_multi_list, te_as_index, te_as_uniquecs, te_as_multics, 100, 20)
+#    EM_te_s_multi = EMestimate(te_s_len, te_s_multi_list, te_s_index, te_s_uniquecs, te_s_multics, 100, 20)
+#    EM_mir_multi = EMestimate(mir_len, mir_multi_list, mir_index, mir_uniquecs, mir_multics, 100, 20)
+#    EM_te_as = list(map(operator.add, te_as_uniquecs, EM_te_as_multi))
+#    EM_te_s = list(map(operator.add, te_s_uniquecs, EM_te_s_multi))
+#    EM_mir = list(map(operator.add, mir_uniquecs, EM_mir_multi))
+#    sys.stderr.write("EM done. Matching weights to features\n")
+#    mir_weights = merge_weight_dict(EM_mir, mir_index)
+#    te_s_weights = merge_weight_dict(EM_te_s, te_s_index)
+#    te_as_weights = merge_weight_dict(EM_te_as, te_as_index)
+#    em_weight_df_for_bed = make_list_for_bed_grph(te_s_weights, te_s_ref_dic, te_as_weights, te_as_ref_dic, mir_weights, mir_ref_dic, array_count)
+#    em_weight_df_for_bed = make_list_for_bed_grph(te_s_weights, te_s_ref_dic, te_as_weights, te_as_ref_dic, array_count)
+#    return anno_fi, em_weight_df_for_bed, mir_weights, te_s_weights, te_as_weights
+#    return anno_fi, em_weight_df_for_bed, te_s_weights, te_as_weights
+    return anno_fi
diff --git a/TEsmall/command_line.py b/TEsmall/command_line.py
@@ -82,14 +82,18 @@ def main():
         readinfo = get_read_info(multi_bam)
         cca_anno, residual_bam = handle_cca(multi_bam, tbtidx, annot_dir)
         ccafiles.append(cca_anno)
-        anno_fi, em_weight_df_for_bed, mir_weights, te_s_weights, te_as_weights = handle_annotation_EM(residual_bam, annot_dir, args.order)
+#        anno_fi, em_weight_df_for_bed, mir_weights, te_s_weights, te_as_weights = handle_annotation_EM(residual_bam, annot_dir, args.order)
+#         anno_fi, em_weight_df_for_bed, te_s_weights, te_as_weights = handle_annotation_EM(residual_bam, annot_dir, args.order)
+        anno_fi = handle_annotation_EM(residual_bam, annot_dir, args.order)
         annofiles.append(anno_fi)
-        em_weights_df.append(em_weight_df_for_bed)
-        te_s.append(te_s_weights)
-        te_as.append(te_as_weights)
-        miRNA.append(mir_weights)
+#        em_weights_df.append(em_weight_df_for_bed)
+#        te_s.append(te_s_weights)
+#        te_as.append(te_as_weights)
+#        miRNA.append(mir_weights)
 
-    em_outs = list(zip(annofiles, ccafiles, em_weights_df, miRNA, te_s, te_as))
+#    em_outs = list(zip(annofiles, ccafiles, em_weights_df, miRNA, te_s, te_as))
+#    em_outs = list(zip(annofiles, ccafiles, em_weights_df, te_s, te_as))
+    em_outs = list(zip(annofiles, ccafiles))
     calc_composition(annofiles, ccafiles)
     gen_summary(args.label, args.order, args.maxaln)
     calc_abundance(em_outs)

diff --git a/TEsmall/summary.py b/TEsmall/summary.py
@@ -259,10 +259,16 @@ def get_stat(prefix, maxaln):
                 stat["Proportion"].append(un_reads/rm_reads)
 
     df = pd.read_csv("{0}.anno".format(prefix), sep="\t", usecols=["rid"])
+    annot_reads = len(df.rid.unique()
+    stat["Statistics"].append("Aligned reads annotated")
+    stat["Number of reads"].append(anno_reads)
+    stat["Proportion"].append(anno_reads/rm_reads)
+    df = df.append(pd.read_csv("{0}.3trf.TE.mapper.anno".format(prefix), sep="\t", usecols=["rid"]), ignore_index=True)
+    df = df.append(pd.read_csv("{0}.3trf.struc.mapper.anno".format(prefix), sep="\t", usecols=["rid"]), ignore_index=True)
     anno_reads = len(df.rid.unique())
-    stat["Statistics"].append("Annotated reads of aligned reads")
+    stat["Statistics"].append("Annotated reads (including 3' tRF)")
     stat["Number of reads"].append(anno_reads)
-    stat["Proportion"].append(anno_reads/up_reads)
+    stat["Proportion"].append(anno_reads/rm_reads)
 
     return stat