Skip to content

Commit

Permalink
disable EM until further troubleshooting; updating HTML output
Browse files Browse the repository at this point in the history
  • Loading branch information
olivertam committed Nov 25, 2024
1 parent 51bf49c commit 778b86e
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 46 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
*.bak
*~

# Debug
debug_environment.yaml

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
36 changes: 21 additions & 15 deletions TEsmall/abundance.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def em_dic_to_df(em_dict, srna_class, file_root):
te_table = pd.concat([te_name_df, df], sort=True, axis=1)
te_table.columns = ['ftype', 'fid', file_root]
te_table.iloc[:, 2] = te_table.iloc[:, 2].round(10)
#te_table.to_csv('/Users/koneill/Desktop/fix_tesmall/df_{0}_{1}.txt'.format(file_root, srna_class), sep=' ', mode='a', index=False)
te_table.to_csv('df_{0}_{1}.txt'.format(file_root, srna_class), sep=' ', mode='a', index=False)
return te_table
else:
empty_df = pd.DataFrame(columns=['ftype', 'fid', file_root]) # in case you get an empty dictionary from the EM, there are no reads in that class
Expand All @@ -45,25 +45,28 @@ def em_dic_to_df(em_dict, srna_class, file_root):

def calc_abundance(em_in):
dfs = [] # holds dfs from each sample to merge for final count table
for anno, cca_anno, em_weight_df, mirna_dic, te_s_dic, te_as_dic in em_in:
# for anno, cca_anno, em_weight_df, mirna_dic, te_s_dic, te_as_dic in em_in:
# for anno, cca_anno, em_weight_df, te_s_dic, te_as_dic in em_in:
for anno, cca_anno in em_in:
logging.info("Calculating feature abundances...")
root = splitext(anno)[0]
# create data frames from annotation files (1/n) and EM weighted dictionaries
df = pd.read_csv(anno, sep="\t", usecols=["rid", "ftype", "fid"]) # get necessary columns
df = df[(df["ftype"] != "anti_TE") & (df["ftype"] != "sense_TE") & (df["ftype"] != "miRNA")] # filter out EM treated categories
df = df[(df["ftype"] != "anti_TE") & (df["ftype"] != "sense_TE")] # filter out EM treated categories
rweight = 1 / df.groupby("rid").fid.nunique() # calculate 1/n weight per read
ftable = df.groupby(["ftype", "fid"]).rid.unique() # group on sRNA id in class
count = ftable.apply(lambda l: round(sum([rweight[s] for s in l]))) # hierarchecal df with ftype and fid (fid contains fid and weight tab delim)
temp_dic = {root: count} # makes new key with sample name
count = pd.DataFrame(temp_dic).reset_index() # resets weight column name to sample root name
te_s_table = em_dic_to_df(te_s_dic, 'sense_TE', root) # get matching EM dfs
te_as_table = em_dic_to_df(te_as_dic, 'anti_TE', root)
mir_table = em_dic_to_df(mirna_dic, 'miRNA', root)
# te_s_table = em_dic_to_df(te_s_dic, 'sense_TE', root) # get matching EM dfs
# te_as_table = em_dic_to_df(te_as_dic, 'anti_TE', root)
# mir_table = em_dic_to_df(mirna_dic, 'miRNA', root)

em_count = pd.concat([te_s_table, te_as_table, mir_table], sort=True) # put EM dfs together
#em_count.to_csv('/Users/koneill/Desktop/fix_tesmall/em_merge_df_{0}.txt'.format(root), sep=' ', mode='a')
em_count = em_count[["ftype", "fid", root]] # order columns properly
#em_count.to_csv('/Users/koneill/Desktop/fix_tesmall/em_merge_df_named_{0}.txt'.format(root), sep=' ', mode='a')
# em_count = pd.concat([te_s_table, te_as_table, mir_table], sort=True) # put EM dfs together
# em_count = pd.concat([te_s_table, te_as_table], sort=True) # put EM dfs together
# em_count.to_csv('em_merge_df_{0}.txt'.format(root), sep=' ', mode='a')
# em_count = em_count[["ftype", "fid", root]] # order columns properly
# em_count.to_csv('em_merge_df_named_{0}.txt'.format(root), sep=' ', mode='a')

cca_df = pd.read_csv(cca_anno, sep="\t", usecols=["rid", "ftype", "fid"]) # do everything from before on cca annotation
cca_rweight = 1 / cca_df.groupby("rid").fid.nunique()
Expand All @@ -72,26 +75,29 @@ def calc_abundance(em_in):
temp_cca_dic = {root: cca_count} # fix weight column
cca_count = pd.DataFrame(temp_cca_dic).reset_index()
# put main (1/n) anno counts, EM weighted counts, and (1/n) 3' trf counts together
count_out = pd.concat([count, em_count, cca_count], sort=True, ignore_index=True)
# count_out = pd.concat([count, em_count, cca_count], sort=True, ignore_index=True)
count_out = pd.concat([count, cca_count], sort=True, ignore_index=True)
count_out = count_out.loc[count_out[root] > 0.1, :] # filter species with 0 counts
count_out = count_out[['fid', 'ftype', root]]
count_out = count_out.groupby(['fid','ftype'], as_index = False)[root].sum()
count_out = count_out[['fid', 'ftype', root]]
#count_out.to_csv("test_sample_df_final_{0}.txt".format(root), sep="\t", na_rep=0, float_format="%.0f", index=False)
count_out.to_csv("test_sample_df_final_{0}.txt".format(root), sep="\t", na_rep=0, float_format="%.0f", index=False)
dfs.append(count_out)

# make a bedgraph file too
coor = pd.read_csv(anno, sep="\t", usecols=["rid", "ftype", "fid", 'rchr', 'rstart', 'rend']) # get the right columns from anno file
cca_coor = pd.read_csv(cca_anno, sep="\t", usecols=["rid", "ftype", "fid", 'rchr', 'rstart', 'rend'])
bed = coor.merge(rweight.to_frame(), on='rid') # get their already calculated associated weights
bed.columns = ['rid', 'rchr', 'rstart', 'rend', 'ftype', 'fid', 'weight'] # name columns
bed = bed[(bed["ftype"] != "anti_TE") & (bed["ftype"] != "sense_TE") & (bed["ftype"] != "miRNA")] # filter out those that arent TEs or miRNAs since these weights are from EM
em_bed = pd.merge(coor, em_weight_df, left_on=['ftype', 'fid', 'rid'], right_on=['ftype', 'fid', 'rid']) # merge in EM weights
# bed = bed[(bed["ftype"] != "anti_TE") & (bed["ftype"] != "sense_TE") & (bed["ftype"] != "miRNA")] # filter out those that arent TEs or miRNAs since these weights are from EM
# bed = bed[(bed["ftype"] != "anti_TE") & (bed["ftype"] != "sense_TE")] # filter out those that arent TEs since these weights are from EM
# em_bed = pd.merge(coor, em_weight_df, left_on=['ftype', 'fid', 'rid'], right_on=['ftype', 'fid', 'rid']) # merge in EM weights
cca_bed = cca_coor.merge(cca_rweight.to_frame(), on='rid') # get 3' trf weights from parallel annotation counting
cca_bed.columns = ['rid', 'rchr', 'rstart', 'rend', 'ftype', 'fid', 'weight'] # rename columns

# merge bed dataframes to one big one
for_bed_graph = pd.concat([bed, em_bed, cca_bed])
# for_bed_graph = pd.concat([bed, em_bed, cca_bed])
for_bed_graph = pd.concat([bed, cca_bed])
for_bed_graph = for_bed_graph[['rchr', 'rstart', 'rend', 'weight']] # pick the columns an actual .bed file uses
for_bed_graph = for_bed_graph.sort_values(by=['rchr', 'rstart']) # sort 'em for your viewer
for_bed_graph = for_bed_graph.groupby(['rchr', 'rstart', 'rend'], as_index=False)['weight'].sum() # sum within bins of read length denoted by annotation start stop
Expand Down
49 changes: 26 additions & 23 deletions TEsmall/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ def merge_weight_dict(EM_output, em_index):
return count_dic


def make_list_for_bed_grph(te_s_weights, te_s_read_ref, te_as_weights, te_as_read_ref, mir_weights, mir_read_ref, array_count):
def make_list_for_bed_grph(te_s_weights, te_s_read_ref, te_as_weights, te_as_read_ref, array_count):
idx = array_count - 1
result_array = np.empty((array_count, 4), dtype='object')
x = 'sense_TE'
Expand All @@ -218,13 +218,13 @@ def make_list_for_bed_grph(te_s_weights, te_s_read_ref, te_as_weights, te_as_rea
result_array[idx] = [y, k, i, read_weight]
idx -= 1
# result_array = np.append(result_array, [q], axis=0)
for k, v in list(mir_read_ref.items()):
read_num = len(v)
weight_feature = mir_weights[k]
read_weight = weight_feature / read_num
for i in mir_read_ref[k]:
result_array[idx] = [z, k, i, read_weight]
idx -= 1
# for k, v in list(mir_read_ref.items()):
# read_num = len(v)
# weight_feature = mir_weights[k]
# read_weight = weight_feature / read_num
# for i in mir_read_ref[k]:
# result_array[idx] = [z, k, i, read_weight]
# idx -= 1
# result_array = np.append(result_array, [q], axis=0)
weight_df = pd.DataFrame(result_array, columns=['ftype', 'fid', 'rid', 'weight'])
return weight_df
Expand All @@ -233,19 +233,22 @@ def make_list_for_bed_grph(te_s_weights, te_s_read_ref, te_as_weights, te_as_rea
def handle_annotation_EM(trf_free_multi, annot_dir, order):
anno_fi, te_s_read_list, te_s_index, te_s_len, te_as_read_list, te_as_index, te_as_len, mir_read_list, mir_index, mir_len, te_s_ref_dic, te_as_ref_dic, mir_ref_dic, array_count = annotate_reads(trf_free_multi, annot_dir, order)
sys.stderr.write("Annotation done. Fixing counts for EM\n")
mir_uniquecs, mir_multics, mir_multi_list = getcountsEM(mir_read_list, mir_index)
te_as_uniquecs, te_as_multics, te_as_multi_list = getcountsEM(te_as_read_list, te_as_index)
te_s_uniquecs, te_s_multics, te_s_multi_list = getcountsEM(te_s_read_list, te_s_index)
# mir_uniquecs, mir_multics, mir_multi_list = getcountsEM(mir_read_list, mir_index)
# te_as_uniquecs, te_as_multics, te_as_multi_list = getcountsEM(te_as_read_list, te_as_index)
# te_s_uniquecs, te_s_multics, te_s_multi_list = getcountsEM(te_s_read_list, te_s_index)
# make function to not hardcode average read length to 20
EM_te_as_multi = EMestimate(te_as_len, te_as_multi_list, te_as_index, te_as_uniquecs, te_as_multics, 100, 20)
EM_te_s_multi = EMestimate(te_s_len, te_s_multi_list, te_s_index, te_s_uniquecs, te_s_multics, 100, 20)
EM_mir_multi = EMestimate(mir_len, mir_multi_list, mir_index, mir_uniquecs, mir_multics, 100, 20)
EM_te_as = list(map(operator.add, te_as_uniquecs, EM_te_as_multi))
EM_te_s = list(map(operator.add, te_s_uniquecs, EM_te_s_multi))
EM_mir = list(map(operator.add, mir_uniquecs, EM_mir_multi))
sys.stderr.write("EM done. Matching weights to features\n")
mir_weights = merge_weight_dict(EM_mir, mir_index)
te_s_weights = merge_weight_dict(EM_te_s, te_s_index)
te_as_weights = merge_weight_dict(EM_te_as, te_as_index)
em_weight_df_for_bed = make_list_for_bed_grph(te_s_weights, te_s_ref_dic, te_as_weights, te_as_ref_dic, mir_weights, mir_ref_dic, array_count)
return anno_fi, em_weight_df_for_bed, mir_weights, te_s_weights, te_as_weights
# EM_te_as_multi = EMestimate(te_as_len, te_as_multi_list, te_as_index, te_as_uniquecs, te_as_multics, 100, 20)
# EM_te_s_multi = EMestimate(te_s_len, te_s_multi_list, te_s_index, te_s_uniquecs, te_s_multics, 100, 20)
# EM_mir_multi = EMestimate(mir_len, mir_multi_list, mir_index, mir_uniquecs, mir_multics, 100, 20)
# EM_te_as = list(map(operator.add, te_as_uniquecs, EM_te_as_multi))
# EM_te_s = list(map(operator.add, te_s_uniquecs, EM_te_s_multi))
# EM_mir = list(map(operator.add, mir_uniquecs, EM_mir_multi))
# sys.stderr.write("EM done. Matching weights to features\n")
# mir_weights = merge_weight_dict(EM_mir, mir_index)
# te_s_weights = merge_weight_dict(EM_te_s, te_s_index)
# te_as_weights = merge_weight_dict(EM_te_as, te_as_index)
# em_weight_df_for_bed = make_list_for_bed_grph(te_s_weights, te_s_ref_dic, te_as_weights, te_as_ref_dic, mir_weights, mir_ref_dic, array_count)
# em_weight_df_for_bed = make_list_for_bed_grph(te_s_weights, te_s_ref_dic, te_as_weights, te_as_ref_dic, array_count)
# return anno_fi, em_weight_df_for_bed, mir_weights, te_s_weights, te_as_weights
# return anno_fi, em_weight_df_for_bed, te_s_weights, te_as_weights
return anno_fi
16 changes: 10 additions & 6 deletions TEsmall/command_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,14 +82,18 @@ def main():
readinfo = get_read_info(multi_bam)
cca_anno, residual_bam = handle_cca(multi_bam, tbtidx, annot_dir)
ccafiles.append(cca_anno)
anno_fi, em_weight_df_for_bed, mir_weights, te_s_weights, te_as_weights = handle_annotation_EM(residual_bam, annot_dir, args.order)
# anno_fi, em_weight_df_for_bed, mir_weights, te_s_weights, te_as_weights = handle_annotation_EM(residual_bam, annot_dir, args.order)
# anno_fi, em_weight_df_for_bed, te_s_weights, te_as_weights = handle_annotation_EM(residual_bam, annot_dir, args.order)
anno_fi = handle_annotation_EM(residual_bam, annot_dir, args.order)
annofiles.append(anno_fi)
em_weights_df.append(em_weight_df_for_bed)
te_s.append(te_s_weights)
te_as.append(te_as_weights)
miRNA.append(mir_weights)
# em_weights_df.append(em_weight_df_for_bed)
# te_s.append(te_s_weights)
# te_as.append(te_as_weights)
# miRNA.append(mir_weights)

em_outs = list(zip(annofiles, ccafiles, em_weights_df, miRNA, te_s, te_as))
# em_outs = list(zip(annofiles, ccafiles, em_weights_df, miRNA, te_s, te_as))
# em_outs = list(zip(annofiles, ccafiles, em_weights_df, te_s, te_as))
em_outs = list(zip(annofiles, ccafiles))
calc_composition(annofiles, ccafiles)
gen_summary(args.label, args.order, args.maxaln)
calc_abundance(em_outs)
Expand Down
10 changes: 8 additions & 2 deletions TEsmall/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,10 +259,16 @@ def get_stat(prefix, maxaln):
stat["Proportion"].append(un_reads/rm_reads)

df = pd.read_csv("{0}.anno".format(prefix), sep="\t", usecols=["rid"])
annot_reads = len(df.rid.unique()
stat["Statistics"].append("Aligned reads annotated")
stat["Number of reads"].append(anno_reads)
stat["Proportion"].append(anno_reads/rm_reads)
df = df.append(pd.read_csv("{0}.3trf.TE.mapper.anno".format(prefix), sep="\t", usecols=["rid"]), ignore_index=True)
df = df.append(pd.read_csv("{0}.3trf.struc.mapper.anno".format(prefix), sep="\t", usecols=["rid"]), ignore_index=True)
anno_reads = len(df.rid.unique())
stat["Statistics"].append("Annotated reads of aligned reads")
stat["Statistics"].append("Annotated reads (including 3' tRF)")
stat["Number of reads"].append(anno_reads)
stat["Proportion"].append(anno_reads/up_reads)
stat["Proportion"].append(anno_reads/rm_reads)

return stat

Expand Down

0 comments on commit 778b86e

Please sign in to comment.