diff --git a/tcga_analysis.py b/tcga_analysis.py index c81c3e5..623bace 100644 --- a/tcga_analysis.py +++ b/tcga_analysis.py @@ -51,7 +51,8 @@ def load_TCGA(tcga_maf= "../data/tcga/mc3/mc3.v0.2.8.PUBLIC.maf", tcga_code_tabl def plot_tcga_heat_map(prots_df=None, tcga_df=None, merged_df=None, top=10, title_prot_subset="all proteins", title_postfix='', axis=None, show=True, tf_idf_score=False): - """Returns merged dataframe of prots and tcga maf file on "gene" column""" + """ THIS IS THE OLD HEAT MAP, USE THE OTHER FUNCTION (`normalized_heatmap`). + Returns merged dataframe of prots and tcga maf file on "gene" column""" if isinstance(prots_df, str): prots_df = add_gene_name(pd.read_csv(prots_df)) @@ -103,72 +104,10 @@ def plot_tcga_heat_map(prots_df=None, tcga_df=None, merged_df=None, top=10, titl return merged_df -def plot_combined_heatmap(df_tcga=None): - if not df_tcga: - df_tcga = load_TCGA() - df_tcga['case'] = df_tcga['Tumor_Sample_Barcode'].str[:12] - # df_tcga_uni['uniprot'] = df_tcga_uni['SWISSPROT'].str.split('_').str[0] - # df_tcga_uni['uniprot2'] = df_tcga_uni['TREMBL'].str.split(',').str[0].str.split('_').str[0] - - cases = [True,False] - test_df = pd.read_csv('../downloads/test_prots_gene_names.csv').rename({'gene_name':'gene'}, axis=1) - csvs = { - 'all proteins': "../downloads/all_prots.csv", - 'test proteins with BindingDB': test_df, - 'test proteins': test_df[test_df.db != 'BindingDB'], - } - - _, axes = plt.subplots(len(csvs),len(cases), figsize=(12*len(cases),8*len(csvs))) - - for i, drop_duplicates in enumerate(cases): - df_tcga_uni = df_tcga.drop_duplicates(subset='Tumor_Sample_Barcode') if drop_duplicates else df_tcga - - for j, k in enumerate(csvs.keys()): - merged_df = plot_tcga_heat_map(csvs[k], df_tcga_uni, merged_df=None, - top=20, - title_prot_subset=k, - title_postfix=' (unique cases)' if drop_duplicates else '', - axis=axes[j][i], show=False) - - plt.tight_layout() - -def box_plot(df_tcga, normalize=False, limit=20): - groups = df_tcga.groupby(['Study Abbreviation', 'Tumor_Sample_Barcode']).size().reset_index(name='counts') - # Group by 'Study Abbreviation' to get the size of each group - group_sizes = groups.groupby('Study Abbreviation').groups - - # Convert the dictionary to a DataFrame suitable for seaborn - df = pd.DataFrame([(k, v) for k, lst in group_sizes.items() for v in lst], columns=['Study Abbreviation', 'Group Size']) - df.sort_values(by='Group Size', ascending=False, inplace=True) - - # limiting to top x cancers: - cancer_size = df.groupby('Study Abbreviation').sum().sort_values(by="Group Size", ascending=False).iloc[:limit] - cancer_size.rename({"Group Size": "Total"}, axis=1, inplace=True) - top_x_cancers = cancer_size.index - - df = df[df['Study Abbreviation'].isin(top_x_cancers)] - - if normalize: - # Merge the original dataframe with the aggregated counts dataframe - df = df.merge(cancer_size, on='Study Abbreviation') - - # Normalize the 'Group Size' by dividing by the 'Aggregated Count' - df['Normalized Group Size'] = df['Group Size'] / df['Total'] - - - # Plotting the boxplot with seaborn - plt.figure(figsize=(12, 8)) - sns.boxplot(x='Study Abbreviation', y=f'{"Normalized " if normalize else ""}Group Size', - data=df) - plt.xlabel('Cancer type') - plt.ylabel(f'Mutation counts{" (normalized by cancer size)" if normalize else ""}') - plt.title(f'Boxplot of {"normalized " if normalize else ""}patient mutation count by cancer type') - plt.xticks(rotation=45) - plt.show() - def normalized_heatmap(merged_df, top=100, tfidf=False, normalize=True, normalize_by='avg', biomart_csv='/cluster/home/t122995uhn/projects/downloads/mart_export.tsv', variant=None, axis=None, merged_df_name='ALL TCGA'): + """Normalize_by can take on the following values: [avg, max, None], if "None" then we normalize by the gene counts""" df_gene_len = pd.read_csv(biomart_csv, sep='\t') df_gene_len.dropna(inplace=True) @@ -250,11 +189,74 @@ def normalized_heatmap(merged_df, top=100, tfidf=False, normalize=True, normaliz axis.set_xlabel('Gene Name') plt.colorbar(heatmap) +def plot_combined_heatmap(df_tcga=None): + """Plotting heatmaps filtered for different test sets""" + if not df_tcga: + df_tcga = load_TCGA() + df_tcga['case'] = df_tcga['Tumor_Sample_Barcode'].str[:12] + # df_tcga_uni['uniprot'] = df_tcga_uni['SWISSPROT'].str.split('_').str[0] + # df_tcga_uni['uniprot2'] = df_tcga_uni['TREMBL'].str.split(',').str[0].str.split('_').str[0] + + cases = [True,False] + test_df = pd.read_csv('../downloads/test_prots_gene_names.csv').rename({'gene_name':'gene'}, axis=1) + csvs = { + 'all proteins': "../downloads/all_prots.csv", + 'test proteins with BindingDB': test_df, + 'test proteins': test_df[test_df.db != 'BindingDB'], + } + + _, axes = plt.subplots(len(csvs),len(cases), figsize=(12*len(cases),8*len(csvs))) + + for i, drop_duplicates in enumerate(cases): + df_tcga_uni = df_tcga.drop_duplicates(subset='Tumor_Sample_Barcode') if drop_duplicates else df_tcga + + for j, k in enumerate(csvs.keys()): + merged_df = plot_tcga_heat_map(csvs[k], df_tcga_uni, merged_df=None, + top=20, + title_prot_subset=k, + title_postfix=' (unique cases)' if drop_duplicates else '', + axis=axes[j][i], show=False) + + plt.tight_layout() + +def box_plot(df_tcga, normalize=False, limit=20): + groups = df_tcga.groupby(['Study Abbreviation', 'Tumor_Sample_Barcode']).size().reset_index(name='counts') + # Group by 'Study Abbreviation' to get the size of each group + group_sizes = groups.groupby('Study Abbreviation').groups + + # Convert the dictionary to a DataFrame suitable for seaborn + df = pd.DataFrame([(k, v) for k, lst in group_sizes.items() for v in lst], columns=['Study Abbreviation', 'Group Size']) + df.sort_values(by='Group Size', ascending=False, inplace=True) + + # limiting to top x cancers: + cancer_size = df.groupby('Study Abbreviation').sum().sort_values(by="Group Size", ascending=False).iloc[:limit] + cancer_size.rename({"Group Size": "Total"}, axis=1, inplace=True) + top_x_cancers = cancer_size.index + + df = df[df['Study Abbreviation'].isin(top_x_cancers)] + + if normalize: + # Merge the original dataframe with the aggregated counts dataframe + df = df.merge(cancer_size, on='Study Abbreviation') + + # Normalize the 'Group Size' by dividing by the 'Aggregated Count' + df['Normalized Group Size'] = df['Group Size'] / df['Total'] + + + # Plotting the boxplot with seaborn + plt.figure(figsize=(12, 8)) + sns.boxplot(x='Study Abbreviation', y=f'{"Normalized " if normalize else ""}Group Size', + data=df) + plt.xlabel('Cancer type') + plt.ylabel(f'Mutation counts{" (normalized by cancer size)" if normalize else ""}') + plt.title(f'Boxplot of {"normalized " if normalize else ""}patient mutation count by cancer type') + plt.xticks(rotation=45) + plt.show() + # %% df_tcga = load_TCGA() - # %% variants = [None, 'Missense_Mutation', 'Silent', "3'UTR", "Nonsense_Mutation"] test_df = pd.read_csv('../downloads/test_prots_gene_names.csv').rename({'gene_name':'gene'}, axis=1) @@ -274,6 +276,4 @@ def normalized_heatmap(merged_df, top=100, tfidf=False, normalize=True, normaliz for i, v in enumerate(variants): normalized_heatmap(merged_df, normalize_by='avg', variant=v, axis=axes[i][j], merged_df_name=k) -plt.tight_layout() - -# %% +plt.tight_layout() \ No newline at end of file