Merge pull request #19 from RECETOX/hechth/issue13

Finalize plots for paper
RECETOX · Nov 14, 2023 · 3d64a92 · 3d64a92
2 parents 48606b6 + b574e52
commit 3d64a92
Show file tree

Hide file tree

Showing 31 changed files with 7,687 additions and 1,180 deletions.
diff --git a/analysis/Python_scripts/boxplot.png b/analysis/Python_scripts/boxplot.png
diff --git a/analysis/Python_scripts/boxplot_Benzenoids.png b/analysis/Python_scripts/boxplot_Benzenoids.png
diff --git a/analysis/Python_scripts/boxplot_Lipids and lipid-like molecules.png b/analysis/Python_scripts/boxplot_Lipids and lipid-like molecules.png
diff --git a/analysis/Python_scripts/boxplot_Organic acids and derivatives.png b/analysis/Python_scripts/boxplot_Organic acids and derivatives.png
diff --git a/analysis/Python_scripts/boxplot_Organic nitrogen compounds.png b/analysis/Python_scripts/boxplot_Organic nitrogen compounds.png
diff --git a/analysis/Python_scripts/boxplot_Organic oxygen compounds.png b/analysis/Python_scripts/boxplot_Organic oxygen compounds.png
diff --git a/analysis/Python_scripts/boxplot_Organohalogen compounds.png b/analysis/Python_scripts/boxplot_Organohalogen compounds.png
diff --git a/analysis/Python_scripts/boxplot_Organoheterocyclic compounds.png b/analysis/Python_scripts/boxplot_Organoheterocyclic compounds.png
diff --git a/analysis/Python_scripts/boxplot_Organophosphorus compounds.png b/analysis/Python_scripts/boxplot_Organophosphorus compounds.png
diff --git a/analysis/Python_scripts/boxplot_Organosulfur compounds.png b/analysis/Python_scripts/boxplot_Organosulfur compounds.png
diff --git a/analysis/Python_scripts/boxplot_Phenylpropanoids and polyketides.png b/analysis/Python_scripts/boxplot_Phenylpropanoids and polyketides.png
diff --git a/analysis/Python_scripts/boxplot_superclass.png b/analysis/Python_scripts/boxplot_superclass.png
diff --git a/analysis/Python_scripts/chemical_composition_boxplot.ipynb b/analysis/Python_scripts/chemical_composition_boxplot.ipynb
diff --git a/analysis/Python_scripts/chemical_composition_boxplot.png b/analysis/Python_scripts/chemical_composition_boxplot.png
diff --git a/analysis/Python_scripts/chemical_composition_boxplot2.png b/analysis/Python_scripts/chemical_composition_boxplot2.png
diff --git a/analysis/Python_scripts/chemical_composition_boxplot2_N.png b/analysis/Python_scripts/chemical_composition_boxplot2_N.png
diff --git a/analysis/Python_scripts/chemical_composition_boxplot_N.png b/analysis/Python_scripts/chemical_composition_boxplot_N.png
diff --git a/analysis/Python_scripts/classes_boxplot.ipynb b/analysis/Python_scripts/classes_boxplot.ipynb
diff --git a/analysis/Python_scripts/deprecated/classes_boxplot.ipynb b/analysis/Python_scripts/deprecated/classes_boxplot.ipynb
diff --git a/analysis/Python_scripts/plotting.py b/analysis/Python_scripts/plotting.py
@@ -1,3 +1,4 @@
+import textwrap
 import pandas as pd
 from matplotlib import pyplot as plt
 import seaborn as sns
@@ -83,56 +84,49 @@ def plot_histogram(x, xaxis_title='', title=''):
     # Display the plot
     fig.show()
 
-def create_plot(df, path):
+def create_plot(df, path, grouping_column, xlabel):
     sns.set_style(style='white')
     plt.figure(figsize=(17, 5))
 
     # Set the color palette
     colors = ['yellow', 'deepskyblue']
     sns.set_palette(sns.color_palette(colors))
 
-    ax = sns.boxplot(x="true_names", y="value", hue="Number", 
+    ax = sns.boxplot(x=grouping_column, y="value", hue="Number", 
                     data=df, hue_order=['CosineHungarian_0.01_0.0_1.0_matches',np.nan],
                     medianprops={'color': 'darkgreen', 'linewidth': 4.0},
                     flierprops={'marker': 'o', 'markersize': 10, 'markerfacecolor': 'none'})  # RUN PLOT   
     ax2 = ax.twinx()
 
-    sns.boxplot(ax=ax2,x='true_names', y='value', hue='Number',
+    sns.boxplot(ax=ax2,x=grouping_column, y='value', hue='Number',
                 data=df, hue_order=[np.nan, 'CosineHungarian_0.01_0.0_1.0_scores'], 
                 medianprops={'color': 'b', 'linewidth': 4.0}, 
                 flierprops={'marker': 'o', 'markersize': 10, 'markerfacecolor': 'none'})
 
     ax.legend_.remove()
-    ax.set_ylim([0, 5])  # Set y-axis limits
-    ax.yaxis.set_major_locator(plt.MultipleLocator(1))  # Set major tick marks
-    ax.set_ylabel('Match values')  # Set y-axis label
-    ax.yaxis.label.set_size(20)  # Set font size of y-axis label
-    ax.set_xlabel('Chemical composition', fontsize=20)  # Set x-axis label and font size
+    # ax.set_ylim([0, 5])  # Set y-axis limits
+    # ax.yaxis.set_major_locator(plt.MultipleLocator(1))  # Set major tick marks
+    ax.set_ylabel('Match values', fontsize=20)  # Set y-axis label
+    if xlabel:
+        ax.set_xlabel(xlabel, fontsize=20)  # Set x-axis label and font size
     ax.tick_params(axis='x', labelsize=13)  # Set font size of x-axis tick labels
-    ax.tick_params(axis='y', labelsize=13)  # Set font size of y-axis tick labels
-    ax.yaxis.labelpad = 10
-    ax.xaxis.labelpad = 10
+    # ax.tick_params(axis='y', labelsize=13)  # Set font size of y-axis tick labels
+    # ax.yaxis.labelpad = 10
 
     # Create a count for each x-axis label
-    count_data = df['true_names'].value_counts().reset_index()
-    count_data.columns = ['true_names', 'count']
-    count_data = count_data.sort_values(by=['true_names'])
-    count_data['count'] = count_data['count'] // 2
+    count_data = df[grouping_column].value_counts()
 
-    # Remove the original x-axis labels
-    ax.set_xticklabels([])
-
-    # Add the count labels to the x-axis
-    ax.set_xticks(np.arange(len(count_data)))
-    ax.set_xticklabels(count_data['true_names'] + ' (' + count_data['count'].astype(str) + ')', rotation=45, ha='right')
+    # # Add the count labels to the x-axis
+    xlabels = [label.get_text() for label in ax.get_xticklabels()]
+    xlabels = ['\n'.join(textwrap.wrap(label + ' (' + str((count_data.loc[label] //2 )) + ')', width=25)) for label in xlabels]
+    ax.set_xticklabels(xlabels, rotation=45, ha='right')
 
     ax2.legend_.remove()
-    ax2.set_ylim([0, 1])  # Set y-axis limits
-    ax2.yaxis.set_major_locator(plt.MultipleLocator(0.2))  # Set major tick marks
-    ax2.set_ylabel('Score values')  # Set y-axis label
-    ax2.yaxis.label.set_size(20)  # Set font size of y-axis label
-    ax2.tick_params(axis='y', labelsize=12)  # Set font size of y-axis tick labels
-    ax2.yaxis.labelpad = 10
+    # ax2.set_ylim([0, 1])  # Set y-axis limits
+    # ax2.yaxis.set_major_locator(plt.MultipleLocator(0.2))  # Set major tick marks
+    ax2.set_ylabel('Score values', fontsize=20)  # Set y-axis label
+    ax2.tick_params(axis='y', labelsize=13)  # Set font size of y-axis tick labels
+    # ax2.yaxis.labelpad = 10
 
     # Change the legend labels
     handles, labels = ax.get_legend_handles_labels()
@@ -144,3 +138,27 @@ def create_plot(df, path):
     plt.show()
     plt.clf()
     plt.close()
+
+
+def scatterplot_matplotlib(df):
+    fig = plt.figure(figsize=(18, 6))
+    scatter = plt.scatter(
+        df['CosineHungarian_0.01_0.0_1.0_scores'],
+        df['CosineHungarian_0.01_0.0_1.0_matches'],
+        s=df['FractionQuery'] * 200,  # Adjust the size scaling factor as needed
+        c=df['FractionReference'] * 100,
+        cmap='viridis',  # change the colorscale as needed
+        alpha=0.5,
+        vmin=0,
+        vmax=100
+    )
+    plt.colorbar(scatter).set_label('Reference Matched %')
+    plt.xlabel('Score')
+    plt.ylabel('Matches')
+
+    # Add a legend for the size
+    sizes = [1, 50, 100]
+    for size in sizes:
+        plt.scatter([], [], c='c', alpha=0.5, s=size * 2 , label=str(size))
+    plt.legend(scatterpoints=1, title='Query Matched %', labelspacing=1, loc='upper left')
+    return fig
diff --git a/analysis/Python_scripts/scatter_plot.png b/analysis/Python_scripts/scatter_plot.png
diff --git a/analysis/Python_scripts/scatterplot.ipynb b/analysis/Python_scripts/scatterplot.ipynb
diff --git a/analysis/Python_scripts/scatterplot_Benzenoids.png b/analysis/Python_scripts/scatterplot_Benzenoids.png
diff --git a/analysis/Python_scripts/scatterplot_Lipids and lipid-like molecules.png b/analysis/Python_scripts/scatterplot_Lipids and lipid-like molecules.png
diff --git a/analysis/Python_scripts/scatterplot_Organic acids and derivatives.png b/analysis/Python_scripts/scatterplot_Organic acids and derivatives.png
diff --git a/analysis/Python_scripts/scatterplot_Organohalogen compounds.png b/analysis/Python_scripts/scatterplot_Organohalogen compounds.png
diff --git a/analysis/Python_scripts/scatterplot_Organoheterocyclic compounds.png b/analysis/Python_scripts/scatterplot_Organoheterocyclic compounds.png
diff --git a/analysis/Python_scripts/scatterplot_Phenylpropanoids and polyketides.png b/analysis/Python_scripts/scatterplot_Phenylpropanoids and polyketides.png
diff --git a/analysis/Python_scripts/scatterplot_all.png b/analysis/Python_scripts/scatterplot_all.png
diff --git a/analysis/Python_scripts/scatterplot_azoles.png b/analysis/Python_scripts/scatterplot_azoles.png
diff --git a/analysis/Python_scripts/utils.py b/analysis/Python_scripts/utils.py
@@ -55,7 +55,7 @@ def has_organic_atoms(mol):
     """
     # Check if the molecule contains any halogen atoms
     for atom in mol.GetAtoms():
-        if atom.GetSymbol() in ['C', 'O', 'N', 'H']:
+        if atom.GetSymbol() in ['C', 'O', 'H']:
             return True
 
     return False
@@ -86,7 +86,7 @@ def append_classes(df, left_on):
         "S": [has_atom(m, 'S') for m in molecules],
         "P": [has_atom(m, 'P') for m in molecules],
         "Si": [has_atom(m, 'Si') for m in molecules],
-        "C,O,N,H": [has_organic_atoms(m) for m in molecules],
+        "C,O,N,H": [has_organic_atoms(m) or has_atom(m, 'N') for m in molecules],
         "N": [has_atom(m, 'N') for m in molecules],
     })
     merged_df = pd.merge(df, class_names, left_on=left_on, right_on='molname')
@@ -151,23 +151,25 @@ def generate_combinations(df, column_name):
 
     return pd.DataFrame(new_rows).reset_index(drop=True)
 
-def preprocess_data(merged_top5_same):
+def preprocess_data(merged_top5_same, cols_to_keep):
+    key_cols = ['query', 'reference'] + cols_to_keep
     # Concatenate the DataFrames in df1_list and add a 'value' column with the value 'matches'.
-    df1 = merged_top5_same[['query', 'reference', 'true_names', 'CosineHungarian_0.01_0.0_1.0_matches']].copy()
+    df1 = merged_top5_same[key_cols + ['CosineHungarian_0.01_0.0_1.0_matches']].copy()
 
     # Concatenate the DataFrames in df2_list and add a 'value' column with the value 'scores'.
-    df2 = merged_top5_same[['query', 'reference', 'true_names', 'CosineHungarian_0.01_0.0_1.0_scores']].copy()
+    df2 = merged_top5_same[key_cols + ['CosineHungarian_0.01_0.0_1.0_scores']].copy()
 
     # Concatenate df1 and df2 into a single DataFrame.
     df_cat = pd.concat([df1, df2])
 
-    mdf = pd.melt(df_cat, id_vars=['query', 'reference', 'true_names'], var_name=['Number'])      # MELT
+    mdf = pd.melt(df_cat, id_vars=key_cols, var_name='Number')      # MELT
+    mdf = mdf.dropna()
+    return mdf
 
-    #cleaning data
+def clean_chemical_composition_data(mdf):
     mdf = split_and_add_rows(mdf, 'true_names', split_by=', C,O,N,H')
     mdf['true_names'] = mdf['true_names'].replace('', np.nan)
-    mdf = mdf.dropna(subset=['value', 'true_names'])
-
+    mdf = mdf.dropna()
     return mdf
 
 def load_spectra_metadata(file_path, metadata_column_name):