From 42d2922887c5bab9ebd565a0e3830b01eecfb945 Mon Sep 17 00:00:00 2001
From: yh202109 <yh202109@gmail.com>
Date: Wed, 3 Jul 2024 14:06:28 -0400
Subject: [PATCH] v0.2.13

---
 mtbp3/statlab/kappa.py | 39 +++++++++++++++++++++++++--------------
 1 file changed, 25 insertions(+), 14 deletions(-)

diff --git a/mtbp3/statlab/kappa.py b/mtbp3/statlab/kappa.py
index 7538fabe..3f8634dc 100644
--- a/mtbp3/statlab/kappa.py
+++ b/mtbp3/statlab/kappa.py
@@ -245,7 +245,7 @@ def bootstrap_cohen_ci(self, n_iterations=1000, confidence_level=0.95, outfmt='s
         else:
             return [self.cohen_kappa, n_iterations, confidence_level, lower_bound, upper_bound]
 
-    def create_bubble_plot(self, out_path="", title="", axis_label=[], max_size_ratio=100):
+    def create_bubble_plot(self, out_path="", axis_label=[], max_size_ratio=0, hist=False):
         """
         Creates a bubble plot based on the y_count_sq matrix.
 
@@ -268,7 +268,9 @@ def create_bubble_plot(self, out_path="", title="", axis_label=[], max_size_rati
         if self.n_rater == 2 and self.y_count_sq is not None and self.y_count_sq.shape[0] == self.y_count_sq.shape[1] and self.y_count_sq.shape[0] > 0:
             categories = self.y_count_sq.columns
             n_categories = len(categories)
-            r1 = []
+            max_size_ratio = max_size_ratio if max_size_ratio >= 1 else max(1,int(150 / n_categories))
+
+            r1 = [] 
             r2 = []
             sizes = []
             for i1, c1 in enumerate(categories):
@@ -276,21 +278,30 @@ def create_bubble_plot(self, out_path="", title="", axis_label=[], max_size_rati
                     r1.append(c1)
                     r2.append(c2)
                     sizes.append(self.y_count_sq.iloc[i1, i2])
-            data = pd.DataFrame({'r1': r1, 'r2': r2, 'sizes': sizes})
-            sns.scatterplot(data=data, x="r1", y="r2", size="sizes", sizes=(min(sizes), max(sizes)*max_size_ratio), legend=False)
-            for i in range(len(data)):
-                plt.text(data['r1'][i], data['r2'][i], data['sizes'][i], ha='center', va='center')
+            df0 = pd.DataFrame({'r1': r1, 'r2': r2, 'sizes': sizes})
+            if hist:
+                sns.jointplot(
+                    data=df0, x="r1", y="r2", kind="scatter", 
+                    height=5, ratio=3, marginal_ticks=True,
+                    marginal_kws={"weights": sizes, "shrink":.5}, 
+                    joint_kws={"size": sizes, "legend": False, "sizes":(min(sizes), max(sizes)*max_size_ratio)}
+                    ) 
+                #sns.jointplot(data=df0, x="r1", y="r2", size="sizes", kind="scatter") 
+            else:
+                sns.scatterplot(data=df0, x="r1", y="r2", size="sizes", sizes=(min(sizes), max(sizes)*max_size_ratio), legend=False)
+                tmp1 = plt.xlim()
+                tmp1d = ((tmp1[1] - tmp1[0])/n_categories)
+                plt.xlim(tmp1[0] - tmp1d, tmp1[1] + tmp1d)
+                plt.ylim(tmp1[0] - tmp1d, tmp1[1] + tmp1d)
+
+            for i in range(len(df0)):
+                plt.text(df0['r1'][i], df0['r2'][i], df0['sizes'][i], ha='center', va='center')
+
             if not axis_label:
                 axis_label = ['Rater 1', 'Rater 2']
             plt.xlabel(axis_label[0])
             plt.ylabel(axis_label[1])
-            if not title:
-                title = 'Bubble Plot'
-            plt.title(title)
-            tmp1 = plt.xlim()
-            tmp1d = ((tmp1[1] - tmp1[0])/n_categories)
-            plt.xlim(tmp1[0] - tmp1d, tmp1[1] + tmp1d)
-            plt.ylim(tmp1[0] - tmp1d, tmp1[1] + tmp1d)
+
             plt.tight_layout()
             if out_path:
                 try:
@@ -317,5 +328,5 @@ def create_bubble_plot(self, out_path="", title="", axis_label=[], max_size_rati
     print("Number of rating categories: "+str(kappa.n_category))
     print("Number of sample: "+str(kappa.y_count.shape[0]))
 
-    kappa.create_bubble_plot(out_path='statlab_kappa_fig1.svg')
+    kappa.create_bubble_plot(hist=True)