From 2e605c792ff2d67ae64b27225ab05fc6d32f9cc8 Mon Sep 17 00:00:00 2001
From: Marc Glisse <marc.glisse@inria.fr>
Date: Sat, 15 Jun 2024 23:18:23 +0200
Subject: [PATCH 1/2] Rescale values for KDE to avoir underflow

---
 src/python/gudhi/clustering/tomato.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py
index d0e9995c2c..7313f6f4cc 100644
--- a/src/python/gudhi/clustering/tomato.py
+++ b/src/python/gudhi/clustering/tomato.py
@@ -66,6 +66,7 @@ def __init__(
             graph_type (str): 'manual', 'knn' or 'radius'. Default is 'knn'.
             density_type (str): 'manual', 'DTM', 'logDTM', 'KDE' or 'logKDE'. When you have many points,
                 'KDE' and 'logKDE' tend to be slower. Default is 'logDTM'.
+                The values computed for 'DTM' or 'KDE' are not normalized (this does not affect the clustering).
             metric (str|Callable): metric used when calculating the distance between instances in a feature array.
                 Defaults to Minkowski of parameter p.
             kde_params (dict): if density_type is 'KDE' or 'logKDE', additional parameters passed directly to
@@ -223,6 +224,8 @@ def fit(self, X, y=None, weights=None):
 
             weights = KernelDensity(**kde_params).fit(self.points_).score_samples(self.points_)
             if self.density_type_ == "KDE":
+                # First rescale to avoid computing exp(-1000)
+                weights -= numpy.max(weights)
                 weights = numpy.exp(weights)
 
         # TODO: do it at the C++ level and/or in parallel if this is too slow?

From d85758780a4dfbc14974575660eb6dc924a3d4c9 Mon Sep 17 00:00:00 2001
From: Marc Glisse <marc.glisse@inria.fr>
Date: Mon, 17 Jun 2024 11:28:41 +0200
Subject: [PATCH 2/2] Test for KDE underflow

---
 src/python/test/test_tomato.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/python/test/test_tomato.py b/src/python/test/test_tomato.py
index c571f79929..2a3e60b3b1 100644
--- a/src/python/test/test_tomato.py
+++ b/src/python/test/test_tomato.py
@@ -63,3 +63,13 @@ def test_tomato_1():
     assert t.diagram_.size == 0
     assert t.max_weight_per_cc_.size == 1
     t.plot_diagram()
+
+
+def test_tomato_kde_underflow():
+    # 1D construction with 2 Gaussians, embedded in high dimension
+    X = np.zeros((200, 1000))
+    X[:100, 0] = np.random.default_rng().normal(-2, 1, 100)
+    X[100:, 0] = np.random.default_rng().normal(2, 1, 100)
+    # X[:,0].sort()
+    t = Tomato(density_type="KDE").fit(X)
+    assert (t.weights_ != 0).all()