From 2e605c792ff2d67ae64b27225ab05fc6d32f9cc8 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Sat, 15 Jun 2024 23:18:23 +0200 Subject: [PATCH 1/2] Rescale values for KDE to avoir underflow --- src/python/gudhi/clustering/tomato.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index d0e9995c2c..7313f6f4cc 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -66,6 +66,7 @@ def __init__( graph_type (str): 'manual', 'knn' or 'radius'. Default is 'knn'. density_type (str): 'manual', 'DTM', 'logDTM', 'KDE' or 'logKDE'. When you have many points, 'KDE' and 'logKDE' tend to be slower. Default is 'logDTM'. + The values computed for 'DTM' or 'KDE' are not normalized (this does not affect the clustering). metric (str|Callable): metric used when calculating the distance between instances in a feature array. Defaults to Minkowski of parameter p. kde_params (dict): if density_type is 'KDE' or 'logKDE', additional parameters passed directly to @@ -223,6 +224,8 @@ def fit(self, X, y=None, weights=None): weights = KernelDensity(**kde_params).fit(self.points_).score_samples(self.points_) if self.density_type_ == "KDE": + # First rescale to avoid computing exp(-1000) + weights -= numpy.max(weights) weights = numpy.exp(weights) # TODO: do it at the C++ level and/or in parallel if this is too slow? From d85758780a4dfbc14974575660eb6dc924a3d4c9 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Mon, 17 Jun 2024 11:28:41 +0200 Subject: [PATCH 2/2] Test for KDE underflow --- src/python/test/test_tomato.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/python/test/test_tomato.py b/src/python/test/test_tomato.py index c571f79929..2a3e60b3b1 100644 --- a/src/python/test/test_tomato.py +++ b/src/python/test/test_tomato.py @@ -63,3 +63,13 @@ def test_tomato_1(): assert t.diagram_.size == 0 assert t.max_weight_per_cc_.size == 1 t.plot_diagram() + + +def test_tomato_kde_underflow(): + # 1D construction with 2 Gaussians, embedded in high dimension + X = np.zeros((200, 1000)) + X[:100, 0] = np.random.default_rng().normal(-2, 1, 100) + X[100:, 0] = np.random.default_rng().normal(2, 1, 100) + # X[:,0].sort() + t = Tomato(density_type="KDE").fit(X) + assert (t.weights_ != 0).all()