diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index 9b296865a5..a14eaf63bc 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -66,6 +66,7 @@ def __init__( graph_type (str): 'manual', 'knn' or 'radius'. Default is 'knn'. density_type (str): 'manual', 'DTM', 'logDTM', 'KDE' or 'logKDE'. When you have many points, 'KDE' and 'logKDE' tend to be slower. Default is 'logDTM'. + The values computed for 'DTM' or 'KDE' are not normalized (this does not affect the clustering). metric (str|Callable): metric used when calculating the distance between instances in a feature array. Defaults to Minkowski of parameter p. kde_params (dict): if density_type is 'KDE' or 'logKDE', additional parameters passed directly to @@ -224,6 +225,8 @@ def fit(self, X, y=None, weights=None): weights = KernelDensity(**kde_params).fit(self.points_).score_samples(self.points_) if self.density_type_ == "KDE": + # First rescale to avoid computing exp(-1000) + weights -= numpy.max(weights) weights = numpy.exp(weights) # TODO: do it at the C++ level and/or in parallel if this is too slow? diff --git a/src/python/test/test_tomato.py b/src/python/test/test_tomato.py index c571f79929..2a3e60b3b1 100644 --- a/src/python/test/test_tomato.py +++ b/src/python/test/test_tomato.py @@ -63,3 +63,13 @@ def test_tomato_1(): assert t.diagram_.size == 0 assert t.max_weight_per_cc_.size == 1 t.plot_diagram() + + +def test_tomato_kde_underflow(): + # 1D construction with 2 Gaussians, embedded in high dimension + X = np.zeros((200, 1000)) + X[:100, 0] = np.random.default_rng().normal(-2, 1, 100) + X[100:, 0] = np.random.default_rng().normal(2, 1, 100) + # X[:,0].sort() + t = Tomato(density_type="KDE").fit(X) + assert (t.weights_ != 0).all()