Merge pull request #1420 from helmholtz-analytics/features/1411-Imple…

…ment_sketched_percentile Implement sketched percentile
helmholtz-analytics · Jul 1, 2024 · 064f495 · 064f495
2 parents e927907 + 896f573
commit 064f495
Show file tree

Hide file tree

Showing 4 changed files with 167 additions and 9 deletions.
diff --git a/heat/core/statistics.py b/heat/core/statistics.py
@@ -19,6 +19,8 @@
 from . import stride_tricks
 from . import logical
 from . import constants
+from .random import randint
+from warnings import warn
 
 __all__ = [
     "argmax",
@@ -1016,10 +1018,19 @@ def reduce_means_elementwise(output_shape_i: torch.Tensor) -> DNDarray:
 DNDarray.mean.__doc__ = mean.__doc__
 
 
-def median(x: DNDarray, axis: Optional[int] = None, keepdims: bool = False) -> DNDarray:
+def median(
+    x: DNDarray,
+    axis: Optional[int] = None,
+    keepdims: bool = False,
+    sketched: bool = False,
+    sketch_size: Optional[float] = 1.0 / MPI.COMM_WORLD.size,
+) -> DNDarray:
     """
     Compute the median of the data along the specified axis.
     Returns the median of the ``DNDarray`` elements.
+    Per default, the "true" median of the entire data set is computed; however, the argument
+    `sketched` allows to switch to a faster but less accurate version that computes
+    the median only on behalf of a random subset of the data set ("sketch").
 
     Parameters
     ----------
@@ -1032,14 +1043,25 @@ def median(x: DNDarray, axis: Optional[int] = None, keepdims: bool = False) -> D
     keepdims : bool, optional
         If True, the axes which are reduced are left in the result as dimensions with size one.
         With this option, the result can broadcast correctly against the original array ``a``.
+
+    sketched : bool, optional
+        If True, the median is computed on a random subset of the data set ("sketch").
+        This is faster but less accurate.  Default is False. The size of the sketch is controlled by the argument `sketch_size`.
+    sketch_size : float, optional
+        The size of the sketch as a fraction of the data set size. Default is `1./n_proc`  where `n_proc` is the number of MPI processes, e.g. `n_proc =  MPI.COMM_WORLD.size`. Must be in the range (0, 1).
+        Ignored for sketched = False.
     """
-    return percentile(x, q=50, axis=axis, keepdims=keepdims)
+    return percentile(
+        x, q=50, axis=axis, keepdims=keepdims, sketched=sketched, sketch_size=sketch_size
+    )
 
 
-DNDarray.median: Callable[[DNDarray, int, bool], DNDarray] = (
-    lambda x, axis=None, keepdims=False: median(x, axis, keepdims)
+DNDarray.median: Callable[[DNDarray, int, bool, bool, float], DNDarray] = (
+    lambda x, axis=None, keepdims=False, sketched=False, sketch_size=1.0 / MPI.COMM_WORLD.size: median(
+        x, axis, keepdims, sketched=sketched, sketch_size=sketch_size
+    )
 )
-DNDarray.mean.__doc__ = mean.__doc__
+DNDarray.median.__doc__ = median.__doc__
 
 
 def __merge_moments(
@@ -1412,10 +1434,15 @@ def percentile(
     out: Optional[DNDarray] = None,
     interpolation: str = "linear",
     keepdims: bool = False,
+    sketched: bool = False,
+    sketch_size: Optional[float] = 1.0 / MPI.COMM_WORLD.size,
 ) -> DNDarray:
     r"""
     Compute the q-th percentile of the data along the specified axis.
     Returns the q-th percentile(s) of the tensor elements.
+    Per default, the "true" percentile(s) of the entire data set are computed; however, the argument
+    `sketched` allows to switch to a faster but inaccurate version that computes
+    the percentile only on behalf of a random subset of the data set ("sketch").
 
     Parameters
     ----------
@@ -1447,6 +1474,14 @@ def percentile(
     keepdims : bool, optional
         If True, the axes which are reduced are left in the result as dimensions with size one.
         With this option, the result can broadcast correctly against the original array x.
+
+    sketched : bool, optional
+        If False (default), the entire data is used and no sketching is performed.
+        If True, a fraction of the data to use for estimating the percentile. The fraction is determined by `sketch_size`.
+    sketch_size : float, optional
+        The fraction of the data to use for estimating the percentile; needs to be strictly between 0 and 1.
+        The default is 1/size of the MPI communicator, i.e., roughly the portion of the data that is anyway processed on a single process.
+        Ignored for sketched = False.
     """
 
     def _local_percentile(data: torch.Tensor, axis: int, indices: torch.Tensor) -> torch.Tensor:
@@ -1490,13 +1525,65 @@ def _local_percentile(data: torch.Tensor, axis: int, indices: torch.Tensor) -> t
 
         return percentile
 
+    def _create_sketch(
+        a: DNDarray,
+        axis: Union[int, None],
+        sketch_size_relative: Optional[float] = None,
+        sketch_size_absolute: Optional[int] = None,
+    ) -> DNDarray:
+        """
+        Create a sketch of a DNDarray along a specified axis. The sketch is created by sampling the DNDarray along the specified axis.
+
+        Parameters
+        ----------
+        a : DNDarray
+            The DNDarray for which to create a sketch.
+        axis : int
+            The axis along which to create the sketch.
+        sketch_size_relative : optional, float
+            The size of the sketch. Fraction of samples to take, hence between 0 and 1.
+        sketch_size_absolute : optional, int
+            The size of the sketch. Number of samples to take, hence must not exceed the size of the axis along which the sketch is taken.
+        """
+        if (sketch_size_relative is None and sketch_size_absolute is None) or (
+            sketch_size_relative is not None and sketch_size_absolute is not None
+        ):
+            raise ValueError(
+                "Exactly one of sketch_size_relative and sketch_size_absolute must be specified."
+            )
+        if sketch_size_absolute is None:
+            sketch_size = int(sketch_size_relative * a.shape[axis])
+        else:
+            sketch_size = sketch_size_absolute
+
+        # create a random sample of indices
+        indices = manipulations.sort(
+            randint(0, a.shape[axis], sketch_size, device=a.device, dtype=types.int64)
+        )[0]
+        sketch = a.swapaxes(0, axis)
+        sketch = a[indices, ...].resplit_(None)
+        return sketch.swapaxes(0, axis)
+
     # SANITATION
     # sanitize input
     if not isinstance(x, DNDarray):
         raise TypeError(f"expected x to be a DNDarray, but was {type(x)}")
     if isinstance(axis, (list, tuple)):
         raise NotImplementedError("ht.percentile(), tuple axis not implemented yet")
 
+    if sketched:
+        if (
+            not isinstance(sketch_size, float)
+            or sketch_size <= 0
+            or (MPI.COMM_WORLD.size > 1 and sketch_size == 1)
+            or sketch_size > 1
+        ):
+            raise ValueError(
+                f"If sketched=True, sketch_size must be float strictly between 0 and 1, but is {sketch_size}."
+            )
+        else:
+            x = _create_sketch(x, axis, sketch_size_relative=sketch_size)
+
     if axis is None:
         if x.ndim > 1:
             x = x.flatten()

diff --git a/heat/core/tests/test_statistics.py b/heat/core/tests/test_statistics.py
@@ -1244,6 +1244,26 @@ def test_percentile(self):
         with self.assertRaises(ValueError):
             ht.percentile(x_ht, q, out=out_wrong_split)
 
+    def test_percentile_sketched(self):
+        axis, q = 0, 50
+        use_sketch_of_size = 0.1
+        q = 50
+        # check if it works
+        for split in [None, 1, 0]:
+            X = ht.random.rand(10 * ht.MPI_WORLD.size, 2 * ht.MPI_WORLD.size, split=split)
+            p = ht.percentile(X, q, axis=axis, sketched=True, sketch_size=use_sketch_of_size)
+            self.assertTrue(p.shape == (2 * ht.MPI_WORLD.size,))
+        # default sketch size
+        for split in [None, 1, 0]:
+            X = ht.random.rand(10 * ht.MPI_WORLD.size, 2 * ht.MPI_WORLD.size, split=split)
+            p = ht.percentile(X, q, axis=axis, sketched=True)
+            self.assertTrue(p.shape == (2 * ht.MPI_WORLD.size,))
+        # check if it raises correct errors
+        with self.assertRaises(ValueError):
+            ht.percentile(X, q, axis=axis, sketched=True, sketch_size=1.1)
+        with self.assertRaises(ValueError):
+            ht.percentile(X, q, axis=axis, sketched=True, sketch_size=10)
+
     def test_skew(self):
         x = ht.zeros((2, 3, 4))
         with self.assertRaises(ValueError):

diff --git a/heat/preprocessing/preprocessing.py b/heat/preprocessing/preprocessing.py
@@ -448,6 +448,10 @@ class RobustScaler(ht.TransformMixin, ht.BaseEstimator):
     the quantile range (defaults to IQR: Interquartile Range); this routine is similar
     to ``sklearn.preprocessing.RobustScaler``.
 
+    Per default, the "true" median and IQR of the entire data set is computed; however, the argument
+    `sketched` allows to switch to a faster but inaccurate version that computes
+    median and IQR only on behalf of a random subset of the data set ("sketch") of size `sketch_size`.
+
     The underyling data set to be scaled must be stored as a 2D-`DNDarray` of shape (n_datapoints, n_features).
     Each feature is centered and scaled independently.
 
@@ -470,6 +474,14 @@ class RobustScaler(ht.TransformMixin, ht.BaseEstimator):
     unit_variance : not yet supported.
         raises ``NotImplementedError``
 
+    sketched : bool, default=False
+        If `True`, use a sketch of the data set to compute the median and IQR.
+        This is faster but less accurate. The size of the sketch is determined by the argument `sketch_size`.
+
+    sketch_size : float, default=1./ht.MPI_WORLD.size
+        Fraction of the data set to be used for the sketch if `sketched=True`. The default value is 1/N, where N is the number of MPI processes.
+        Ignored if `sketched=False`.
+
     Attributes
     ----------
     center_ : DNDarray of shape (n_features,)
@@ -490,11 +502,15 @@ def __init__(
         quantile_range: Tuple[float, float] = (25.0, 75.0),
         copy: bool = True,
         unit_variance: bool = False,
+        sketched: bool = False,
+        sketch_size: Optional[float] = 1.0 / ht.MPI_WORLD.size,
     ):
         self.with_centering = with_centering
         self.with_scaling = with_scaling
         self.quantile_range = quantile_range
         self.copy = copy
+        self.sketched = sketched
+        self.sketch_size = sketch_size
         if not with_centering and not with_scaling:
             raise ValueError(
                 "Both centering and scaling are disabled, thus RobustScaler could do nothing. At least one of with_scaling or with_centering must be True."
@@ -525,10 +541,22 @@ def fit(self, X: ht.DNDarray) -> Self:
         """
         _is_2D_float_DNDarray(X)
         if self.with_centering:
-            self.center_ = ht.median(X, axis=0)
+            self.center_ = ht.median(
+                X, axis=0, sketched=self.sketched, sketch_size=self.sketch_size
+            )
         if self.with_scaling:
-            self.iqr_ = ht.percentile(X, self.quantile_range[1], axis=0) - ht.percentile(
-                X, self.quantile_range[0], axis=0
+            self.iqr_ = ht.percentile(
+                X,
+                self.quantile_range[1],
+                axis=0,
+                sketched=self.sketched,
+                sketch_size=self.sketch_size,
+            ) - ht.percentile(
+                X,
+                self.quantile_range[0],
+                axis=0,
+                sketched=self.sketched,
+                sketch_size=self.sketch_size,
             )
 
             # if length of iqr is close to zero, do not scale this feature

diff --git a/heat/preprocessing/tests/test_preprocessing.py b/heat/preprocessing/tests/test_preprocessing.py
@@ -238,7 +238,7 @@ def test_MaxAbsScaler(self):
 
 class TestRobustScaler(TestCase):
     def test_RobustScaler(self):
-        for split in [0]:
+        for split in [0, 1]:
             for with_centering in [False, True]:
                 for with_scaling in [False, True]:
                     if not with_centering and not with_scaling:
@@ -348,3 +348,26 @@ def test_RobustScaler(self):
             scaler.fit(ht.zeros((10, 10, 10), dtype=ht.float32))
         with self.assertRaises(TypeError):
             scaler.fit(ht.zeros(10, 10, dtype=ht.int32))
+
+    def test_robust_scaler_sketched(self):
+        for split in [0, 1]:
+            with_centering = True
+            with_scaling = True
+            copy = True
+            X = _generate_test_data_set(
+                MPI.COMM_WORLD.Get_size() * 10,
+                MPI.COMM_WORLD.Get_size() * 4,
+                split=split,
+                dtype=ht.float32,
+            )
+            scaler = ht.preprocessing.RobustScaler(
+                quantile_range=(24.0, 76.0),
+                copy=copy,
+                with_centering=with_centering,
+                with_scaling=with_scaling,
+                sketched=True,
+            )
+            scaler.fit(X)
+            Y = scaler.transform(X)
+            Y = scaler.inverse_transform(Y)
+            self.assertTrue(ht.allclose(X, Y, atol=atol_inv))