adap · jafermarq · May 6, 2024 · Apr 17, 2024 · Apr 17, 2024 · Apr 19, 2024
@@ -15,9 +15,12 @@
 """Natural id partitioner class that works with Hugging Face Datasets."""
 
 
-from typing import Dict
+from typing import Dict, Union
+
+import numpy as np
 
 import datasets
+from flwr_datasets.common.typing import NDArrayInt
 from flwr_datasets.partitioner.partitioner import Partitioner
 
 
@@ -30,6 +33,8 @@ def __init__(
     ):
         super().__init__()
         self._partition_id_to_natural_id: Dict[int, str] = {}
+        self._natural_id_to_partition_id: Dict[str, int] = {}
+        self._partition_id_to_indices: Dict[int, NDArrayInt] = {}
         self._partition_by = partition_by
 
     def _create_int_partition_id_to_natural_id(self) -> None:
@@ -42,6 +47,61 @@ def _create_int_partition_id_to_natural_id(self) -> None:
             zip(range(len(unique_natural_ids)), unique_natural_ids)
         )
 
+    def _create_natural_id_to_int_partition_id(self) -> None:
+        """Create a mapping from unique client ids from dataset to int indices.
+
+        Natural ids come from the column specified in `partition_by`. This object is
+        inverse of the `self._partition_id_to_natural_id`. This method assumes that
+        `self._partition_id_to_natural_id` already exist.
+        """
+        self._natural_id_to_partition_id = {
+            value: key for key, value in self._partition_id_to_natural_id.items()
+        }
+
+    def _create_partition_id_to_indices(self) -> None:
+        """Create an assignment of indices to the partition indices."""
+        natural_ids = np.array(self.dataset[self._partition_by])
+        unique_natural_ids = self.dataset.unique(self._partition_by)
+
+        none_present = False
+        if None in unique_natural_ids:
+            none_present = True
+            dtype = self.dataset.features[self._partition_by].dtype
+            none_replacement: Union[int, str]
+            if dtype == "string":
+                none_replacement = "None"
+                # Ensure the replacement is not in the dataset
+                while True:
+                    if none_replacement not in unique_natural_ids:
+                        break
+                    none_replacement += "1"
+            elif "unit" in dtype:
+                none_replacement = max(natural_ids) + 1
+            elif "int" in dtype:
+                none_replacement = -1
+                if none_replacement in unique_natural_ids:
+                    none_replacement = max(natural_ids) + 1
+            else:
+                raise ValueError(
+                    "The type of values in the `partition_by` column needs "
+                    "to be int or string"
+                )
+
+            # Replace the None by the none_replacement (in order to be able to use the
+            # np.unique(value, return_inverse) that requires no None and same val types
+            is_none = np.vectorize(lambda x: x is None)
+            mask = is_none(natural_ids)
+            natural_ids[mask] = none_replacement
+
+        unique_natural_ids, inverse = np.unique(natural_ids, return_inverse=True)
+
+        for i, natural_id in enumerate(unique_natural_ids):
+            if none_present and natural_id == none_replacement:
+                # Use the natural_id that is present in the dataset (not replacement)
+                natural_id = None
+            partition_id = self._natural_id_to_partition_id[natural_id]
+            self._partition_id_to_indices[partition_id] = np.where(inverse == i)[0]
+
     def load_partition(self, partition_id: int) -> datasets.Dataset:
         """Load a single partition corresponding to a single `partition_id`.
 
@@ -59,18 +119,22 @@ def load_partition(self, partition_id: int) -> datasets.Dataset:
             single dataset partition
         """
         if len(self._partition_id_to_natural_id) == 0:
+            self._check_supported_type_of_value_in_partition_by()
             self._create_int_partition_id_to_natural_id()
+            self._create_natural_id_to_int_partition_id()
 
-        return self.dataset.filter(
-            lambda row: row[self._partition_by]
-            == self._partition_id_to_natural_id[partition_id]
-        )
+        if len(self._partition_id_to_indices) == 0:
+            self._create_partition_id_to_indices()
+
+        return self.dataset.select(self._partition_id_to_indices[partition_id])
 
     @property
     def num_partitions(self) -> int:
         """Total number of partitions."""
         if len(self._partition_id_to_natural_id) == 0:
+            self._check_supported_type_of_value_in_partition_by()
             self._create_int_partition_id_to_natural_id()
+            self._create_natural_id_to_int_partition_id()
         return len(self._partition_id_to_natural_id)
 
     @property
@@ -87,3 +151,17 @@ def partition_id_to_natural_id(self, value: Dict[int, str]) -> None:
         raise AttributeError(
             "Setting the partition_id_to_natural_id dictionary is not allowed."
         )
+
+    def _check_supported_type_of_value_in_partition_by(self) -> None:
+        values = self.dataset[self._partition_by]
+        values_np = np.array(values)
+        dtype = values_np.dtype
+        if not (
+            np.issubdtype(dtype, np.object_)
+            or np.issubdtype(dtype, np.integer)
+            or np.issubdtype(dtype, np.str_)
+        ):
+            raise ValueError(
+                f"The specified column in {self._partition_by} is of type {dtype} "
+                f"however only ints (with None) and strings (with None) are acceptable."
+            )
diff --git a/datasets/flwr_datasets/partitioner/natural_id_partitioner_test.py b/datasets/flwr_datasets/partitioner/natural_id_partitioner_test.py
@@ -95,7 +95,7 @@ def test_partitioner_with_non_existing_column_partition_by(self) -> None:
         dataset = _create_dataset(10, 2)
         partitioner = NaturalIdPartitioner(partition_by="not-existing")
         partitioner.dataset = dataset
-        with self.assertRaises(ValueError):
+        with self.assertRaises(KeyError):
             partitioner.load_partition(0)
 
     @parameterized.expand(  # type: ignore