Merge branch 'main' into add-assign-reviewer-script

adap · May 31, 2024 · 92765b4 · 92765b4
2 parents 9eb6c34 + 3d17a5e
commit 92765b4
Show file tree

Hide file tree

Showing 33 changed files with 645 additions and 627 deletions.
diff --git a/README.md b/README.md
@@ -152,6 +152,7 @@ Other [examples](https://github.com/adap/flower/tree/main/examples):
 - [Flower through Docker Compose and with Grafana dashboard](https://github.com/adap/flower/tree/main/examples/flower-via-docker-compose)
 - [Flower with KaplanMeierFitter from the lifelines library](https://github.com/adap/flower/tree/main/examples/federated-kaplan-meier-fitter)
 - [Sample Level Privacy with Opacus](https://github.com/adap/flower/tree/main/examples/opacus)
+- [Sample Level Privacy with TensorFlow-Privacy](https://github.com/adap/flower/tree/main/examples/tensorflow-privacy)
 
 ## Community
 

diff --git a/datasets/flwr_datasets/__init__.py b/datasets/flwr_datasets/__init__.py
@@ -15,15 +15,15 @@
 """Flower Datasets main package."""
 
 
-from flwr_datasets import partitioner, resplitter
+from flwr_datasets import partitioner, preprocessor
 from flwr_datasets import utils as utils
 from flwr_datasets.common.version import package_version as _package_version
 from flwr_datasets.federated_dataset import FederatedDataset
 
 __all__ = [
     "FederatedDataset",
     "partitioner",
-    "resplitter",
+    "preprocessor",
     "utils",
 ]
 

diff --git a/datasets/flwr_datasets/federated_dataset.py b/datasets/flwr_datasets/federated_dataset.py
@@ -20,11 +20,11 @@
 import datasets
 from datasets import Dataset, DatasetDict
 from flwr_datasets.partitioner import Partitioner
-from flwr_datasets.resplitter import Resplitter
+from flwr_datasets.preprocessor import Preprocessor
 from flwr_datasets.utils import (
     _check_if_dataset_tested,
+    _instantiate_merger_if_needed,
     _instantiate_partitioners,
-    _instantiate_resplitter_if_needed,
 )
 
 
@@ -45,9 +45,11 @@ class FederatedDataset:
     subset : str
         Secondary information regarding the dataset, most often subset or version
         (that is passed to the name in datasets.load_dataset).
-    resplitter : Optional[Union[Resplitter, Dict[str, Tuple[str, ...]]]]
-        `Callable` that transforms `DatasetDict` splits, or configuration dict for
-        `MergeResplitter`.
+    preprocessor : Optional[Union[Preprocessor, Dict[str, Tuple[str, ...]]]]
+        `Callable` that transforms `DatasetDict` by resplitting, removing
+        features, creating new features, performing any other preprocessing operation,
+        or configuration dict for `Merger`. Applied after shuffling. If None,
+        no operation is applied.
     partitioners : Dict[str, Union[Partitioner, int]]
         A dictionary mapping the Dataset split (a `str`) to a `Partitioner` or an `int`
         (representing the number of IID partitions that this split should be partitioned
@@ -79,16 +81,16 @@ def __init__(
         *,
         dataset: str,
         subset: Optional[str] = None,
-        resplitter: Optional[Union[Resplitter, Dict[str, Tuple[str, ...]]]] = None,
+        preprocessor: Optional[Union[Preprocessor, Dict[str, Tuple[str, ...]]]] = None,
         partitioners: Dict[str, Union[Partitioner, int]],
         shuffle: bool = True,
         seed: Optional[int] = 42,
     ) -> None:
         _check_if_dataset_tested(dataset)
         self._dataset_name: str = dataset
         self._subset: Optional[str] = subset
-        self._resplitter: Optional[Resplitter] = _instantiate_resplitter_if_needed(
-            resplitter
+        self._preprocessor: Optional[Preprocessor] = _instantiate_merger_if_needed(
+            preprocessor
         )
         self._partitioners: Dict[str, Partitioner] = _instantiate_partitioners(
             partitioners
@@ -242,8 +244,8 @@ def _prepare_dataset(self) -> None:
             # Note it shuffles all the splits. The self._dataset is DatasetDict
             # so e.g. {"train": train_data, "test": test_data}. All splits get shuffled.
             self._dataset = self._dataset.shuffle(seed=self._seed)
-        if self._resplitter:
-            self._dataset = self._resplitter(self._dataset)
+        if self._preprocessor:
+            self._dataset = self._preprocessor(self._dataset)
         self._dataset_prepared = True
 
     def _check_if_no_split_keyword_possible(self) -> None:

diff --git a/datasets/flwr_datasets/federated_dataset_test.py b/datasets/flwr_datasets/federated_dataset_test.py
@@ -170,20 +170,20 @@ def test_resplit_dataset_into_one(self) -> None:
         fds = FederatedDataset(
             dataset=self.dataset_name,
             partitioners={"train": 100},
-            resplitter={"full": ("train", self.test_split)},
+            preprocessor={"full": ("train", self.test_split)},
         )
         full = fds.load_split("full")
         self.assertEqual(dataset_length, len(full))
 
     # pylint: disable=protected-access
     def test_resplit_dataset_to_change_names(self) -> None:
-        """Test resplitter to change the names of the partitions."""
+        """Test preprocessor to change the names of the partitions."""
         if self.test_split is None:
             return
         fds = FederatedDataset(
             dataset=self.dataset_name,
             partitioners={"new_train": 100},
-            resplitter={
+            preprocessor={
                 "new_train": ("train",),
                 "new_" + self.test_split: (self.test_split,),
             },
@@ -195,7 +195,7 @@ def test_resplit_dataset_to_change_names(self) -> None:
         )
 
     def test_resplit_dataset_by_callable(self) -> None:
-        """Test resplitter to change the names of the partitions."""
+        """Test preprocessor to change the names of the partitions."""
         if self.test_split is None:
             return
 
@@ -209,7 +209,7 @@ def resplit(dataset: DatasetDict) -> DatasetDict:
             )
 
         fds = FederatedDataset(
-            dataset=self.dataset_name, partitioners={"train": 100}, resplitter=resplit
+            dataset=self.dataset_name, partitioners={"train": 100}, preprocessor=resplit
         )
         full = fds.load_split("full")
         dataset = datasets.load_dataset(self.dataset_name)
@@ -298,7 +298,7 @@ def resplit(dataset: DatasetDict) -> DatasetDict:
         fds = FederatedDataset(
             dataset="does-not-matter",
             partitioners={"train": 10},
-            resplitter=resplit,
+            preprocessor=resplit,
             shuffle=True,
         )
         train = fds.load_split("train")
@@ -411,7 +411,7 @@ def test_cannot_use_the_old_split_names(self) -> None:
         fds = FederatedDataset(
             dataset="mnist",
             partitioners={"train": 100},
-            resplitter={"full": ("train", "test")},
+            preprocessor={"full": ("train", "test")},
         )
         with self.assertRaises(ValueError):
             fds.load_partition(0, "train")

diff --git a/...sets/flwr_datasets/resplitter/__init__.py → ...ts/flwr_datasets/preprocessor/__init__.py b/...sets/flwr_datasets/resplitter/__init__.py → ...ts/flwr_datasets/preprocessor/__init__.py
@@ -12,15 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Resplitter package."""
+"""Preprocessor package."""
 
 
-from .divide_resplitter import DivideResplitter
-from .merge_resplitter import MergeResplitter
-from .resplitter import Resplitter
+from .divider import Divider
+from .merger import Merger
+from .preprocessor import Preprocessor
 
 __all__ = [
-    "DivideResplitter",
-    "MergeResplitter",
-    "Resplitter",
+    "Merger",
+    "Preprocessor",
+    "Divider",
 ]
diff --git a/..._datasets/resplitter/divide_resplitter.py → ...ets/flwr_datasets/preprocessor/divider.py b/..._datasets/resplitter/divide_resplitter.py → ...ets/flwr_datasets/preprocessor/divider.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""DivideResplitter class for Flower Datasets."""
+"""Divider class for Flower Datasets."""
 
 
 import collections
@@ -25,7 +25,7 @@
 
 # flake8: noqa: E501
 # pylint: disable=line-too-long
-class DivideResplitter:
+class Divider:
     """Dive existing split(s) of the dataset and assign them custom names.
 
     Create new `DatasetDict` with new split names with corresponding percentages of data
@@ -66,14 +66,14 @@ class DivideResplitter:
 
     >>> # Assuming there is a dataset_dict of type `DatasetDict`
     >>> # dataset_dict is {"train": train-data, "test": test-data}
-    >>> resplitter = DivideResplitter(
+    >>> divider = Divider(
     >>>     divide_config={
     >>>         "train": 0.8,
     >>>         "valid": 0.2,
     >>>     }
     >>>     divide_split="train",
     >>> )
-    >>> new_dataset_dict = resplitter(dataset_dict)
+    >>> new_dataset_dict = divider(dataset_dict)
     >>> # new_dataset_dict is
     >>> # {"train": 80% of train, "valid": 20% of train, "test": test-data}
 
@@ -83,7 +83,7 @@ class DivideResplitter:
 
     >>> # Assuming there is a dataset_dict of type `DatasetDict`
     >>> # dataset_dict is {"train": train-data, "test": test-data}
-    >>> resplitter = DivideResplitter(
+    >>> divider = Divider(
     >>>     divide_config={
     >>>         "train": {
     >>>             "train": 0.8,
@@ -92,7 +92,7 @@ class DivideResplitter:
     >>>         "test": {"test-a": 0.4, "test-b": 0.6 }
     >>>     }
     >>> )
-    >>> new_dataset_dict = resplitter(dataset_dict)
+    >>> new_dataset_dict = divider(dataset_dict)
     >>> # new_dataset_dict is
     >>> # {"train": 80% of train, "valid": 20% of train,
     >>> # "test-a": 40% of test, "test-b": 60% of test}