Merge branch 'main' into tabdar/update-quickstart-pytorch

adap · Apr 2, 2024 · fb65b67 · fb65b67
2 parents e4d65e0 + f95d641
commit fb65b67
Show file tree

Hide file tree

Showing 191 changed files with 21,424 additions and 14,766 deletions.
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -9,6 +9,9 @@ README.md @jafermarq @tanertopal @danieljanes
 # Flower Baselines
 /baselines @jafermarq @tanertopal @danieljanes
 
+# Flower Datasets
+/datasets @jafermarq @tanertopal @danieljanes
+
 # Flower Examples
 /examples @jafermarq @tanertopal @danieljanes
 

diff --git a/.github/CODE_OF_CONDUCT.md b/.github/CODE_OF_CONDUCT.md
@@ -55,7 +55,7 @@ further defined and clarified by project maintainers.
 ## Enforcement
 
 Instances of abusive, harassing, or otherwise unacceptable behavior may be
-reported by contacting the project team at [email protected]. All
+reported by contacting the project team at [email protected]. All
 complaints will be reviewed and investigated and will result in a response that
 is deemed necessary and appropriate to the circumstances. The project team is
 obligated to maintain confidentiality with regard to the reporter of an incident.

diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml
@@ -45,6 +45,7 @@ jobs:
           [ -z "${{ github.head_ref }}" ] && dir="${{ github.ref_name }}" || dir="pr/${{ github.head_ref }}"
           echo "DIR=$dir" >> "$GITHUB_OUTPUT"
           aws s3 cp --content-disposition "attachment" --cache-control "no-cache" ./ s3://${{ env.ARTIFACT_BUCKET }}/py/$dir/$sha_short --recursive
+          aws s3 cp --content-disposition "attachment" --cache-control "no-cache" ./ s3://${{ env.ARTIFACT_BUCKET }}/py/$dir/latest --recursive
     outputs:
       whl_path: ${{ steps.upload.outputs.WHL_PATH }}
       short_sha: ${{ steps.upload.outputs.SHORT_SHA }}

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,18 @@
+repos:
+  - repo: local
+    hooks:
+      - id: format-code
+        name: Format Code
+        entry: ./dev/format.sh
+        language: script
+        # Ensures the script runs from the repository root:
+        pass_filenames: false
+        stages: [commit]
+
+      - id: run-tests
+        name: Run Tests
+        entry: ./dev/test.sh
+        language: script
+        # Ensures the script runs from the repository root:
+        pass_filenames: false
+        stages: [commit]
diff --git a/baselines/flwr_baselines/pyproject.toml b/baselines/flwr_baselines/pyproject.toml
@@ -51,6 +51,7 @@ wget = "^3.2"
 virtualenv = "^20.24.6"
 pandas = "^1.5.3"
 pyhamcrest = "^2.0.4"
+pillow = "==10.2.0"
 
 [tool.poetry.dev-dependencies]
 isort = "==5.13.2"

diff --git a/datasets/README.md b/datasets/README.md
@@ -59,7 +59,7 @@ If you plan to change the type of the dataset to run the code with your ML frame
 
 # Usage
 
-Flower Datasets exposes the `FederatedDataset` abstraction to represent the dataset needed for federated learning/evaluation/analytics. It has two powerful methods that let you handle the dataset preprocessing: `load_partition(node_id, split)` and `load_full(split)`.
+Flower Datasets exposes the `FederatedDataset` abstraction to represent the dataset needed for federated learning/evaluation/analytics. It has two powerful methods that let you handle the dataset preprocessing: `load_partition(partition_id, split)` and `load_split(split)`.
 
 Here's a basic quickstart example of how to partition the MNIST dataset:
 
@@ -71,7 +71,7 @@ mnist_fds = FederatedDataset("mnist", partitioners={"train": 100}
 
 mnist_partition_0 = mnist_fds.load_partition(0, "train")
 
-centralized_data = mnist_fds.load_full("test")
+centralized_data = mnist_fds.load_split("test")
 ```
 
 For more details, please refer to the specific how-to guides or tutorial. They showcase customization and more advanced features.

diff --git a/datasets/doc/source/conf.py b/datasets/doc/source/conf.py
@@ -38,7 +38,7 @@
 author = "The Flower Authors"
 
 # The full version, including alpha/beta/rc tags
-release = "0.0.2"
+release = "0.1.0"
 
 
 # -- General configuration ---------------------------------------------------

diff --git a/datasets/doc/source/how-to-use-with-numpy.rst b/datasets/doc/source/how-to-use-with-numpy.rst
@@ -9,7 +9,7 @@ Create a ``FederatedDataset``::
 
   fds = FederatedDataset(dataset="cifar10", partitioners={"train": 10})
   partition = fds.load_partition(0, "train")
-  centralized_dataset = fds.load_full("test")
+  centralized_dataset = fds.load_split("test")
 
 Inspect the names of the features::
 

diff --git a/datasets/doc/source/how-to-use-with-pytorch.rst b/datasets/doc/source/how-to-use-with-pytorch.rst
@@ -8,7 +8,7 @@ Standard setup - download the dataset, choose the partitioning::
 
   fds = FederatedDataset(dataset="cifar10", partitioners={"train": 10})
   partition = fds.load_partition(0, "train")
-  centralized_dataset = fds.load_full("test")
+  centralized_dataset = fds.load_split("test")
 
 Determine the names of the features (you can alternatively do that directly on the Hugging Face website). The name can
 vary e.g. "img" or "image", "label" or "labels"::

diff --git a/datasets/doc/source/how-to-use-with-tensorflow.rst b/datasets/doc/source/how-to-use-with-tensorflow.rst
@@ -11,7 +11,7 @@ Create a ``FederatedDataset``::
 
   fds = FederatedDataset(dataset="cifar10", partitioners={"train": 10})
   partition = fds.load_partition(0, "train")
-  centralized_dataset = fds.load_full("test")
+  centralized_dataset = fds.load_split("test")
 
 Inspect the names of the features::
 

diff --git a/datasets/doc/source/tutorial-quickstart.rst b/datasets/doc/source/tutorial-quickstart.rst
@@ -38,7 +38,7 @@ To iid partition your dataset, choose the split you want to partition and the nu
 
   fds = FederatedDataset(dataset="cifar10", partitioners={"train": 10})
   partition = fds.load_partition(0, "train")
-  centralized_dataset = fds.load_full("test")
+  centralized_dataset = fds.load_split("test")
 
 Now you're ready to go. You have ten partitions created from the train split of the CIFAR10 dataset and the test split
 for the centralized evaluation. We will convert the type of the dataset from Hugging Face's `Dataset` type to the one

diff --git a/datasets/flwr_datasets/federated_dataset.py b/datasets/flwr_datasets/federated_dataset.py
@@ -15,7 +15,7 @@
 """FederatedDataset."""
 
 
-from typing import Dict, List, Optional, Tuple, Union, cast
+from typing import Dict, Optional, Tuple, Union
 
 import datasets
 from datasets import Dataset, DatasetDict
@@ -25,7 +25,6 @@
     _check_if_dataset_tested,
     _instantiate_partitioners,
     _instantiate_resplitter_if_needed,
-    divide_dataset,
 )
 
 
@@ -54,19 +53,6 @@ class FederatedDataset:
         (representing the number of IID partitions that this split should be partitioned
         into). One or multiple `Partitioner` objects can be specified in that manner,
         but at most, one per split.
-    partition_division : Optional[Union[List[float], Tuple[float, ...],
-    Dict[str, float], Dict[str, Optional[Union[List[float], Tuple[float, ...],
-    Dict[str, float]]]]]]
-        Fractions specifing the division of the partition assiciated with certain split
-        (and partitioner) that enable returning already divided partition from the
-        `load_partition` method. You can think of this as on-edge division of the data
-        into multiple divisions (e.g. into train and validation). You can also name the
-        divisions by using the Dict or create specify it as a List/Tuple. If you
-        specified a single partitioner you can provide the simplified form e.g.
-        [0.8, 0.2] or {"partition_train": 0.8, "partition_test": 0.2} but when multiple
-        partitioners are specified you need to indicate the result of which partitioner
-        are further divided e.g. {"train": [0.8, 0.2]} would result in dividing only the
-        partitions that are created from the "train" split.
     shuffle : bool
         Whether to randomize the order of samples. Applied prior to resplitting,
         speratelly to each of the present splits in the dataset. It uses the `seed`
@@ -83,15 +69,7 @@ class FederatedDataset:
     >>> # Load partition for client with ID 10.
     >>> partition = mnist_fds.load_partition(10, "train")
     >>> # Use test split for centralized evaluation.
-    >>> centralized = mnist_fds.load_full("test")
-
-    Automatically divde the data returned from `load_partition`
-    >>> mnist_fds = FederatedDataset(
-    >>>     dataset="mnist",
-    >>>     partitioners={"train": 100},
-    >>>     partition_division=[0.8, 0.2],
-    >>> )
-    >>> partition_train, partition_test = mnist_fds.load_partition(10, "train")
+    >>> centralized = mnist_fds.load_split("test")
     """
 
     # pylint: disable=too-many-instance-attributes
@@ -102,17 +80,6 @@ def __init__(
         subset: Optional[str] = None,
         resplitter: Optional[Union[Resplitter, Dict[str, Tuple[str, ...]]]] = None,
         partitioners: Dict[str, Union[Partitioner, int]],
-        partition_division: Optional[
-            Union[
-                List[float],
-                Tuple[float, ...],
-                Dict[str, float],
-                Dict[
-                    str,
-                    Optional[Union[List[float], Tuple[float, ...], Dict[str, float]]],
-                ],
-            ]
-        ] = None,
         shuffle: bool = True,
         seed: Optional[int] = 42,
     ) -> None:
@@ -125,30 +92,27 @@ def __init__(
         self._partitioners: Dict[str, Partitioner] = _instantiate_partitioners(
             partitioners
         )
-        self._partition_division = self._initialize_partition_division(
-            partition_division
-        )
         self._shuffle = shuffle
         self._seed = seed
         #  _dataset is prepared lazily on the first call to `load_partition`
-        #  or `load_full`. See _prepare_datasets for more details
+        #  or `load_split`. See _prepare_datasets for more details
         self._dataset: Optional[DatasetDict] = None
-        # Indicate if the dataset is prepared for `load_partition` or `load_full`
+        # Indicate if the dataset is prepared for `load_partition` or `load_split`
         self._dataset_prepared: bool = False
 
     def load_partition(
         self,
-        node_id: int,
+        partition_id: int,
         split: Optional[str] = None,
-    ) -> Union[Dataset, List[Dataset], DatasetDict]:
+    ) -> Dataset:
         """Load the partition specified by the idx in the selected split.
 
         The dataset is downloaded only when the first call to `load_partition` or
-        `load_full` is made.
+        `load_split` is made.
 
         Parameters
         ----------
-        node_id : int
+        partition_id : int
             Partition index for the selected split, idx in {0, ..., num_partitions - 1}.
         split : Optional[str]
             Name of the (partitioned) split (e.g. "train", "test"). You can skip this
@@ -160,13 +124,8 @@ def load_partition(
 
         Returns
         -------
-        partition : Union[Dataset, List[Dataset], DatasetDict]
-            Undivided or divided partition from the dataset split.
-            If `partition_division` is not specified then `Dataset` is returned.
-            If `partition_division` is specified as `List` or `Tuple` then
-            `List[Dataset]` is returned.
-            If `partition_division` is specified as `Dict` then `DatasetDict` is
-            returned.
+        partition : Dataset
+            Single partition from the dataset split.
         """
         if not self._dataset_prepared:
             self._prepare_dataset()
@@ -179,22 +138,13 @@ def load_partition(
         self._check_if_split_possible_to_federate(split)
         partitioner: Partitioner = self._partitioners[split]
         self._assign_dataset_to_partitioner(split)
-        partition = partitioner.load_partition(node_id)
-        if self._partition_division is None:
-            return partition
-        partition_division = self._partition_division.get(split)
-        if partition_division is None:
-            return partition
-        divided_partition: Union[List[Dataset], DatasetDict] = divide_dataset(
-            partition, partition_division
-        )
-        return divided_partition
+        return partitioner.load_partition(partition_id)
 
-    def load_full(self, split: str) -> Dataset:
+    def load_split(self, split: str) -> Dataset:
         """Load the full split of the dataset.
 
         The dataset is downloaded only when the first call to `load_partition` or
-        `load_full` is made.
+        `load_split` is made.
 
         Parameters
         ----------
@@ -213,6 +163,25 @@ def load_full(self, split: str) -> Dataset:
         self._check_if_split_present(split)
         return self._dataset[split]
 
+    @property
+    def partitioners(self) -> Dict[str, Partitioner]:
+        """Dictionary mapping each split to its associated partitioner.
+
+        The returned partitioners have the splits of the dataset assigned to them.
+        """
+        # This function triggers the dataset download (lazy download) and checks
+        # the partitioner specification correctness (which can also happen lazily only
+        # after the dataset download).
+        if not self._dataset_prepared:
+            self._prepare_dataset()
+        if self._dataset is None:
+            raise ValueError("Dataset is not loaded yet.")
+        partitioners_keys = list(self._partitioners.keys())
+        for split in partitioners_keys:
+            self._check_if_split_present(split)
+            self._assign_dataset_to_partitioner(split)
+        return self._partitioners
+
     def _check_if_split_present(self, split: str) -> None:
         """Check if the split (for partitioning or full return) is in the dataset."""
         if self._dataset is None:
@@ -282,62 +251,3 @@ def _check_if_no_split_keyword_possible(self) -> None:
                 "Please set the `split` argument. You can only omit the split keyword "
                 "if there is exactly one partitioner specified."
             )
-
-    def _initialize_partition_division(
-        self,
-        partition_division: Optional[
-            Union[
-                List[float],
-                Tuple[float, ...],
-                Dict[str, float],
-                Dict[
-                    str,
-                    Optional[Union[List[float], Tuple[float, ...], Dict[str, float]]],
-                ],
-            ]
-        ],
-    ) -> Optional[
-        Dict[
-            str,
-            Optional[Union[List[float], Tuple[float, ...], Dict[str, float]]],
-        ]
-    ]:
-        """Create the partition division in the full format.
-
-        Reduced format (possible if only one partitioner exist):
-
-        Union[List[float], Tuple[float, ...], Dict[str, float]
-
-        Full format: Dict[str, Reduced format]
-        Full format represents the split to division mapping.
-        """
-        # Check for simple dict, list, or tuple types directly
-        if isinstance(partition_division, (list, tuple)) or (
-            isinstance(partition_division, dict)
-            and all(isinstance(value, float) for value in partition_division.values())
-        ):
-            if len(self._partitioners) > 1:
-                raise ValueError(
-                    f"The specified partition_division {partition_division} does not "
-                    f"provide mapping to split but more than one partitioners is "
-                    f"specified. Please adjust the partition_division specification to "
-                    f"have the split names as the keys."
-                )
-            return cast(
-                Dict[
-                    str,
-                    Optional[Union[List[float], Tuple[float, ...], Dict[str, float]]],
-                ],
-                {list(self._partitioners.keys())[0]: partition_division},
-            )
-        if isinstance(partition_division, dict):
-            return cast(
-                Dict[
-                    str,
-                    Optional[Union[List[float], Tuple[float, ...], Dict[str, float]]],
-                ],
-                partition_division,
-            )
-        if partition_division is None:
-            return None
-        raise TypeError("Unsupported type for partition_division")