Merge branch 'main' into fds-fix-pathological-partitioner

adap · Oct 1, 2024 · 00f19da · 00f19da
2 parents 98de354 + 1a4da37
commit 00f19da
Show file tree

Hide file tree

Showing 161 changed files with 27,054 additions and 15,943 deletions.
diff --git a/.editorconfig b/.editorconfig
@@ -16,6 +16,14 @@ profile = black
 indent_style = space
 indent_size = 2
 
+[*.md]
+indent_style = space
+indent_size = 2
+
 [*.yml]
 indent_style = space
 indent_size = 2
+
+[*.toml]
+indent_style = space
+indent_size = 4
diff --git a/.github/workflows/_docker-build.yml b/.github/workflows/_docker-build.yml
@@ -101,6 +101,7 @@ jobs:
           attempt_delay: 9000 # 9 secs
           with: |
             pull: true
+            sbom: true
             platforms: ${{ matrix.platform.docker }}
             context: "{{defaultContext}}:${{ inputs.file-dir }}"
             outputs: type=image,name=${{ inputs.namespace-repository }},push-by-digest=true,name-canonical=true,push=true

diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml
@@ -51,6 +51,63 @@ jobs:
       short_sha: ${{ steps.upload.outputs.SHORT_SHA }}
       dir: ${{ steps.upload.outputs.DIR }}
 
+  superexec:
+    runs-on: ubuntu-22.04
+    timeout-minutes: 10
+    needs: wheel
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.9", "3.10", "3.11"]
+        directory: [e2e-bare-auth]
+        connection: [secure, insecure]
+        engine: [deployment-engine, simulation-engine]
+        authentication: [no-auth, client-auth]
+        exclude:
+          - connection: insecure
+            authentication: client-auth
+    name: |
+      SuperExec / 
+      Python ${{ matrix.python-version }} /
+      ${{ matrix.connection }} / 
+      ${{ matrix.authentication }} / 
+      ${{ matrix.engine }} 
+    defaults:
+      run:
+        working-directory: e2e/${{ matrix.directory }}
+    steps:
+      - uses: actions/checkout@v4
+      - name: Bootstrap
+        uses: ./.github/actions/bootstrap
+        with:
+          python-version: ${{ matrix.python-version }}
+          poetry-skip: 'true'
+      - name: Install Flower from repo
+        if: ${{ github.repository != 'adap/flower' || github.event.pull_request.head.repo.fork || github.actor == 'dependabot[bot]' }}
+        run: |
+          if [[ "${{ matrix.engine }}" == "simulation-engine" ]]; then
+            python -m pip install ".[simulation]"
+          else
+            python -m pip install .
+          fi
+      - name: Download and install Flower wheel from artifact store
+        if: ${{ github.repository == 'adap/flower' && !github.event.pull_request.head.repo.fork && github.actor != 'dependabot[bot]' }}
+        run: |
+          # Define base URL for wheel file
+          WHEEL_URL="https://${{ env.ARTIFACT_BUCKET }}/py/${{ needs.wheel.outputs.dir }}/${{ needs.wheel.outputs.short_sha }}/${{ needs.wheel.outputs.whl_path }}"
+          if [[ "${{ matrix.engine }}" == "simulation-engine" ]]; then
+            python -m pip install "flwr[simulation] @ ${WHEEL_URL}"
+          else
+            python -m pip install "${WHEEL_URL}"
+          fi
+      - name: >
+          Run SuperExec test / 
+          ${{ matrix.connection }} / 
+          ${{ matrix.authentication }} / 
+          ${{ matrix.engine }} 
+        working-directory: e2e/${{ matrix.directory }}
+        run: ./../test_superexec.sh "${{ matrix.connection }}" "${{ matrix.authentication}}" "${{ matrix.engine }}"
+
   frameworks:
     runs-on: ubuntu-22.04
     timeout-minutes: 10

diff --git a/.github/workflows/update_translations.yml b/.github/workflows/update_translations.yml
@@ -0,0 +1,79 @@
+name: Translations
+
+on:
+  schedule:
+    - cron: '0 0 * * *' # Runs every day at midnight
+  workflow_dispatch: # Allows to manually trigger the workflow
+
+jobs:
+  update-and-pr:
+    runs-on: ubuntu-22.04
+    permissions:
+      contents: write
+      pull-requests: write
+    env:
+      branch-name: auto-update-trans-text
+    name: Update text
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Bootstrap
+        uses: ./.github/actions/bootstrap
+        with:
+          python-version: '3.10'
+
+      - name: Install dependencies
+        run: |
+          python -m poetry install
+          pip install sphinx==7.3.7
+
+      - name: Install pandoc
+        uses: nikeee/setup-pandoc@v1
+
+      - name: Update text and translations for all locales
+        run: |
+          cd doc
+          make update-text
+          for langDir in locales/*; do
+            if [ -d "$langDir" ]; then
+              lang=$(basename $langDir)
+              echo "Updating language $lang"
+              make update-lang lang=$lang
+            fi
+          done
+
+      - name: Commit changes
+        run: |
+          git config --local user.email "41898282+github-actions[bot]@users.noreply.github.com"
+          git config --local user.name "github-actions[bot]"
+          git add doc/locales
+          git commit -m "Update text and language files"
+        continue-on-error: true
+
+      - name: Calculate diff  # Even without doc changes the update-lang command will generate 228 additions and 60 deletions, so we only want to open a PR when there is more
+        id: calculate_diff
+        run: |
+          additions=$(git diff --numstat HEAD^1 | awk '{s+=$1} END {print s}')
+          deletions=$(git diff --numstat HEAD^1 | awk '{s+=$2} END {print s}')
+          echo "Additions: $additions"
+          echo "Deletions: $deletions"
+          echo "additions=$additions" >> $GITHUB_OUTPUT
+          echo "deletions=$deletions" >> $GITHUB_OUTPUT
+
+      - name: Push changes
+        if: steps.calculate_diff.outputs.additions > 228 && steps.calculate_diff.outputs.deletions > 60
+        uses: ad-m/github-push-action@master
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          branch: '${{ env.branch-name }}'
+
+      - name: Create Pull Request
+        if: steps.calculate_diff.outputs.additions > 228 && steps.calculate_diff.outputs.deletions > 60
+        uses: peter-evans/create-pull-request@v6
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          branch: '${{ env.branch-name }}'
+          delete-branch: true
+          title: 'docs(framework:skip) Update source texts for translations (automated)'
+          body: 'This PR is auto-generated to update text and language files.'
+          draft: false
diff --git a/README.md b/README.md
@@ -18,6 +18,7 @@
 [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](https://github.com/adap/flower/blob/main/CONTRIBUTING.md)
 ![Build](https://github.com/adap/flower/actions/workflows/framework.yml/badge.svg)
 [![Downloads](https://static.pepy.tech/badge/flwr)](https://pepy.tech/project/flwr)
+[![Docker Hub](https://img.shields.io/badge/Docker%20Hub-flwr-blue)](https://hub.docker.com/u/flwr)
 [![Slack](https://img.shields.io/badge/Chat-Slack-red)](https://flower.ai/join-slack)
 
 Flower (`flwr`) is a framework for building federated learning systems. The

diff --git a/benchmarks/flowertune-llm/README.md b/benchmarks/flowertune-llm/README.md
@@ -13,13 +13,13 @@ As the first step, please register for a Flower account on [flower.ai/login](htt
 Then, create a new Python environment and install Flower. 
 
 > [!TIP]
-> We recommend using `pyenv` and the `virtualenv` plugin to create your environment. Other manager such as Conda would likely work too. Check the [documentation](https://flower.ai/docs/framework/how-to-install-flower.html) for alternative ways of installing Flower.
+> We recommend using `pyenv` with the `virtualenv` plugin to create your environment with Python >= 3.10.0. Other managers, such as Conda, will likely work as well. Check the [documentation](https://flower.ai/docs/framework/how-to-install-flower.html) for alternative ways to install Flower.
 
 ```shell
 pip install flwr
 ```
 
-On the new environment, create a new Flower project using the `FlowerTune` template. You will be prompted for a name to give to your project, your username, and for your choice of LLM challenge:
+In the new environment, create a new Flower project using the `FlowerTune` template. You will be prompted for a name to give to your app/project, your username, and for your choice of LLM challenge:
 ```shell
 flwr new --framework=FlowerTune
 ```
@@ -64,5 +64,5 @@ following the `README.md` in [`evaluation`](https://github.com/adap/flower/tree/
 
 
 > [!NOTE]
-> If you have any questions about running FlowerTune LLM challenges or evaluation, please feel free to make posts at [Flower Discuss](https://discuss.flower.ai) forum, 
+> If you have any questions about running FlowerTune LLM challenges or evaluation, please feel free to make posts at our dedicated [FlowerTune Category](https://discuss.flower.ai/c/flowertune-llm-leaderboard/) on [Flower Discuss](https://discuss.flower.ai) forum, 
 or join our [Slack channel](https://flower.ai/join-slack/) to ask questions in the `#flowertune-llm-leaderboard` channel.
diff --git a/benchmarks/flowertune-llm/evaluation/README.md b/benchmarks/flowertune-llm/evaluation/README.md
@@ -39,6 +39,9 @@ The default template generated by `flwr new` (see the [Project Creation Instruct
 |:----------:|:-----:|:---------:|:--------------:|:---------------:|:-----:|
 | Pass@1 (%) | 31.60 |   23.78   |     28.57      |      25.47      | 27.36 |
 
+> [!NOTE]  
+> In the LLM Leaderboard, we rank the submissions based on the **average** value derived from different evaluation datasets for each challenge.
+
 
 ## Make submission on FlowerTune LLM Leaderboard
 

diff --git a/datasets/doc/source/index.rst b/datasets/doc/source/index.rst
@@ -3,6 +3,8 @@ Flower Datasets
 
 Flower Datasets (``flwr-datasets``) is a library that enables the quick and easy creation of datasets for federated learning/analytics/evaluation. It enables heterogeneity (non-iidness) simulation and division of datasets with the preexisting notion of IDs. The library was created by the ``Flower Labs`` team that also created `Flower <https://flower.ai>`_ : A Friendly Federated Learning Framework.
 
+Try out an interactive demo to generate code and visualize heterogeneous divisions at the :ref:`bottom of this page<demo>`.
+
 Flower Datasets Framework
 -------------------------
 
@@ -133,7 +135,6 @@ What makes Flower Datasets stand out from other libraries?
 
   * New custom partitioning schemes (``Partitioner`` subclasses) integrated with the whole ecosystem.
 
-
 Join the Flower Community
 -------------------------
 
@@ -144,3 +145,16 @@ The Flower Community is growing quickly - we're a friendly group of researchers,
     :shadow:
 
     Join us on Slack
+
+.. _demo:
+Demo
+----
+
+.. raw:: html
+
+  <script
+    type="module"
+    src="https://gradio.s3-us-west-2.amazonaws.com/4.44.0/gradio.js"
+  ></script>
+
+  <gradio-app src="https://flwrlabs-federated-learning-datasets-by-flwr-datasets.hf.space"></gradio-app>
diff --git a/datasets/flwr_datasets/partitioner/__init__.py b/datasets/flwr_datasets/partitioner/__init__.py
@@ -27,6 +27,7 @@
 from .partitioner import Partitioner
 from .pathological_partitioner import PathologicalPartitioner
 from .shard_partitioner import ShardPartitioner
+from .size_partitioner import SizePartitioner
 from .square_partitioner import SquarePartitioner
 
 __all__ = [
@@ -42,5 +43,6 @@
     "Partitioner",
     "PathologicalPartitioner",
     "ShardPartitioner",
+    "SizePartitioner",
     "SquarePartitioner",
 ]
diff --git a/datasets/flwr_datasets/partitioner/size_partitioner.py b/datasets/flwr_datasets/partitioner/size_partitioner.py
@@ -0,0 +1,128 @@
+# Copyright 2024 Flower Labs GmbH. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""SizePartitioner class."""
+
+
+import warnings
+from collections.abc import Sequence
+
+import datasets
+from flwr_datasets.partitioner.partitioner import Partitioner
+
+
+class SizePartitioner(Partitioner):
+    """Partitioner that creates each partition with the size specified by a user.
+
+    Parameters
+    ----------
+    partition_sizes : Sequence[int]
+        The size of each partition. partition_id 0 will have partition_sizes[0]
+        samples, partition_id 1 will have partition_sizes[1] samples, etc.
+
+    Examples
+    --------
+    >>> from flwr_datasets import FederatedDataset
+    >>> from flwr_datasets.partitioner import SizePartitioner
+    >>>
+    >>> partition_sizes = [15_000, 5_000, 30_000]
+    >>> partitioner = SizePartitioner(partition_sizes)
+    >>> fds = FederatedDataset(dataset="cifar10", partitioners={"train": partitioner})
+    """
+
+    def __init__(self, partition_sizes: Sequence[int]) -> None:
+        super().__init__()
+        self._pre_ds_validate_partition_sizes(partition_sizes)
+        self._partition_sizes = partition_sizes
+        self._partition_id_to_indices: dict[int, list[int]] = {}
+        self._partition_id_to_indices_determined = False
+
+    def load_partition(self, partition_id: int) -> datasets.Dataset:
+        """Load a single partition of the size of partition_sizes[partition_id].
+
+        For example if given partition_sizes=[20_000, 10_000, 30_000],
+        then partition_id=0 will return a partition of size 20_000,
+        partition_id=1 will return a partition of size 10_000, etc.
+
+        Parameters
+        ----------
+        partition_id : int
+            The index that corresponds to the requested partition.
+
+        Returns
+        -------
+        dataset_partition : Dataset
+            Single dataset partition.
+        """
+        self._determine_partition_id_to_indices_if_needed()
+        return self.dataset.select(self._partition_id_to_indices[partition_id])
+
+    @property
+    def num_partitions(self) -> int:
+        """Total number of partitions."""
+        self._determine_partition_id_to_indices_if_needed()
+        return len(self._partition_sizes)
+
+    @property
+    def partition_id_to_indices(self) -> dict[int, list[int]]:
+        """Partition id to indices (the result of partitioning)."""
+        self._determine_partition_id_to_indices_if_needed()
+        return self._partition_id_to_indices
+
+    def _determine_partition_id_to_indices_if_needed(
+        self,
+    ) -> None:
+        """Create an assignment of indices to the partition indices."""
+        if self._partition_id_to_indices_determined:
+            return
+        self._post_ds_validate_partition_sizes()
+        start = 0
+        end = 0
+        for partition_id, partition_size in enumerate(self._partition_sizes):
+            end += partition_size
+            indices = list(range(start, end))
+            self._partition_id_to_indices[partition_id] = indices
+            start = end
+        self._partition_id_to_indices_determined = True
+
+    def _pre_ds_validate_partition_sizes(self, partition_sizes: Sequence[int]) -> None:
+        """Check if the partition sizes are valid (no information about the dataset)."""
+        if not isinstance(partition_sizes, Sequence):
+            raise ValueError("Partition sizes must be a sequence.")
+        if len(partition_sizes) == 0:
+            raise ValueError("Partition sizes must not be empty.")
+        if not all(
+            isinstance(partition_size, int) for partition_size in partition_sizes
+        ):
+            raise ValueError("All partition sizes must be integers.")
+        if not all(partition_size > 0 for partition_size in partition_sizes):
+            raise ValueError("All partition sizes must be greater than zero.")
+
+    def _post_ds_validate_partition_sizes(self) -> None:
+        """Validate the partition sizes against the dataset size."""
+        desired_partition_sizes = sum(self._partition_sizes)
+        dataset_size = len(self.dataset)
+        if desired_partition_sizes > dataset_size:
+            raise ValueError(
+                f"The sum of partition sizes sum({self._partition_sizes})"
+                f"= {desired_partition_sizes} is greater than the size of"
+                f" the dataset {dataset_size}."
+            )
+        if desired_partition_sizes < dataset_size:
+            warnings.warn(
+                f"The sum of partition sizes is {desired_partition_sizes}, which is"
+                f"smaller than the size of the dataset: {dataset_size}. "
+                f"Ignore this warning if it is the desired behavior.",
+                stacklevel=1,
+            )