Skip to content

Commit

Permalink
Merge branch 'main' into fds-fix-pathological-partitioner
Browse files Browse the repository at this point in the history
  • Loading branch information
jafermarq authored Oct 1, 2024
2 parents 98de354 + 1a4da37 commit 00f19da
Show file tree
Hide file tree
Showing 161 changed files with 27,054 additions and 15,943 deletions.
8 changes: 8 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,14 @@ profile = black
indent_style = space
indent_size = 2

[*.md]
indent_style = space
indent_size = 2

[*.yml]
indent_style = space
indent_size = 2

[*.toml]
indent_style = space
indent_size = 4
1 change: 1 addition & 0 deletions .github/workflows/_docker-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ jobs:
attempt_delay: 9000 # 9 secs
with: |
pull: true
sbom: true
platforms: ${{ matrix.platform.docker }}
context: "{{defaultContext}}:${{ inputs.file-dir }}"
outputs: type=image,name=${{ inputs.namespace-repository }},push-by-digest=true,name-canonical=true,push=true
Expand Down
57 changes: 57 additions & 0 deletions .github/workflows/e2e.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,63 @@ jobs:
short_sha: ${{ steps.upload.outputs.SHORT_SHA }}
dir: ${{ steps.upload.outputs.DIR }}

superexec:
runs-on: ubuntu-22.04
timeout-minutes: 10
needs: wheel
strategy:
fail-fast: false
matrix:
python-version: ["3.9", "3.10", "3.11"]
directory: [e2e-bare-auth]
connection: [secure, insecure]
engine: [deployment-engine, simulation-engine]
authentication: [no-auth, client-auth]
exclude:
- connection: insecure
authentication: client-auth
name: |
SuperExec /
Python ${{ matrix.python-version }} /
${{ matrix.connection }} /
${{ matrix.authentication }} /
${{ matrix.engine }}
defaults:
run:
working-directory: e2e/${{ matrix.directory }}
steps:
- uses: actions/checkout@v4
- name: Bootstrap
uses: ./.github/actions/bootstrap
with:
python-version: ${{ matrix.python-version }}
poetry-skip: 'true'
- name: Install Flower from repo
if: ${{ github.repository != 'adap/flower' || github.event.pull_request.head.repo.fork || github.actor == 'dependabot[bot]' }}
run: |
if [[ "${{ matrix.engine }}" == "simulation-engine" ]]; then
python -m pip install ".[simulation]"
else
python -m pip install .
fi
- name: Download and install Flower wheel from artifact store
if: ${{ github.repository == 'adap/flower' && !github.event.pull_request.head.repo.fork && github.actor != 'dependabot[bot]' }}
run: |
# Define base URL for wheel file
WHEEL_URL="https://${{ env.ARTIFACT_BUCKET }}/py/${{ needs.wheel.outputs.dir }}/${{ needs.wheel.outputs.short_sha }}/${{ needs.wheel.outputs.whl_path }}"
if [[ "${{ matrix.engine }}" == "simulation-engine" ]]; then
python -m pip install "flwr[simulation] @ ${WHEEL_URL}"
else
python -m pip install "${WHEEL_URL}"
fi
- name: >
Run SuperExec test /
${{ matrix.connection }} /
${{ matrix.authentication }} /
${{ matrix.engine }}
working-directory: e2e/${{ matrix.directory }}
run: ./../test_superexec.sh "${{ matrix.connection }}" "${{ matrix.authentication}}" "${{ matrix.engine }}"
frameworks:
runs-on: ubuntu-22.04
timeout-minutes: 10
Expand Down
79 changes: 79 additions & 0 deletions .github/workflows/update_translations.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
name: Translations

on:
schedule:
- cron: '0 0 * * *' # Runs every day at midnight
workflow_dispatch: # Allows to manually trigger the workflow

jobs:
update-and-pr:
runs-on: ubuntu-22.04
permissions:
contents: write
pull-requests: write
env:
branch-name: auto-update-trans-text
name: Update text
steps:
- uses: actions/checkout@v4

- name: Bootstrap
uses: ./.github/actions/bootstrap
with:
python-version: '3.10'

- name: Install dependencies
run: |
python -m poetry install
pip install sphinx==7.3.7
- name: Install pandoc
uses: nikeee/setup-pandoc@v1

- name: Update text and translations for all locales
run: |
cd doc
make update-text
for langDir in locales/*; do
if [ -d "$langDir" ]; then
lang=$(basename $langDir)
echo "Updating language $lang"
make update-lang lang=$lang
fi
done
- name: Commit changes
run: |
git config --local user.email "41898282+github-actions[bot]@users.noreply.github.com"
git config --local user.name "github-actions[bot]"
git add doc/locales
git commit -m "Update text and language files"
continue-on-error: true

- name: Calculate diff # Even without doc changes the update-lang command will generate 228 additions and 60 deletions, so we only want to open a PR when there is more
id: calculate_diff
run: |
additions=$(git diff --numstat HEAD^1 | awk '{s+=$1} END {print s}')
deletions=$(git diff --numstat HEAD^1 | awk '{s+=$2} END {print s}')
echo "Additions: $additions"
echo "Deletions: $deletions"
echo "additions=$additions" >> $GITHUB_OUTPUT
echo "deletions=$deletions" >> $GITHUB_OUTPUT
- name: Push changes
if: steps.calculate_diff.outputs.additions > 228 && steps.calculate_diff.outputs.deletions > 60
uses: ad-m/github-push-action@master
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
branch: '${{ env.branch-name }}'

- name: Create Pull Request
if: steps.calculate_diff.outputs.additions > 228 && steps.calculate_diff.outputs.deletions > 60
uses: peter-evans/create-pull-request@v6
with:
token: ${{ secrets.GITHUB_TOKEN }}
branch: '${{ env.branch-name }}'
delete-branch: true
title: 'docs(framework:skip) Update source texts for translations (automated)'
body: 'This PR is auto-generated to update text and language files.'
draft: false
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
[![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](https://github.com/adap/flower/blob/main/CONTRIBUTING.md)
![Build](https://github.com/adap/flower/actions/workflows/framework.yml/badge.svg)
[![Downloads](https://static.pepy.tech/badge/flwr)](https://pepy.tech/project/flwr)
[![Docker Hub](https://img.shields.io/badge/Docker%20Hub-flwr-blue)](https://hub.docker.com/u/flwr)
[![Slack](https://img.shields.io/badge/Chat-Slack-red)](https://flower.ai/join-slack)

Flower (`flwr`) is a framework for building federated learning systems. The
Expand Down
6 changes: 3 additions & 3 deletions benchmarks/flowertune-llm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,13 @@ As the first step, please register for a Flower account on [flower.ai/login](htt
Then, create a new Python environment and install Flower.

> [!TIP]
> We recommend using `pyenv` and the `virtualenv` plugin to create your environment. Other manager such as Conda would likely work too. Check the [documentation](https://flower.ai/docs/framework/how-to-install-flower.html) for alternative ways of installing Flower.
> We recommend using `pyenv` with the `virtualenv` plugin to create your environment with Python >= 3.10.0. Other managers, such as Conda, will likely work as well. Check the [documentation](https://flower.ai/docs/framework/how-to-install-flower.html) for alternative ways to install Flower.
```shell
pip install flwr
```

On the new environment, create a new Flower project using the `FlowerTune` template. You will be prompted for a name to give to your project, your username, and for your choice of LLM challenge:
In the new environment, create a new Flower project using the `FlowerTune` template. You will be prompted for a name to give to your app/project, your username, and for your choice of LLM challenge:
```shell
flwr new --framework=FlowerTune
```
Expand Down Expand Up @@ -64,5 +64,5 @@ following the `README.md` in [`evaluation`](https://github.com/adap/flower/tree/


> [!NOTE]
> If you have any questions about running FlowerTune LLM challenges or evaluation, please feel free to make posts at [Flower Discuss](https://discuss.flower.ai) forum,
> If you have any questions about running FlowerTune LLM challenges or evaluation, please feel free to make posts at our dedicated [FlowerTune Category](https://discuss.flower.ai/c/flowertune-llm-leaderboard/) on [Flower Discuss](https://discuss.flower.ai) forum,
or join our [Slack channel](https://flower.ai/join-slack/) to ask questions in the `#flowertune-llm-leaderboard` channel.
3 changes: 3 additions & 0 deletions benchmarks/flowertune-llm/evaluation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ The default template generated by `flwr new` (see the [Project Creation Instruct
|:----------:|:-----:|:---------:|:--------------:|:---------------:|:-----:|
| Pass@1 (%) | 31.60 | 23.78 | 28.57 | 25.47 | 27.36 |

> [!NOTE]
> In the LLM Leaderboard, we rank the submissions based on the **average** value derived from different evaluation datasets for each challenge.

## Make submission on FlowerTune LLM Leaderboard

Expand Down
16 changes: 15 additions & 1 deletion datasets/doc/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ Flower Datasets

Flower Datasets (``flwr-datasets``) is a library that enables the quick and easy creation of datasets for federated learning/analytics/evaluation. It enables heterogeneity (non-iidness) simulation and division of datasets with the preexisting notion of IDs. The library was created by the ``Flower Labs`` team that also created `Flower <https://flower.ai>`_ : A Friendly Federated Learning Framework.

Try out an interactive demo to generate code and visualize heterogeneous divisions at the :ref:`bottom of this page<demo>`.

Flower Datasets Framework
-------------------------

Expand Down Expand Up @@ -133,7 +135,6 @@ What makes Flower Datasets stand out from other libraries?

* New custom partitioning schemes (``Partitioner`` subclasses) integrated with the whole ecosystem.


Join the Flower Community
-------------------------

Expand All @@ -144,3 +145,16 @@ The Flower Community is growing quickly - we're a friendly group of researchers,
:shadow:

Join us on Slack

.. _demo:
Demo
----

.. raw:: html

<script
type="module"
src="https://gradio.s3-us-west-2.amazonaws.com/4.44.0/gradio.js"
></script>

<gradio-app src="https://flwrlabs-federated-learning-datasets-by-flwr-datasets.hf.space"></gradio-app>
2 changes: 2 additions & 0 deletions datasets/flwr_datasets/partitioner/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from .partitioner import Partitioner
from .pathological_partitioner import PathologicalPartitioner
from .shard_partitioner import ShardPartitioner
from .size_partitioner import SizePartitioner
from .square_partitioner import SquarePartitioner

__all__ = [
Expand All @@ -42,5 +43,6 @@
"Partitioner",
"PathologicalPartitioner",
"ShardPartitioner",
"SizePartitioner",
"SquarePartitioner",
]
128 changes: 128 additions & 0 deletions datasets/flwr_datasets/partitioner/size_partitioner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
# Copyright 2024 Flower Labs GmbH. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""SizePartitioner class."""


import warnings
from collections.abc import Sequence

import datasets
from flwr_datasets.partitioner.partitioner import Partitioner


class SizePartitioner(Partitioner):
"""Partitioner that creates each partition with the size specified by a user.
Parameters
----------
partition_sizes : Sequence[int]
The size of each partition. partition_id 0 will have partition_sizes[0]
samples, partition_id 1 will have partition_sizes[1] samples, etc.
Examples
--------
>>> from flwr_datasets import FederatedDataset
>>> from flwr_datasets.partitioner import SizePartitioner
>>>
>>> partition_sizes = [15_000, 5_000, 30_000]
>>> partitioner = SizePartitioner(partition_sizes)
>>> fds = FederatedDataset(dataset="cifar10", partitioners={"train": partitioner})
"""

def __init__(self, partition_sizes: Sequence[int]) -> None:
super().__init__()
self._pre_ds_validate_partition_sizes(partition_sizes)
self._partition_sizes = partition_sizes
self._partition_id_to_indices: dict[int, list[int]] = {}
self._partition_id_to_indices_determined = False

def load_partition(self, partition_id: int) -> datasets.Dataset:
"""Load a single partition of the size of partition_sizes[partition_id].
For example if given partition_sizes=[20_000, 10_000, 30_000],
then partition_id=0 will return a partition of size 20_000,
partition_id=1 will return a partition of size 10_000, etc.
Parameters
----------
partition_id : int
The index that corresponds to the requested partition.
Returns
-------
dataset_partition : Dataset
Single dataset partition.
"""
self._determine_partition_id_to_indices_if_needed()
return self.dataset.select(self._partition_id_to_indices[partition_id])

@property
def num_partitions(self) -> int:
"""Total number of partitions."""
self._determine_partition_id_to_indices_if_needed()
return len(self._partition_sizes)

@property
def partition_id_to_indices(self) -> dict[int, list[int]]:
"""Partition id to indices (the result of partitioning)."""
self._determine_partition_id_to_indices_if_needed()
return self._partition_id_to_indices

def _determine_partition_id_to_indices_if_needed(
self,
) -> None:
"""Create an assignment of indices to the partition indices."""
if self._partition_id_to_indices_determined:
return
self._post_ds_validate_partition_sizes()
start = 0
end = 0
for partition_id, partition_size in enumerate(self._partition_sizes):
end += partition_size
indices = list(range(start, end))
self._partition_id_to_indices[partition_id] = indices
start = end
self._partition_id_to_indices_determined = True

def _pre_ds_validate_partition_sizes(self, partition_sizes: Sequence[int]) -> None:
"""Check if the partition sizes are valid (no information about the dataset)."""
if not isinstance(partition_sizes, Sequence):
raise ValueError("Partition sizes must be a sequence.")
if len(partition_sizes) == 0:
raise ValueError("Partition sizes must not be empty.")
if not all(
isinstance(partition_size, int) for partition_size in partition_sizes
):
raise ValueError("All partition sizes must be integers.")
if not all(partition_size > 0 for partition_size in partition_sizes):
raise ValueError("All partition sizes must be greater than zero.")

def _post_ds_validate_partition_sizes(self) -> None:
"""Validate the partition sizes against the dataset size."""
desired_partition_sizes = sum(self._partition_sizes)
dataset_size = len(self.dataset)
if desired_partition_sizes > dataset_size:
raise ValueError(
f"The sum of partition sizes sum({self._partition_sizes})"
f"= {desired_partition_sizes} is greater than the size of"
f" the dataset {dataset_size}."
)
if desired_partition_sizes < dataset_size:
warnings.warn(
f"The sum of partition sizes is {desired_partition_sizes}, which is"
f"smaller than the size of the dataset: {dataset_size}. "
f"Ignore this warning if it is the desired behavior.",
stacklevel=1,
)
Loading

0 comments on commit 00f19da

Please sign in to comment.