Skip to content

Commit

Permalink
Merge branch 'main' into fix-aggregate-inplace
Browse files Browse the repository at this point in the history
  • Loading branch information
KarhouTam authored Sep 21, 2024
2 parents 7053c63 + 247cada commit f5cacbd
Show file tree
Hide file tree
Showing 153 changed files with 3,959 additions and 2,402 deletions.
8 changes: 8 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,14 @@ profile = black
indent_style = space
indent_size = 2

[*.md]
indent_style = space
indent_size = 2

[*.yml]
indent_style = space
indent_size = 2

[*.toml]
indent_style = space
indent_size = 4
2 changes: 1 addition & 1 deletion .github/workflows/datasets-e2e.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:
- name: Bootstrap
uses: ./.github/actions/bootstrap
with:
python-version: 3.8
python-version: 3.9
- name: Install dependencies
run: python -m poetry install
- name: Run tests
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/datasets.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
# In case of a mismatch, the job has to download Python to install it.
# Note: Due to a bug in actions/setup-python, we have to put "3.10" in
# quotes as it will otherwise assume "3.1"
python: [3.8, 3.9, '3.10', '3.11']
python: ['3.9', '3.10', '3.11']

name: Python ${{ matrix.python }}

Expand Down
21 changes: 11 additions & 10 deletions benchmarks/flowertune-llm/README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
![](_static/flower_llm.png)
[![FlowerTune LLM Leaderboard](_static/flower_llm.png)](https://flower.ai/benchmarks/llm-leaderboard)

# FlowerTune LLM Leaderboard

Expand Down Expand Up @@ -27,15 +27,16 @@ flwr new --framework=FlowerTune
The `flwr new` command will generate a directory with the following structure:

```bash
<project-name>
├── README.md # <- Instructions
├── pyproject.toml # <- Environment dependencies and configs
└── <project_name>
├── client_app.py # <- Flower ClientApp build
├── dataset.py # <- Dataset and tokenizer build
├── models.py # <- Model build
├── server_app.py # <- Flower ServerApp build
└── strategy.py # <- Flower strategy build
<project_name>
├── README.md # Instructions
├── pyproject.toml # Environment dependencies and configs
└── <project_name>
├── __init__.py
├── client_app.py # Flower ClientApp build
├── dataset.py # Dataset and tokenizer build
├── models.py # Model build
├── server_app.py # Flower ServerApp build
└── strategy.py # Flower strategy build
```

This can serve as the starting point for you to build up your own federated LLM fine-tuning methods.
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/flowertune-llm/evaluation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ If you are participating [LLM Leaderboard](https://flower.ai/benchmarks/llm-lead

## How to run

Navigate to the directory corresponding to your selected challenge (`general NLP`, `finance`, `medical`, or `code`) and follow the instructions there to execute the evaluation.
Navigate to the directory corresponding to your selected challenge ([`general NLP`](https://github.com/adap/flower/tree/main/benchmarks/flowertune-llm/evaluation/general-nlp), [`finance`](https://github.com/adap/flower/tree/main/benchmarks/flowertune-llm/evaluation/finance), [`medical`](https://github.com/adap/flower/tree/main/benchmarks/flowertune-llm/evaluation/medical), or [`code`](https://github.com/adap/flower/tree/main/benchmarks/flowertune-llm/evaluation/code)) and follow the instructions there to execute the evaluation.

> [!NOTE]
> If you wish to participate in the LLM Leaderboard, you must not modify the evaluation code and should use the exact command provided in the respective directory to run the evaluation.
Expand Down
10 changes: 5 additions & 5 deletions datasets/flwr_datasets/common/telemetry.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from concurrent.futures import Future, ThreadPoolExecutor
from enum import Enum, auto
from pathlib import Path
from typing import Any, Dict, List, Optional, Union, cast
from typing import Any, Optional, Union, cast

from flwr_datasets.common.version import package_name, package_version

Expand Down Expand Up @@ -114,7 +114,7 @@ class EventType(str, Enum):
# The type signature is not compatible with mypy, pylint and flake8
# so each of those needs to be disabled for this line.
# pylint: disable-next=no-self-argument,arguments-differ,line-too-long
def _generate_next_value_(name: str, start: int, count: int, last_values: List[Any]) -> Any: # type: ignore # noqa: E501
def _generate_next_value_(name: str, start: int, count: int, last_values: list[Any]) -> Any: # type: ignore # noqa: E501
return name

PING = auto()
Expand All @@ -127,7 +127,7 @@ def _generate_next_value_(name: str, start: int, count: int, last_values: List[A

# Use the ThreadPoolExecutor with max_workers=1 to have a queue
# and also ensure that telemetry calls are not blocking.
state: Dict[str, Union[Optional[str], Optional[ThreadPoolExecutor]]] = {
state: dict[str, Union[Optional[str], Optional[ThreadPoolExecutor]]] = {
# Will be assigned ThreadPoolExecutor(max_workers=1)
# in event() the first time it's required
"executor": None,
Expand All @@ -143,7 +143,7 @@ def _generate_next_value_(name: str, start: int, count: int, last_values: List[A
# pylint: disable-next=unsubscriptable-object
def event(
event_type: EventType,
event_details: Optional[Dict[str, Any]] = None,
event_details: Optional[dict[str, Any]] = None,
) -> Future: # type: ignore
"""Submit create_event to ThreadPoolExecutor to avoid blocking."""
if state["executor"] is None:
Expand All @@ -155,7 +155,7 @@ def event(
return result


def create_event(event_type: EventType, event_details: Optional[Dict[str, Any]]) -> str:
def create_event(event_type: EventType, event_details: Optional[dict[str, Any]]) -> str:
"""Create telemetry event."""
if state["source"] is None:
state["source"] = _get_source_id()
Expand Down
4 changes: 2 additions & 2 deletions datasets/flwr_datasets/common/typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@
"""Flower Datasets type definitions."""


from typing import Any, List
from typing import Any

import numpy as np
import numpy.typing as npt

NDArray = npt.NDArray[Any]
NDArrayInt = npt.NDArray[np.int_]
NDArrayFloat = npt.NDArray[np.float_]
NDArrays = List[NDArray]
NDArrays = list[NDArray]
5 changes: 2 additions & 3 deletions datasets/flwr_datasets/common/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,14 @@


import importlib.metadata as importlib_metadata
from typing import Tuple


def _check_package(name: str) -> Tuple[str, str]:
def _check_package(name: str) -> tuple[str, str]:
version: str = importlib_metadata.version(name)
return name, version


def _version() -> Tuple[str, str]:
def _version() -> tuple[str, str]:
"""Read and return Flower Dataset package name and version.
Returns
Expand Down
8 changes: 4 additions & 4 deletions datasets/flwr_datasets/federated_dataset_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@


import unittest
from typing import Dict, Union
from typing import Union
from unittest.mock import Mock, patch

import numpy as np
Expand Down Expand Up @@ -385,7 +385,7 @@ def test_dict_of_partitioners_passes_partitioners(self) -> None:
"""Test if partitioners are passed directly (no recreation)."""
num_train_partitions = 100
num_test_partitions = 100
partitioners: Dict[str, Union[Partitioner, int]] = {
partitioners: dict[str, Union[Partitioner, int]] = {
"train": IidPartitioner(num_partitions=num_train_partitions),
"test": IidPartitioner(num_partitions=num_test_partitions),
}
Expand Down Expand Up @@ -419,7 +419,7 @@ def test_mixed_type_partitioners_passes_instantiated_partitioners(self) -> None:
"""Test if an instantiated partitioner is passed directly."""
num_train_partitions = 100
num_test_partitions = 100
partitioners: Dict[str, Union[Partitioner, int]] = {
partitioners: dict[str, Union[Partitioner, int]] = {
"train": IidPartitioner(num_partitions=num_train_partitions),
"test": num_test_partitions,
}
Expand All @@ -433,7 +433,7 @@ def test_mixed_type_partitioners_creates_from_int(self) -> None:
"""Test if an IidPartitioner partitioner is created."""
num_train_partitions = 100
num_test_partitions = 100
partitioners: Dict[str, Union[Partitioner, int]] = {
partitioners: dict[str, Union[Partitioner, int]] = {
"train": IidPartitioner(num_partitions=num_train_partitions),
"test": num_test_partitions,
}
Expand Down
6 changes: 3 additions & 3 deletions datasets/flwr_datasets/metrics/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@


import warnings
from typing import List, Optional, Union
from typing import Optional, Union

import pandas as pd

Expand Down Expand Up @@ -206,7 +206,7 @@ def compute_frequencies(


def _compute_counts(
labels: Union[List[int], List[str]], unique_labels: Union[List[int], List[str]]
labels: Union[list[int], list[str]], unique_labels: Union[list[int], list[str]]
) -> pd.Series:
"""Compute the count of labels when taking into account all possible labels.
Expand Down Expand Up @@ -237,7 +237,7 @@ def _compute_counts(


def _compute_frequencies(
labels: Union[List[int], List[str]], unique_labels: Union[List[int], List[str]]
labels: Union[list[int], list[str]], unique_labels: Union[list[int], list[str]]
) -> pd.Series:
"""Compute the distribution of labels when taking into account all possible labels.
Expand Down
38 changes: 19 additions & 19 deletions datasets/flwr_datasets/mock_utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import random
import string
from datetime import datetime, timedelta
from typing import Any, Dict, List, Optional, Set, Tuple, Union
from typing import Any, Optional, Union

import numpy as np
from PIL import Image
Expand All @@ -30,7 +30,7 @@

def _generate_artificial_strings(
num_rows: int, num_unique: int, string_length: int, seed: int = 42
) -> List[str]:
) -> list[str]:
"""Create list of strings for categories or labels mocking.
Note to keep the seed the same if you reuse this function for in creation of the
Expand All @@ -53,7 +53,7 @@ def _generate_artificial_strings(
List of generated strings.
"""
random.seed(seed)
unique_strings: Set[str] = set()
unique_strings: set[str] = set()
while len(unique_strings) < num_unique:
random_str = "".join(
random.choices(string.ascii_letters + string.digits, k=string_length)
Expand All @@ -68,7 +68,7 @@ def _generate_artificial_strings(
return artificial_column


def _generate_artificial_categories(num_rows: int, choices: List[Any]) -> List[str]:
def _generate_artificial_categories(num_rows: int, choices: list[Any]) -> list[str]:
"""Create list of strings from given `choices` list."""
artificial_column = choices.copy()
remaining_to_allocate = num_rows - len(choices)
Expand All @@ -82,7 +82,7 @@ def _generate_random_word(length: int) -> str:
return "".join(random.choices(string.ascii_letters, k=length))


def _generate_random_text_column(num_rows: int, length: int) -> List[str]:
def _generate_random_text_column(num_rows: int, length: int) -> list[str]:
"""Generate a list of random text of specified length."""
text_col = []
for _ in range(num_rows):
Expand All @@ -98,7 +98,7 @@ def _generate_random_sentence(
) -> str:
"""Generate a random sentence with words of random lengths."""
sentence_length = random.randint(min_sentence_length, max_sentence_length)
sentence: List[str] = []
sentence: list[str] = []
while len(" ".join(sentence)) < sentence_length:
word_length = random.randint(min_word_length, max_word_length)
word = _generate_random_word(word_length)
Expand All @@ -112,7 +112,7 @@ def _generate_random_sentences(
max_word_length: int,
min_sentence_length: int,
max_sentence_length: int,
) -> List[str]:
) -> list[str]:
"""Generate a list of random sentences."""
text_col = [
_generate_random_sentence(
Expand All @@ -123,7 +123,7 @@ def _generate_random_sentences(
return text_col


def _make_num_rows_none(column: List[Any], num_none: int) -> List[Any]:
def _make_num_rows_none(column: list[Any], num_none: int) -> list[Any]:
"""Assign none num_none times to the given list."""
column_copy = column.copy()
none_positions = random.sample(range(len(column_copy)), num_none)
Expand Down Expand Up @@ -154,29 +154,29 @@ def _generate_random_date_column(
end_date: datetime,
date_format: str = "%a %b %d %H:%M:%S %Y",
as_string: bool = True,
) -> List[Union[str, datetime]]:
) -> list[Union[str, datetime]]:
"""Generate a list of random dates."""
return [
_generate_random_date(start_date, end_date, date_format, as_string)
for _ in range(num_rows)
]


def _generate_random_int_column(num_rows: int, min_int: int, max_int: int) -> List[int]:
def _generate_random_int_column(num_rows: int, min_int: int, max_int: int) -> list[int]:
"""Generate a list of ints."""
return [random.randint(min_int, max_int) for _ in range(num_rows)]


def _generate_random_bool_column(num_rows: int) -> List[bool]:
def _generate_random_bool_column(num_rows: int) -> list[bool]:
"""Generate a list of bools."""
return [random.choice([True, False]) for _ in range(num_rows)]


def _generate_random_image_column(
num_rows: int,
image_size: Union[Tuple[int, int], Tuple[int, int, int]],
image_size: Union[tuple[int, int], tuple[int, int, int]],
simulate_type: str,
) -> List[Any]:
) -> list[Any]:
"""Simulate the images with the format that is found in HF Hub.
Directly using `Image.fromarray` does not work because it creates `PIL.Image.Image`.
Expand Down Expand Up @@ -207,7 +207,7 @@ def generate_random_audio_column(
num_rows: int,
sampling_rate: int,
length_in_samples: int,
) -> List[Dict[str, Any]]:
) -> list[dict[str, Any]]:
"""Simulate the audio column.
Audio column in the datset is comprised from an array or floats, sample_rate and a
Expand Down Expand Up @@ -365,8 +365,8 @@ def _mock_speach_commands(num_rows: int) -> Dataset:

def _load_mocked_dataset(
dataset_name: str,
num_rows: List[int],
split_names: List[str],
num_rows: list[int],
split_names: list[str],
subset: str = "",
) -> DatasetDict:
dataset_dict = {}
Expand All @@ -380,7 +380,7 @@ def _load_mocked_dataset(
def _load_mocked_dataset_by_partial_download(
dataset_name: str,
split_name: str,
skip_take_list: List[Tuple[int, int]],
skip_take_list: list[tuple[int, int]],
subset_name: Optional[str] = None,
) -> Dataset:
"""Download a partial dataset.
Expand Down Expand Up @@ -423,8 +423,8 @@ def _load_mocked_dataset_by_partial_download(

def _load_mocked_dataset_dict_by_partial_download(
dataset_name: str,
split_names: List[str],
skip_take_lists: List[List[Tuple[int, int]]],
split_names: list[str],
skip_take_lists: list[list[tuple[int, int]]],
subset_name: Optional[str] = None,
) -> DatasetDict:
"""Like _load_mocked_dataset_by_partial_download but for many splits."""
Expand Down
2 changes: 2 additions & 0 deletions datasets/flwr_datasets/partitioner/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from .partitioner import Partitioner
from .pathological_partitioner import PathologicalPartitioner
from .shard_partitioner import ShardPartitioner
from .size_partitioner import SizePartitioner
from .square_partitioner import SquarePartitioner

__all__ = [
Expand All @@ -42,5 +43,6 @@
"Partitioner",
"PathologicalPartitioner",
"ShardPartitioner",
"SizePartitioner",
"SquarePartitioner",
]
Loading

0 comments on commit f5cacbd

Please sign in to comment.