Skip to content

Commit

Permalink
Merge branch 'main' into fix-aggregate-inplace
Browse files Browse the repository at this point in the history
  • Loading branch information
KarhouTam authored Sep 11, 2024
2 parents 41bc30d + 75d0243 commit 841cf0a
Show file tree
Hide file tree
Showing 84 changed files with 1,885 additions and 579 deletions.
6 changes: 5 additions & 1 deletion .github/CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@
README.md @jafermarq @tanertopal @danieljanes

# Flower Baselines
/baselines @jafermarq @tanertopal @danieljanes
/baselines @jafermarq @danieljanes

# Flower Benchmarks
/benchmarks @jafermarq @danieljanes

# Flower Datasets
/datasets @jafermarq @tanertopal @danieljanes
Expand All @@ -32,3 +35,4 @@ README.md @jafermarq @tanertopal @danieljanes
/.devcontainer @Robert-Steiner @Moep90
**/Dockerfile @Robert-Steiner @Moep90
**/*.Dockerfile @Robert-Steiner @Moep90
src/docker @Robert-Steiner @Moep90
18 changes: 5 additions & 13 deletions .github/workflows/_docker-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,18 +36,16 @@ permissions:
jobs:
build:
name: Build image
runs-on: ubuntu-22.04
runs-on: ${{ matrix.platform.runner-os }}
timeout-minutes: 180
outputs:
build-id: ${{ steps.build-id.outputs.id }}
strategy:
fail-fast: true
matrix:
platform: [
# build-push action and qemu use different platform names
# therefore we create a map
{ name: "amd64", qemu: "", docker: "linux/amd64" },
{ name: "arm64", qemu: "arm64", docker: "linux/arm64" },
{ name: "amd64", docker: "linux/amd64", runner-os: "ubuntu-22.04" },
{ name: "arm64", docker: "linux/arm64", runner-os: "ubuntu-4-core-arm64" },
]
steps:
- name: Create build id
Expand Down Expand Up @@ -79,12 +77,6 @@ jobs:
print(build_args, file=fh)
print("EOF", file=fh)
- name: Set up QEMU
if: matrix.platform.qemu != ''
uses: docker/setup-qemu-action@49b3bc8e6bdd4a60e6116a5414239cba5943d3cf # v3.2.0
with:
platforms: ${{ matrix.platform.qemu }}

- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@8e5442c4ef9f78752691e2d8f8d19755c6f78e81 # v5.5.1
Expand All @@ -104,7 +96,7 @@ jobs:
uses: Wandalen/wretry.action@6feedb7dedadeb826de0f45ff482b53b379a7844 # v3.5.0
id: build
with:
action: docker/build-push-action@2cdde995de11925a030ce8070c3d77a52ffcf1c0 # v5.3.0
action: docker/build-push-action@5cd11c3a4ced054e52742c5fd54dca954e0edd85 # v6.7.0
attempt_limit: 60 # 60 attempts * (9 secs delay + 1 sec retry) = ~10 mins
attempt_delay: 9000 # 9 secs
with: |
Expand All @@ -122,7 +114,7 @@ jobs:
touch "/tmp/digests/${digest#sha256:}"
- name: Upload digest
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
with:
name: digests-${{ steps.build-id.outputs.id }}-${{ matrix.platform.name }}
path: /tmp/digests/*
Expand Down
69 changes: 69 additions & 0 deletions .github/workflows/docker-build-main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
name: Build Docker Images Main Branch

on:
push:
branches:
- 'main'

jobs:
parameters:
if: github.repository == 'adap/flower'
name: Collect docker build parameters
runs-on: ubuntu-22.04
timeout-minutes: 10
outputs:
pip-version: ${{ steps.versions.outputs.pip-version }}
setuptools-version: ${{ steps.versions.outputs.setuptools-version }}
flwr-version-ref: ${{ steps.versions.outputs.flwr-version-ref }}
steps:
- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1

- uses: ./.github/actions/bootstrap
id: bootstrap

- id: versions
run: |
echo "pip-version=${{ steps.bootstrap.outputs.pip-version }}" >> "$GITHUB_OUTPUT"
echo "setuptools-version=${{ steps.bootstrap.outputs.setuptools-version }}" >> "$GITHUB_OUTPUT"
echo "flwr-version-ref=git+${{ github.server_url }}/${{ github.repository }}.git@${{ github.sha }}" >> "$GITHUB_OUTPUT"
build-docker-base-images:
name: Build base images
if: github.repository == 'adap/flower'
uses: ./.github/workflows/_docker-build.yml
needs: parameters
with:
namespace-repository: flwr/base
file-dir: src/docker/base/ubuntu
build-args: |
PIP_VERSION=${{ needs.parameters.outputs.pip-version }}
SETUPTOOLS_VERSION=${{ needs.parameters.outputs.setuptools-version }}
FLWR_VERSION_REF=${{ needs.parameters.outputs.flwr-version-ref }}
tags: unstable
secrets:
dockerhub-user: ${{ secrets.DOCKERHUB_USERNAME }}
dockerhub-token: ${{ secrets.DOCKERHUB_TOKEN }}

build-docker-binary-images:
name: Build binary images
if: github.repository == 'adap/flower'
uses: ./.github/workflows/_docker-build.yml
needs: build-docker-base-images
strategy:
fail-fast: false
matrix:
images: [
{ repository: "flwr/superlink", file_dir: "src/docker/superlink" },
{ repository: "flwr/supernode", file_dir: "src/docker/supernode" },
{ repository: "flwr/serverapp", file_dir: "src/docker/serverapp" },
{ repository: "flwr/superexec", file_dir: "src/docker/superexec" },
{ repository: "flwr/clientapp", file_dir: "src/docker/clientapp" }
]
with:
namespace-repository: ${{ matrix.images.repository }}
file-dir: ${{ matrix.images.file_dir }}
build-args: BASE_IMAGE=unstable
tags: unstable
secrets:
dockerhub-user: ${{ secrets.DOCKERHUB_USERNAME }}
dockerhub-token: ${{ secrets.DOCKERHUB_TOKEN }}
4 changes: 1 addition & 3 deletions .github/workflows/e2e.yml
Original file line number Diff line number Diff line change
Expand Up @@ -146,8 +146,6 @@ jobs:
if: ${{ github.repository == 'adap/flower' && !github.event.pull_request.head.repo.fork && github.actor != 'dependabot[bot]' }}
run: |
python -m pip install https://${{ env.ARTIFACT_BUCKET }}/py/${{ needs.wheel.outputs.dir }}/${{ needs.wheel.outputs.short_sha }}/${{ needs.wheel.outputs.whl_path }}
- name: Install e2e components
run: pip install .
- name: Download dataset
if: ${{ matrix.dataset }}
run: python -c "${{ matrix.dataset }}"
Expand All @@ -172,7 +170,7 @@ jobs:
run: ./../test_superlink.sh bare sqlite
- name: Run driver test with client authentication
if: ${{ matrix.directory == 'e2e-bare-auth' }}
run: ./../test_superlink.sh bare client-auth
run: ./../test_superlink.sh "${{ matrix.directory }}" client-auth
- name: Run reconnection test with SQLite database
if: ${{ matrix.directory == 'e2e-bare' }}
run: ./../test_reconnection.sh sqlite
Expand Down
7 changes: 5 additions & 2 deletions .github/workflows/framework-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ jobs:
if: ${{ github.repository == 'adap/flower' }}
name: Publish release
runs-on: ubuntu-22.04
outputs:
flwr-version: ${{ steps.publish.outputs.flwr-version }}
steps:
- name: Checkout code
uses: actions/checkout@v4
Expand All @@ -26,10 +28,12 @@ jobs:
uses: ./.github/actions/bootstrap

- name: Get artifacts and publish
id: publish
env:
GITHUB_REF: ${{ github.ref }}
run: |
TAG_NAME=$(echo "${GITHUB_REF_NAME}" | cut -c2-)
echo "flwr-version=$TAG_NAME" >> "$GITHUB_OUTPUT"
wheel_name="flwr-${TAG_NAME}-py3-none-any.whl"
tar_name="flwr-${TAG_NAME}.tar.gz"
Expand Down Expand Up @@ -67,8 +71,7 @@ jobs:
- id: matrix
run: |
FLWR_VERSION=$(poetry version -s)
python dev/build-docker-image-matrix.py --flwr-version "${FLWR_VERSION}" > matrix.json
python dev/build-docker-image-matrix.py --flwr-version "${{ needs.publish.outputs.flwr-version }}" > matrix.json
echo "matrix=$(cat matrix.json)" >> $GITHUB_OUTPUT
build-base-images:
Expand Down
54 changes: 30 additions & 24 deletions benchmarks/flowertune-llm/README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
![](_static/flower_llm.jpg)
![](_static/flower_llm.png)

# FlowerTune LLM Leaderboard

Expand All @@ -9,39 +9,40 @@ Please follow the instructions to run and evaluate the federated LLMs.

## Create a new project

As the first step, please register a Flower account on [Flower website](https://flower.ai/login).
Assuming `flwr` package is already installed on your system (check [here](https://flower.ai/docs/framework/how-to-install-flower.html) for `flwr` installation).
We provide a single-line command to create a new project directory based on your selected challenge:
As the first step, please register for a Flower account on [flower.ai/login](https://flower.ai/login).
Then, create a new Python environment and install Flower.

> [!TIP]
> We recommend using `pyenv` and the `virtualenv` plugin to create your environment. Other manager such as Conda would likely work too. Check the [documentation](https://flower.ai/docs/framework/how-to-install-flower.html) for alternative ways of installing Flower.
```shell
flwr new --framework=flwrtune --username=your_flower_account
pip install flwr
```

Then you will see a prompt to ask your project name and the choice of LLM challenges from the set of general NLP, finance, medical and code.
Type your project name and select your preferred challenge,
and then a new project directory will be generated automatically.

### Structure
On the new environment, create a new Flower project using the `FlowerTune` template. You will be prompted for a name to give to your project, your username, and for your choice of LLM challenge:
```shell
flwr new --framework=FlowerTune
```

After running `flwr new`, you will see a new directory generated with the following structure:
The `flwr new` command will generate a directory with the following structure:

```bash
<project-name>
├── README.md # <- Instructions
├── pyproject.toml # <- Environment dependencies
├── pyproject.toml # <- Environment dependencies and configs
└── <project_name>
├── app.py # <- Flower ClientApp/ServerApp build
├── client.py # <- Flower client constructor
├── server.py # <- Sever-related functions
├── models.py # <- Model build
├── client_app.py # <- Flower ClientApp build
├── dataset.py # <- Dataset and tokenizer build
├── conf/config.yaml # <- User configuration
└── conf/static_config.yaml # <- Static configuration
├── models.py # <- Model build
├── server_app.py # <- Flower ServerApp build
└── strategy.py # <- Flower strategy build
```

This can serve as the starting point for you to build up your own federated LLM fine-tuning methods.
Please note that any modification to the content of `conf/static_config.yaml` is strictly prohibited for those who wish to participate in the [LLM Leaderboard](https://flower.ai/benchmarks/llm-leaderboard).
Otherwise, the submission will not be considered.

> [!IMPORTANT]
> Please note that if you intend to submit your project as an entry to the [LLM Leaderboard](https://flower.ai/benchmarks/llm-leaderboard) modifications to `[tool.flwr.app.config.static]` and `[tool.flwr.federations.local-simulation]` sections in the `pyproject.toml` are not allowed and will invalidate the submission.

## Run FlowerTune LLM challenges

Expand All @@ -50,12 +51,17 @@ With a new project directory created, running a baseline challenge can be done b
1. Navigate inside the directory that you just created.


2. Follow the `Environments setup` section of `README.md` in the project directory to install project dependencies.
2. Follow the `Environments setup` section of `README.md` in the project directory to install the project dependencies.


3. Run the challenge as indicated in the `Running the challenge` section in the `README.md`.

## Evaluate pre-trained LLMs
## Evaluate fine-tuned LLMs

Once the LLM fine-tuning finished, evaluate the performance of your fine-tuned LLM
following the `README.md` in [`evaluation`](https://github.com/adap/flower/tree/main/benchmarks/flowertune-llm/evaluation) directory.


After the LLM fine-tuning finished, evaluate the performance of your pre-trained LLMs
following the `README.md` in `evaluation` directory.
> [!NOTE]
> If you have any questions about running FlowerTune LLM challenges or evaluation, please feel free to make posts at [Flower Discuss](https://discuss.flower.ai) forum,
or join our [Slack channel](https://flower.ai/join-slack/) to ask questions in the `#flowertune-llm-leaderboard` channel.
Binary file removed benchmarks/flowertune-llm/_static/flower_llm.jpg
Binary file not shown.
Binary file added benchmarks/flowertune-llm/_static/flower_llm.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
46 changes: 46 additions & 0 deletions benchmarks/flowertune-llm/evaluation/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# FlowerTune LLM Evaluation

This directory provides various evaluation metrics to assess the quality of your fine-tuned LLMs.
If you are participating [LLM Leaderboard](https://flower.ai/benchmarks/llm-leaderboard), evaluating your fine-tuned LLM is the final step prior to have your submission added to the [LLM Leaderboard](https://flower.ai/benchmarks/llm-leaderboard#how-to-participate). The evaluation scores generated here will be displayed as the definitive values on the LLM Leaderboard.

## How to run

Navigate to the directory corresponding to your selected challenge (`general NLP`, `finance`, `medical`, or `code`) and follow the instructions there to execute the evaluation.

> [!NOTE]
> If you wish to participate in the LLM Leaderboard, you must not modify the evaluation code and should use the exact command provided in the respective directory to run the evaluation.

## Baseline results

The default template generated by `flwr new` (see the [Project Creation Instructions](https://github.com/adap/flower/tree/main/benchmarks/flowertune-llm#create-a-new-project)) for each challenge will produce results as follows, which serve as the lower bound on the LLM Leaderboard.

### General NLP

| | MT-1 | MT-2 | MT-Avg |
|:--------:|:----:|:----:|:------:|
| MT Score | 5.54 | 5.52 | 5.53 |

### Finance

| | FPB | FIQA | TFNS | Avg |
|:-------:|:-----:|:-----:|:-----:|:-----:|
| Acc (%) | 44.55 | 62.50 | 28.77 | 45.27 |

### Medical

| | PubMedQA | MedMCQA | MedQA | Avg |
|:-------:|:--------:|:-------:|:-----:|:-----:|
| Acc (%) | 59.00 | 23.69 | 27.10 | 36.60 |

### Code

| | MBPP | HumanEval | MultiPL-E (JS) | MultiPL-E (C++) | Avg |
|:----------:|:-----:|:---------:|:--------------:|:---------------:|:-----:|
| Pass@1 (%) | 31.60 | 23.78 | 28.57 | 25.47 | 27.36 |


## Make submission on FlowerTune LLM Leaderboard

If your LLM outperforms the listed benchmarks in any challenge,
we encourage you to submit your code and model to the FlowerTune LLM Leaderboard without hesitation (see the [How-to-participate Instructions](https://flower.ai/benchmarks/llm-leaderboard#how-to-participate)).
70 changes: 70 additions & 0 deletions benchmarks/flowertune-llm/evaluation/code/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# Evaluation for Code challenge

We leverage the code generation evaluation metrics provided by [bigcode-evaluation-harness](https://github.com/bigcode-project/bigcode-evaluation-harness/tree/main) to evaluate our fine-tuned LLMs.
Three datasets have been selected for this evaluation: [MBPP](https://huggingface.co/datasets/google-research-datasets/mbpp) (Python), [HumanEval](https://huggingface.co/datasets/openai/openai_humaneval) (Python), and [MultiPL-E](https://github.com/nuprl/MultiPL-E) (JavaScript, C++).

> [!WARNING]
> The evaluation process takes ~30 GB VRAM. On a 40GB A100 it requires 15-30mins depending on the dataset to complete.
## Environment Setup

```shell
git clone --depth=1 https://github.com/adap/flower.git && mv flower/benchmarks/flowertune-llm/evaluation/code ./flowertune-eval-code && rm -rf flower && cd flowertune-eval-code
```

Create a new Python environment (we recommend Python 3.10), activate it, then install dependencies with:

```shell
# From a new python environment, run:
pip install -r requirements.txt

# Log in HuggingFace account
huggingface-cli login
```

After that, install `Node.js` and `g++` for the evaluation of JavaScript, C++:

```shell
# Install nvm (Node Version Manager)
curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.7/install.sh | bash

# Restart your terminal

# Download and install Node.js (you may need to restart the terminal)
nvm install 20

# Install g++
sudo apt-get install g++
```

Then, download the `main.py` script from `bigcode-evaluation-harness` repository.

```shell
git clone https://github.com/bigcode-project/bigcode-evaluation-harness.git && cd bigcode-evaluation-harness && git checkout 0f3e95f0806e78a4f432056cdb1be93604a51d69 && mv main.py ../ && cd .. && rm -rf bigcode-evaluation-harness
```


## Generate model answers & calculate pass@1 score

> [!NOTE]
> Evaluation needs to be run on MBPP, HumanEval, MultiPL-E (JS) and MultiPL-E (C++).
```bash
python main.py \
--model=mistralai/Mistral-7B-v0.3 \
--peft_model=/path/to/fine-tuned-peft-model-dir/ \ # e.g., ./peft_1
--max_length_generation=1024 \ # change to 2048 when running mbpp
--batch_size=4 \
--use_auth_token \
--allow_code_execution \
--save_generations \
--save_references \
--tasks=humaneval \ # chosen from [mbpp, humaneval, multiple-js, multiple-cpp]
--metric_output_path=./evaluation_results_humaneval.json # change dataset name based on your choice
```

The model answers and pass@1 scores will be saved to `generations_{dataset_name}.json` and `evaluation_results_{dataset_name}.json`, respectively.


> [!NOTE]
> Please ensure that you provide all **four pass@1 scores** for the evaluation datasets when submitting to the LLM Leaderboard (see the [`Make Submission`](https://github.com/adap/flower/tree/main/benchmarks/flowertune-llm/evaluation#make-submission-on-flowertune-llm-leaderboard) section).
Loading

0 comments on commit 841cf0a

Please sign in to comment.