Merge branch 'main' into fix-aggregate-inplace

adap · Sep 11, 2024 · 841cf0a · 841cf0a
2 parents 41bc30d + 75d0243
commit 841cf0a
Show file tree

Hide file tree

Showing 84 changed files with 1,885 additions and 579 deletions.
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -7,7 +7,10 @@
 README.md @jafermarq @tanertopal @danieljanes
 
 # Flower Baselines
-/baselines @jafermarq @tanertopal @danieljanes
+/baselines @jafermarq @danieljanes
+
+# Flower Benchmarks
+/benchmarks @jafermarq @danieljanes
 
 # Flower Datasets
 /datasets @jafermarq @tanertopal @danieljanes
@@ -32,3 +35,4 @@ README.md @jafermarq @tanertopal @danieljanes
 /.devcontainer @Robert-Steiner @Moep90
 **/Dockerfile @Robert-Steiner @Moep90
 **/*.Dockerfile @Robert-Steiner @Moep90
+src/docker @Robert-Steiner @Moep90
diff --git a/.github/workflows/_docker-build.yml b/.github/workflows/_docker-build.yml
@@ -36,18 +36,16 @@ permissions:
 jobs:
   build:
     name: Build image
-    runs-on: ubuntu-22.04
+    runs-on: ${{ matrix.platform.runner-os }}
     timeout-minutes: 180
     outputs:
       build-id: ${{ steps.build-id.outputs.id }}
     strategy:
       fail-fast: true
       matrix:
         platform: [
-            # build-push action and qemu use different platform names
-            # therefore we create a map
-            { name: "amd64", qemu: "", docker: "linux/amd64" },
-            { name: "arm64", qemu: "arm64", docker: "linux/arm64" },
+            { name: "amd64", docker: "linux/amd64", runner-os: "ubuntu-22.04" },
+            { name: "arm64", docker: "linux/arm64", runner-os: "ubuntu-4-core-arm64" },
           ]
     steps:
       - name: Create build id
@@ -79,12 +77,6 @@ jobs:
               print(build_args, file=fh)
               print("EOF", file=fh)
 
-      - name: Set up QEMU
-        if: matrix.platform.qemu != ''
-        uses: docker/setup-qemu-action@49b3bc8e6bdd4a60e6116a5414239cba5943d3cf # v3.2.0
-        with:
-          platforms: ${{ matrix.platform.qemu }}
-
       - name: Extract metadata (tags, labels) for Docker
         id: meta
         uses: docker/metadata-action@8e5442c4ef9f78752691e2d8f8d19755c6f78e81 # v5.5.1
@@ -104,7 +96,7 @@ jobs:
         uses: Wandalen/wretry.action@6feedb7dedadeb826de0f45ff482b53b379a7844 # v3.5.0
         id: build
         with:
-          action: docker/build-push-action@2cdde995de11925a030ce8070c3d77a52ffcf1c0 # v5.3.0
+          action: docker/build-push-action@5cd11c3a4ced054e52742c5fd54dca954e0edd85 # v6.7.0
           attempt_limit: 60 # 60 attempts * (9 secs delay + 1 sec retry) = ~10 mins
           attempt_delay: 9000 # 9 secs
           with: |
@@ -122,7 +114,7 @@ jobs:
           touch "/tmp/digests/${digest#sha256:}"
 
       - name: Upload digest
-        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
         with:
           name: digests-${{ steps.build-id.outputs.id }}-${{ matrix.platform.name }}
           path: /tmp/digests/*

diff --git a/.github/workflows/docker-build-main.yml b/.github/workflows/docker-build-main.yml
@@ -0,0 +1,69 @@
+name: Build Docker Images Main Branch
+
+on:
+  push:
+    branches:
+      - 'main'
+
+jobs:
+  parameters:
+    if: github.repository == 'adap/flower'
+    name: Collect docker build parameters
+    runs-on: ubuntu-22.04
+    timeout-minutes: 10
+    outputs:
+      pip-version: ${{ steps.versions.outputs.pip-version }}
+      setuptools-version: ${{ steps.versions.outputs.setuptools-version }}
+      flwr-version-ref: ${{ steps.versions.outputs.flwr-version-ref }}
+    steps:
+      - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+
+      - uses: ./.github/actions/bootstrap
+        id: bootstrap
+
+      - id: versions
+        run: |
+          echo "pip-version=${{ steps.bootstrap.outputs.pip-version }}" >> "$GITHUB_OUTPUT"
+          echo "setuptools-version=${{ steps.bootstrap.outputs.setuptools-version }}" >> "$GITHUB_OUTPUT"
+          echo "flwr-version-ref=git+${{ github.server_url }}/${{ github.repository }}.git@${{ github.sha }}" >> "$GITHUB_OUTPUT"
+
+  build-docker-base-images:
+    name: Build base images
+    if: github.repository == 'adap/flower'
+    uses: ./.github/workflows/_docker-build.yml
+    needs: parameters
+    with:
+      namespace-repository: flwr/base
+      file-dir: src/docker/base/ubuntu
+      build-args: |
+        PIP_VERSION=${{ needs.parameters.outputs.pip-version }}
+        SETUPTOOLS_VERSION=${{ needs.parameters.outputs.setuptools-version }}
+        FLWR_VERSION_REF=${{ needs.parameters.outputs.flwr-version-ref }}
+      tags: unstable
+    secrets:
+      dockerhub-user: ${{ secrets.DOCKERHUB_USERNAME }}
+      dockerhub-token: ${{ secrets.DOCKERHUB_TOKEN }}
+
+  build-docker-binary-images:
+    name: Build binary images
+    if: github.repository == 'adap/flower'
+    uses: ./.github/workflows/_docker-build.yml
+    needs: build-docker-base-images
+    strategy:
+      fail-fast: false
+      matrix:
+        images: [
+          { repository: "flwr/superlink", file_dir: "src/docker/superlink" },
+          { repository: "flwr/supernode", file_dir: "src/docker/supernode" },
+          { repository: "flwr/serverapp", file_dir: "src/docker/serverapp" },
+          { repository: "flwr/superexec", file_dir: "src/docker/superexec" },
+          { repository: "flwr/clientapp", file_dir: "src/docker/clientapp" }
+        ]
+    with:
+      namespace-repository: ${{ matrix.images.repository }}
+      file-dir: ${{ matrix.images.file_dir }}
+      build-args: BASE_IMAGE=unstable
+      tags: unstable
+    secrets:
+      dockerhub-user: ${{ secrets.DOCKERHUB_USERNAME }}
+      dockerhub-token: ${{ secrets.DOCKERHUB_TOKEN }}
diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml
@@ -146,8 +146,6 @@ jobs:
         if: ${{ github.repository == 'adap/flower' && !github.event.pull_request.head.repo.fork && github.actor != 'dependabot[bot]' }}
         run: |
           python -m pip install https://${{ env.ARTIFACT_BUCKET }}/py/${{ needs.wheel.outputs.dir }}/${{ needs.wheel.outputs.short_sha }}/${{ needs.wheel.outputs.whl_path }}
-      - name: Install e2e components
-        run: pip install .
       - name: Download dataset
         if: ${{ matrix.dataset }}
         run: python -c "${{ matrix.dataset }}"
@@ -172,7 +170,7 @@ jobs:
         run: ./../test_superlink.sh bare sqlite
       - name: Run driver test with client authentication
         if: ${{ matrix.directory == 'e2e-bare-auth' }}
-        run: ./../test_superlink.sh bare client-auth
+        run: ./../test_superlink.sh "${{ matrix.directory }}" client-auth
       - name: Run reconnection test with SQLite database
         if: ${{ matrix.directory == 'e2e-bare' }}
         run: ./../test_reconnection.sh sqlite

diff --git a/.github/workflows/framework-release.yml b/.github/workflows/framework-release.yml
@@ -16,6 +16,8 @@ jobs:
     if: ${{ github.repository == 'adap/flower' }}
     name: Publish release
     runs-on: ubuntu-22.04
+    outputs:
+      flwr-version: ${{ steps.publish.outputs.flwr-version }}
     steps:
     - name: Checkout code
       uses: actions/checkout@v4
@@ -26,10 +28,12 @@ jobs:
       uses: ./.github/actions/bootstrap
 
     - name: Get artifacts and publish
+      id: publish
       env:
         GITHUB_REF: ${{ github.ref }}
       run: |
         TAG_NAME=$(echo "${GITHUB_REF_NAME}" | cut -c2-)
+        echo "flwr-version=$TAG_NAME" >> "$GITHUB_OUTPUT"
 
         wheel_name="flwr-${TAG_NAME}-py3-none-any.whl"
         tar_name="flwr-${TAG_NAME}.tar.gz"
@@ -67,8 +71,7 @@ jobs:
 
       - id: matrix
         run: |
-           FLWR_VERSION=$(poetry version -s)
-           python dev/build-docker-image-matrix.py --flwr-version "${FLWR_VERSION}" > matrix.json
+           python dev/build-docker-image-matrix.py --flwr-version "${{ needs.publish.outputs.flwr-version }}" > matrix.json
            echo "matrix=$(cat matrix.json)" >> $GITHUB_OUTPUT
 
   build-base-images:

diff --git a/benchmarks/flowertune-llm/README.md b/benchmarks/flowertune-llm/README.md
@@ -1,4 +1,4 @@
-![](_static/flower_llm.jpg)
+![](_static/flower_llm.png)
 
 # FlowerTune LLM Leaderboard
 
@@ -9,39 +9,40 @@ Please follow the instructions to run and evaluate the federated LLMs.
 
 ## Create a new project
 
-As the first step, please register a Flower account on [Flower website](https://flower.ai/login).
-Assuming `flwr` package is already installed on your system (check [here](https://flower.ai/docs/framework/how-to-install-flower.html) for `flwr` installation).
-We provide a single-line command to create a new project directory based on your selected challenge:
+As the first step, please register for a Flower account on [flower.ai/login](https://flower.ai/login).
+Then, create a new Python environment and install Flower. 
+
+> [!TIP]
+> We recommend using `pyenv` and the `virtualenv` plugin to create your environment. Other manager such as Conda would likely work too. Check the [documentation](https://flower.ai/docs/framework/how-to-install-flower.html) for alternative ways of installing Flower.
 
 ```shell
-flwr new --framework=flwrtune --username=your_flower_account
+pip install flwr
 ```
 
-Then you will see a prompt to ask your project name and the choice of LLM challenges from the set of general NLP, finance, medical and code.
-Type your project name and select your preferred challenge,
-and then a new project directory will be generated automatically.
-
-### Structure
+On the new environment, create a new Flower project using the `FlowerTune` template. You will be prompted for a name to give to your project, your username, and for your choice of LLM challenge:
+```shell
+flwr new --framework=FlowerTune
+```
 
-After running `flwr new`, you will see a new directory generated with the following structure:
+The `flwr new` command will generate a directory with the following structure:
 
 ```bash
 <project-name>
        ├── README.md           # <- Instructions
-       ├── pyproject.toml      # <- Environment dependencies
+       ├── pyproject.toml      # <- Environment dependencies and configs
        └── <project_name>
-                  ├── app.py          # <- Flower ClientApp/ServerApp build
-                  ├── client.py       # <- Flower client constructor
-                  ├── server.py       # <- Sever-related functions
-                  ├── models.py       # <- Model build
+                  ├── client_app.py   # <- Flower ClientApp build
                   ├── dataset.py      # <- Dataset and tokenizer build
-                  ├── conf/config.yaml         # <- User configuration
-                  └── conf/static_config.yaml  # <- Static configuration
+                  ├── models.py       # <- Model build
+                  ├── server_app.py   # <- Flower ServerApp build
+                  └── strategy.py     # <- Flower strategy build
 ```
 
 This can serve as the starting point for you to build up your own federated LLM fine-tuning methods.
-Please note that any modification to the content of `conf/static_config.yaml` is strictly prohibited for those who wish to participate in the [LLM Leaderboard](https://flower.ai/benchmarks/llm-leaderboard).
-Otherwise, the submission will not be considered.
+
+> [!IMPORTANT]
+> Please note that if you intend to submit your project as an entry to the [LLM Leaderboard](https://flower.ai/benchmarks/llm-leaderboard) modifications to `[tool.flwr.app.config.static]` and `[tool.flwr.federations.local-simulation]` sections in the `pyproject.toml` are not allowed and will invalidate the submission.
+
 
 ## Run FlowerTune LLM challenges
 
@@ -50,12 +51,17 @@ With a new project directory created, running a baseline challenge can be done b
 1. Navigate inside the directory that you just created.
 
 
-2. Follow the `Environments setup` section of `README.md` in the project directory to install project dependencies.
+2. Follow the `Environments setup` section of `README.md` in the project directory to install the project dependencies.
 
 
 3. Run the challenge as indicated in the `Running the challenge` section in the `README.md`.
 
-## Evaluate pre-trained LLMs
+## Evaluate fine-tuned LLMs
+
+Once the LLM fine-tuning finished, evaluate the performance of your fine-tuned LLM
+following the `README.md` in [`evaluation`](https://github.com/adap/flower/tree/main/benchmarks/flowertune-llm/evaluation) directory.
+
 
-After the LLM fine-tuning finished, evaluate the performance of your pre-trained LLMs
-following the `README.md` in `evaluation` directory.
+> [!NOTE]
+> If you have any questions about running FlowerTune LLM challenges or evaluation, please feel free to make posts at [Flower Discuss](https://discuss.flower.ai) forum, 
+or join our [Slack channel](https://flower.ai/join-slack/) to ask questions in the `#flowertune-llm-leaderboard` channel.
diff --git a/benchmarks/flowertune-llm/_static/flower_llm.jpg b/benchmarks/flowertune-llm/_static/flower_llm.jpg
diff --git a/benchmarks/flowertune-llm/_static/flower_llm.png b/benchmarks/flowertune-llm/_static/flower_llm.png
diff --git a/benchmarks/flowertune-llm/evaluation/README.md b/benchmarks/flowertune-llm/evaluation/README.md
@@ -0,0 +1,46 @@
+# FlowerTune LLM Evaluation
+
+This directory provides various evaluation metrics to assess the quality of your fine-tuned LLMs.
+If you are participating [LLM Leaderboard](https://flower.ai/benchmarks/llm-leaderboard), evaluating your fine-tuned LLM is the final step prior to have your submission added to the [LLM Leaderboard](https://flower.ai/benchmarks/llm-leaderboard#how-to-participate). The evaluation scores generated here will be displayed as the definitive values on the LLM Leaderboard.
+
+## How to run
+
+Navigate to the directory corresponding to your selected challenge (`general NLP`, `finance`, `medical`, or `code`) and follow the instructions there to execute the evaluation.
+
+> [!NOTE]  
+> If you wish to participate in the LLM Leaderboard, you must not modify the evaluation code and should use the exact command provided in the respective directory to run the evaluation.
+
+
+## Baseline results
+
+The default template generated by `flwr new` (see the [Project Creation Instructions](https://github.com/adap/flower/tree/main/benchmarks/flowertune-llm#create-a-new-project)) for each challenge will produce results as follows, which serve as the lower bound on the LLM Leaderboard.
+
+### General NLP
+
+|          | MT-1 | MT-2 | MT-Avg |  
+|:--------:|:----:|:----:|:------:|
+| MT Score | 5.54 | 5.52 |  5.53  |
+
+### Finance
+
+|         |  FPB  | FIQA  | TFNS  |  Avg  |  
+|:-------:|:-----:|:-----:|:-----:|:-----:|
+| Acc (%) | 44.55 | 62.50 | 28.77 | 45.27 |
+
+### Medical
+
+|         | PubMedQA | MedMCQA | MedQA |  Avg  |  
+|:-------:|:--------:|:-------:|:-----:|:-----:|
+| Acc (%) |  59.00   |  23.69  | 27.10 | 36.60 |
+
+### Code
+
+|            | MBPP  | HumanEval | MultiPL-E (JS) | MultiPL-E (C++) |  Avg  |  
+|:----------:|:-----:|:---------:|:--------------:|:---------------:|:-----:|
+| Pass@1 (%) | 31.60 |   23.78   |     28.57      |      25.47      | 27.36 |
+
+
+## Make submission on FlowerTune LLM Leaderboard
+
+If your LLM outperforms the listed benchmarks in any challenge, 
+we encourage you to submit your code and model to the FlowerTune LLM Leaderboard without hesitation (see the [How-to-participate Instructions](https://flower.ai/benchmarks/llm-leaderboard#how-to-participate)).
diff --git a/benchmarks/flowertune-llm/evaluation/code/README.md b/benchmarks/flowertune-llm/evaluation/code/README.md
@@ -0,0 +1,70 @@
+# Evaluation for Code challenge
+
+We leverage the code generation evaluation metrics provided by [bigcode-evaluation-harness](https://github.com/bigcode-project/bigcode-evaluation-harness/tree/main) to evaluate our fine-tuned LLMs.
+Three datasets have been selected for this evaluation: [MBPP](https://huggingface.co/datasets/google-research-datasets/mbpp) (Python), [HumanEval](https://huggingface.co/datasets/openai/openai_humaneval) (Python), and [MultiPL-E](https://github.com/nuprl/MultiPL-E) (JavaScript, C++). 
+
+> [!WARNING]
+> The evaluation process takes ~30 GB VRAM. On a 40GB A100 it requires 15-30mins depending on the dataset to complete.
+
+## Environment Setup
+
+```shell
+git clone --depth=1 https://github.com/adap/flower.git && mv flower/benchmarks/flowertune-llm/evaluation/code ./flowertune-eval-code && rm -rf flower && cd flowertune-eval-code
+```
+
+Create a new Python environment (we recommend Python 3.10), activate it, then install dependencies with:
+
+```shell
+# From a new python environment, run:
+pip install -r requirements.txt
+
+# Log in HuggingFace account
+huggingface-cli login
+```
+
+After that, install `Node.js` and `g++` for the evaluation of JavaScript, C++:
+
+```shell
+# Install nvm (Node Version Manager)
+curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.7/install.sh | bash
+
+# Restart your terminal
+
+# Download and install Node.js (you may need to restart the terminal)
+nvm install 20
+
+# Install g++
+sudo apt-get install g++
+```
+
+Then, download the `main.py` script from `bigcode-evaluation-harness` repository.
+
+```shell
+git clone https://github.com/bigcode-project/bigcode-evaluation-harness.git && cd bigcode-evaluation-harness && git checkout 0f3e95f0806e78a4f432056cdb1be93604a51d69 && mv main.py ../ && cd .. && rm -rf bigcode-evaluation-harness
+```
+
+
+## Generate model answers & calculate pass@1 score
+
+> [!NOTE]
+> Evaluation needs to be run on MBPP, HumanEval, MultiPL-E (JS) and MultiPL-E (C++).
+
+```bash
+python main.py \
+--model=mistralai/Mistral-7B-v0.3 \
+--peft_model=/path/to/fine-tuned-peft-model-dir/  \ # e.g., ./peft_1
+--max_length_generation=1024  \ # change to 2048 when running mbpp
+--batch_size=4 \
+--use_auth_token \
+--allow_code_execution \
+--save_generations  \
+--save_references \
+--tasks=humaneval \ # chosen from [mbpp, humaneval, multiple-js, multiple-cpp]
+--metric_output_path=./evaluation_results_humaneval.json # change dataset name based on your choice
+```
+
+The model answers and pass@1 scores will be saved to `generations_{dataset_name}.json` and `evaluation_results_{dataset_name}.json`, respectively.
+
+
+> [!NOTE]
+> Please ensure that you provide all **four pass@1 scores** for the evaluation datasets when submitting to the LLM Leaderboard (see the [`Make Submission`](https://github.com/adap/flower/tree/main/benchmarks/flowertune-llm/evaluation#make-submission-on-flowertune-llm-leaderboard) section).