Merge branch 'update-xgb-comprehensive-example' of https://github.com…

…/adap/flower into update-xgb-comprehensive-example
adap · Sep 18, 2024 · 5f58690 · 5f58690
2 parents 6fb13bc + af6094f
commit 5f58690
Show file tree

Hide file tree

Showing 351 changed files with 93,928 additions and 77,113 deletions.
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -32,7 +32,7 @@ README.md @jafermarq @tanertopal @danieljanes
 /.github/actions @Robert-Steiner @tanertopal @danieljanes
 
 # Docker-related files
-/.devcontainer @Robert-Steiner @Moep90
-**/Dockerfile @Robert-Steiner @Moep90
-**/*.Dockerfile @Robert-Steiner @Moep90
-src/docker @Robert-Steiner @Moep90
+/.devcontainer @Robert-Steiner @Moep90 @tanertopal @danieljanes
+**/Dockerfile @Robert-Steiner @Moep90 @tanertopal @danieljanes
+**/*.Dockerfile @Robert-Steiner @Moep90 @tanertopal @danieljanes
+/src/docker @Robert-Steiner @Moep90 @tanertopal @danieljanes
diff --git a/.github/actions/bootstrap/action.yml b/.github/actions/bootstrap/action.yml
@@ -3,7 +3,7 @@ description: "Bootstrap Python environment (install and configure Python version
 inputs:
   python-version:
     description: "Version range or exact version of Python or PyPy to use, using SemVer's version range syntax."
-    default: 3.8
+    default: 3.9
   pip-version:
     description: "Version of pip to be installed using pip"
     default: 24.1.2

diff --git a/.github/workflows/cache-cleanup.yml b/.github/workflows/cache-cleanup.yml
@@ -34,7 +34,7 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v5
         with:
-          python-version: 3.8
+          python-version: 3.9
 
       - name: Cleanup caches by directories
         # Only keep caches that match the latest keys for each directory

diff --git a/.github/workflows/datasets-e2e.yml b/.github/workflows/datasets-e2e.yml
@@ -45,7 +45,7 @@ jobs:
       - name: Bootstrap
         uses: ./.github/actions/bootstrap
         with:
-          python-version: 3.8
+          python-version: 3.9
       - name: Install dependencies
         run: python -m poetry install
       - name: Run tests

diff --git a/.github/workflows/datasets.yml b/.github/workflows/datasets.yml
@@ -37,7 +37,7 @@ jobs:
         # In case of a mismatch, the job has to download Python to install it.
         # Note: Due to a bug in actions/setup-python, we have to put "3.10" in
         # quotes as it will otherwise assume "3.1"
-        python: [3.8, 3.9, '3.10', '3.11']
+        python: ['3.9', '3.10', '3.11']
 
     name: Python ${{ matrix.python }}
 

diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml
@@ -127,7 +127,7 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v5
         with:
-          python-version: 3.8
+          python-version: 3.9
       - name: Install build tools
         run: |
             python -m pip install -U pip==23.3.1

diff --git a/.github/workflows/framework.yml b/.github/workflows/framework.yml
@@ -25,7 +25,7 @@ jobs:
         # In case of a mismatch, the job has to download Python to install it.
         # Note: Due to a bug in actions/setup-python, we have to put "3.10" in
         # quotes as it will otherwise assume "3.1"
-        python: [3.8, 3.9, '3.10', '3.11']
+        python: ['3.9', '3.10', '3.11']
 
     name: Python ${{ matrix.python }}
 

diff --git a/README.md b/README.md
@@ -147,7 +147,6 @@ Other [examples](https://github.com/adap/flower/tree/main/examples):
 - [Federated Finetuning of a Vision Transformer](https://github.com/adap/flower/tree/main/examples/flowertune-vit)
 - [Advanced Flower with TensorFlow/Keras](https://github.com/adap/flower/tree/main/examples/advanced-tensorflow)
 - [Advanced Flower with PyTorch](https://github.com/adap/flower/tree/main/examples/advanced-pytorch)
-- Single-Machine Simulation of Federated Learning Systems ([PyTorch](https://github.com/adap/flower/tree/main/examples/simulation-pytorch)) ([Tensorflow](https://github.com/adap/flower/tree/main/examples/simulation-tensorflow))
 - [Comprehensive Flower+XGBoost](https://github.com/adap/flower/tree/main/examples/xgboost-comprehensive)
 - [Flower through Docker Compose and with Grafana dashboard](https://github.com/adap/flower/tree/main/examples/flower-via-docker-compose)
 - [Flower with KaplanMeierFitter from the lifelines library](https://github.com/adap/flower/tree/main/examples/federated-kaplan-meier-fitter)

diff --git a/benchmarks/flowertune-llm/README.md b/benchmarks/flowertune-llm/README.md
@@ -1,4 +1,4 @@
-![](_static/flower_llm.png)
+[![FlowerTune LLM Leaderboard](_static/flower_llm.png)](https://flower.ai/benchmarks/llm-leaderboard)
 
 # FlowerTune LLM Leaderboard
 
@@ -27,15 +27,16 @@ flwr new --framework=FlowerTune
 The `flwr new` command will generate a directory with the following structure:
 
 ```bash
-<project-name>
-       ├── README.md           # <- Instructions
-       ├── pyproject.toml      # <- Environment dependencies and configs
-       └── <project_name>
-                  ├── client_app.py   # <- Flower ClientApp build
-                  ├── dataset.py      # <- Dataset and tokenizer build
-                  ├── models.py       # <- Model build
-                  ├── server_app.py   # <- Flower ServerApp build
-                  └── strategy.py     # <- Flower strategy build
+<project_name>
+├── README.md           # Instructions
+├── pyproject.toml      # Environment dependencies and configs
+└── <project_name>
+    ├── __init__.py
+    ├── client_app.py   # Flower ClientApp build
+    ├── dataset.py      # Dataset and tokenizer build
+    ├── models.py       # Model build
+    ├── server_app.py   # Flower ServerApp build
+    └── strategy.py     # Flower strategy build
 ```
 
 This can serve as the starting point for you to build up your own federated LLM fine-tuning methods.

diff --git a/benchmarks/flowertune-llm/evaluation/README.md b/benchmarks/flowertune-llm/evaluation/README.md
@@ -5,7 +5,7 @@ If you are participating [LLM Leaderboard](https://flower.ai/benchmarks/llm-lead
 
 ## How to run
 
-Navigate to the directory corresponding to your selected challenge (`general NLP`, `finance`, `medical`, or `code`) and follow the instructions there to execute the evaluation.
+Navigate to the directory corresponding to your selected challenge ([`general NLP`](https://github.com/adap/flower/tree/main/benchmarks/flowertune-llm/evaluation/general-nlp), [`finance`](https://github.com/adap/flower/tree/main/benchmarks/flowertune-llm/evaluation/finance), [`medical`](https://github.com/adap/flower/tree/main/benchmarks/flowertune-llm/evaluation/medical), or [`code`](https://github.com/adap/flower/tree/main/benchmarks/flowertune-llm/evaluation/code)) and follow the instructions there to execute the evaluation.
 
 > [!NOTE]  
 > If you wish to participate in the LLM Leaderboard, you must not modify the evaluation code and should use the exact command provided in the respective directory to run the evaluation.

diff --git a/benchmarks/flowertune-llm/evaluation/code/README.md b/benchmarks/flowertune-llm/evaluation/code/README.md
@@ -0,0 +1,70 @@
+# Evaluation for Code challenge
+
+We leverage the code generation evaluation metrics provided by [bigcode-evaluation-harness](https://github.com/bigcode-project/bigcode-evaluation-harness/tree/main) to evaluate our fine-tuned LLMs.
+Three datasets have been selected for this evaluation: [MBPP](https://huggingface.co/datasets/google-research-datasets/mbpp) (Python), [HumanEval](https://huggingface.co/datasets/openai/openai_humaneval) (Python), and [MultiPL-E](https://github.com/nuprl/MultiPL-E) (JavaScript, C++). 
+
+> [!WARNING]
+> The evaluation process takes ~30 GB VRAM. On a 40GB A100 it requires 15-30mins depending on the dataset to complete.
+
+## Environment Setup
+
+```shell
+git clone --depth=1 https://github.com/adap/flower.git && mv flower/benchmarks/flowertune-llm/evaluation/code ./flowertune-eval-code && rm -rf flower && cd flowertune-eval-code
+```
+
+Create a new Python environment (we recommend Python 3.10), activate it, then install dependencies with:
+
+```shell
+# From a new python environment, run:
+pip install -r requirements.txt
+
+# Log in HuggingFace account
+huggingface-cli login
+```
+
+After that, install `Node.js` and `g++` for the evaluation of JavaScript, C++:
+
+```shell
+# Install nvm (Node Version Manager)
+curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.7/install.sh | bash
+
+# Restart your terminal
+
+# Download and install Node.js (you may need to restart the terminal)
+nvm install 20
+
+# Install g++
+sudo apt-get install g++
+```
+
+Then, download the `main.py` script from `bigcode-evaluation-harness` repository.
+
+```shell
+git clone https://github.com/bigcode-project/bigcode-evaluation-harness.git && cd bigcode-evaluation-harness && git checkout 0f3e95f0806e78a4f432056cdb1be93604a51d69 && mv main.py ../ && cd .. && rm -rf bigcode-evaluation-harness
+```
+
+
+## Generate model answers & calculate pass@1 score
+
+> [!NOTE]
+> Evaluation needs to be run on MBPP, HumanEval, MultiPL-E (JS) and MultiPL-E (C++).
+
+```bash
+python main.py \
+--model=mistralai/Mistral-7B-v0.3 \
+--peft_model=/path/to/fine-tuned-peft-model-dir/  \ # e.g., ./peft_1
+--max_length_generation=1024  \ # change to 2048 when running mbpp
+--batch_size=4 \
+--use_auth_token \
+--allow_code_execution \
+--save_generations  \
+--save_references \
+--tasks=humaneval \ # chosen from [mbpp, humaneval, multiple-js, multiple-cpp]
+--metric_output_path=./evaluation_results_humaneval.json # change dataset name based on your choice
+```
+
+The model answers and pass@1 scores will be saved to `generations_{dataset_name}.json` and `evaluation_results_{dataset_name}.json`, respectively.
+
+
+> [!NOTE]
+> Please ensure that you provide all **four pass@1 scores** for the evaluation datasets when submitting to the LLM Leaderboard (see the [`Make Submission`](https://github.com/adap/flower/tree/main/benchmarks/flowertune-llm/evaluation#make-submission-on-flowertune-llm-leaderboard) section).
diff --git a/benchmarks/flowertune-llm/evaluation/code/requirements.txt b/benchmarks/flowertune-llm/evaluation/code/requirements.txt
@@ -0,0 +1,7 @@
+peft==0.6.2
+datasets==2.20.0
+evaluate==0.3.0
+sentencepiece==0.2.0
+protobuf==5.27.1
+bitsandbytes==0.43.1
+git+https://github.com/bigcode-project/bigcode-evaluation-harness.git@0f3e95f0806e78a4f432056cdb1be93604a51d69
diff --git a/benchmarks/flowertune-llm/evaluation/finance/README.md b/benchmarks/flowertune-llm/evaluation/finance/README.md
@@ -0,0 +1,40 @@
+# Evaluation for Finance challenge
+
+We build a sentiment classification pipeline on finance-related text to evaluate our fine-tuned LLMs.
+Three datasets have been selected for this evaluation: [FPB](https://huggingface.co/datasets/takala/financial_phrasebank), [FIQA](https://huggingface.co/datasets/pauri32/fiqa-2018), and [TFNS](https://huggingface.co/datasets/zeroshot/twitter-financial-news-sentiment). 
+
+
+## Environment Setup
+
+```shell
+git clone --depth=1 https://github.com/adap/flower.git && mv flower/benchmarks/flowertune-llm/evaluation/finance ./flowertune-eval-finance && rm -rf flower && cd flowertune-eval-finance
+```
+
+Create a new Python environment (we recommend Python 3.10), activate it, then install dependencies with:
+
+```shell
+# From a new python environment, run:
+pip install -r requirements.txt
+
+# Log in HuggingFace account
+huggingface-cli login
+```
+
+## Generate model decision & calculate accuracy
+
+> [!NOTE]
+> Please ensure that you use `quantization=4` to run the evaluation if you wish to participate in the LLM Leaderboard.
+
+```bash
+python eval.py \
+--peft-path=/path/to/fine-tuned-peft-model-dir/ \ # e.g., ./peft_1
+--run-name=fl  \ # specified name for this run  
+--batch-size=32 \
+--quantization=4 \
+--datasets=fpb,fiqa,tfns
+```
+
+The model answers and accuracy values will be saved to `benchmarks/generation_{dataset_name}_{run_name}.jsonl` and `benchmarks/acc_{dataset_name}_{run_name}.txt`, respectively.
+
+> [!NOTE]
+> Please ensure that you provide all **three accuracy values (FPB, FIQA, TFNS)** for three evaluation datasets when submitting to the LLM Leaderboard (see the [`Make Submission`](https://github.com/adap/flower/tree/main/benchmarks/flowertune-llm/evaluation#make-submission-on-flowertune-llm-leaderboard) section).
diff --git a/benchmarks/flowertune-llm/evaluation/finance/benchmarks.py b/benchmarks/flowertune-llm/evaluation/finance/benchmarks.py
@@ -0,0 +1,135 @@
+import torch
+from sklearn.metrics import accuracy_score
+from tqdm import tqdm
+from utils import (
+    add_instruct,
+    change_target,
+    format_example,
+    generate_label,
+    load_data,
+    save_results,
+)
+
+
+def infer_fiqa(model, tokenizer, batch_size, run_name):
+    name = "fiqa"
+    dataset = load_data("pauri32/fiqa-2018", concat=True)
+
+    # Post process
+    dataset["output"] = dataset.sentiment_score.apply(generate_label)
+    dataset["instruction"] = dataset.apply(add_instruct, axis=1)
+    dataset = dataset[["sentence", "output", "instruction"]]
+    dataset.columns = ["input", "output", "instruction"]
+
+    dataset[["context", "target"]] = dataset.apply(
+        format_example, axis=1, result_type="expand"
+    )
+
+    # Print example
+    print(f"\n\nPrompt example:\n{dataset['context'][0]}\n\n")
+
+    # Run inference
+    dataset, acc = inference(dataset, model, tokenizer, batch_size)
+
+    # Save results and generations
+    save_results(name, run_name, dataset, acc)
+
+
+def infer_fpb(model, tokenizer, batch_size, run_name):
+    name = "fpb"
+    dataset = load_data("takala/financial_phrasebank", "sentences_50agree")
+
+    # Post process
+    dataset.columns = ["input", "output"]
+    dic = {0: "negative", 1: "neutral", 2: "positive"}
+    dataset["output"] = dataset["output"].apply(lambda x: dic[x])
+
+    dataset["instruction"] = (
+        "What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}."
+    )
+    dataset[["context", "target"]] = dataset.apply(
+        format_example, axis=1, result_type="expand"
+    )
+
+    # Print example
+    print(f"\n\nPrompt example:\n{dataset['context'][0]}\n\n")
+
+    # Run inference
+    dataset, acc = inference(dataset, model, tokenizer, batch_size)
+
+    # Save results and generations
+    save_results(name, run_name, dataset, acc)
+
+
+def infer_tfns(model, tokenizer, batch_size, run_name):
+    name = "tfns"
+    dataset = load_data(
+        "zeroshot/twitter-financial-news-sentiment", valid_set="validation"
+    )
+
+    # Post process
+    dic = {0: "negative", 1: "positive", 2: "neutral"}
+    dataset["label"] = dataset["label"].apply(lambda x: dic[x])
+
+    dataset["instruction"] = (
+        "What is the sentiment of this tweet? Please choose an answer from {negative/neutral/positive}."
+    )
+
+    dataset.columns = ["input", "output", "instruction"]
+    dataset[["context", "target"]] = dataset.apply(
+        format_example, axis=1, result_type="expand"
+    )
+
+    # print example
+    print(f"\n\nPrompt example:\n{dataset['context'][0]}\n\n")
+
+    # Run inference
+    dataset, acc = inference(dataset, model, tokenizer, batch_size)
+
+    # Save results and generations
+    save_results(name, run_name, dataset, acc)
+
+
+def inference(dataset, model, tokenizer, batch_size):
+    context = dataset["context"].tolist()
+
+    last_batch = dataset.shape[0] % batch_size
+    total_steps = dataset.shape[0] // batch_size + 1
+    print(
+        f"Total len: {len(context)}. Batch size: {batch_size}. Total steps: {total_steps}"
+    )
+
+    out_text_list = []
+    for i in tqdm(range(total_steps)):
+        idx_s = i * batch_size
+        tmp_context = (
+            context[idx_s : idx_s + last_batch]
+            if i == total_steps - 1
+            else context[idx_s : idx_s + batch_size]
+        )
+
+        if tmp_context:
+            tokens = tokenizer(
+                tmp_context,
+                return_tensors="pt",
+                padding=True,
+                max_length=512,
+                return_token_type_ids=False,
+            )
+            for k in tokens.keys():
+                tokens[k] = tokens[k].cuda()
+            res = model.generate(
+                **tokens, max_length=512, eos_token_id=tokenizer.eos_token_id
+            )
+            res_sentences = [tokenizer.decode(i, skip_special_tokens=True) for i in res]
+            out_text = [o.split("Answer: ")[1] for o in res_sentences]
+            out_text_list += out_text
+            torch.cuda.empty_cache()
+
+    dataset["out_text"] = out_text_list
+    dataset["new_target"] = dataset["target"].apply(change_target)
+    dataset["new_out"] = dataset["out_text"].apply(change_target)
+
+    acc = accuracy_score(dataset["new_target"], dataset["new_out"])
+
+    return dataset, acc