Merge branch 'main' into feat/cron-job-rebuild-latest-image

adap · Sep 10, 2024 · b232c38 · b232c38
2 parents 25797eb + 8f8639f
commit b232c38
Show file tree

Hide file tree

Showing 40 changed files with 1,222 additions and 484 deletions.
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -35,3 +35,4 @@ README.md @jafermarq @tanertopal @danieljanes
 /.devcontainer @Robert-Steiner @Moep90
 **/Dockerfile @Robert-Steiner @Moep90
 **/*.Dockerfile @Robert-Steiner @Moep90
+src/docker @Robert-Steiner @Moep90
diff --git a/.github/workflows/docker-build-main.yml b/.github/workflows/docker-build-main.yml
@@ -0,0 +1,69 @@
+name: Build Docker Images Main Branch
+
+on:
+  push:
+    branches:
+      - 'main'
+
+jobs:
+  parameters:
+    if: github.repository == 'adap/flower'
+    name: Collect docker build parameters
+    runs-on: ubuntu-22.04
+    timeout-minutes: 10
+    outputs:
+      pip-version: ${{ steps.versions.outputs.pip-version }}
+      setuptools-version: ${{ steps.versions.outputs.setuptools-version }}
+      flwr-version-ref: ${{ steps.versions.outputs.flwr-version-ref }}
+    steps:
+      - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+
+      - uses: ./.github/actions/bootstrap
+        id: bootstrap
+
+      - id: versions
+        run: |
+          echo "pip-version=${{ steps.bootstrap.outputs.pip-version }}" >> "$GITHUB_OUTPUT"
+          echo "setuptools-version=${{ steps.bootstrap.outputs.setuptools-version }}" >> "$GITHUB_OUTPUT"
+          echo "flwr-version-ref=git+${{ github.server_url }}/${{ github.repository }}.git@${{ github.sha }}" >> "$GITHUB_OUTPUT"
+
+  build-docker-base-images:
+    name: Build base images
+    if: github.repository == 'adap/flower'
+    uses: ./.github/workflows/_docker-build.yml
+    needs: parameters
+    with:
+      namespace-repository: flwr/base
+      file-dir: src/docker/base/ubuntu
+      build-args: |
+        PIP_VERSION=${{ needs.parameters.outputs.pip-version }}
+        SETUPTOOLS_VERSION=${{ needs.parameters.outputs.setuptools-version }}
+        FLWR_VERSION_REF=${{ needs.parameters.outputs.flwr-version-ref }}
+      tags: unstable
+    secrets:
+      dockerhub-user: ${{ secrets.DOCKERHUB_USERNAME }}
+      dockerhub-token: ${{ secrets.DOCKERHUB_TOKEN }}
+
+  build-docker-binary-images:
+    name: Build binary images
+    if: github.repository == 'adap/flower'
+    uses: ./.github/workflows/_docker-build.yml
+    needs: build-docker-base-images
+    strategy:
+      fail-fast: false
+      matrix:
+        images: [
+          { repository: "flwr/superlink", file_dir: "src/docker/superlink" },
+          { repository: "flwr/supernode", file_dir: "src/docker/supernode" },
+          { repository: "flwr/serverapp", file_dir: "src/docker/serverapp" },
+          { repository: "flwr/superexec", file_dir: "src/docker/superexec" },
+          { repository: "flwr/clientapp", file_dir: "src/docker/clientapp" }
+        ]
+    with:
+      namespace-repository: ${{ matrix.images.repository }}
+      file-dir: ${{ matrix.images.file_dir }}
+      build-args: BASE_IMAGE=unstable
+      tags: unstable
+    secrets:
+      dockerhub-user: ${{ secrets.DOCKERHUB_USERNAME }}
+      dockerhub-token: ${{ secrets.DOCKERHUB_TOKEN }}
diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml
@@ -146,8 +146,6 @@ jobs:
         if: ${{ github.repository == 'adap/flower' && !github.event.pull_request.head.repo.fork && github.actor != 'dependabot[bot]' }}
         run: |
           python -m pip install https://${{ env.ARTIFACT_BUCKET }}/py/${{ needs.wheel.outputs.dir }}/${{ needs.wheel.outputs.short_sha }}/${{ needs.wheel.outputs.whl_path }}
-      - name: Install e2e components
-        run: pip install .
       - name: Download dataset
         if: ${{ matrix.dataset }}
         run: python -c "${{ matrix.dataset }}"
@@ -172,7 +170,7 @@ jobs:
         run: ./../test_superlink.sh bare sqlite
       - name: Run driver test with client authentication
         if: ${{ matrix.directory == 'e2e-bare-auth' }}
-        run: ./../test_superlink.sh bare client-auth
+        run: ./../test_superlink.sh "${{ matrix.directory }}" client-auth
       - name: Run reconnection test with SQLite database
         if: ${{ matrix.directory == 'e2e-bare' }}
         run: ./../test_reconnection.sh sqlite

diff --git a/.github/workflows/framework-release.yml b/.github/workflows/framework-release.yml
@@ -16,6 +16,8 @@ jobs:
     if: ${{ github.repository == 'adap/flower' }}
     name: Publish release
     runs-on: ubuntu-22.04
+    outputs:
+      flwr-version: ${{ steps.publish.outputs.flwr-version }}
     steps:
     - name: Checkout code
       uses: actions/checkout@v4
@@ -26,10 +28,12 @@ jobs:
       uses: ./.github/actions/bootstrap
 
     - name: Get artifacts and publish
+      id: publish
       env:
         GITHUB_REF: ${{ github.ref }}
       run: |
         TAG_NAME=$(echo "${GITHUB_REF_NAME}" | cut -c2-)
+        echo "flwr-version=$TAG_NAME" >> "$GITHUB_OUTPUT"
 
         wheel_name="flwr-${TAG_NAME}-py3-none-any.whl"
         tar_name="flwr-${TAG_NAME}.tar.gz"
@@ -67,8 +71,7 @@ jobs:
 
       - id: matrix
         run: |
-           FLWR_VERSION=$(poetry version -s)
-           python dev/build-docker-image-matrix.py --flwr-version "${FLWR_VERSION}" > matrix.json
+           python dev/build-docker-image-matrix.py --flwr-version "${{ needs.publish.outputs.flwr-version }}" > matrix.json
            echo "matrix=$(cat matrix.json)" >> $GITHUB_OUTPUT
 
   build-base-images:

diff --git a/benchmarks/flowertune-llm/evaluation/README.md b/benchmarks/flowertune-llm/evaluation/README.md
@@ -37,7 +37,7 @@ The default template generated by `flwr new` (see the [Project Creation Instruct
 
 |            | MBPP  | HumanEval | MultiPL-E (JS) | MultiPL-E (C++) |  Avg  |  
 |:----------:|:-----:|:---------:|:--------------:|:---------------:|:-----:|
-| Pass@1 (%) | 32.60 |   26.83   |     29.81      |      24.22      | 28.37 |
+| Pass@1 (%) | 31.60 |   23.78   |     28.57      |      25.47      | 27.36 |
 
 
 ## Make submission on FlowerTune LLM Leaderboard

diff --git a/benchmarks/flowertune-llm/evaluation/general-nlp/README.md b/benchmarks/flowertune-llm/evaluation/general-nlp/README.md
@@ -23,7 +23,7 @@ huggingface-cli login
 Download data from [FastChat](https://github.com/lm-sys/FastChat):
 
 ```shell
-git clone --depth=1 https://github.com/lm-sys/FastChat.git && cd FastChat && git checkout d561f87b24de197e25e3ddf7e09af93ced8dfe36 && mv fastchat/llm_judge/data ../data && cd .. && rm -rf FastChat
+git clone https://github.com/lm-sys/FastChat.git && cd FastChat && git checkout d561f87b24de197e25e3ddf7e09af93ced8dfe36 && mv fastchat/llm_judge/data ../data && cd .. && rm -rf FastChat
 ```
 
 

diff --git a/benchmarks/flowertune-llm/evaluation/medical/README.md b/benchmarks/flowertune-llm/evaluation/medical/README.md
@@ -0,0 +1,38 @@
+# Evaluation for Medical challenge
+
+We build up a medical question answering (QA) pipeline to evaluate our fined-tuned LLMs.
+Three datasets have been selected for this evaluation: [PubMedQA](https://huggingface.co/datasets/bigbio/pubmed_qa), [MedMCQA](https://huggingface.co/datasets/medmcqa), and [MedQA](https://huggingface.co/datasets/bigbio/med_qa). 
+
+
+## Environment Setup
+
+```shell
+git clone --depth=1 https://github.com/adap/flower.git && mv flower/benchmarks/flowertune-llm/evaluation/medical ./flowertune-eval-medical && rm -rf flower && cd flowertune-eval-medical
+```
+
+Create a new Python environment (we recommend Python 3.10), activate it, then install dependencies with:
+
+```shell
+# From a new python environment, run:
+pip install -r requirements.txt
+
+# Log in HuggingFace account
+huggingface-cli login
+```
+
+## Generate model decision & calculate accuracy
+
+```bash
+python eval.py \
+--peft-path=/path/to/fine-tuned-peft-model-dir/ \ # e.g., ./peft_1
+--run-name=fl  \ # specified name for this run  
+--batch-size=16 \
+--quantization=4 \
+--datasets=pubmedqa,medmcqa,medqa
+```
+
+The model answers and accuracy values will be saved to `benchmarks/generation_{dataset_name}_{run_name}.jsonl` and `benchmarks/acc_{dataset_name}_{run_name}.txt`, respectively.
+
+
+> [!NOTE]
+> Please ensure that you provide all **three accuracy values (PubMedQA, MedMCQA, MedQA)** for three evaluation datasets when submitting to the LLM Leaderboard (see the [`Make Submission`](https://github.com/adap/flower/tree/main/benchmarks/flowertune-llm/evaluation#make-submission-on-flowertune-llm-leaderboard) section).
diff --git a/benchmarks/flowertune-llm/evaluation/medical/benchmarks.py b/benchmarks/flowertune-llm/evaluation/medical/benchmarks.py
@@ -0,0 +1,174 @@
+import json
+
+import pandas as pd
+from sklearn.metrics import accuracy_score
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from utils import format_answer, format_example, save_results
+
+import datasets
+
+# The instructions refer to Meditron evaluation:
+# https://github.com/epfLLM/meditron/blob/main/evaluation/instructions.json
+INSTRUCTIONS = {
+    "pubmedqa": "As an expert doctor in clinical science and medical knowledge, can you tell me if the following statement is correct? Answer yes, no, or maybe.",
+    "medqa": "You are a medical doctor taking the US Medical Licensing Examination. You need to demonstrate your understanding of basic and clinical science, medical knowledge, and mechanisms underlying health, disease, patient care, and modes of therapy. Show your ability to apply the knowledge essential for medical practice. For the following multiple-choice question, select one correct answer from A to E. Base your answer on the current and standard practices referenced in medical guidelines.",
+    "medmcqa": "You are a medical doctor answering realworld medical entrance exam questions. Based on your understanding of basic and clinical science, medical knowledge, and mechanisms underlying health, disease, patient care, and modes of therapy, answer the following multiple-choice question. Select one correct answer from A to D. Base your answer on the current and standard practices referenced in medical guidelines.",
+}
+
+
+def infer_pubmedqa(model, tokenizer, batch_size, run_name):
+    name = "pubmedqa"
+    answer_type = "boolean"
+    dataset = datasets.load_dataset(
+        "bigbio/pubmed_qa",
+        "pubmed_qa_labeled_fold0_source",
+        split="test",
+        trust_remote_code=True,
+    )
+    # Post process
+    instruction = INSTRUCTIONS[name]
+
+    def post_process(row):
+        context = "\n".join(row["CONTEXTS"])
+        row["prompt"] = f"{context}\n{row['QUESTION']}"
+        row["gold"] = row["final_decision"]
+        row["long_answer"] = row["LONG_ANSWER"]
+        row["prompt"] = f"{instruction}\n{row['prompt']}\nThe answer is:\n"
+        return row
+
+    dataset = dataset.map(post_process)
+
+    # Generate results
+    generate_results(name, run_name, dataset, model, tokenizer, batch_size, answer_type)
+
+
+def infer_medqa(model, tokenizer, batch_size, run_name):
+    name = "medqa"
+    answer_type = "mcq"
+    dataset = datasets.load_dataset(
+        "bigbio/med_qa",
+        "med_qa_en_4options_source",
+        split="test",
+        trust_remote_code=True,
+    )
+
+    # Post process
+    instruction = INSTRUCTIONS[name]
+
+    def post_process(row):
+        choices = [opt["value"] for opt in row["options"]]
+        row["prompt"] = format_example(row["question"], choices)
+        for opt in row["options"]:
+            if opt["value"] == row["answer"]:
+                row["gold"] = opt["key"]
+                break
+        row["prompt"] = f"{instruction}\n{row['prompt']}\nThe answer is:\n"
+        return row
+
+    dataset = dataset.map(post_process)
+
+    # Generate results
+    generate_results(name, run_name, dataset, model, tokenizer, batch_size, answer_type)
+
+
+def infer_medmcqa(model, tokenizer, batch_size, run_name):
+    name = "medmcqa"
+    answer_type = "mcq"
+    dataset = datasets.load_dataset(
+        "medmcqa", split="validation", trust_remote_code=True
+    )
+
+    # Post process
+    instruction = INSTRUCTIONS[name]
+
+    def post_process(row):
+        options = [row["opa"], row["opb"], row["opc"], row["opd"]]
+        answer = int(row["cop"])
+        row["prompt"] = format_example(row["question"], options)
+        row["gold"] = chr(ord("A") + answer) if answer in [0, 1, 2, 3] else None
+        row["prompt"] = f"{instruction}\n{row['prompt']}\nThe answer is:\n"
+        return row
+
+    dataset = dataset.map(post_process)
+
+    # Generate results
+    generate_results(name, run_name, dataset, model, tokenizer, batch_size, answer_type)
+
+
+def generate_results(
+    name, run_name, dataset, model, tokenizer, batch_size, answer_type
+):
+    # Run inference
+    prediction = inference(dataset, model, tokenizer, batch_size)
+
+    # Calculate accuracy
+    acc = accuracy_compute(prediction, answer_type)
+
+    # Save results and generations
+    save_results(name, run_name, prediction, acc)
+
+
+def inference(dataset, model, tokenizer, batch_size):
+    columns_process = ["prompt", "gold"]
+    dataset_process = pd.DataFrame(dataset, columns=dataset.features)[columns_process]
+    dataset_process = dataset_process.assign(output="Null")
+    temperature = 1.0
+
+    inference_data = json.loads(dataset_process.to_json(orient="records"))
+    data_loader = DataLoader(inference_data, batch_size=batch_size, shuffle=False)
+
+    batch_counter = 0
+    for batch in tqdm(data_loader, total=len(data_loader), position=0, leave=True):
+        prompts = [
+            f"<|im_start|>question\n{prompt}<|im_end|>\n<|im_start|>answer\n"
+            for prompt in batch["prompt"]
+        ]
+        if batch_counter == 0:
+            print(prompts[0])
+
+        # Process tokenizer
+        stop_seq = ["###"]
+        if tokenizer.eos_token is not None:
+            stop_seq.append(tokenizer.eos_token)
+        if tokenizer.pad_token is not None:
+            stop_seq.append(tokenizer.pad_token)
+        max_new_tokens = len(
+            tokenizer(batch["gold"][0], add_special_tokens=False)["input_ids"]
+        )
+
+        outputs = []
+        for prompt in prompts:
+            input_ids = tokenizer.encode(prompt, return_tensors="pt").to("cuda")
+            output_ids = model.generate(
+                inputs=input_ids,
+                max_new_tokens=max_new_tokens,
+                do_sample=False,
+                top_p=1.0,
+                temperature=temperature,
+                pad_token_id=tokenizer.eos_token_id,
+            )
+            output_ids = output_ids[0][len(input_ids[0]) :]
+            output = tokenizer.decode(output_ids, skip_special_tokens=True)
+            outputs.append(output)
+
+        for prompt, out in zip(batch["prompt"], outputs):
+            dataset_process.loc[dataset_process["prompt"] == prompt, "output"] = out
+        batch_counter += 1
+
+    return dataset_process
+
+
+def accuracy_compute(dataset, answer_type):
+    dataset = json.loads(dataset.to_json(orient="records"))
+    preds, golds = [], []
+    for row in dataset:
+        answer = row["gold"].lower()
+        output = row["output"].lower()
+        pred, gold = format_answer(output, answer, answer_type=answer_type)
+        preds.append(pred)
+        golds.append(gold)
+
+    accuracy = accuracy_score(preds, golds)
+
+    return accuracy