Skip to content

Commit

Permalink
Merge branch 'main' into feat/cron-job-rebuild-latest-image
Browse files Browse the repository at this point in the history
  • Loading branch information
Robert-Steiner authored Sep 10, 2024
2 parents 25797eb + 8f8639f commit b232c38
Show file tree
Hide file tree
Showing 40 changed files with 1,222 additions and 484 deletions.
1 change: 1 addition & 0 deletions .github/CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,4 @@ README.md @jafermarq @tanertopal @danieljanes
/.devcontainer @Robert-Steiner @Moep90
**/Dockerfile @Robert-Steiner @Moep90
**/*.Dockerfile @Robert-Steiner @Moep90
src/docker @Robert-Steiner @Moep90
69 changes: 69 additions & 0 deletions .github/workflows/docker-build-main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
name: Build Docker Images Main Branch

on:
push:
branches:
- 'main'

jobs:
parameters:
if: github.repository == 'adap/flower'
name: Collect docker build parameters
runs-on: ubuntu-22.04
timeout-minutes: 10
outputs:
pip-version: ${{ steps.versions.outputs.pip-version }}
setuptools-version: ${{ steps.versions.outputs.setuptools-version }}
flwr-version-ref: ${{ steps.versions.outputs.flwr-version-ref }}
steps:
- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1

- uses: ./.github/actions/bootstrap
id: bootstrap

- id: versions
run: |
echo "pip-version=${{ steps.bootstrap.outputs.pip-version }}" >> "$GITHUB_OUTPUT"
echo "setuptools-version=${{ steps.bootstrap.outputs.setuptools-version }}" >> "$GITHUB_OUTPUT"
echo "flwr-version-ref=git+${{ github.server_url }}/${{ github.repository }}.git@${{ github.sha }}" >> "$GITHUB_OUTPUT"
build-docker-base-images:
name: Build base images
if: github.repository == 'adap/flower'
uses: ./.github/workflows/_docker-build.yml
needs: parameters
with:
namespace-repository: flwr/base
file-dir: src/docker/base/ubuntu
build-args: |
PIP_VERSION=${{ needs.parameters.outputs.pip-version }}
SETUPTOOLS_VERSION=${{ needs.parameters.outputs.setuptools-version }}
FLWR_VERSION_REF=${{ needs.parameters.outputs.flwr-version-ref }}
tags: unstable
secrets:
dockerhub-user: ${{ secrets.DOCKERHUB_USERNAME }}
dockerhub-token: ${{ secrets.DOCKERHUB_TOKEN }}

build-docker-binary-images:
name: Build binary images
if: github.repository == 'adap/flower'
uses: ./.github/workflows/_docker-build.yml
needs: build-docker-base-images
strategy:
fail-fast: false
matrix:
images: [
{ repository: "flwr/superlink", file_dir: "src/docker/superlink" },
{ repository: "flwr/supernode", file_dir: "src/docker/supernode" },
{ repository: "flwr/serverapp", file_dir: "src/docker/serverapp" },
{ repository: "flwr/superexec", file_dir: "src/docker/superexec" },
{ repository: "flwr/clientapp", file_dir: "src/docker/clientapp" }
]
with:
namespace-repository: ${{ matrix.images.repository }}
file-dir: ${{ matrix.images.file_dir }}
build-args: BASE_IMAGE=unstable
tags: unstable
secrets:
dockerhub-user: ${{ secrets.DOCKERHUB_USERNAME }}
dockerhub-token: ${{ secrets.DOCKERHUB_TOKEN }}
4 changes: 1 addition & 3 deletions .github/workflows/e2e.yml
Original file line number Diff line number Diff line change
Expand Up @@ -146,8 +146,6 @@ jobs:
if: ${{ github.repository == 'adap/flower' && !github.event.pull_request.head.repo.fork && github.actor != 'dependabot[bot]' }}
run: |
python -m pip install https://${{ env.ARTIFACT_BUCKET }}/py/${{ needs.wheel.outputs.dir }}/${{ needs.wheel.outputs.short_sha }}/${{ needs.wheel.outputs.whl_path }}
- name: Install e2e components
run: pip install .
- name: Download dataset
if: ${{ matrix.dataset }}
run: python -c "${{ matrix.dataset }}"
Expand All @@ -172,7 +170,7 @@ jobs:
run: ./../test_superlink.sh bare sqlite
- name: Run driver test with client authentication
if: ${{ matrix.directory == 'e2e-bare-auth' }}
run: ./../test_superlink.sh bare client-auth
run: ./../test_superlink.sh "${{ matrix.directory }}" client-auth
- name: Run reconnection test with SQLite database
if: ${{ matrix.directory == 'e2e-bare' }}
run: ./../test_reconnection.sh sqlite
Expand Down
7 changes: 5 additions & 2 deletions .github/workflows/framework-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ jobs:
if: ${{ github.repository == 'adap/flower' }}
name: Publish release
runs-on: ubuntu-22.04
outputs:
flwr-version: ${{ steps.publish.outputs.flwr-version }}
steps:
- name: Checkout code
uses: actions/checkout@v4
Expand All @@ -26,10 +28,12 @@ jobs:
uses: ./.github/actions/bootstrap

- name: Get artifacts and publish
id: publish
env:
GITHUB_REF: ${{ github.ref }}
run: |
TAG_NAME=$(echo "${GITHUB_REF_NAME}" | cut -c2-)
echo "flwr-version=$TAG_NAME" >> "$GITHUB_OUTPUT"
wheel_name="flwr-${TAG_NAME}-py3-none-any.whl"
tar_name="flwr-${TAG_NAME}.tar.gz"
Expand Down Expand Up @@ -67,8 +71,7 @@ jobs:
- id: matrix
run: |
FLWR_VERSION=$(poetry version -s)
python dev/build-docker-image-matrix.py --flwr-version "${FLWR_VERSION}" > matrix.json
python dev/build-docker-image-matrix.py --flwr-version "${{ needs.publish.outputs.flwr-version }}" > matrix.json
echo "matrix=$(cat matrix.json)" >> $GITHUB_OUTPUT
build-base-images:
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/flowertune-llm/evaluation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ The default template generated by `flwr new` (see the [Project Creation Instruct

| | MBPP | HumanEval | MultiPL-E (JS) | MultiPL-E (C++) | Avg |
|:----------:|:-----:|:---------:|:--------------:|:---------------:|:-----:|
| Pass@1 (%) | 32.60 | 26.83 | 29.81 | 24.22 | 28.37 |
| Pass@1 (%) | 31.60 | 23.78 | 28.57 | 25.47 | 27.36 |


## Make submission on FlowerTune LLM Leaderboard
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/flowertune-llm/evaluation/general-nlp/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ huggingface-cli login
Download data from [FastChat](https://github.com/lm-sys/FastChat):

```shell
git clone --depth=1 https://github.com/lm-sys/FastChat.git && cd FastChat && git checkout d561f87b24de197e25e3ddf7e09af93ced8dfe36 && mv fastchat/llm_judge/data ../data && cd .. && rm -rf FastChat
git clone https://github.com/lm-sys/FastChat.git && cd FastChat && git checkout d561f87b24de197e25e3ddf7e09af93ced8dfe36 && mv fastchat/llm_judge/data ../data && cd .. && rm -rf FastChat
```


Expand Down
38 changes: 38 additions & 0 deletions benchmarks/flowertune-llm/evaluation/medical/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Evaluation for Medical challenge

We build up a medical question answering (QA) pipeline to evaluate our fined-tuned LLMs.
Three datasets have been selected for this evaluation: [PubMedQA](https://huggingface.co/datasets/bigbio/pubmed_qa), [MedMCQA](https://huggingface.co/datasets/medmcqa), and [MedQA](https://huggingface.co/datasets/bigbio/med_qa).


## Environment Setup

```shell
git clone --depth=1 https://github.com/adap/flower.git && mv flower/benchmarks/flowertune-llm/evaluation/medical ./flowertune-eval-medical && rm -rf flower && cd flowertune-eval-medical
```

Create a new Python environment (we recommend Python 3.10), activate it, then install dependencies with:

```shell
# From a new python environment, run:
pip install -r requirements.txt

# Log in HuggingFace account
huggingface-cli login
```

## Generate model decision & calculate accuracy

```bash
python eval.py \
--peft-path=/path/to/fine-tuned-peft-model-dir/ \ # e.g., ./peft_1
--run-name=fl \ # specified name for this run
--batch-size=16 \
--quantization=4 \
--datasets=pubmedqa,medmcqa,medqa
```

The model answers and accuracy values will be saved to `benchmarks/generation_{dataset_name}_{run_name}.jsonl` and `benchmarks/acc_{dataset_name}_{run_name}.txt`, respectively.


> [!NOTE]
> Please ensure that you provide all **three accuracy values (PubMedQA, MedMCQA, MedQA)** for three evaluation datasets when submitting to the LLM Leaderboard (see the [`Make Submission`](https://github.com/adap/flower/tree/main/benchmarks/flowertune-llm/evaluation#make-submission-on-flowertune-llm-leaderboard) section).
174 changes: 174 additions & 0 deletions benchmarks/flowertune-llm/evaluation/medical/benchmarks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
import json

import pandas as pd
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader
from tqdm import tqdm
from utils import format_answer, format_example, save_results

import datasets

# The instructions refer to Meditron evaluation:
# https://github.com/epfLLM/meditron/blob/main/evaluation/instructions.json
INSTRUCTIONS = {
"pubmedqa": "As an expert doctor in clinical science and medical knowledge, can you tell me if the following statement is correct? Answer yes, no, or maybe.",
"medqa": "You are a medical doctor taking the US Medical Licensing Examination. You need to demonstrate your understanding of basic and clinical science, medical knowledge, and mechanisms underlying health, disease, patient care, and modes of therapy. Show your ability to apply the knowledge essential for medical practice. For the following multiple-choice question, select one correct answer from A to E. Base your answer on the current and standard practices referenced in medical guidelines.",
"medmcqa": "You are a medical doctor answering realworld medical entrance exam questions. Based on your understanding of basic and clinical science, medical knowledge, and mechanisms underlying health, disease, patient care, and modes of therapy, answer the following multiple-choice question. Select one correct answer from A to D. Base your answer on the current and standard practices referenced in medical guidelines.",
}


def infer_pubmedqa(model, tokenizer, batch_size, run_name):
name = "pubmedqa"
answer_type = "boolean"
dataset = datasets.load_dataset(
"bigbio/pubmed_qa",
"pubmed_qa_labeled_fold0_source",
split="test",
trust_remote_code=True,
)
# Post process
instruction = INSTRUCTIONS[name]

def post_process(row):
context = "\n".join(row["CONTEXTS"])
row["prompt"] = f"{context}\n{row['QUESTION']}"
row["gold"] = row["final_decision"]
row["long_answer"] = row["LONG_ANSWER"]
row["prompt"] = f"{instruction}\n{row['prompt']}\nThe answer is:\n"
return row

dataset = dataset.map(post_process)

# Generate results
generate_results(name, run_name, dataset, model, tokenizer, batch_size, answer_type)


def infer_medqa(model, tokenizer, batch_size, run_name):
name = "medqa"
answer_type = "mcq"
dataset = datasets.load_dataset(
"bigbio/med_qa",
"med_qa_en_4options_source",
split="test",
trust_remote_code=True,
)

# Post process
instruction = INSTRUCTIONS[name]

def post_process(row):
choices = [opt["value"] for opt in row["options"]]
row["prompt"] = format_example(row["question"], choices)
for opt in row["options"]:
if opt["value"] == row["answer"]:
row["gold"] = opt["key"]
break
row["prompt"] = f"{instruction}\n{row['prompt']}\nThe answer is:\n"
return row

dataset = dataset.map(post_process)

# Generate results
generate_results(name, run_name, dataset, model, tokenizer, batch_size, answer_type)


def infer_medmcqa(model, tokenizer, batch_size, run_name):
name = "medmcqa"
answer_type = "mcq"
dataset = datasets.load_dataset(
"medmcqa", split="validation", trust_remote_code=True
)

# Post process
instruction = INSTRUCTIONS[name]

def post_process(row):
options = [row["opa"], row["opb"], row["opc"], row["opd"]]
answer = int(row["cop"])
row["prompt"] = format_example(row["question"], options)
row["gold"] = chr(ord("A") + answer) if answer in [0, 1, 2, 3] else None
row["prompt"] = f"{instruction}\n{row['prompt']}\nThe answer is:\n"
return row

dataset = dataset.map(post_process)

# Generate results
generate_results(name, run_name, dataset, model, tokenizer, batch_size, answer_type)


def generate_results(
name, run_name, dataset, model, tokenizer, batch_size, answer_type
):
# Run inference
prediction = inference(dataset, model, tokenizer, batch_size)

# Calculate accuracy
acc = accuracy_compute(prediction, answer_type)

# Save results and generations
save_results(name, run_name, prediction, acc)


def inference(dataset, model, tokenizer, batch_size):
columns_process = ["prompt", "gold"]
dataset_process = pd.DataFrame(dataset, columns=dataset.features)[columns_process]
dataset_process = dataset_process.assign(output="Null")
temperature = 1.0

inference_data = json.loads(dataset_process.to_json(orient="records"))
data_loader = DataLoader(inference_data, batch_size=batch_size, shuffle=False)

batch_counter = 0
for batch in tqdm(data_loader, total=len(data_loader), position=0, leave=True):
prompts = [
f"<|im_start|>question\n{prompt}<|im_end|>\n<|im_start|>answer\n"
for prompt in batch["prompt"]
]
if batch_counter == 0:
print(prompts[0])

# Process tokenizer
stop_seq = ["###"]
if tokenizer.eos_token is not None:
stop_seq.append(tokenizer.eos_token)
if tokenizer.pad_token is not None:
stop_seq.append(tokenizer.pad_token)
max_new_tokens = len(
tokenizer(batch["gold"][0], add_special_tokens=False)["input_ids"]
)

outputs = []
for prompt in prompts:
input_ids = tokenizer.encode(prompt, return_tensors="pt").to("cuda")
output_ids = model.generate(
inputs=input_ids,
max_new_tokens=max_new_tokens,
do_sample=False,
top_p=1.0,
temperature=temperature,
pad_token_id=tokenizer.eos_token_id,
)
output_ids = output_ids[0][len(input_ids[0]) :]
output = tokenizer.decode(output_ids, skip_special_tokens=True)
outputs.append(output)

for prompt, out in zip(batch["prompt"], outputs):
dataset_process.loc[dataset_process["prompt"] == prompt, "output"] = out
batch_counter += 1

return dataset_process


def accuracy_compute(dataset, answer_type):
dataset = json.loads(dataset.to_json(orient="records"))
preds, golds = [], []
for row in dataset:
answer = row["gold"].lower()
output = row["output"].lower()
pred, gold = format_answer(output, answer, answer_type=answer_type)
preds.append(pred)
golds.append(gold)

accuracy = accuracy_score(preds, golds)

return accuracy
Loading

0 comments on commit b232c38

Please sign in to comment.