From ec760b0231705129d663da67bd9edd9bc141d2e1 Mon Sep 17 00:00:00 2001
From: Yan Gao <y.gaogy@gmail.com>
Date: Mon, 9 Sep 2024 18:20:07 +0100
Subject: [PATCH] feat(benchmarks) Add LLM evaluation pipeline for Medical
 challenge (#3768)

Co-authored-by: jafermarq <javier@flower.ai>
---
 .../evaluation/medical/README.md              |  38 ++++
 .../evaluation/medical/benchmarks.py          | 174 ++++++++++++++++++
 .../flowertune-llm/evaluation/medical/eval.py |  62 +++++++
 .../evaluation/medical/requirements.txt       |   7 +
 .../evaluation/medical/utils.py               |  81 ++++++++
 5 files changed, 362 insertions(+)
 create mode 100644 benchmarks/flowertune-llm/evaluation/medical/README.md
 create mode 100644 benchmarks/flowertune-llm/evaluation/medical/benchmarks.py
 create mode 100644 benchmarks/flowertune-llm/evaluation/medical/eval.py
 create mode 100644 benchmarks/flowertune-llm/evaluation/medical/requirements.txt
 create mode 100644 benchmarks/flowertune-llm/evaluation/medical/utils.py

diff --git a/benchmarks/flowertune-llm/evaluation/medical/README.md b/benchmarks/flowertune-llm/evaluation/medical/README.md
new file mode 100644
index 000000000000..78de069460d8
--- /dev/null
+++ b/benchmarks/flowertune-llm/evaluation/medical/README.md
@@ -0,0 +1,38 @@
+# Evaluation for Medical challenge
+
+We build up a medical question answering (QA) pipeline to evaluate our fined-tuned LLMs.
+Three datasets have been selected for this evaluation: [PubMedQA](https://huggingface.co/datasets/bigbio/pubmed_qa), [MedMCQA](https://huggingface.co/datasets/medmcqa), and [MedQA](https://huggingface.co/datasets/bigbio/med_qa). 
+
+
+## Environment Setup
+
+```shell
+git clone --depth=1 https://github.com/adap/flower.git && mv flower/benchmarks/flowertune-llm/evaluation/medical ./flowertune-eval-medical && rm -rf flower && cd flowertune-eval-medical
+```
+
+Create a new Python environment (we recommend Python 3.10), activate it, then install dependencies with:
+
+```shell
+# From a new python environment, run:
+pip install -r requirements.txt
+
+# Log in HuggingFace account
+huggingface-cli login
+```
+
+## Generate model decision & calculate accuracy
+
+```bash
+python eval.py \
+--peft-path=/path/to/fine-tuned-peft-model-dir/ \ # e.g., ./peft_1
+--run-name=fl  \ # specified name for this run  
+--batch-size=16 \
+--quantization=4 \
+--datasets=pubmedqa,medmcqa,medqa
+```
+
+The model answers and accuracy values will be saved to `benchmarks/generation_{dataset_name}_{run_name}.jsonl` and `benchmarks/acc_{dataset_name}_{run_name}.txt`, respectively.
+
+
+> [!NOTE]
+> Please ensure that you provide all **three accuracy values (PubMedQA, MedMCQA, MedQA)** for three evaluation datasets when submitting to the LLM Leaderboard (see the [`Make Submission`](https://github.com/adap/flower/tree/main/benchmarks/flowertune-llm/evaluation#make-submission-on-flowertune-llm-leaderboard) section).
diff --git a/benchmarks/flowertune-llm/evaluation/medical/benchmarks.py b/benchmarks/flowertune-llm/evaluation/medical/benchmarks.py
new file mode 100644
index 000000000000..c72e2a7894da
--- /dev/null
+++ b/benchmarks/flowertune-llm/evaluation/medical/benchmarks.py
@@ -0,0 +1,174 @@
+import json
+
+import pandas as pd
+from sklearn.metrics import accuracy_score
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from utils import format_answer, format_example, save_results
+
+import datasets
+
+# The instructions refer to Meditron evaluation:
+# https://github.com/epfLLM/meditron/blob/main/evaluation/instructions.json
+INSTRUCTIONS = {
+    "pubmedqa": "As an expert doctor in clinical science and medical knowledge, can you tell me if the following statement is correct? Answer yes, no, or maybe.",
+    "medqa": "You are a medical doctor taking the US Medical Licensing Examination. You need to demonstrate your understanding of basic and clinical science, medical knowledge, and mechanisms underlying health, disease, patient care, and modes of therapy. Show your ability to apply the knowledge essential for medical practice. For the following multiple-choice question, select one correct answer from A to E. Base your answer on the current and standard practices referenced in medical guidelines.",
+    "medmcqa": "You are a medical doctor answering realworld medical entrance exam questions. Based on your understanding of basic and clinical science, medical knowledge, and mechanisms underlying health, disease, patient care, and modes of therapy, answer the following multiple-choice question. Select one correct answer from A to D. Base your answer on the current and standard practices referenced in medical guidelines.",
+}
+
+
+def infer_pubmedqa(model, tokenizer, batch_size, run_name):
+    name = "pubmedqa"
+    answer_type = "boolean"
+    dataset = datasets.load_dataset(
+        "bigbio/pubmed_qa",
+        "pubmed_qa_labeled_fold0_source",
+        split="test",
+        trust_remote_code=True,
+    )
+    # Post process
+    instruction = INSTRUCTIONS[name]
+
+    def post_process(row):
+        context = "\n".join(row["CONTEXTS"])
+        row["prompt"] = f"{context}\n{row['QUESTION']}"
+        row["gold"] = row["final_decision"]
+        row["long_answer"] = row["LONG_ANSWER"]
+        row["prompt"] = f"{instruction}\n{row['prompt']}\nThe answer is:\n"
+        return row
+
+    dataset = dataset.map(post_process)
+
+    # Generate results
+    generate_results(name, run_name, dataset, model, tokenizer, batch_size, answer_type)
+
+
+def infer_medqa(model, tokenizer, batch_size, run_name):
+    name = "medqa"
+    answer_type = "mcq"
+    dataset = datasets.load_dataset(
+        "bigbio/med_qa",
+        "med_qa_en_4options_source",
+        split="test",
+        trust_remote_code=True,
+    )
+
+    # Post process
+    instruction = INSTRUCTIONS[name]
+
+    def post_process(row):
+        choices = [opt["value"] for opt in row["options"]]
+        row["prompt"] = format_example(row["question"], choices)
+        for opt in row["options"]:
+            if opt["value"] == row["answer"]:
+                row["gold"] = opt["key"]
+                break
+        row["prompt"] = f"{instruction}\n{row['prompt']}\nThe answer is:\n"
+        return row
+
+    dataset = dataset.map(post_process)
+
+    # Generate results
+    generate_results(name, run_name, dataset, model, tokenizer, batch_size, answer_type)
+
+
+def infer_medmcqa(model, tokenizer, batch_size, run_name):
+    name = "medmcqa"
+    answer_type = "mcq"
+    dataset = datasets.load_dataset(
+        "medmcqa", split="validation", trust_remote_code=True
+    )
+
+    # Post process
+    instruction = INSTRUCTIONS[name]
+
+    def post_process(row):
+        options = [row["opa"], row["opb"], row["opc"], row["opd"]]
+        answer = int(row["cop"])
+        row["prompt"] = format_example(row["question"], options)
+        row["gold"] = chr(ord("A") + answer) if answer in [0, 1, 2, 3] else None
+        row["prompt"] = f"{instruction}\n{row['prompt']}\nThe answer is:\n"
+        return row
+
+    dataset = dataset.map(post_process)
+
+    # Generate results
+    generate_results(name, run_name, dataset, model, tokenizer, batch_size, answer_type)
+
+
+def generate_results(
+    name, run_name, dataset, model, tokenizer, batch_size, answer_type
+):
+    # Run inference
+    prediction = inference(dataset, model, tokenizer, batch_size)
+
+    # Calculate accuracy
+    acc = accuracy_compute(prediction, answer_type)
+
+    # Save results and generations
+    save_results(name, run_name, prediction, acc)
+
+
+def inference(dataset, model, tokenizer, batch_size):
+    columns_process = ["prompt", "gold"]
+    dataset_process = pd.DataFrame(dataset, columns=dataset.features)[columns_process]
+    dataset_process = dataset_process.assign(output="Null")
+    temperature = 1.0
+
+    inference_data = json.loads(dataset_process.to_json(orient="records"))
+    data_loader = DataLoader(inference_data, batch_size=batch_size, shuffle=False)
+
+    batch_counter = 0
+    for batch in tqdm(data_loader, total=len(data_loader), position=0, leave=True):
+        prompts = [
+            f"<|im_start|>question\n{prompt}<|im_end|>\n<|im_start|>answer\n"
+            for prompt in batch["prompt"]
+        ]
+        if batch_counter == 0:
+            print(prompts[0])
+
+        # Process tokenizer
+        stop_seq = ["###"]
+        if tokenizer.eos_token is not None:
+            stop_seq.append(tokenizer.eos_token)
+        if tokenizer.pad_token is not None:
+            stop_seq.append(tokenizer.pad_token)
+        max_new_tokens = len(
+            tokenizer(batch["gold"][0], add_special_tokens=False)["input_ids"]
+        )
+
+        outputs = []
+        for prompt in prompts:
+            input_ids = tokenizer.encode(prompt, return_tensors="pt").to("cuda")
+            output_ids = model.generate(
+                inputs=input_ids,
+                max_new_tokens=max_new_tokens,
+                do_sample=False,
+                top_p=1.0,
+                temperature=temperature,
+                pad_token_id=tokenizer.eos_token_id,
+            )
+            output_ids = output_ids[0][len(input_ids[0]) :]
+            output = tokenizer.decode(output_ids, skip_special_tokens=True)
+            outputs.append(output)
+
+        for prompt, out in zip(batch["prompt"], outputs):
+            dataset_process.loc[dataset_process["prompt"] == prompt, "output"] = out
+        batch_counter += 1
+
+    return dataset_process
+
+
+def accuracy_compute(dataset, answer_type):
+    dataset = json.loads(dataset.to_json(orient="records"))
+    preds, golds = [], []
+    for row in dataset:
+        answer = row["gold"].lower()
+        output = row["output"].lower()
+        pred, gold = format_answer(output, answer, answer_type=answer_type)
+        preds.append(pred)
+        golds.append(gold)
+
+    accuracy = accuracy_score(preds, golds)
+
+    return accuracy
diff --git a/benchmarks/flowertune-llm/evaluation/medical/eval.py b/benchmarks/flowertune-llm/evaluation/medical/eval.py
new file mode 100644
index 000000000000..7405e1493e4d
--- /dev/null
+++ b/benchmarks/flowertune-llm/evaluation/medical/eval.py
@@ -0,0 +1,62 @@
+import argparse
+
+import torch
+from peft import PeftModel
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+from benchmarks import infer_medmcqa, infer_medqa, infer_pubmedqa
+
+# Fixed seed
+torch.manual_seed(2024)
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--base-model-name-path", type=str, default="mistralai/Mistral-7B-v0.3"
+)
+parser.add_argument("--run-name", type=str, default="fl")
+parser.add_argument("--peft-path", type=str, default=None)
+parser.add_argument(
+    "--datasets",
+    type=str,
+    default="pubmedqa",
+    help="The dataset to infer on: [pubmedqa, medqa, medmcqa]",
+)
+parser.add_argument("--batch-size", type=int, default=16)
+parser.add_argument("--quantization", type=int, default=4)
+args = parser.parse_args()
+
+
+# Load model and tokenizer
+if args.quantization == 4:
+    quantization_config = BitsAndBytesConfig(load_in_4bit=True)
+    torch_dtype = torch.float32
+elif args.quantization == 8:
+    quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+    torch_dtype = torch.float16
+else:
+    raise ValueError(
+        f"Use 4-bit or 8-bit quantization. You passed: {args.quantization}/"
+    )
+
+model = AutoModelForCausalLM.from_pretrained(
+    args.base_model_name_path,
+    quantization_config=quantization_config,
+    torch_dtype=torch_dtype,
+)
+if args.peft_path is not None:
+    model = PeftModel.from_pretrained(
+        model, args.peft_path, torch_dtype=torch_dtype
+    ).to("cuda")
+
+tokenizer = AutoTokenizer.from_pretrained(args.base_model_name_path)
+
+# Evaluate
+for dataset in args.datasets.split(","):
+    if dataset == "pubmedqa":
+        infer_pubmedqa(model, tokenizer, args.batch_size, args.run_name)
+    elif dataset == "medqa":
+        infer_medqa(model, tokenizer, args.batch_size, args.run_name)
+    elif dataset == "medmcqa":
+        infer_medmcqa(model, tokenizer, args.batch_size, args.run_name)
+    else:
+        raise ValueError("Undefined Dataset.")
diff --git a/benchmarks/flowertune-llm/evaluation/medical/requirements.txt b/benchmarks/flowertune-llm/evaluation/medical/requirements.txt
new file mode 100644
index 000000000000..adfc8b0c59db
--- /dev/null
+++ b/benchmarks/flowertune-llm/evaluation/medical/requirements.txt
@@ -0,0 +1,7 @@
+peft==0.6.2
+pandas==2.2.2
+scikit-learn==1.5.0
+datasets==2.20.0
+sentencepiece==0.2.0
+protobuf==5.27.1
+bitsandbytes==0.43.1
diff --git a/benchmarks/flowertune-llm/evaluation/medical/utils.py b/benchmarks/flowertune-llm/evaluation/medical/utils.py
new file mode 100644
index 000000000000..44d0763d39d4
--- /dev/null
+++ b/benchmarks/flowertune-llm/evaluation/medical/utils.py
@@ -0,0 +1,81 @@
+import os
+import re
+
+
+def format_example(question, choices):
+    if not question.endswith("?") and not question.endswith("."):
+        question += "?"
+    options_str = "\n".join([f"{chr(65+i)}. {choices[i]}" for i in range(len(choices))])
+    prompt = "Question: " + question + "\n\nOptions:\n" + options_str
+    return prompt
+
+
+def save_results(dataset_name, run_name, dataset, acc):
+    path = "./benchmarks/"
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+    # Save results
+    results_path = os.path.join(path, f"acc_{dataset_name}_{run_name}.txt")
+    with open(results_path, "w") as f:
+        f.write(f"Accuracy: {acc}. ")
+    print(f"Accuracy: {acc}. ")
+
+    # Save generations
+    generation_path = os.path.join(path, f"generation_{dataset_name}_{run_name}.jsonl")
+    dataset.to_json(generation_path, orient="records")
+
+
+def format_answer(output_full, answer, answer_type="mcq"):
+    output = output_full
+    default = (output_full, answer)
+    if "\n##" in output:
+        try:
+            output = output.split("\n##")[1].split("\n")[0].strip().lower()
+        except Exception:
+            return default
+    if "###" in answer:
+        try:
+            answer = answer.split("answer is:")[1].split("###")[0].strip()
+        except Exception:
+            return default
+
+    output = re.sub(r"[^a-zA-Z0-9]", " ", output).strip()
+    output = re.sub(" +", " ", output)
+
+    if answer_type == "boolean":
+        output = clean_boolean_answer(output)
+    elif answer_type == "mcq":
+        output = clean_mcq_answer(output)
+
+    if output in ["a", "b", "c", "d", "e", "yes", "no"]:
+        return output, answer
+    else:
+        return default
+
+
+def clean_mcq_answer(output):
+    output = clean_answer(output)
+    try:
+        output = output[0]
+    except Exception:
+        return output
+    return output
+
+
+def clean_boolean_answer(output):
+    if "yesyes" in output:
+        output = output.replace("yesyes", "yes")
+    elif "nono" in output:
+        output = output.replace("nono", "no")
+    elif "yesno" in output:
+        output = output.replace("yesno", "yes")
+    elif "noyes" in output:
+        output = output.replace("noyes", "no")
+    output = clean_answer(output)
+    return output
+
+
+def clean_answer(output):
+    output_clean = output.encode("ascii", "ignore").decode("ascii")
+    return output_clean