feat(benchmarks) Add LLM evaluation pipeline for Finance challenge (#…

…3769) Co-authored-by: jafermarq <[email protected]>
adap · Sep 11, 2024 · 4558995 · 4558995
1 parent 5f01736
commit 4558995
Show file tree

Hide file tree

Showing 5 changed files with 315 additions and 0 deletions.
diff --git a/benchmarks/flowertune-llm/evaluation/finance/README.md b/benchmarks/flowertune-llm/evaluation/finance/README.md
@@ -0,0 +1,40 @@
+# Evaluation for Finance challenge
+
+We build a sentiment classification pipeline on finance-related text to evaluate our fine-tuned LLMs.
+Three datasets have been selected for this evaluation: [FPB](https://huggingface.co/datasets/takala/financial_phrasebank), [FIQA](https://huggingface.co/datasets/pauri32/fiqa-2018), and [TFNS](https://huggingface.co/datasets/zeroshot/twitter-financial-news-sentiment). 
+
+
+## Environment Setup
+
+```shell
+git clone --depth=1 https://github.com/adap/flower.git && mv flower/benchmarks/flowertune-llm/evaluation/finance ./flowertune-eval-finance && rm -rf flower && cd flowertune-eval-finance
+```
+
+Create a new Python environment (we recommend Python 3.10), activate it, then install dependencies with:
+
+```shell
+# From a new python environment, run:
+pip install -r requirements.txt
+
+# Log in HuggingFace account
+huggingface-cli login
+```
+
+## Generate model decision & calculate accuracy
+
+> [!NOTE]
+> Please ensure that you use `quantization=4` to run the evaluation if you wish to participate in the LLM Leaderboard.
+
+```bash
+python eval.py \
+--peft-path=/path/to/fine-tuned-peft-model-dir/ \ # e.g., ./peft_1
+--run-name=fl  \ # specified name for this run  
+--batch-size=32 \
+--quantization=4 \
+--datasets=fpb,fiqa,tfns
+```
+
+The model answers and accuracy values will be saved to `benchmarks/generation_{dataset_name}_{run_name}.jsonl` and `benchmarks/acc_{dataset_name}_{run_name}.txt`, respectively.
+
+> [!NOTE]
+> Please ensure that you provide all **three accuracy values (FPB, FIQA, TFNS)** for three evaluation datasets when submitting to the LLM Leaderboard (see the [`Make Submission`](https://github.com/adap/flower/tree/main/benchmarks/flowertune-llm/evaluation#make-submission-on-flowertune-llm-leaderboard) section).
diff --git a/benchmarks/flowertune-llm/evaluation/finance/benchmarks.py b/benchmarks/flowertune-llm/evaluation/finance/benchmarks.py
@@ -0,0 +1,135 @@
+import torch
+from sklearn.metrics import accuracy_score
+from tqdm import tqdm
+from utils import (
+    add_instruct,
+    change_target,
+    format_example,
+    generate_label,
+    load_data,
+    save_results,
+)
+
+
+def infer_fiqa(model, tokenizer, batch_size, run_name):
+    name = "fiqa"
+    dataset = load_data("pauri32/fiqa-2018", concat=True)
+
+    # Post process
+    dataset["output"] = dataset.sentiment_score.apply(generate_label)
+    dataset["instruction"] = dataset.apply(add_instruct, axis=1)
+    dataset = dataset[["sentence", "output", "instruction"]]
+    dataset.columns = ["input", "output", "instruction"]
+
+    dataset[["context", "target"]] = dataset.apply(
+        format_example, axis=1, result_type="expand"
+    )
+
+    # Print example
+    print(f"\n\nPrompt example:\n{dataset['context'][0]}\n\n")
+
+    # Run inference
+    dataset, acc = inference(dataset, model, tokenizer, batch_size)
+
+    # Save results and generations
+    save_results(name, run_name, dataset, acc)
+
+
+def infer_fpb(model, tokenizer, batch_size, run_name):
+    name = "fpb"
+    dataset = load_data("takala/financial_phrasebank", "sentences_50agree")
+
+    # Post process
+    dataset.columns = ["input", "output"]
+    dic = {0: "negative", 1: "neutral", 2: "positive"}
+    dataset["output"] = dataset["output"].apply(lambda x: dic[x])
+
+    dataset["instruction"] = (
+        "What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}."
+    )
+    dataset[["context", "target"]] = dataset.apply(
+        format_example, axis=1, result_type="expand"
+    )
+
+    # Print example
+    print(f"\n\nPrompt example:\n{dataset['context'][0]}\n\n")
+
+    # Run inference
+    dataset, acc = inference(dataset, model, tokenizer, batch_size)
+
+    # Save results and generations
+    save_results(name, run_name, dataset, acc)
+
+
+def infer_tfns(model, tokenizer, batch_size, run_name):
+    name = "tfns"
+    dataset = load_data(
+        "zeroshot/twitter-financial-news-sentiment", valid_set="validation"
+    )
+
+    # Post process
+    dic = {0: "negative", 1: "positive", 2: "neutral"}
+    dataset["label"] = dataset["label"].apply(lambda x: dic[x])
+
+    dataset["instruction"] = (
+        "What is the sentiment of this tweet? Please choose an answer from {negative/neutral/positive}."
+    )
+
+    dataset.columns = ["input", "output", "instruction"]
+    dataset[["context", "target"]] = dataset.apply(
+        format_example, axis=1, result_type="expand"
+    )
+
+    # print example
+    print(f"\n\nPrompt example:\n{dataset['context'][0]}\n\n")
+
+    # Run inference
+    dataset, acc = inference(dataset, model, tokenizer, batch_size)
+
+    # Save results and generations
+    save_results(name, run_name, dataset, acc)
+
+
+def inference(dataset, model, tokenizer, batch_size):
+    context = dataset["context"].tolist()
+
+    last_batch = dataset.shape[0] % batch_size
+    total_steps = dataset.shape[0] // batch_size + 1
+    print(
+        f"Total len: {len(context)}. Batch size: {batch_size}. Total steps: {total_steps}"
+    )
+
+    out_text_list = []
+    for i in tqdm(range(total_steps)):
+        idx_s = i * batch_size
+        tmp_context = (
+            context[idx_s : idx_s + last_batch]
+            if i == total_steps - 1
+            else context[idx_s : idx_s + batch_size]
+        )
+
+        if tmp_context:
+            tokens = tokenizer(
+                tmp_context,
+                return_tensors="pt",
+                padding=True,
+                max_length=512,
+                return_token_type_ids=False,
+            )
+            for k in tokens.keys():
+                tokens[k] = tokens[k].cuda()
+            res = model.generate(
+                **tokens, max_length=512, eos_token_id=tokenizer.eos_token_id
+            )
+            res_sentences = [tokenizer.decode(i, skip_special_tokens=True) for i in res]
+            out_text = [o.split("Answer: ")[1] for o in res_sentences]
+            out_text_list += out_text
+            torch.cuda.empty_cache()
+
+    dataset["out_text"] = out_text_list
+    dataset["new_target"] = dataset["target"].apply(change_target)
+    dataset["new_out"] = dataset["out_text"].apply(change_target)
+
+    acc = accuracy_score(dataset["new_target"], dataset["new_out"])
+
+    return dataset, acc
diff --git a/benchmarks/flowertune-llm/evaluation/finance/eval.py b/benchmarks/flowertune-llm/evaluation/finance/eval.py
@@ -0,0 +1,64 @@
+import argparse
+
+import torch
+from peft import PeftModel
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+from benchmarks import infer_fiqa, infer_fpb, infer_tfns
+
+# Fixed seed
+torch.manual_seed(2024)
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--base-model-name-path", type=str, default="mistralai/Mistral-7B-v0.3"
+)
+parser.add_argument("--run-name", type=str, default="fl")
+parser.add_argument("--peft-path", type=str, default=None)
+parser.add_argument("--datasets", type=str, default="fpb")
+parser.add_argument("--batch-size", type=int, default=32)
+parser.add_argument("--quantization", type=int, default=4)
+args = parser.parse_args()
+
+
+# Load model and tokenizer
+if args.quantization == 4:
+    quantization_config = BitsAndBytesConfig(load_in_4bit=True)
+    torch_dtype = torch.float32
+elif args.quantization == 8:
+    quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+    torch_dtype = torch.float16
+else:
+    raise ValueError(
+        f"Use 4-bit or 8-bit quantization. You passed: {args.quantization}/"
+    )
+
+model = AutoModelForCausalLM.from_pretrained(
+    args.base_model_name_path,
+    quantization_config=quantization_config,
+    torch_dtype=torch_dtype,
+)
+if args.peft_path is not None:
+    model = PeftModel.from_pretrained(
+        model, args.peft_path, torch_dtype=torch_dtype
+    ).to("cuda")
+
+tokenizer = AutoTokenizer.from_pretrained(args.base_model_name_path)
+
+if not tokenizer.pad_token or tokenizer.pad_token_id == tokenizer.eos_token_id:
+    tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+    model.resize_token_embeddings(len(tokenizer))
+
+
+# Evaluate
+model = model.eval()
+with torch.no_grad():
+    for dataset in args.datasets.split(","):
+        if dataset == "fpb":
+            infer_fpb(model, tokenizer, args.batch_size, args.run_name)
+        elif dataset == "fiqa":
+            infer_fiqa(model, tokenizer, args.batch_size, args.run_name)
+        elif dataset == "tfns":
+            infer_tfns(model, tokenizer, args.batch_size, args.run_name)
+        else:
+            raise ValueError("Undefined Dataset.")
diff --git a/benchmarks/flowertune-llm/evaluation/finance/requirements.txt b/benchmarks/flowertune-llm/evaluation/finance/requirements.txt
@@ -0,0 +1,6 @@
+peft==0.6.2
+scikit-learn==1.5.0
+datasets==2.20.0
+sentencepiece==0.2.0
+protobuf==5.27.1
+bitsandbytes==0.43.1
diff --git a/benchmarks/flowertune-llm/evaluation/finance/utils.py b/benchmarks/flowertune-llm/evaluation/finance/utils.py
@@ -0,0 +1,70 @@
+import os
+
+import datasets
+from datasets import Dataset
+
+
+def load_data(dataset_path, name=None, concat=False, valid_set=None):
+    dataset = datasets.load_dataset(dataset_path, name, trust_remote_code=True)
+
+    if concat:
+        dataset = datasets.concatenate_datasets(
+            [dataset["train"], dataset["validation"], dataset["test"]]
+        )
+
+    if valid_set:
+        dataset = dataset[valid_set]
+    else:
+        dataset = dataset if concat else dataset["train"]
+        dataset = dataset.train_test_split(0.25, seed=42)["test"]
+
+    dataset = dataset.to_pandas()
+
+    return dataset
+
+
+def format_example(example: dict):
+    context = f"Instruction: {example['instruction']}\n"
+    if example.get("input"):
+        context += f"Input: {example['input']}\n"
+    context += "Answer: "
+    target = example["output"]
+    return {"context": context, "target": target}
+
+
+def generate_label(value):
+    return "negative" if value < -0.1 else "neutral" if value < 0.1 else "positive"
+
+
+def add_instruct(content):
+    tag = "tweet" if content.format == "post" else "news"
+    return f"What is the sentiment of this {tag}? Please choose an answer from {{negative/neutral/positive}}."
+
+
+def change_target(x):
+    if "positive" in x or "Positive" in x:
+        return "positive"
+    elif "negative" in x or "Negative" in x:
+        return "negative"
+    else:
+        return "neutral"
+
+
+def save_results(dataset_name, run_name, dataset, acc):
+    path = "./benchmarks/"
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+    # Save results
+    results_path = os.path.join(path, f"acc_{dataset_name}_{run_name}.txt")
+    with open(results_path, "w") as f:
+        f.write(f"Accuracy: {acc}. ")
+    print(f"Accuracy: {acc}. ")
+
+    # Save generations
+    generation_path = os.path.join(path, f"generation_{dataset_name}_{run_name}.jsonl")
+    dataset = Dataset.from_pandas(dataset)
+    dataset = dataset.remove_columns(
+        ["input", "output", "instruction", "target", "out_text"]
+    )
+    dataset.to_json(generation_path, orient="records")