Skip to content

Commit

Permalink
feat(benchmarks) Add LLM evaluation pipeline for Finance challenge (#…
Browse files Browse the repository at this point in the history
…3769)

Co-authored-by: jafermarq <[email protected]>
  • Loading branch information
yan-gao-GY and jafermarq authored Sep 11, 2024
1 parent 5f01736 commit 4558995
Show file tree
Hide file tree
Showing 5 changed files with 315 additions and 0 deletions.
40 changes: 40 additions & 0 deletions benchmarks/flowertune-llm/evaluation/finance/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Evaluation for Finance challenge

We build a sentiment classification pipeline on finance-related text to evaluate our fine-tuned LLMs.
Three datasets have been selected for this evaluation: [FPB](https://huggingface.co/datasets/takala/financial_phrasebank), [FIQA](https://huggingface.co/datasets/pauri32/fiqa-2018), and [TFNS](https://huggingface.co/datasets/zeroshot/twitter-financial-news-sentiment).


## Environment Setup

```shell
git clone --depth=1 https://github.com/adap/flower.git && mv flower/benchmarks/flowertune-llm/evaluation/finance ./flowertune-eval-finance && rm -rf flower && cd flowertune-eval-finance
```

Create a new Python environment (we recommend Python 3.10), activate it, then install dependencies with:

```shell
# From a new python environment, run:
pip install -r requirements.txt

# Log in HuggingFace account
huggingface-cli login
```

## Generate model decision & calculate accuracy

> [!NOTE]
> Please ensure that you use `quantization=4` to run the evaluation if you wish to participate in the LLM Leaderboard.
```bash
python eval.py \
--peft-path=/path/to/fine-tuned-peft-model-dir/ \ # e.g., ./peft_1
--run-name=fl \ # specified name for this run
--batch-size=32 \
--quantization=4 \
--datasets=fpb,fiqa,tfns
```

The model answers and accuracy values will be saved to `benchmarks/generation_{dataset_name}_{run_name}.jsonl` and `benchmarks/acc_{dataset_name}_{run_name}.txt`, respectively.

> [!NOTE]
> Please ensure that you provide all **three accuracy values (FPB, FIQA, TFNS)** for three evaluation datasets when submitting to the LLM Leaderboard (see the [`Make Submission`](https://github.com/adap/flower/tree/main/benchmarks/flowertune-llm/evaluation#make-submission-on-flowertune-llm-leaderboard) section).
135 changes: 135 additions & 0 deletions benchmarks/flowertune-llm/evaluation/finance/benchmarks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
import torch
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from utils import (
add_instruct,
change_target,
format_example,
generate_label,
load_data,
save_results,
)


def infer_fiqa(model, tokenizer, batch_size, run_name):
name = "fiqa"
dataset = load_data("pauri32/fiqa-2018", concat=True)

# Post process
dataset["output"] = dataset.sentiment_score.apply(generate_label)
dataset["instruction"] = dataset.apply(add_instruct, axis=1)
dataset = dataset[["sentence", "output", "instruction"]]
dataset.columns = ["input", "output", "instruction"]

dataset[["context", "target"]] = dataset.apply(
format_example, axis=1, result_type="expand"
)

# Print example
print(f"\n\nPrompt example:\n{dataset['context'][0]}\n\n")

# Run inference
dataset, acc = inference(dataset, model, tokenizer, batch_size)

# Save results and generations
save_results(name, run_name, dataset, acc)


def infer_fpb(model, tokenizer, batch_size, run_name):
name = "fpb"
dataset = load_data("takala/financial_phrasebank", "sentences_50agree")

# Post process
dataset.columns = ["input", "output"]
dic = {0: "negative", 1: "neutral", 2: "positive"}
dataset["output"] = dataset["output"].apply(lambda x: dic[x])

dataset["instruction"] = (
"What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}."
)
dataset[["context", "target"]] = dataset.apply(
format_example, axis=1, result_type="expand"
)

# Print example
print(f"\n\nPrompt example:\n{dataset['context'][0]}\n\n")

# Run inference
dataset, acc = inference(dataset, model, tokenizer, batch_size)

# Save results and generations
save_results(name, run_name, dataset, acc)


def infer_tfns(model, tokenizer, batch_size, run_name):
name = "tfns"
dataset = load_data(
"zeroshot/twitter-financial-news-sentiment", valid_set="validation"
)

# Post process
dic = {0: "negative", 1: "positive", 2: "neutral"}
dataset["label"] = dataset["label"].apply(lambda x: dic[x])

dataset["instruction"] = (
"What is the sentiment of this tweet? Please choose an answer from {negative/neutral/positive}."
)

dataset.columns = ["input", "output", "instruction"]
dataset[["context", "target"]] = dataset.apply(
format_example, axis=1, result_type="expand"
)

# print example
print(f"\n\nPrompt example:\n{dataset['context'][0]}\n\n")

# Run inference
dataset, acc = inference(dataset, model, tokenizer, batch_size)

# Save results and generations
save_results(name, run_name, dataset, acc)


def inference(dataset, model, tokenizer, batch_size):
context = dataset["context"].tolist()

last_batch = dataset.shape[0] % batch_size
total_steps = dataset.shape[0] // batch_size + 1
print(
f"Total len: {len(context)}. Batch size: {batch_size}. Total steps: {total_steps}"
)

out_text_list = []
for i in tqdm(range(total_steps)):
idx_s = i * batch_size
tmp_context = (
context[idx_s : idx_s + last_batch]
if i == total_steps - 1
else context[idx_s : idx_s + batch_size]
)

if tmp_context:
tokens = tokenizer(
tmp_context,
return_tensors="pt",
padding=True,
max_length=512,
return_token_type_ids=False,
)
for k in tokens.keys():
tokens[k] = tokens[k].cuda()
res = model.generate(
**tokens, max_length=512, eos_token_id=tokenizer.eos_token_id
)
res_sentences = [tokenizer.decode(i, skip_special_tokens=True) for i in res]
out_text = [o.split("Answer: ")[1] for o in res_sentences]
out_text_list += out_text
torch.cuda.empty_cache()

dataset["out_text"] = out_text_list
dataset["new_target"] = dataset["target"].apply(change_target)
dataset["new_out"] = dataset["out_text"].apply(change_target)

acc = accuracy_score(dataset["new_target"], dataset["new_out"])

return dataset, acc
64 changes: 64 additions & 0 deletions benchmarks/flowertune-llm/evaluation/finance/eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import argparse

import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

from benchmarks import infer_fiqa, infer_fpb, infer_tfns

# Fixed seed
torch.manual_seed(2024)

parser = argparse.ArgumentParser()
parser.add_argument(
"--base-model-name-path", type=str, default="mistralai/Mistral-7B-v0.3"
)
parser.add_argument("--run-name", type=str, default="fl")
parser.add_argument("--peft-path", type=str, default=None)
parser.add_argument("--datasets", type=str, default="fpb")
parser.add_argument("--batch-size", type=int, default=32)
parser.add_argument("--quantization", type=int, default=4)
args = parser.parse_args()


# Load model and tokenizer
if args.quantization == 4:
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
torch_dtype = torch.float32
elif args.quantization == 8:
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
torch_dtype = torch.float16
else:
raise ValueError(
f"Use 4-bit or 8-bit quantization. You passed: {args.quantization}/"
)

model = AutoModelForCausalLM.from_pretrained(
args.base_model_name_path,
quantization_config=quantization_config,
torch_dtype=torch_dtype,
)
if args.peft_path is not None:
model = PeftModel.from_pretrained(
model, args.peft_path, torch_dtype=torch_dtype
).to("cuda")

tokenizer = AutoTokenizer.from_pretrained(args.base_model_name_path)

if not tokenizer.pad_token or tokenizer.pad_token_id == tokenizer.eos_token_id:
tokenizer.add_special_tokens({"pad_token": "[PAD]"})
model.resize_token_embeddings(len(tokenizer))


# Evaluate
model = model.eval()
with torch.no_grad():
for dataset in args.datasets.split(","):
if dataset == "fpb":
infer_fpb(model, tokenizer, args.batch_size, args.run_name)
elif dataset == "fiqa":
infer_fiqa(model, tokenizer, args.batch_size, args.run_name)
elif dataset == "tfns":
infer_tfns(model, tokenizer, args.batch_size, args.run_name)
else:
raise ValueError("Undefined Dataset.")
6 changes: 6 additions & 0 deletions benchmarks/flowertune-llm/evaluation/finance/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
peft==0.6.2
scikit-learn==1.5.0
datasets==2.20.0
sentencepiece==0.2.0
protobuf==5.27.1
bitsandbytes==0.43.1
70 changes: 70 additions & 0 deletions benchmarks/flowertune-llm/evaluation/finance/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import os

import datasets
from datasets import Dataset


def load_data(dataset_path, name=None, concat=False, valid_set=None):
dataset = datasets.load_dataset(dataset_path, name, trust_remote_code=True)

if concat:
dataset = datasets.concatenate_datasets(
[dataset["train"], dataset["validation"], dataset["test"]]
)

if valid_set:
dataset = dataset[valid_set]
else:
dataset = dataset if concat else dataset["train"]
dataset = dataset.train_test_split(0.25, seed=42)["test"]

dataset = dataset.to_pandas()

return dataset


def format_example(example: dict):
context = f"Instruction: {example['instruction']}\n"
if example.get("input"):
context += f"Input: {example['input']}\n"
context += "Answer: "
target = example["output"]
return {"context": context, "target": target}


def generate_label(value):
return "negative" if value < -0.1 else "neutral" if value < 0.1 else "positive"


def add_instruct(content):
tag = "tweet" if content.format == "post" else "news"
return f"What is the sentiment of this {tag}? Please choose an answer from {{negative/neutral/positive}}."


def change_target(x):
if "positive" in x or "Positive" in x:
return "positive"
elif "negative" in x or "Negative" in x:
return "negative"
else:
return "neutral"


def save_results(dataset_name, run_name, dataset, acc):
path = "./benchmarks/"
if not os.path.exists(path):
os.makedirs(path)

# Save results
results_path = os.path.join(path, f"acc_{dataset_name}_{run_name}.txt")
with open(results_path, "w") as f:
f.write(f"Accuracy: {acc}. ")
print(f"Accuracy: {acc}. ")

# Save generations
generation_path = os.path.join(path, f"generation_{dataset_name}_{run_name}.jsonl")
dataset = Dataset.from_pandas(dataset)
dataset = dataset.remove_columns(
["input", "output", "instruction", "target", "out_text"]
)
dataset.to_json(generation_path, orient="records")

0 comments on commit 4558995

Please sign in to comment.