Skip to content

Commit

Permalink
add MT_BENCH to main pipeline
Browse files Browse the repository at this point in the history
Signed-off-by: sallyom <[email protected]>
  • Loading branch information
sallyom committed Sep 20, 2024
1 parent 27802f0 commit 2ccfeb8
Show file tree
Hide file tree
Showing 6 changed files with 457 additions and 75 deletions.
2 changes: 1 addition & 1 deletion Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@ click = "*"
[dev-packages]

[requires]
python_version = "3.12"
python_version = "3.11"
48 changes: 20 additions & 28 deletions Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions eval/mt_bench/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .components import run_mt_bench_op, load_mt_bench_results_op, parse_output_to_dict
#from . import faked

__all__ = ["run_mt_bench_op", "load_mt_bench_results_op", "parse_output_to_dict"]

104 changes: 104 additions & 0 deletions eval/mt_bench/components.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# type: ignore
# pylint: disable=no-value-for-parameter,import-outside-toplevel,import-error
from typing import List, NamedTuple
from kfp import dsl
from kfp.dsl import component, Input, Output, Artifact, Model, importer

INSTRUCTLAB_IMAGE = "registry.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.1"
TOOLBOX_IMAGE = "registry.access.redhat.com/ubi9/toolbox"
#MODELS_INFO = "kfp-models"
#JUDGE_SECRET = "judge-server"

@component(base_image=TOOLBOX_IMAGE)
def parse_output_to_dict(output: str):
lines = output.strip().split('\n')
report = {}

# Iterate through each line and extract key-value pairs
for line in lines:
line = line.strip()
if line.startswith("## MODEL"):
report["model"] = lines[lines.index(line) + 1].strip()
elif line.startswith("### AVERAGE"):
average_line = lines[lines.index(line) + 1].strip()
report["average"] = float(average_line.split()[0])
report["average_details"] = average_line
elif line.startswith("### TURN ONE"):
report["turn_one"] = float(lines[lines.index(line) + 1].strip())
elif line.startswith("### TURN TWO"):
report["turn_two"] = float(lines[lines.index(line) + 1].strip())
elif line.startswith("### ERROR RATE"):
report["error_rate"] = float(lines[lines.index(line) + 1].strip())

return report


@component(base_image=INSTRUCTLAB_IMAGE)
def run_mt_bench_op(
mt_bench_output: Output[Artifact],
models_path_prefix: str,
judge_model_path: str,
models_list: List[int],
) -> NamedTuple("outputs", best_model=str, best_score=float) :
import json
import os
import subprocess
import typing

Outputs = NamedTuple("outputs", best_model=str, best_score=float)
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# ilab CLI does not support external models
#judge_model_name = os.getenv("JUDGE_MODEL_NAME")
#judge_endpoint = os.getenv("JUDGE_ENDPOINT")
#judge_api_key = os.getenv("JUDGE_API_KEY", "")

#os.environ["ILAB_MODELS_DIR"] = "{models_path_prefix}"

scores = {}
all_mt_bench_data = []
for model_name in models_list:
model_path = f"{models_path_prefix}/{model_name}"

#ilab model evaluate --benchmark mt_bench
# --model $ILAB_MODELS_DIR/granite-7b-lab-Q4_K_M.gguf
# --judge-model $ILAB_MODELS_DIR/granite-7b-lab-Q4_K_M.gguf

command = ["ilab", "model", "evaluate", "--benchmark", "mt_bench", "--model", "{models_path_prefix}/{model_name}", "--judge-model", "{judge_model_path}"]
result = subprocess.run(command, capture_output=True, text=True)

print(result.stdout)
print(result.stderr)
if result.returncode == 0:
print("MT_BENCH executed successfully!")
else:
print("MT_BENCH failed with return code:", result.returncode)

output_str = result.stdout
output_dict = parse_output_to_dict(output_str)
overall_score = output_dict["average"]
mt_bench_data = json.dumps(output_dict, indent=4)

all_mt_bench_data.append(mt_bench_data)
scores[model_path] = overall_score

with open(mt_bench_output.path, 'w') as f:
json.dump(all_mt_bench_data, f, indent=4)

best_model = max(scores, key=scores.get)
best_score = scores[best_model]
return Outputs(best_model, best_score)

@component(base_image=TOOLBOX_IMAGE)
def load_mt_bench_results_op(mt_bench_output: Input[Artifact]) -> list:
import json

mt_bench_score_list = []
with open(mt_bench_output.path, 'r') as f:
mt_bench_score_list = json.load(f)

print("MT_BENCH Evaluation Data:")
for mt_bench_score in mt_bench_score_list:
print(json.dumps(mt_bench_score, indent=4))

return mt_bench_score_list
58 changes: 56 additions & 2 deletions pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
)

K8S_NAME = "kfp-model-server"
MOCKED_STAGES = ["sdg", "train", "eval"]
MODELS_INFO = "kfp-models"
MOCKED_STAGES = ["sdg", "train"]


def pipeline_wrapper(mock: List[Literal[MOCKED_STAGES]]):
Expand Down Expand Up @@ -46,6 +47,12 @@ def pipeline_wrapper(mock: List[Literal[MOCKED_STAGES]]):
pvc_to_model_op
)

# Imports for MMLU, MT_BENCH stage
# TODO: Add mock/fake components
from utils import list_models_in_directory_op
#from eval.mmlu import run_mmlu_op, load_mmlu_results_op
from eval.mt_bench import run_mt_bench_op, load_mt_bench_results_op

@dsl.pipeline(
display_name="InstructLab",
name="instructlab",
Expand All @@ -58,6 +65,12 @@ def pipeline(
repo_pr: Optional[int] = None,
storage_class_name: str = "ocs-external-storagecluster-ceph-rbd",
base_model: str = "ibm-granite/granite-7b-base",
# minimal subset of MMLU_TASKS
#mmlu_tasks_list: str = "mmlu_abstract_algebra,mmlu_anatomy,mmlu_astronomy",
#model_dtype: str = "bfloat16",
#few_shots: int = 5,
#batch_size: int = 8,
#device: str = None,
):

# SDG stage
Expand Down Expand Up @@ -166,14 +179,55 @@ def pipeline(
output_data_task = pvc_to_model_op(
pvc_path="/output/model",
)

# Evaluation of models

models_list_task = list_models_in_directory_op(
models_folder="/output/model/model/hf_format",
)

models_list_task.after(kubectl_wait_task)

mount_pvc(
task=models_list_task, pvc_name=output_pvc_task.output, mount_path="/output/model"
)

# TODO: Run MMLU eval (PR #29)
# Run training on MMLU best-model
# Run mt_bench on candidate_models from training
# Run final eval on best scored mt_bench candidate

run_mt_bench_task = run_mt_bench_op(
# TODO: make a second models_list from the 2nd phase of training
models_list=models_list_task.output,
models_path_prefix = "/output/model/model/hf_format",
judge_model_path = "/model/granite-7b-base"
)
mount_pvc(
task=run_mt_bench_task, pvc_name=model_pvc_task.output, mount_path="/model"
)

load_mt_bench_results_task = load_mt_bench_results_op(
mt_bench_output=run_mt_bench_task.outputs['mt_bench_output'],
)

best_model = run_mt_bench_task.outputs['best_model']
best_score = run_mt_bench_task.outputs['best_score']
print(f"Best Model after MT_Bench: {best_model} with score: {best_score}")

run_mt_bench_task.set_accelerator_type('nvidia.com/gpu')
run_mt_bench_task.set_accelerator_limit(1)

output_data_task.after(kubectl_wait_task)
output_model_task.set_caching_options(False)
#output_model_task.after(run_mmlu_task)
output_model_task.after(run_mt_bench_task)
mount_pvc(
task=output_data_task, pvc_name=output_pvc_task.output, mount_path="/output/model"
)

output_pvc_delete_task = DeletePVC(pvc_name=output_pvc_task.output)
output_pvc_delete_task.after(output_model_task, output_data_task)
output_pvc_delete_task.after(output_model_task, output_data_task, run_mt_bench_task, load_mt_bench_results_task)

return

Expand Down
Loading

0 comments on commit 2ccfeb8

Please sign in to comment.