Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add MT_BENCH to pipeline #34

Merged
merged 4 commits into from
Sep 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@ click = "*"
[dev-packages]

[requires]
python_version = "3.12"
python_version = "3.11"
48 changes: 20 additions & 28 deletions Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions eval/mmlu/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# pylint: disable=no-value-for-parameter,import-outside-toplevel,import-error
from typing import List, NamedTuple
from kfp.dsl import component, Input, Output, Artifact, Model, importer
from utils.consts import TOOLBOX_IMAGE
from utils.consts import PYTHON_IMAGE

EVAL_IMAGE = "quay.io/sallyom/instructlab-ocp:eval"

Expand All @@ -15,7 +15,7 @@ def run_mmlu_op(
few_shots: int,
batch_size: int,
device: str,
models_list: List[int],
models_list: List[str],
) -> NamedTuple('outputs', best_model=str, best_score=float):
import json
import os
Expand Down Expand Up @@ -73,7 +73,7 @@ def run_mmlu_op(
best_score = scores[best_model]
return outputs(best_model=best_model, best_score=best_score)

@component(base_image=TOOLBOX_IMAGE)
@component(base_image=PYTHON_IMAGE)
def load_mmlu_results_op(mmlu_output: Input[Artifact]) -> list:
import json

Expand Down
5 changes: 5 additions & 0 deletions eval/mt_bench/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .components import run_mt_bench_op, load_mt_bench_results_op
#from . import faked

__all__ = ["run_mt_bench_op", "load_mt_bench_results_op"]

187 changes: 187 additions & 0 deletions eval/mt_bench/components.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
# type: ignore
# pylint: disable=no-value-for-parameter,import-outside-toplevel,import-error
from typing import List, NamedTuple
from kfp.dsl import component, Input, Output, Artifact, Model, importer
from utils.consts import PYTHON_IMAGE

EVAL_IMAGE = "quay.io/sallyom/instructlab-ocp:eval"

@component(base_image=EVAL_IMAGE, packages_to_install=["vllm"])
def run_mt_bench_op(
models_path_prefix: str,
models_list: List[str],
mt_bench_output: Output[Artifact],
merge_system_user_message: bool,
# generate_answers,judgment uses a magic word for its mt_bench evaluator - `auto`
# with `auto`, number of gpus allocated for serving is calculated based on environment
# https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36
max_workers: str = "auto",
device: str = None,
) -> NamedTuple('outputs', best_model=str, best_score=float):


def launch_vllm_server_background(model_path: str, gpu_count: int, retries: int = 60, delay: int = 5):
import subprocess
import sys
import time
import requests

if gpu_count > 0:
command = [
sys.executable,
"-m", "vllm.entrypoints.openai.api_server",
"--model", model_path,
"--tensor-parallel-size", str(gpu_count),
]
else:
command = [
sys.executable,
"-m", "vllm.entrypoints.openai.api_server",
"--model", model_path,
]

subprocess.Popen(args=command)

server_url = "http://localhost:8000/v1"
print(f"Waiting for vLLM server to start at {server_url}...")

for attempt in range(retries):
try:
response = requests.get(f"{server_url}/models")
if response.status_code == 200:
print(f"vLLM server is up and running at {server_url}.")
return
except requests.ConnectionError:
pass

print(f"Server not available yet, retrying in {delay} seconds (Attempt {attempt + 1}/{retries})...")
time.sleep(delay)

raise RuntimeError(f"Failed to start vLLM server at {server_url} after {retries} retries.")

# This seems like excessive effort to stop the vllm process, but merely saving & killing the pid doesn't work
# Also, the base image does not include `pkill` cmd, so can't pkill -f vllm.entrypoints.openai.api_server either
def stop_vllm_server_by_name():
import psutil

for process in psutil.process_iter(attrs=["pid", "name", "cmdline"]):
cmdline = process.info.get("cmdline")
if cmdline and "vllm.entrypoints.openai.api_server" in cmdline:
print(f"Found vLLM server process with PID: {process.info['pid']}, terminating...")
try:
process.terminate() # Try graceful termination
process.wait(timeout=5) # Wait a bit for it to terminate
if process.is_running():
print(f"Forcefully killing vLLM server process with PID: {process.info['pid']}")
process.kill() # Force kill if it's still running
print(f"Successfully stopped vLLM server with PID: {process.info['pid']}")
except psutil.NoSuchProcess:
print(f"Process with PID {process.info['pid']} no longer exists.")
except psutil.AccessDenied:
print(f"Access denied when trying to terminate process with PID {process.info['pid']}.")
except Exception as e:
print(f"Failed to terminate process with PID {process.info['pid']}. Error: {e}")
Comment on lines +62 to +83
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was not working in the first place because vLLM does not handle signals properly. It expects to be run from the CLI and thus stopped via SIGINT.

If you use a process group in the instantiation above you can get ride of the current pid search logic and simply do:

process_group_id = os.getpgid(process.pid)
process.send_signal(signal.SIGINT)
try:
    print("Waiting for vLLM server to shut down gracefully")
    process.wait(timeout)
except subprocess.TimeoutExpired:
    print(
        f"Sending SIGKILL to vLLM server since timeout ({timeout}s) expired"
    )
    process.kill()

# Attempt to cleanup any remaining child processes
# Make sure process_group is legit (> 1) before trying
if process_group_id and process_group_id > 1:
    try:
        os.killpg(process_group_id, signal.SIGKILL)
        print("Sent SIGKILL to vLLM process group")
    except ProcessLookupError:
        print("Nothing left to clean up with the vLLM process group")
else:
    print("vLLM process group id not found")

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! Please add in a follow-up, I need to be done with this PR 🤣



import json
import torch
import os

from instructlab.eval import mt_bench_answers, mt_bench_judgment

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
candidate_server_url = "http://localhost:8000/v1"

gpu_available = torch.cuda.is_available()
gpu_name = torch.cuda.get_device_name(torch.cuda.current_device()) if gpu_available else "No GPU available"
gpu_count = torch.cuda.device_count() if gpu_available else 0

print(f"GPU Available: {gpu_available}, {gpu_name}")

# See note above about magic word "auto"
if max_workers == "auto":
sallyom marked this conversation as resolved.
Show resolved Hide resolved
try:
usable_cpu_count = len(os.sched_getaffinity(0)) // 2
except AttributeError:
usable_cpu_count = multiprocessing.cpu_count() // 2
max_workers = usable_cpu_count

# TODO: Using evaluator results in connection errors, need to determine why.
# For now, using mt_bench_answers.generate_answers & mt_bench_judgment.generate_judgment
#evaluator = MTBenchEvaluator(
# model_name=candidate_model_name,
# judge_model_name=judge_model_name,
# max_workers=max_workers,
# merge_system_user_message=merge_system_user_message
#)

judge_api_key = os.getenv("JUDGE_API_KEY", "")
judge_model_name = os.getenv("JUDGE_NAME")
judge_endpoint = os.getenv("JUDGE_ENDPOINT")

scores = {}
all_mt_bench_data = []

for model_name in models_list:
print(f"Serving candidate model: {model_name}")
model_path = f"{models_path_prefix}/{model_name}"

# Launch the vLLM server and wait until it is ready
launch_vllm_server_background(model_path, gpu_count)

# model ID is the model_path value in vLLM
print("Generating answers...")
mt_bench_answers.generate_answers(
model_name=model_path,
model_api_base=candidate_server_url,
output_dir="/tmp/eval_output",
max_workers=max_workers
)

print("Judging answers...")
overall_score, qa_pairs, turn_scores, error_rate = mt_bench_judgment.generate_judgment(
model_name=model_path,
judge_model_name=judge_model_name,
model_api_base=judge_endpoint,
api_key=judge_api_key,
output_dir="/tmp/eval_output",
max_workers=max_workers,
merge_system_user_message=merge_system_user_message
)

stop_vllm_server_by_name()

mt_bench_data = {
"report_title": "SKILLS EVALUATION REPORT",
"model": model_path,
"judge_model": judge_model_name,
"overall_score": overall_score,
"turn_scores": turn_scores,
"qa_scores": qa_pairs,
"error_rate": error_rate,
}

all_mt_bench_data.append(mt_bench_data)
scores[model_path] = overall_score

with open(mt_bench_output.path, 'w') as f:
json.dump(all_mt_bench_data, f, indent=4)

outputs = NamedTuple('outputs', best_model=str, best_score=float)
best_model = max(scores, key=scores.get)
best_score = scores[best_model]
return outputs(best_model=best_model, best_score=best_score)

@component(base_image=PYTHON_IMAGE)
def load_mt_bench_results_op(mt_bench_output: Input[Artifact]) -> list:
import json

mt_bench_score_list = []
with open(mt_bench_output.path, 'r') as f:
mt_bench_score_list = json.load(f)

print("MT_Bench Evaluation Data:")
for mt_bench_score in mt_bench_score_list:
print(json.dumps(mt_bench_score, indent=4))

return mt_bench_score_list
Loading