add note about magic word auto and update subprocess cmds

Signed-off-by: sallyom <[email protected]>
opendatahub-io · Sep 25, 2024 · 5c53958 · 5c53958
1 parent c35f4b5
commit 5c53958
Show file tree

Hide file tree

Showing 2 changed files with 59 additions and 58 deletions.
diff --git a/eval/mt_bench/components.py b/eval/mt_bench/components.py
@@ -12,6 +12,9 @@ def run_mt_bench_op(
     models_list: List[str],
     mt_bench_output: Output[Artifact],
     merge_system_user_message: bool,
+    # generate_answers,judgment uses a magic word for its mt_bench evaluator  - `auto`
+    # with `auto`, number of gpus allocated for serving is calculated based on environment
+    # https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36
     max_workers: str = "auto",
     device: str = None,
 ) -> NamedTuple('outputs', best_model=str, best_score=float):
@@ -23,15 +26,11 @@ def launch_vllm_server_background(model_path: str, gpu_count: int, retries: int
         import requests
 
         if gpu_count > 0:
-            command = (
-                f"nohup python3.11 -m vllm.entrypoints.openai.api_server "
-                f"--model {model_path} "
-                f"--tensor-parallel-size {gpu_count} &"
-            )
+            command = [sys.executable, "-m", "vllm.entrypoints.openai.api_server", "--model", "{model_path}", "--tensor-parallel-size", "{gpu_count}"]
         else:
-            command = f"nohup python3.11 -m vllm.entrypoints.openai.api_server --model {model_path} &"
+            command = [sys.executable, "-m", "vllm.entrypoints.openai.api_server", "--model", "{model_path}"]
 
-        subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        subprocess.Popen(args=command)
 
         server_url = "http://localhost:8000/v1"
         print(f"Waiting for vLLM server to start at {server_url}...")
@@ -89,6 +88,7 @@ def stop_vllm_server_by_name():
 
     print(f"GPU Available: {gpu_available}, {gpu_name}")
 
+    # See note above about magic word "auto"
     if max_workers == "auto":
         try:
             usable_cpu_count = len(os.sched_getaffinity(0)) // 2

diff --git a/pipeline.yaml b/pipeline.yaml
@@ -922,60 +922,61 @@ deploymentSpec:
         - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
           \ *\n\ndef run_mt_bench_op(\n    models_path_prefix: str,\n    models_list:\
           \ List[str],\n    mt_bench_output: Output[Artifact],\n    merge_system_user_message:\
-          \ bool,\n    max_workers: str = \"auto\",\n    device: str = None,\n) ->\
-          \ NamedTuple('outputs', best_model=str, best_score=float):\n\n\n    def\
-          \ launch_vllm_server_background(model_path: str, gpu_count: int, retries:\
-          \ int = 60, delay: int = 5):\n        import subprocess\n        import\
-          \ time\n        import requests\n\n        if gpu_count > 0:\n         \
-          \   command = (\n                f\"nohup python3.11 -m vllm.entrypoints.openai.api_server\
-          \ \"\n                f\"--model {model_path} \"\n                f\"--tensor-parallel-size\
-          \ {gpu_count} &\"\n            )\n        else:\n            command = f\"\
-          nohup python3.11 -m vllm.entrypoints.openai.api_server --model {model_path}\
-          \ &\"\n\n        subprocess.Popen(command, shell=True, stdout=subprocess.PIPE,\
-          \ stderr=subprocess.PIPE)\n\n        server_url = \"http://localhost:8000/v1\"\
-          \n        print(f\"Waiting for vLLM server to start at {server_url}...\"\
-          )\n\n        for attempt in range(retries):\n            try:\n        \
-          \        response = requests.get(f\"{server_url}/models\")\n           \
-          \     if response.status_code == 200:\n                    print(f\"vLLM\
-          \ server is up and running at {server_url}.\")\n                    return\n\
-          \            except requests.ConnectionError:\n                pass\n\n\
-          \            print(f\"Server not available yet, retrying in {delay} seconds\
-          \ (Attempt {attempt + 1}/{retries})...\")\n            time.sleep(delay)\n\
-          \n        raise RuntimeError(f\"Failed to start vLLM server at {server_url}\
-          \ after {retries} retries.\")\n\n    # This seems like excessive effort\
-          \ to stop the vllm process, but merely saving & killing the pid doesn't\
-          \ work\n    # Also, the base image does not include `pkill` cmd, so can't\
-          \ pkill -f vllm.entrypoints.openai.api_server either\n    def stop_vllm_server_by_name():\n\
-          \        import psutil\n\n        for process in psutil.process_iter(attrs=[\"\
-          pid\", \"name\", \"cmdline\"]):\n            cmdline = process.info.get(\"\
-          cmdline\")\n            if cmdline and \"vllm.entrypoints.openai.api_server\"\
-          \ in cmdline:\n                print(f\"Found vLLM server process with PID:\
-          \ {process.info['pid']}, terminating...\")\n                try:\n     \
-          \               process.terminate()  # Try graceful termination\n      \
-          \              process.wait(timeout=5)  # Wait a bit for it to terminate\n\
-          \                    if process.is_running():\n                        print(f\"\
-          Forcefully killing vLLM server process with PID: {process.info['pid']}\"\
-          )\n                        process.kill()  # Force kill if it's still running\n\
-          \                    print(f\"Successfully stopped vLLM server with PID:\
-          \ {process.info['pid']}\")\n                except psutil.NoSuchProcess:\n\
-          \                    print(f\"Process with PID {process.info['pid']} no\
-          \ longer exists.\")\n                except psutil.AccessDenied:\n     \
-          \               print(f\"Access denied when trying to terminate process\
-          \ with PID {process.info['pid']}.\")\n                except Exception as\
-          \ e:\n                    print(f\"Failed to terminate process with PID\
-          \ {process.info['pid']}. Error: {e}\")\n\n\n    import json\n    import\
-          \ torch\n    import os\n\n    from instructlab.eval import mt_bench_answers,\
-          \ mt_bench_judgment\n\n    os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"\
-          expandable_segments:True\"\n    candidate_server_url = \"http://localhost:8000/v1\"\
+          \ bool,\n    # generate_answers,judgment uses a magic word for its mt_bench\
+          \ evaluator  - `auto`\n    # with `auto`, number of gpus allocated for serving\
+          \ is calculated based on environment\n    # https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36\n\
+          \    max_workers: str = \"auto\",\n    device: str = None,\n) -> NamedTuple('outputs',\
+          \ best_model=str, best_score=float):\n\n\n    def launch_vllm_server_background(model_path:\
+          \ str, gpu_count: int, retries: int = 60, delay: int = 5):\n        import\
+          \ subprocess\n        import time\n        import requests\n\n        if\
+          \ gpu_count > 0:\n            command = [sys.executable, \"-m\", \"vllm.entrypoints.openai.api_server\"\
+          , \"--model\", \"{model_path}\", \"--tensor-parallel-size\", \"{gpu_count}\"\
+          ]\n        else:\n            command = [sys.executable, \"-m\", \"vllm.entrypoints.openai.api_server\"\
+          , \"--model\", \"{model_path}\"]\n\n        subprocess.Popen(args=command)\n\
+          \n        server_url = \"http://localhost:8000/v1\"\n        print(f\"Waiting\
+          \ for vLLM server to start at {server_url}...\")\n\n        for attempt\
+          \ in range(retries):\n            try:\n                response = requests.get(f\"\
+          {server_url}/models\")\n                if response.status_code == 200:\n\
+          \                    print(f\"vLLM server is up and running at {server_url}.\"\
+          )\n                    return\n            except requests.ConnectionError:\n\
+          \                pass\n\n            print(f\"Server not available yet,\
+          \ retrying in {delay} seconds (Attempt {attempt + 1}/{retries})...\")\n\
+          \            time.sleep(delay)\n\n        raise RuntimeError(f\"Failed to\
+          \ start vLLM server at {server_url} after {retries} retries.\")\n\n    #\
+          \ This seems like excessive effort to stop the vllm process, but merely\
+          \ saving & killing the pid doesn't work\n    # Also, the base image does\
+          \ not include `pkill` cmd, so can't pkill -f vllm.entrypoints.openai.api_server\
+          \ either\n    def stop_vllm_server_by_name():\n        import psutil\n\n\
+          \        for process in psutil.process_iter(attrs=[\"pid\", \"name\", \"\
+          cmdline\"]):\n            cmdline = process.info.get(\"cmdline\")\n    \
+          \        if cmdline and \"vllm.entrypoints.openai.api_server\" in cmdline:\n\
+          \                print(f\"Found vLLM server process with PID: {process.info['pid']},\
+          \ terminating...\")\n                try:\n                    process.terminate()\
+          \  # Try graceful termination\n                    process.wait(timeout=5)\
+          \  # Wait a bit for it to terminate\n                    if process.is_running():\n\
+          \                        print(f\"Forcefully killing vLLM server process\
+          \ with PID: {process.info['pid']}\")\n                        process.kill()\
+          \  # Force kill if it's still running\n                    print(f\"Successfully\
+          \ stopped vLLM server with PID: {process.info['pid']}\")\n             \
+          \   except psutil.NoSuchProcess:\n                    print(f\"Process with\
+          \ PID {process.info['pid']} no longer exists.\")\n                except\
+          \ psutil.AccessDenied:\n                    print(f\"Access denied when\
+          \ trying to terminate process with PID {process.info['pid']}.\")\n     \
+          \           except Exception as e:\n                    print(f\"Failed\
+          \ to terminate process with PID {process.info['pid']}. Error: {e}\")\n\n\
+          \n    import json\n    import torch\n    import os\n\n    from instructlab.eval\
+          \ import mt_bench_answers, mt_bench_judgment\n\n    os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"\
+          ] = \"expandable_segments:True\"\n    candidate_server_url = \"http://localhost:8000/v1\"\
           \n\n    gpu_available = torch.cuda.is_available()\n    gpu_name = torch.cuda.get_device_name(torch.cuda.current_device())\
           \ if gpu_available else \"No GPU available\"\n    gpu_count = torch.cuda.device_count()\
           \ if gpu_available else 0\n\n    print(f\"GPU Available: {gpu_available},\
-          \ {gpu_name}\")\n\n    if max_workers == \"auto\":\n        try:\n     \
-          \       usable_cpu_count = len(os.sched_getaffinity(0)) // 2\n        except\
-          \ AttributeError:\n            usable_cpu_count = multiprocessing.cpu_count()\
-          \ // 2\n        max_workers = usable_cpu_count\n\n    # TODO: Using evaluator\
-          \ results in connection errors, need to determine why.\n    #       For\
-          \ now, using mt_bench_answers.generate_answers & mt_bench_judgment.generate_judgment\n\
+          \ {gpu_name}\")\n\n    # See note above about magic word \"auto\"\n    if\
+          \ max_workers == \"auto\":\n        try:\n            usable_cpu_count =\
+          \ len(os.sched_getaffinity(0)) // 2\n        except AttributeError:\n  \
+          \          usable_cpu_count = multiprocessing.cpu_count() // 2\n       \
+          \ max_workers = usable_cpu_count\n\n    # TODO: Using evaluator results\
+          \ in connection errors, need to determine why.\n    #       For now, using\
+          \ mt_bench_answers.generate_answers & mt_bench_judgment.generate_judgment\n\
           \    #evaluator = MTBenchEvaluator(\n    #    model_name=candidate_model_name,\n\
           \    #    judge_model_name=judge_model_name,\n    #    max_workers=max_workers,\n\
           \    #    merge_system_user_message=merge_system_user_message\n    #)\n\n\