Shortfin LLM Deviceid Support (nod-ai#493)

# Description Add the ability to specify device_ids that you want Shortfin LLM Server to run with. The setup is essentially 1-1 with how SD server sets device_ids support up. Created a new `shortfin/interop/support/device_setup.py` module and moved the `get_selected_devices` function there to be shared across `managers`. ## Example ```bash python -m shortfin_apps.llm.server --tokenizer_json=/data/llama3.1/8b/tokenizer.json --model_config=./export/edited_config.json --vmfb=./export/model.vmfb --parameters=/data/llama3.1/8b/llama8b_f16.irpa --device=amdgpu --device_ids=0 ```
stbaione · Nov 13, 2024 · ce6ccf8 · ce6ccf8
1 parent 903d3c1
commit ce6ccf8
Show file tree

Hide file tree

Showing 4 changed files with 57 additions and 31 deletions.
diff --git a/shortfin/python/shortfin/interop/support/device_setup.py b/shortfin/python/shortfin/interop/support/device_setup.py
@@ -0,0 +1,26 @@
+import shortfin as sf
+
+
+def get_selected_devices(sb: sf.SystemBuilder, device_ids=None):
+    available = sb.available_devices
+    selected = []
+    if device_ids is not None:
+        if len(device_ids) > len(available):
+            raise ValueError(
+                f"Requested more device ids ({device_ids}) than available ({available})."
+            )
+        for did in device_ids:
+            if isinstance(did, str):
+                try:
+                    did = int(did)
+                except ValueError:
+                    did = did
+            if did in available:
+                selected.append(did)
+            elif isinstance(did, int):
+                selected.append(available[did])
+            else:
+                raise ValueError(f"Device id {did} could not be parsed.")
+    else:
+        selected = available
+    return selected
diff --git a/shortfin/python/shortfin_apps/llm/components/manager.py b/shortfin/python/shortfin_apps/llm/components/manager.py
@@ -8,16 +8,23 @@
 import threading
 
 import shortfin as sf
+from shortfin.interop.support.device_setup import get_selected_devices
 
 logger = logging.getLogger(__name__)
 
 
 class SystemManager:
-    def __init__(self, device="local-task"):
-        if device == "local-task":
+    def __init__(self, device="local-task", device_ids=None, async_allocs=True):
+        if any(x in device for x in ["local-task", "cpu"]):
             self.ls = sf.host.CPUSystemBuilder().create_system()
-        elif device == "hip":
-            self.ls = sf.amdgpu.SystemBuilder().create_system()
+        elif any(x in device for x in ["hip", "amdgpu"]):
+            sb = sf.SystemBuilder(
+                system_type="amdgpu", amdgpu_async_allocations=async_allocs
+            )
+            if device_ids:
+                sb.visible_devices = sb.available_devices
+                sb.visible_devices = get_selected_devices(sb, device_ids)
+            self.ls = sb.create_system()
         logger.info(f"Created local system with {self.ls.device_names} devices")
         # TODO: Come up with an easier bootstrap thing than manually
         # running a thread.

diff --git a/shortfin/python/shortfin_apps/llm/server.py b/shortfin/python/shortfin_apps/llm/server.py
@@ -86,7 +86,11 @@ def get_eos_from_tokenizer_config(json_path):
 
 def configure(args) -> SystemManager:
     # Setup system (configure devices, etc).
-    sysman = SystemManager(device=args.device)
+    sysman = SystemManager(
+        device=args.device,
+        device_ids=args.device_ids,
+        async_allocs=args.amdgpu_async_allocations,
+    )
 
     # Setup each service we are hosting.
     eos_token = get_eos_from_tokenizer_config(args.tokenizer_config_json)
@@ -155,16 +159,29 @@ def main(argv, log_config=uvicorn.config.LOGGING_CONFIG):
     parser.add_argument(
         "--device",
         type=str,
-        default="local-task",
+        required=True,
+        choices=["local-task", "hip", "amdgpu"],
         help="Device to serve on; e.g. local-task, hip. Same options as `iree-run-module --device` ",
     )
+    parser.add_argument(
+        "--device_ids",
+        type=str,
+        nargs="*",
+        default=None,
+        help="Device IDs visible to the system builder. Defaults to None (full visibility). Can be an index or a sf device id like amdgpu:0:0@0",
+    )
     parser.add_argument(
         "--isolation",
         type=str,
         default="per_call",
         choices=[isolation.name.lower() for isolation in ProgramIsolation],
         help="Concurrency control -- How to isolate programs.",
     )
+    parser.add_argument(
+        "--amdgpu_async_allocations",
+        action="store_true",
+        help="Enable asynchronous allocations for amdgpu device contexts.",
+    )
     args = parser.parse_args(argv)
 
     if args.tokenizer_config_json is None:

diff --git a/shortfin/python/shortfin_apps/sd/components/manager.py b/shortfin/python/shortfin_apps/sd/components/manager.py
@@ -8,35 +8,11 @@
 import threading
 
 import shortfin as sf
+from shortfin.interop.support.device_setup import get_selected_devices
 
 logger = logging.getLogger(__name__)
 
 
-def get_selected_devices(sb: sf.SystemBuilder, device_ids=None):
-    available = sb.available_devices
-    selected = []
-    if device_ids is not None:
-        if len(device_ids) >= len(available):
-            raise ValueError(
-                f"Requested more device ids ({device_ids}) than available ({available})."
-            )
-        for did in device_ids:
-            if isinstance(did, str):
-                try:
-                    did = int(did)
-                except ValueError:
-                    did = did
-            if did in available:
-                selected.append(did)
-            elif isinstance(did, int):
-                selected.append(available[did])
-            else:
-                raise ValueError(f"Device id {did} could not be parsed.")
-    else:
-        selected = available
-    return selected
-
-
 class SystemManager:
     def __init__(self, device="local-task", device_ids=None, async_allocs=True):
         if any(x in device for x in ["local-task", "cpu"]):