Skip to content

Commit

Permalink
Shortfin LLM Deviceid Support (nod-ai#493)
Browse files Browse the repository at this point in the history
# Description

Add the ability to specify device_ids that you want Shortfin LLM Server
to run with. The setup is essentially 1-1 with how SD server sets
device_ids support up.

Created a new `shortfin/interop/support/device_setup.py` module and
moved the `get_selected_devices` function there to be shared across
`managers`.

## Example

```bash
python -m shortfin_apps.llm.server --tokenizer_json=/data/llama3.1/8b/tokenizer.json 
--model_config=./export/edited_config.json --vmfb=./export/model.vmfb 
--parameters=/data/llama3.1/8b/llama8b_f16.irpa 
--device=amdgpu --device_ids=0
```
  • Loading branch information
stbaione authored Nov 13, 2024
1 parent 903d3c1 commit ce6ccf8
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 31 deletions.
26 changes: 26 additions & 0 deletions shortfin/python/shortfin/interop/support/device_setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import shortfin as sf


def get_selected_devices(sb: sf.SystemBuilder, device_ids=None):
available = sb.available_devices
selected = []
if device_ids is not None:
if len(device_ids) > len(available):
raise ValueError(
f"Requested more device ids ({device_ids}) than available ({available})."
)
for did in device_ids:
if isinstance(did, str):
try:
did = int(did)
except ValueError:
did = did
if did in available:
selected.append(did)
elif isinstance(did, int):
selected.append(available[did])
else:
raise ValueError(f"Device id {did} could not be parsed.")
else:
selected = available
return selected
15 changes: 11 additions & 4 deletions shortfin/python/shortfin_apps/llm/components/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,23 @@
import threading

import shortfin as sf
from shortfin.interop.support.device_setup import get_selected_devices

logger = logging.getLogger(__name__)


class SystemManager:
def __init__(self, device="local-task"):
if device == "local-task":
def __init__(self, device="local-task", device_ids=None, async_allocs=True):
if any(x in device for x in ["local-task", "cpu"]):
self.ls = sf.host.CPUSystemBuilder().create_system()
elif device == "hip":
self.ls = sf.amdgpu.SystemBuilder().create_system()
elif any(x in device for x in ["hip", "amdgpu"]):
sb = sf.SystemBuilder(
system_type="amdgpu", amdgpu_async_allocations=async_allocs
)
if device_ids:
sb.visible_devices = sb.available_devices
sb.visible_devices = get_selected_devices(sb, device_ids)
self.ls = sb.create_system()
logger.info(f"Created local system with {self.ls.device_names} devices")
# TODO: Come up with an easier bootstrap thing than manually
# running a thread.
Expand Down
21 changes: 19 additions & 2 deletions shortfin/python/shortfin_apps/llm/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,11 @@ def get_eos_from_tokenizer_config(json_path):

def configure(args) -> SystemManager:
# Setup system (configure devices, etc).
sysman = SystemManager(device=args.device)
sysman = SystemManager(
device=args.device,
device_ids=args.device_ids,
async_allocs=args.amdgpu_async_allocations,
)

# Setup each service we are hosting.
eos_token = get_eos_from_tokenizer_config(args.tokenizer_config_json)
Expand Down Expand Up @@ -155,16 +159,29 @@ def main(argv, log_config=uvicorn.config.LOGGING_CONFIG):
parser.add_argument(
"--device",
type=str,
default="local-task",
required=True,
choices=["local-task", "hip", "amdgpu"],
help="Device to serve on; e.g. local-task, hip. Same options as `iree-run-module --device` ",
)
parser.add_argument(
"--device_ids",
type=str,
nargs="*",
default=None,
help="Device IDs visible to the system builder. Defaults to None (full visibility). Can be an index or a sf device id like amdgpu:0:0@0",
)
parser.add_argument(
"--isolation",
type=str,
default="per_call",
choices=[isolation.name.lower() for isolation in ProgramIsolation],
help="Concurrency control -- How to isolate programs.",
)
parser.add_argument(
"--amdgpu_async_allocations",
action="store_true",
help="Enable asynchronous allocations for amdgpu device contexts.",
)
args = parser.parse_args(argv)

if args.tokenizer_config_json is None:
Expand Down
26 changes: 1 addition & 25 deletions shortfin/python/shortfin_apps/sd/components/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,35 +8,11 @@
import threading

import shortfin as sf
from shortfin.interop.support.device_setup import get_selected_devices

logger = logging.getLogger(__name__)


def get_selected_devices(sb: sf.SystemBuilder, device_ids=None):
available = sb.available_devices
selected = []
if device_ids is not None:
if len(device_ids) >= len(available):
raise ValueError(
f"Requested more device ids ({device_ids}) than available ({available})."
)
for did in device_ids:
if isinstance(did, str):
try:
did = int(did)
except ValueError:
did = did
if did in available:
selected.append(did)
elif isinstance(did, int):
selected.append(available[did])
else:
raise ValueError(f"Device id {did} could not be parsed.")
else:
selected = available
return selected


class SystemManager:
def __init__(self, device="local-task", device_ids=None, async_allocs=True):
if any(x in device for x in ["local-task", "cpu"]):
Expand Down

0 comments on commit ce6ccf8

Please sign in to comment.