From e552151b53950e1b1316740b5414a9a49d9c94fb Mon Sep 17 00:00:00 2001 From: thxCode Date: Mon, 13 Jan 2025 23:41:15 +0800 Subject: [PATCH] feat: support offloading sd to multi devs Signed-off-by: thxCode --- README.md | 37 +++++++++---- llama-box/param.hpp | 98 +++++++++++++++++++---------------- llama-box/stablediffusion.hpp | 16 ++++-- 3 files changed, 93 insertions(+), 58 deletions(-) diff --git a/README.md b/README.md index 6adae32..81514ca 100644 --- a/README.md +++ b/README.md @@ -44,23 +44,36 @@ and [stable-diffusion.cpp](https://github.com/leejet/stable-diffusion.cpp). see our [Reranker Collection](https://huggingface.co/collections/gpustack/reranker-6721a234527f6fcd90deedc4). - Support speculative decoding: draft model or n-gram lookup. - Support RPC server mode, which can serve as a remote inference backend. -- Split offloading layers across multiple devices, including remote RPC server. +- For none image models, split offloading layers across multiple devices, including remote RPC server. ```shell $ # Assume that there are 1 remote RPC server and 3 available GPUs, launch box as below. - $ llama-box -c 8192 -np 4 --host 0.0.0.0 -m ... --rpc remote-ip:remote-port --tensor-split 1,2,3 + $ llama-box -c 8192 -np 4 --host 0.0.0.0 -m --rpc remote-ip:remote-port --tensor-split 1,2,3 $ # Same as --tensor-split 1,2,3,0. $ # The remote RPC server will handle 1/6 of the model, the 1st GPU will handle 1/3 of the model, and the 2nd GPU will handle 1/2 of the model. $ # Nothing to do with the 3rd GPU. $ # Assume that there are 1 remote RPC servers and 3 available GPUs, launch box as below. - $ llama-box -c 8192 -np 4 --host 0.0.0.0 -m ... --rpc remote-ip:remote-port --tensor-split 0,0,1,1 + $ llama-box -c 8192 -np 4 --host 0.0.0.0 -m --rpc remote-ip:remote-port --tensor-split 0,0,1,1 $ # The 2nd GPU will handle 1/2 of the model, and the 3rd GPU will handle 1/2 of the model. $ # Nothing to do with the remote RPC server and the 1st GPUs. ``` +- For image models, split offloading different components across multiple devices, include remote RPC server. + ```shell + $ # Assume that there are 1 remote RPC server and 3 available GPUs, launch box as below. + $ llama-box -np 4 --host 0.0.0.0 -m --rpc remote-ip:remote-port --tensor-split 1,1,1 + $ # Same as --tensor-split 1,1,1,0. + $ # The remote RPC server will handle text encoder part, the 1st GPU will handle VAE part, and the 2nd GPU will handle diffusion part. + $ # Nothing to do with the 3rd GPU. + + $ # Assume that there are 1 remote RPC server and 3 available GPUs, launch box as below. + $ llama-box -np 4 --host 0.0.0.0 -m --rpc remote-ip:remote-port --tensor-split 0,0,1,1 + $ # Then 2nd GPU will handle text encoder and VAE parts, and the 3rd GPU will handle diffusion part. + $ # Nothing to do with the remote RPC server and the 1st GPUs. + ``` - Support injecting `X-Request-ID` http header for tracking requests. ```shell $ # Launch box. - $ llama-box -c 8192 -np 4 --host 0.0.0.0 -m ... + $ llama-box -c 8192 -np 4 --host 0.0.0.0 -m $ # Inject X-Request-ID: trace-id to track the request. $ curl --silent --no-buffer http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -H "X-Request-ID: trace-id" -d '{"model": "demo", "messages": [{"role":"user", "content":"Introduce Beijing in 50 words."}]}' @@ -69,7 +82,7 @@ and [stable-diffusion.cpp](https://github.com/leejet/stable-diffusion.cpp). - Support `X-Request-Tokens-Per-Second` http header for limiting the number of tokens per second. ```shell $ # Launch box with -tps -1. - $ llama-box -c 8192 -np 4 --host 0.0.0.0 -m ... --tokens-per-second -1 + $ llama-box -c 8192 -np 4 --host 0.0.0.0 -m --tokens-per-second -1 $ # For level 1 users, inject X-Request-Tokens-Per-Second: 10 to limit the number of tokens per second to 10. $ curl --silent --no-buffer http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -H "X-Request-Tokens-Per-Second: 10" -d '{"stream": true, "model": "demo", "messages": [{"role":"user", "content":"Introduce Beijing in 50 words."}]}' @@ -330,7 +343,6 @@ server: --lora-init-without-apply load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) -s, --seed N RNG seed (default: -1, use random seed for -1) - -mg, --main-gpu N the GPU to use for the model (default: 0) -fa, --flash-attn enable Flash Attention (default: disabled) --metrics enable prometheus compatible metrics endpoint (default: disabled) --infill enable infill endpoint (default: disabled) @@ -339,20 +351,25 @@ server: --rerank enable reranking endpoint (default: disabled) --slots enable slots monitoring endpoint (default: disabled) --rpc SERVERS comma separated list of RPC servers + -ts, --tensor-split SPLIT fraction of the model to offload to each device, comma-separated list of proportions, e.g. 3,1 + for image models, indicate which device should be able to offload + -ngl, --gpu-layers, --n-gpu-layers N + number of layers to store in VRAM + '-ngl 0' means no offloading --no-warmup skip warming up the model with an empty run + --warmup enable warming up the model with an empty run, which is used to occupy the (V)RAM before serving server/completion: -dev, --device comma-separated list of devices to use for offloading (none = don't offload) use --list-devices to see a list of available devices - -ngl, --gpu-layers, --n-gpu-layers N - number of layers to store in VRAM -sm, --split-mode SPLIT_MODE how to split the model across multiple GPUs, one of: - none: use one GPU only - layer (default): split layers and KV across GPUs - row: split rows across GPUs, store intermediate results and KV in --main-gpu - -ts, --tensor-split SPLIT fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1 + -mg, --main-gpu N the device to use for the model + work with --split-mode none|row', or indicate the device to offload projector model specified by '--mmproj' (default: 0) --override-kv KEY=TYPE:VALUE advanced option to override model metadata by key. may be specified multiple times. types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false @@ -500,7 +517,7 @@ server/images: --image-sampling-steps, --image-sample-steps N number of sampling steps, automatically retrieve the default value according to --model, and +2 when requesting high definition generation --image-cfg-scale N the scale of classifier-free guidance(CFG), automatically retrieve the default value according to --model (1.0 = disabled) - --image-slg-scale N the scale of skip-layer guidance(SLG), only for DiT model (0.0 = disabled, default: 0.0) + --image-slg-scale N the scale of skip-layer guidance(SLG), only for DiT model, automatically retrieve the default value according to --model (0.0 = disabled) --image-slg-skip-layer the layers to skip when processing SLG, may be specified multiple times. (default: 7;8;9) --image-slg-start N the phase to enable SLG (default: 0.01) --image-slg-end N the phase to disable SLG (default: 0.20) diff --git a/llama-box/param.hpp b/llama-box/param.hpp index cfd55ad..1781476 100644 --- a/llama-box/param.hpp +++ b/llama-box/param.hpp @@ -256,7 +256,6 @@ static void llama_box_params_print_usage(int, char **argv, const llama_box_param opts.push_back({ "server", " --lora-scaled FILE SCALE", "apply LoRA adapter with user defined scaling S (implies --no-mmap)" }); opts.push_back({ "server", " --lora-init-without-apply", "load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", llm_params.lora_init_without_apply ? "enabled" : "disabled" }); opts.push_back({ "server", "-s, --seed N", "RNG seed (default: %d, use random seed for %d)", llm_params.sampling.seed, LLAMA_DEFAULT_SEED }); - opts.push_back({ "server", "-mg, --main-gpu N", "the GPU to use for the model (default: %d)", llm_params.main_gpu }); opts.push_back({ "server", "-fa, --flash-attn", "enable Flash Attention (default: %s)", llm_params.flash_attn ? "enabled" : "disabled" }); opts.push_back({ "server", " --metrics", "enable prometheus compatible metrics endpoint (default: %s)", llm_params.endpoint_metrics ? "enabled" : "disabled" }); opts.push_back({ "server", " --infill", "enable infill endpoint (default: %s)", params_.endpoint_infill? "enabled" : "disabled" }); @@ -265,17 +264,22 @@ static void llama_box_params_print_usage(int, char **argv, const llama_box_param opts.push_back({ "server", " --rerank", "enable reranking endpoint (default: %s)", llm_params.reranking ? "enabled" : "disabled" }); opts.push_back({ "server", " --slots", "enable slots monitoring endpoint (default: %s)", llm_params.endpoint_slots ? "enabled" : "disabled" }); opts.push_back({ "server", " --rpc SERVERS", "comma separated list of RPC servers" }); + opts.push_back({ "server", "-ts, --tensor-split SPLIT", "fraction of the model to offload to each device, comma-separated list of proportions, e.g. 3,1\n" + "for image models, indicate which device should be able to offload"}); + opts.push_back({ "server", "-ngl, --gpu-layers, --n-gpu-layers N", "number of layers to store in VRAM\n" + "'-ngl 0' means no offloading"}); opts.push_back({ "server", " --no-warmup", "skip warming up the model with an empty run" }); + opts.push_back({ "server", " --warmup", "enable warming up the model with an empty run, which is used to occupy the (V)RAM before serving" }); // server // completion // opts.push_back({ "server/completion" }); opts.push_back({ "server/completion", "-dev, --device ", "comma-separated list of devices to use for offloading (none = don't offload)\n" "use --list-devices to see a list of available devices"}); - opts.push_back({ "server/completion", "-ngl, --gpu-layers, --n-gpu-layers N", "number of layers to store in VRAM" }); opts.push_back({ "server/completion", "-sm, --split-mode SPLIT_MODE", "how to split the model across multiple GPUs, one of:\n" " - none: use one GPU only\n" " - layer (default): split layers and KV across GPUs\n" " - row: split rows across GPUs, store intermediate results and KV in --main-gpu" }); - opts.push_back({ "server/completion", "-ts, --tensor-split SPLIT", "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1" }); + opts.push_back({ "server/completion", "-mg, --main-gpu N", "the device to use for the model\n" + "work with --split-mode none|row', or indicate the device to offload projector model specified by '--mmproj' (default: %d)", llm_params.main_gpu }); opts.push_back({ "server/completion", " --override-kv KEY=TYPE:VALUE", "advanced option to override model metadata by key. may be specified multiple times.\n" "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" }); opts.push_back({ "server/completion", " --chat-template JINJA_TEMPLATE", "set custom jinja chat template (default: template taken from model's metadata)\n" @@ -664,18 +668,6 @@ static bool llama_box_params_parse(int argc, char **argv, llama_box_params ¶ continue; } - if (!strcmp(flag, "-mg") || !strcmp(flag, "--main-gpu")) { - if (i == argc) { - missing("--main-gpu"); - } - char *arg = argv[i++]; - params_.llm_params.main_gpu = std::stoi(std::string(arg)); - if (params_.llm_params.main_gpu < 0 || params_.llm_params.main_gpu >= int32_t(llama_max_devices())) { - invalid("--main-gpu"); - } - continue; - } - if (!strcmp(flag, "-fa") || !strcmp(flag, "--flash-attn")) { params_.llm_params.flash_attn = true; continue; @@ -720,14 +712,25 @@ static bool llama_box_params_parse(int argc, char **argv, llama_box_params ¶ continue; } - // server // completion// - - if (!strcmp(flag, "-devd") || !strcmp(flag, "--device-draft")) { + if (!strcmp(flag, "-ts") || !strcmp(flag, "--tensor-split")) { if (i == argc) { - missing("--device-draft"); + missing("--tensor-split"); + } + char *arg = argv[i++]; + const std::regex regex{R"([,/]+)"}; + std::string arg_s{arg}; + std::sregex_token_iterator it{arg_s.begin(), arg_s.end(), regex, -1}; + std::vector split_arg{it, {}}; + if (split_arg.size() >= llama_max_devices()) { + invalid("--tensor-split"); + } + for (size_t j = 0; j < llama_max_devices(); ++j) { + if (j < split_arg.size()) { + params_.llm_params.tensor_split[j] = std::stof(split_arg[j]); + } else { + params_.llm_params.tensor_split[j] = 0.0f; + } } - char *arg = argv[i++]; - params_.llm_params.speculative.devices = parse_device_list(arg); continue; } @@ -740,6 +743,27 @@ static bool llama_box_params_parse(int argc, char **argv, llama_box_params ¶ continue; } + if (!strcmp(flag, "--no-warmup")) { + params_.llm_params.warmup = false; + continue; + } + + if (!strcmp(flag, "--warmup")) { + params_.llm_params.warmup = true; + continue; + } + + // server // completion// + + if (!strcmp(flag, "-devd") || !strcmp(flag, "--device-draft")) { + if (i == argc) { + missing("--device-draft"); + } + char *arg = argv[i++]; + params_.llm_params.speculative.devices = parse_device_list(arg); + continue; + } + if (!strcmp(flag, "-sm") || !strcmp(flag, "--split-mode")) { if (i == argc) { missing("--split-mode"); @@ -757,24 +781,14 @@ static bool llama_box_params_parse(int argc, char **argv, llama_box_params ¶ continue; } - if (!strcmp(flag, "-ts") || !strcmp(flag, "--tensor-split")) { + if (!strcmp(flag, "-mg") || !strcmp(flag, "--main-gpu")) { if (i == argc) { - missing("--tensor-split"); - } - char *arg = argv[i++]; - const std::regex regex{R"([,/]+)"}; - std::string arg_s{arg}; - std::sregex_token_iterator it{arg_s.begin(), arg_s.end(), regex, -1}; - std::vector split_arg{it, {}}; - if (split_arg.size() >= llama_max_devices()) { - invalid("--tensor-split"); + missing("--main-gpu"); } - for (size_t j = 0; j < llama_max_devices(); ++j) { - if (j < split_arg.size()) { - params_.llm_params.tensor_split[j] = std::stof(split_arg[j]); - } else { - params_.llm_params.tensor_split[j] = 0.0f; - } + char *arg = argv[i++]; + params_.llm_params.main_gpu = std::stoi(std::string(arg)); + if (params_.llm_params.main_gpu < 0 || params_.llm_params.main_gpu >= int32_t(llama_max_devices())) { + invalid("--main-gpu"); } continue; } @@ -1560,11 +1574,6 @@ static bool llama_box_params_parse(int argc, char **argv, llama_box_params ¶ continue; } - if (!strcmp(flag, "--no-warmup")) { - params_.llm_params.warmup = false; - continue; - } - if (!strcmp(flag, "--spm-infill")) { params_.llm_params.spm_infill = true; continue; @@ -1755,7 +1764,7 @@ static bool llama_box_params_parse(int argc, char **argv, llama_box_params ¶ } char *arg = argv[i++]; params_.sd_params.sampling.guidance = std::stof(std::string(arg)); - if (params_.sd_params.sampling.guidance < 0.0f) { + if (params_.sd_params.sampling.guidance < 1.0f) { invalid("--image-guidance"); } continue; @@ -2111,9 +2120,10 @@ static bool llama_box_params_parse(int argc, char **argv, llama_box_params ¶ params_.sd_params.warmup = params_.llm_params.warmup; params_.sd_params.flash_attn = params_.llm_params.flash_attn; params_.sd_params.n_threads = params_.llm_params.cpuparams.n_threads; - params_.sd_params.main_gpu = params_.llm_params.main_gpu; params_.sd_params.lora_init_without_apply = params_.llm_params.lora_init_without_apply; params_.sd_params.lora_adapters = params_.llm_params.lora_adapters; + params_.sd_params.rpc_servers = params_.llm_params.rpc_servers; + params_.sd_params.tensor_split = params_.llm_params.tensor_split; } return true; diff --git a/llama-box/stablediffusion.hpp b/llama-box/stablediffusion.hpp index 9becf95..4788398 100644 --- a/llama-box/stablediffusion.hpp +++ b/llama-box/stablediffusion.hpp @@ -59,9 +59,10 @@ struct stablediffusion_params { bool warmup = true; bool flash_attn = false; int n_threads = 1; - int main_gpu = 0; bool lora_init_without_apply = false; std::vector lora_adapters = {}; + std::string rpc_servers = ""; + float *tensor_split = nullptr; }; struct stablediffusion_sampling_stream { @@ -157,7 +158,7 @@ int stablediffusion_context::get_default_sampling_steps() { return 20; case VERSION_SDXL: // including Turbo case VERSION_SDXL_REFINER: - return 40; + return 25; case VERSION_SD3: // including Turbo case VERSION_FLUX: // including Schnell default: @@ -377,6 +378,7 @@ stablediffusion_context *common_sd_init_from_params(stablediffusion_params param rng_type_t rng_type = CUDA_RNG; bool vae_decode_only = false; bool free_params_immediately = false; + bool tae_preview_only = false; sd_ctx_t *sd_ctx = new_sd_ctx( params.model.c_str(), @@ -402,7 +404,9 @@ stablediffusion_context *common_sd_init_from_params(stablediffusion_params param !params.control_model_offload, !params.vae_model_offload, params.flash_attn, - params.main_gpu); + tae_preview_only, + params.rpc_servers.c_str(), + params.tensor_split); if (sd_ctx == nullptr) { LOG_ERR("%s: failed to create stable diffusion context\n", __func__); return nullptr; @@ -410,7 +414,11 @@ stablediffusion_context *common_sd_init_from_params(stablediffusion_params param upscaler_ctx_t *upscaler_ctx = nullptr; if (!params.upscale_model.empty()) { - upscaler_ctx = new_upscaler_ctx(params.upscale_model.c_str(), params.n_threads, params.main_gpu); + upscaler_ctx = new_upscaler_ctx( + params.upscale_model.c_str(), + params.n_threads, + params.rpc_servers.c_str(), + params.tensor_split); if (upscaler_ctx == nullptr) { LOG_ERR("%s: failed to create upscaler context\n", __func__); free_sd_ctx(sd_ctx);