From e552151b53950e1b1316740b5414a9a49d9c94fb Mon Sep 17 00:00:00 2001
From: thxCode <thxcode0824@gmail.com>
Date: Mon, 13 Jan 2025 23:41:15 +0800
Subject: [PATCH] feat: support offloading sd to multi devs

Signed-off-by: thxCode <thxcode0824@gmail.com>
---
 README.md                     | 37 +++++++++----
 llama-box/param.hpp           | 98 +++++++++++++++++++----------------
 llama-box/stablediffusion.hpp | 16 ++++--
 3 files changed, 93 insertions(+), 58 deletions(-)
diff --git a/README.md b/README.md
index 6adae32..81514ca 100644
--- a/README.md
+++ b/README.md
@@ -44,23 +44,36 @@ and [stable-diffusion.cpp](https://github.com/leejet/stable-diffusion.cpp).
   see our [Reranker Collection](https://huggingface.co/collections/gpustack/reranker-6721a234527f6fcd90deedc4).
 - Support speculative decoding: draft model or n-gram lookup.
 - Support RPC server mode, which can serve as a remote inference backend.
-- Split offloading layers across multiple devices, including remote RPC server.
+- For none image models, split offloading layers across multiple devices, including remote RPC server.
   ```shell
     $ # Assume that there are 1 remote RPC server and 3 available GPUs, launch box as below.
-    $ llama-box -c 8192 -np 4 --host 0.0.0.0 -m ... --rpc remote-ip:remote-port --tensor-split 1,2,3
+    $ llama-box -c 8192 -np 4 --host 0.0.0.0 -m <none image model> --rpc remote-ip:remote-port --tensor-split 1,2,3
     $ # Same as --tensor-split 1,2,3,0. 
     $ # The remote RPC server will handle 1/6 of the model, the 1st GPU will handle 1/3 of the model, and the 2nd GPU will handle 1/2 of the model. 
     $ # Nothing to do with the 3rd GPU.
     
     $ # Assume that there are 1 remote RPC servers and 3 available GPUs, launch box as below.
-    $ llama-box -c 8192 -np 4 --host 0.0.0.0 -m ... --rpc remote-ip:remote-port --tensor-split 0,0,1,1
+    $ llama-box -c 8192 -np 4 --host 0.0.0.0 -m <none image model> --rpc remote-ip:remote-port --tensor-split 0,0,1,1
     $ # The 2nd GPU will handle 1/2 of the model, and the 3rd GPU will handle 1/2 of the model.
     $ # Nothing to do with the remote RPC server and the 1st GPUs.
   ```
+- For image models, split offloading different components across multiple devices, include remote RPC server.
+  ```shell
+    $ # Assume that there are 1 remote RPC server and 3 available GPUs, launch box as below.
+    $ llama-box -np 4 --host 0.0.0.0 -m <image model> --rpc remote-ip:remote-port --tensor-split 1,1,1
+    $ # Same as --tensor-split 1,1,1,0.
+    $ # The remote RPC server will handle text encoder part, the 1st GPU will handle VAE part, and the 2nd GPU will handle diffusion part.
+    $ # Nothing to do with the 3rd GPU.
+    
+    $ # Assume that there are 1 remote RPC server and 3 available GPUs, launch box as below.
+    $ llama-box -np 4 --host 0.0.0.0 -m <image model> --rpc remote-ip:remote-port --tensor-split 0,0,1,1
+    $ # Then 2nd GPU will handle text encoder and VAE parts, and the 3rd GPU will handle diffusion part.
+    $ # Nothing to do with the remote RPC server and the 1st GPUs.
+  ```
 - Support injecting `X-Request-ID` http header for tracking requests.
   ```shell
     $ # Launch box.
-    $ llama-box -c 8192 -np 4 --host 0.0.0.0 -m ...
+    $ llama-box -c 8192 -np 4 --host 0.0.0.0 -m <model>
     
     $ # Inject X-Request-ID: trace-id to track the request.
     $ curl --silent --no-buffer http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -H "X-Request-ID: trace-id" -d '{"model": "demo", "messages": [{"role":"user", "content":"Introduce Beijing in 50 words."}]}'
@@ -69,7 +82,7 @@ and [stable-diffusion.cpp](https://github.com/leejet/stable-diffusion.cpp).
 - Support `X-Request-Tokens-Per-Second` http header for limiting the number of tokens per second.
   ```shell
     $ # Launch box with -tps -1.
-    $ llama-box -c 8192 -np 4 --host 0.0.0.0 -m ... --tokens-per-second -1
+    $ llama-box -c 8192 -np 4 --host 0.0.0.0 -m <model> --tokens-per-second -1
   
     $ # For level 1 users, inject X-Request-Tokens-Per-Second: 10 to limit the number of tokens per second to 10.
     $ curl --silent --no-buffer http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -H "X-Request-Tokens-Per-Second: 10" -d '{"stream": true, "model": "demo", "messages": [{"role":"user", "content":"Introduce Beijing in 50 words."}]}'
@@ -330,7 +343,6 @@ server:
          --lora-init-without-apply
                                   load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled)
   -s,    --seed N                 RNG seed (default: -1, use random seed for -1)
-  -mg,   --main-gpu N             the GPU to use for the model (default: 0)
   -fa,   --flash-attn             enable Flash Attention (default: disabled)
          --metrics                enable prometheus compatible metrics endpoint (default: disabled)
          --infill                 enable infill endpoint (default: disabled)
@@ -339,20 +351,25 @@ server:
          --rerank                 enable reranking endpoint (default: disabled)
          --slots                  enable slots monitoring endpoint (default: disabled)
          --rpc SERVERS            comma separated list of RPC servers
+  -ts,   --tensor-split SPLIT     fraction of the model to offload to each device, comma-separated list of proportions, e.g. 3,1
+                                  for image models, indicate which device should be able to offload
+  -ngl,  --gpu-layers,  --n-gpu-layers N
+                                  number of layers to store in VRAM
+                                  '-ngl 0' means no offloading
          --no-warmup              skip warming up the model with an empty run
+         --warmup                 enable warming up the model with an empty run, which is used to occupy the (V)RAM before serving
 
 server/completion:
 
   -dev,  --device <dev1,dev2,...> 
                                   comma-separated list of devices to use for offloading (none = don't offload)
                                   use --list-devices to see a list of available devices
-  -ngl,  --gpu-layers,  --n-gpu-layers N
-                                  number of layers to store in VRAM
   -sm,   --split-mode SPLIT_MODE  how to split the model across multiple GPUs, one of:
                                     - none: use one GPU only
                                     - layer (default): split layers and KV across GPUs
                                     - row: split rows across GPUs, store intermediate results and KV in --main-gpu
-  -ts,   --tensor-split SPLIT     fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1
+  -mg,   --main-gpu N             the device to use for the model
+                                  work with --split-mode none|row', or indicate the device to offload projector model specified by '--mmproj' (default: 0)
          --override-kv KEY=TYPE:VALUE
                                   advanced option to override model metadata by key. may be specified multiple times.
                                   types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false
@@ -500,7 +517,7 @@ server/images:
          --image-sampling-steps, --image-sample-steps N
                                   number of sampling steps, automatically retrieve the default value according to --model, and +2 when requesting high definition generation
          --image-cfg-scale N      the scale of classifier-free guidance(CFG), automatically retrieve the default value according to --model (1.0 = disabled)
-         --image-slg-scale N      the scale of skip-layer guidance(SLG), only for DiT model (0.0 = disabled, default: 0.0)
+         --image-slg-scale N      the scale of skip-layer guidance(SLG), only for DiT model, automatically retrieve the default value according to --model (0.0 = disabled)
          --image-slg-skip-layer   the layers to skip when processing SLG, may be specified multiple times. (default: 7;8;9)
          --image-slg-start N      the phase to enable SLG (default: 0.01)
          --image-slg-end N        the phase to disable SLG (default: 0.20)
diff --git a/llama-box/param.hpp b/llama-box/param.hpp
index cfd55ad..1781476 100644
--- a/llama-box/param.hpp
+++ b/llama-box/param.hpp
@@ -256,7 +256,6 @@ static void llama_box_params_print_usage(int, char **argv, const llama_box_param
     opts.push_back({ "server",                             "       --lora-scaled FILE SCALE",               "apply LoRA adapter with user defined scaling S (implies --no-mmap)" });
     opts.push_back({ "server",                             "       --lora-init-without-apply",              "load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", llm_params.lora_init_without_apply ? "enabled" : "disabled" });
     opts.push_back({ "server",                             "-s,    --seed N",                               "RNG seed (default: %d, use random seed for %d)", llm_params.sampling.seed, LLAMA_DEFAULT_SEED });
-    opts.push_back({ "server",                             "-mg,   --main-gpu N",                           "the GPU to use for the model (default: %d)", llm_params.main_gpu });
     opts.push_back({ "server",                             "-fa,   --flash-attn",                           "enable Flash Attention (default: %s)", llm_params.flash_attn ? "enabled" : "disabled" });
     opts.push_back({ "server",                             "       --metrics",                              "enable prometheus compatible metrics endpoint (default: %s)", llm_params.endpoint_metrics ? "enabled" : "disabled" });
     opts.push_back({ "server",                             "       --infill",                               "enable infill endpoint (default: %s)", params_.endpoint_infill? "enabled" : "disabled" });
@@ -265,17 +264,22 @@ static void llama_box_params_print_usage(int, char **argv, const llama_box_param
     opts.push_back({ "server",                             "       --rerank",                               "enable reranking endpoint (default: %s)", llm_params.reranking ? "enabled" : "disabled" });
     opts.push_back({ "server",                             "       --slots",                                "enable slots monitoring endpoint (default: %s)", llm_params.endpoint_slots ? "enabled" : "disabled" });
     opts.push_back({ "server",                             "       --rpc SERVERS",                          "comma separated list of RPC servers" });
+    opts.push_back({ "server",                             "-ts,   --tensor-split SPLIT",                   "fraction of the model to offload to each device, comma-separated list of proportions, e.g. 3,1\n"
+                                                                                                            "for image models, indicate which device should be able to offload"});
+    opts.push_back({ "server",                             "-ngl,  --gpu-layers,  --n-gpu-layers N",        "number of layers to store in VRAM\n"
+                                                                                                            "'-ngl 0' means no offloading"});
     opts.push_back({ "server",                             "       --no-warmup",                            "skip warming up the model with an empty run" });
+    opts.push_back({ "server",                             "       --warmup",                               "enable warming up the model with an empty run, which is used to occupy the (V)RAM before serving" });
     // server // completion //
     opts.push_back({ "server/completion" });
     opts.push_back({ "server/completion",                  "-dev,  --device <dev1,dev2,...>",               "comma-separated list of devices to use for offloading (none = don't offload)\n"
                                                                                                             "use --list-devices to see a list of available devices"});
-    opts.push_back({ "server/completion",                  "-ngl,  --gpu-layers,  --n-gpu-layers N",        "number of layers to store in VRAM" });
     opts.push_back({ "server/completion",                  "-sm,   --split-mode SPLIT_MODE",                "how to split the model across multiple GPUs, one of:\n"
                                                                                                             "  - none: use one GPU only\n"
                                                                                                             "  - layer (default): split layers and KV across GPUs\n"
                                                                                                             "  - row: split rows across GPUs, store intermediate results and KV in --main-gpu" });
-    opts.push_back({ "server/completion",                  "-ts,   --tensor-split SPLIT",                   "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1" });
+    opts.push_back({ "server/completion",                  "-mg,   --main-gpu N",                           "the device to use for the model\n"
+                                                                                                            "work with --split-mode none|row', or indicate the device to offload projector model specified by '--mmproj' (default: %d)", llm_params.main_gpu });
     opts.push_back({ "server/completion",                  "       --override-kv KEY=TYPE:VALUE",           "advanced option to override model metadata by key. may be specified multiple times.\n"
                                                                                                             "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" });
     opts.push_back({ "server/completion",                  "       --chat-template JINJA_TEMPLATE",         "set custom jinja chat template (default: template taken from model's metadata)\n"
@@ -664,18 +668,6 @@ static bool llama_box_params_parse(int argc, char **argv, llama_box_params &para
                 continue;
             }
 
-            if (!strcmp(flag, "-mg") || !strcmp(flag, "--main-gpu")) {
-                if (i == argc) {
-                    missing("--main-gpu");
-                }
-                char *arg                   = argv[i++];
-                params_.llm_params.main_gpu = std::stoi(std::string(arg));
-                if (params_.llm_params.main_gpu < 0 || params_.llm_params.main_gpu >= int32_t(llama_max_devices())) {
-                    invalid("--main-gpu");
-                }
-                continue;
-            }
-
             if (!strcmp(flag, "-fa") || !strcmp(flag, "--flash-attn")) {
                 params_.llm_params.flash_attn = true;
                 continue;
@@ -720,14 +712,25 @@ static bool llama_box_params_parse(int argc, char **argv, llama_box_params &para
                 continue;
             }
 
-            // server // completion//
-
-            if (!strcmp(flag, "-devd") || !strcmp(flag, "--device-draft")) {
+            if (!strcmp(flag, "-ts") || !strcmp(flag, "--tensor-split")) {
                 if (i == argc) {
-                    missing("--device-draft");
+                    missing("--tensor-split");
+                }
+                char *arg = argv[i++];
+                const std::regex regex{R"([,/]+)"};
+                std::string arg_s{arg};
+                std::sregex_token_iterator it{arg_s.begin(), arg_s.end(), regex, -1};
+                std::vector<std::string> split_arg{it, {}};
+                if (split_arg.size() >= llama_max_devices()) {
+                    invalid("--tensor-split");
+                }
+                for (size_t j = 0; j < llama_max_devices(); ++j) {
+                    if (j < split_arg.size()) {
+                        params_.llm_params.tensor_split[j] = std::stof(split_arg[j]);
+                    } else {
+                        params_.llm_params.tensor_split[j] = 0.0f;
+                    }
                 }
-                char *arg                              = argv[i++];
-                params_.llm_params.speculative.devices = parse_device_list(arg);
                 continue;
             }
 
@@ -740,6 +743,27 @@ static bool llama_box_params_parse(int argc, char **argv, llama_box_params &para
                 continue;
             }
 
+            if (!strcmp(flag, "--no-warmup")) {
+                params_.llm_params.warmup = false;
+                continue;
+            }
+
+            if (!strcmp(flag, "--warmup")) {
+                params_.llm_params.warmup = true;
+                continue;
+            }
+
+            // server // completion//
+
+            if (!strcmp(flag, "-devd") || !strcmp(flag, "--device-draft")) {
+                if (i == argc) {
+                    missing("--device-draft");
+                }
+                char *arg                              = argv[i++];
+                params_.llm_params.speculative.devices = parse_device_list(arg);
+                continue;
+            }
+
             if (!strcmp(flag, "-sm") || !strcmp(flag, "--split-mode")) {
                 if (i == argc) {
                     missing("--split-mode");
@@ -757,24 +781,14 @@ static bool llama_box_params_parse(int argc, char **argv, llama_box_params &para
                 continue;
             }
 
-            if (!strcmp(flag, "-ts") || !strcmp(flag, "--tensor-split")) {
+            if (!strcmp(flag, "-mg") || !strcmp(flag, "--main-gpu")) {
                 if (i == argc) {
-                    missing("--tensor-split");
-                }
-                char *arg = argv[i++];
-                const std::regex regex{R"([,/]+)"};
-                std::string arg_s{arg};
-                std::sregex_token_iterator it{arg_s.begin(), arg_s.end(), regex, -1};
-                std::vector<std::string> split_arg{it, {}};
-                if (split_arg.size() >= llama_max_devices()) {
-                    invalid("--tensor-split");
+                    missing("--main-gpu");
                 }
-                for (size_t j = 0; j < llama_max_devices(); ++j) {
-                    if (j < split_arg.size()) {
-                        params_.llm_params.tensor_split[j] = std::stof(split_arg[j]);
-                    } else {
-                        params_.llm_params.tensor_split[j] = 0.0f;
-                    }
+                char *arg                   = argv[i++];
+                params_.llm_params.main_gpu = std::stoi(std::string(arg));
+                if (params_.llm_params.main_gpu < 0 || params_.llm_params.main_gpu >= int32_t(llama_max_devices())) {
+                    invalid("--main-gpu");
                 }
                 continue;
             }
@@ -1560,11 +1574,6 @@ static bool llama_box_params_parse(int argc, char **argv, llama_box_params &para
                 continue;
             }
 
-            if (!strcmp(flag, "--no-warmup")) {
-                params_.llm_params.warmup = false;
-                continue;
-            }
-
             if (!strcmp(flag, "--spm-infill")) {
                 params_.llm_params.spm_infill = true;
                 continue;
@@ -1755,7 +1764,7 @@ static bool llama_box_params_parse(int argc, char **argv, llama_box_params &para
                 }
                 char *arg                           = argv[i++];
                 params_.sd_params.sampling.guidance = std::stof(std::string(arg));
-                if (params_.sd_params.sampling.guidance < 0.0f) {
+                if (params_.sd_params.sampling.guidance < 1.0f) {
                     invalid("--image-guidance");
                 }
                 continue;
@@ -2111,9 +2120,10 @@ static bool llama_box_params_parse(int argc, char **argv, llama_box_params &para
         params_.sd_params.warmup                  = params_.llm_params.warmup;
         params_.sd_params.flash_attn              = params_.llm_params.flash_attn;
         params_.sd_params.n_threads               = params_.llm_params.cpuparams.n_threads;
-        params_.sd_params.main_gpu                = params_.llm_params.main_gpu;
         params_.sd_params.lora_init_without_apply = params_.llm_params.lora_init_without_apply;
         params_.sd_params.lora_adapters           = params_.llm_params.lora_adapters;
+        params_.sd_params.rpc_servers             = params_.llm_params.rpc_servers;
+        params_.sd_params.tensor_split            = params_.llm_params.tensor_split;
     }
 
     return true;
diff --git a/llama-box/stablediffusion.hpp b/llama-box/stablediffusion.hpp
index 9becf95..4788398 100644
--- a/llama-box/stablediffusion.hpp
+++ b/llama-box/stablediffusion.hpp
@@ -59,9 +59,10 @@ struct stablediffusion_params {
     bool warmup                                         = true;
     bool flash_attn                                     = false;
     int n_threads                                       = 1;
-    int main_gpu                                        = 0;
     bool lora_init_without_apply                        = false;
     std::vector<common_adapter_lora_info> lora_adapters = {};
+    std::string rpc_servers                             = "";
+    float *tensor_split                                 = nullptr;
 };
 
 struct stablediffusion_sampling_stream {
@@ -157,7 +158,7 @@ int stablediffusion_context::get_default_sampling_steps() {
             return 20;
         case VERSION_SDXL: // including Turbo
         case VERSION_SDXL_REFINER:
-            return 40;
+            return 25;
         case VERSION_SD3:  // including Turbo
         case VERSION_FLUX: // including Schnell
         default:
@@ -377,6 +378,7 @@ stablediffusion_context *common_sd_init_from_params(stablediffusion_params param
     rng_type_t rng_type          = CUDA_RNG;
     bool vae_decode_only         = false;
     bool free_params_immediately = false;
+    bool tae_preview_only        = false;
 
     sd_ctx_t *sd_ctx = new_sd_ctx(
         params.model.c_str(),
@@ -402,7 +404,9 @@ stablediffusion_context *common_sd_init_from_params(stablediffusion_params param
         !params.control_model_offload,
         !params.vae_model_offload,
         params.flash_attn,
-        params.main_gpu);
+        tae_preview_only,
+        params.rpc_servers.c_str(),
+        params.tensor_split);
     if (sd_ctx == nullptr) {
         LOG_ERR("%s: failed to create stable diffusion context\n", __func__);
         return nullptr;
@@ -410,7 +414,11 @@ stablediffusion_context *common_sd_init_from_params(stablediffusion_params param
 
     upscaler_ctx_t *upscaler_ctx = nullptr;
     if (!params.upscale_model.empty()) {
-        upscaler_ctx = new_upscaler_ctx(params.upscale_model.c_str(), params.n_threads, params.main_gpu);
+        upscaler_ctx = new_upscaler_ctx(
+            params.upscale_model.c_str(),
+            params.n_threads,
+            params.rpc_servers.c_str(),
+            params.tensor_split);
         if (upscaler_ctx == nullptr) {
             LOG_ERR("%s: failed to create upscaler context\n", __func__);
             free_sd_ctx(sd_ctx);