huggingface · vrdn-23 · Apr 12, 2025
diff --git a/README.md b/README.md
@@ -215,6 +215,11 @@ Options:
           Unused for gRPC servers
 
           [env: AUTO_TRUNCATE=]
+
+      --warmup-model
+          Send a dummy request to the model before server start-up
+
+          [env: WARMUP_MODEL=]
 
       --default-prompt-name <DEFAULT_PROMPT_NAME>
           The name of the prompt that should be used by default for encoding. If not set, no prompt will be applied.

diff --git a/docs/openapi.json b/docs/openapi.json
@@ -1058,6 +1058,7 @@
           "max_batch_tokens",
           "max_client_batch_size",
           "auto_truncate",
+          "warmup_model",
           "tokenization_workers",
           "version"
         ],
@@ -1129,6 +1130,9 @@
             "type": "string",
             "description": "Router Info",
             "example": "0.5.0"
+          },
+          "warmup_model": {
+            "type": "boolean"
           }
         }
       },

diff --git a/docs/source/en/cli_arguments.md b/docs/source/en/cli_arguments.md
@@ -106,6 +106,11 @@ Options:
           Unused for gRPC servers
 
           [env: AUTO_TRUNCATE=]
+
+      --warmup-model
+          Send a dummy request to the model before server start-up
+
+          [env: WARMUP_MODEL=]
 
       --default-prompt-name <DEFAULT_PROMPT_NAME>
           The name of the prompt that should be used by default for encoding. If not set, no prompt will be applied.

diff --git a/router/src/lib.rs b/router/src/lib.rs
@@ -52,6 +52,7 @@ pub async fn run(
     max_batch_requests: Option<usize>,
     max_client_batch_size: usize,
     auto_truncate: bool,
+    warmup_model: bool,
     default_prompt: Option<String>,
     default_prompt_name: Option<String>,
     hf_token: Option<String>,
@@ -248,7 +249,7 @@ pub async fn run(
         .await
         .context("Model backend is not healthy")?;
 
-    if !backend.padded_model {
+    if !backend.padded_model || warmup_model {
         tracing::info!("Warming up model");
         backend
             .warmup(max_input_length, max_batch_tokens, max_batch_requests)
@@ -288,6 +289,7 @@ pub async fn run(
         max_batch_requests,
         max_client_batch_size,
         auto_truncate,
+        warmup_model,
         version: env!("CARGO_PKG_VERSION"),
         sha: option_env!("VERGEN_GIT_SHA"),
         docker_label: option_env!("DOCKER_LABEL"),
@@ -510,6 +512,7 @@ pub struct Info {
     #[cfg_attr(feature = "http", schema(example = "32"))]
     pub max_client_batch_size: usize,
     pub auto_truncate: bool,
+    pub warmup_model: bool,
     #[cfg_attr(feature = "http", schema(example = "4"))]
     pub tokenization_workers: usize,
     /// Router Info

diff --git a/router/src/main.rs b/router/src/main.rs
@@ -79,6 +79,11 @@ struct Args {
     #[clap(long, env)]
     auto_truncate: bool,
 
+    /// Send a dummy request to the model before server start-up
+    ///
+    #[clap(long, env)]
+    warmup_model: bool,
+
     /// The name of the prompt that should be used by default for encoding. If not set, no prompt
     /// will be applied.
     ///
@@ -216,6 +221,7 @@ async fn main() -> Result<()> {
         args.max_batch_requests,
         args.max_client_batch_size,
         args.auto_truncate,
+        args.warmup_model,
         args.default_prompt,
         args.default_prompt_name,
         token,