From c25381f3416558f0e4904b8aa23b182928e9d1f6 Mon Sep 17 00:00:00 2001
From: Li Zhang <lzhang329@gmail.com>
Date: Sat, 11 Jan 2025 01:00:49 +0800
Subject: [PATCH 1/4] Refactor async engine & turbomind IO (#2968)

* refactor

* async interface

* update perf metrics & adaptive tokens per tick

* wait-free

* refactor gateway

* optimize throughput

* add cancel cb

* simplify async engine

* simplify async engine

* fix end session

* faster synchronization

* fix async engine

* refactor async engine

* fix semaphore

* refactor inference API

* remove turbomind sync interface

* fix msvc build

* fix msvc build

* fix msvc build

* add extra outputs

* skip stop tokens

* exit gracefully

* cancel all tasks atexit

* refactor profiler

* fix id2step for api server

* save csv

* fix interactive

* fix lint

* fix generate_token_len

* fix async_end

* update pipeline ut

* fix ignore eos

* minor

* refactor profile pipeline api

* fix stop ids

* fix duplication

* control output range of logits & last hidden states

* fix lint & typo

* fix blank response

* export batch & num prompts
---
 .../interface/pipeline/test_pipeline_func.py  |   2 +-
 autotest/utils/pipeline_chat.py               |   4 +-
 benchmark/profile_pipeline_api.py             | 116 +--
 benchmark/profile_throughput.py               | 257 +++--
 lmdeploy/messages.py                          |   9 +-
 lmdeploy/profiler.py                          | 170 ++++
 lmdeploy/serve/async_engine.py                | 714 ++++++++-----
 lmdeploy/serve/openai/api_server.py           |  19 +-
 lmdeploy/turbomind/chat.py                    |  49 +-
 lmdeploy/turbomind/turbomind.py               | 615 +++++-------
 src/turbomind/CMakeLists.txt                  |   1 +
 src/turbomind/engine/CMakeLists.txt           |   7 +
 src/turbomind/engine/gateway.cc               |  40 +
 src/turbomind/engine/gateway.h                |  61 ++
 src/turbomind/engine/model_request.cc         | 174 ++++
 src/turbomind/engine/model_request.h          |  59 ++
 src/turbomind/engine/request.h                | 148 +++
 src/turbomind/engine/request_queue.cc         |  93 ++
 src/turbomind/engine/request_queue.h          |  46 +
 src/turbomind/engine/signal_buffer.h          |  61 ++
 src/turbomind/kernels/gpt_kernels.cu          |  57 ++
 src/turbomind/kernels/gpt_kernels.h           |  15 +
 .../kernels/sampling_penalty_kernels.cu       |  77 ++
 .../kernels/sampling_penalty_kernels.h        |   8 +
 .../kernels/sampling_topp_kernels.cu          |   9 +-
 .../sampling_layers/LogitsProcessorLayer.cc   |   2 +-
 src/turbomind/models/llama/CMakeLists.txt     |   1 +
 src/turbomind/models/llama/LlamaBatch.cc      | 945 ++++++++----------
 src/turbomind/models/llama/LlamaBatch.h       |  67 +-
 src/turbomind/models/llama/LlamaV2.cc         |  23 +-
 src/turbomind/models/llama/LlamaV2.h          |   5 -
 src/turbomind/models/llama/Request.h          | 115 ---
 src/turbomind/python/bind.cpp                 | 266 +++--
 .../triton_backend/llama/CMakeLists.txt       |   1 -
 .../triton_backend/llama/LlamaTritonModel.cc  |  95 +-
 .../triton_backend/llama/LlamaTritonModel.h   |  33 +-
 .../llama/LlamaTritonModelInstance.cc         | 216 ----
 .../llama/LlamaTritonModelInstance.h          |  80 --
 .../transformer_triton_backend.hpp            |  14 +-
 src/turbomind/utils/Tensor.h                  |  25 +
 src/turbomind/utils/cuda_utils.h              |   3 +-
 41 files changed, 2683 insertions(+), 2019 deletions(-)
 create mode 100644 lmdeploy/profiler.py
 create mode 100644 src/turbomind/engine/CMakeLists.txt
 create mode 100644 src/turbomind/engine/gateway.cc
 create mode 100644 src/turbomind/engine/gateway.h
 create mode 100644 src/turbomind/engine/model_request.cc
 create mode 100644 src/turbomind/engine/model_request.h
 create mode 100644 src/turbomind/engine/request.h
 create mode 100644 src/turbomind/engine/request_queue.cc
 create mode 100644 src/turbomind/engine/request_queue.h
 create mode 100644 src/turbomind/engine/signal_buffer.h
 delete mode 100644 src/turbomind/models/llama/Request.h
 delete mode 100644 src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc
 delete mode 100644 src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h

diff --git a/autotest/interface/pipeline/test_pipeline_func.py b/autotest/interface/pipeline/test_pipeline_func.py
index 87a0719bcb..0696684890 100644
--- a/autotest/interface/pipeline/test_pipeline_func.py
+++ b/autotest/interface/pipeline/test_pipeline_func.py
@@ -408,7 +408,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
         result = True
         for i in range(2):
             result &= response[i].finish_reason == 'length'
-            result &= response[i].session_id == i
+            result &= response[i].index == i
         save_pipeline_common_log(config, file_name, result, response)
         del pipe
         torch.cuda.empty_cache()
diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py
index 8f03e4e406..5dcb358319 100644
--- a/autotest/utils/pipeline_chat.py
+++ b/autotest/utils/pipeline_chat.py
@@ -235,7 +235,7 @@ def assert_pipeline_single_stream_return(output, logprobs_num: int = 0):
 
 def assert_pipeline_batch_stream_return(output, size: int = 1):
     for i in range(size):
-        output_list = [item for item in output if item.session_id == i]
+        output_list = [item for item in output if item.index == i]
         result, msg = assert_pipeline_single_stream_return(output_list)
         if not result:
             return result, msg
@@ -249,7 +249,7 @@ def assert_pipeline_single_element(output,
     result = True
     result &= output.generate_token_len > 0
     result &= output.input_token_len > 0
-    result &= output.session_id >= 0
+    result &= output.index >= 0
     if is_last:
         result &= len(output.text) >= 0
         result &= output.finish_reason in ['stop', 'length']
diff --git a/benchmark/profile_pipeline_api.py b/benchmark/profile_pipeline_api.py
index 764f78399c..334be7fa14 100644
--- a/benchmark/profile_pipeline_api.py
+++ b/benchmark/profile_pipeline_api.py
@@ -1,11 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import argparse
-import csv
 import json
 import os
 import random
-import time
-from collections import OrderedDict
 from typing import List, Tuple
 
 from tqdm import tqdm
@@ -14,6 +11,10 @@
 from lmdeploy import (GenerationConfig, PytorchEngineConfig,
                       TurbomindEngineConfig, pipeline)
 from lmdeploy.cli.utils import ArgumentHelper, DefaultsAndTypesHelpFormatter
+from lmdeploy.profiler import Profiler, Session
+from lmdeploy.utils import get_logger
+
+logger = get_logger('lmdeploy')
 
 
 def sample_requests(dataset_path: str, num_requests: int,
@@ -66,91 +67,70 @@ def __init__(self, model_path: str, engine_config, csv: str):
 
         self.csv = csv
 
-    def process_request(self, requests, concurrency, temperature, top_p, top_k,
-                        stream_output):
+    def process_request(self, requests, profiler: Profiler, temperature, top_p,
+                        top_k, stream_output):
 
-        stats = OrderedDict(
-            (session_id, None) for session_id in range(len(requests)))
         prompts = [prompt for prompt, _, _ in requests]
         gen_configs = [
             GenerationConfig(temperature=temperature,
                              top_p=top_p,
                              top_k=top_k,
                              ignore_eos=True,
+                             do_sample=True,
                              max_new_tokens=output_len)
             for _, _, output_len in requests
         ]
 
-        start = time.perf_counter()
+        sess: List[Session] = []
+        for _, input_len, output_len in requests:
+            sess.append(profiler.new_session(input_len, output_len))
+
+        def _to_status(finish_reason):
+            if finish_reason == 'length':
+                return Session.SUCCESS
+            else:
+                return Session.FAIL
+
+        profiler.start()
+
+        for s in sess:
+            s.tick(0)
+
         if stream_output:
             pbar = tqdm(total=len(requests))
             for output in self.pipe.stream_infer(prompts,
                                                  gen_configs,
                                                  do_preprocess=False):
-                session_id = output.session_id
+                index = output.index
                 n_token = output.generate_token_len
                 finish_reason = output.finish_reason
-                stats[session_id] = (n_token, finish_reason)
+                sess[index].tick(n_token)
                 if finish_reason is not None:
+                    sess[index].finish(_to_status(finish_reason))
                     pbar.update(1)
+            pbar.close()
         else:
             for output in self.pipe(prompts,
                                     gen_configs,
                                     do_preprocess=False,
                                     use_tqdm=True):
-                session_id = output.session_id
+                index = output.index
                 n_token = output.generate_token_len
                 finish_reason = output.finish_reason
-                stats[session_id] = (n_token, finish_reason)
-
-        elapsed_time = time.perf_counter() - start
-
-        completion_tokens = 0
-        for session_id, (n_token, finish_reason) in stats.items():
-            assert finish_reason == 'length', \
-                f'unexpected finish_reason of session_id={session_id}, ' \
-                f'prompt={requests[session_id][0]}'
-            assert n_token - 1 <= requests[session_id][-1] <= n_token, \
-                f'request to generate {requests[session_id][-1]} tokens, ' \
-                f'but got {n_token} tokens'
-            completion_tokens += n_token
-
-        prompt_tokens = 0
-        for _, input_len, _ in requests:
-            prompt_tokens += input_len
-
-        completion_token_throughput = completion_tokens / elapsed_time
-        total_token_throughput = (prompt_tokens +
-                                  completion_tokens) / elapsed_time
-        rps = len(requests) / elapsed_time
-        rpm = rps * 60
-
-        print(f'\n{"-" * 50}\nconcurrency: {concurrency}\n'
-              f'elapsed_time: {elapsed_time:.3f}s\n')
-
-        print(
-            f'number of prompts: {len(requests)}\n'
-            f'number of prompt tokens: {prompt_tokens:.0f}\n'
-            f'number of completion tokens: {completion_tokens:.0f}\n'
-            f'token throughput (completion token): {completion_token_throughput:.3f} token/s\n'  # noqa
-            f'token throughput (prompt + completion token): {total_token_throughput:.3f} token/s\n'  # noqa
-            f'RPS (request per second): {rps:.3f} req/s\n'
-            f'RPM (request per minute): {rpm:.3f} req/min\n'
-            f'{"-" * 50}\n')
-
-        if self.csv:
-            with open(self.csv, 'w') as csvfile:
-                writer = csv.writer(csvfile)
-                writer.writerow([
-                    'batch', 'num_promts', 'RPS', 'RPM',
-                    'throughput(out tok/s)', 'throughput(total tok/s)'
-                ])
-                writer.writerow([
-                    concurrency,
-                    len(requests), f'{rps:.3f}', f'{rpm:.3f}',
-                    f'{completion_token_throughput:.3f}',
-                    f'{total_token_throughput:.3f}'
-                ])
+                sess[index].tick(n_token)
+                sess[index].finish(_to_status(finish_reason))
+
+        profiler.finish()
+
+        # report first failure
+        for i, s in enumerate(sess):
+            if s.status != Session.SUCCESS or s.ns[-1] < s.req_output_len:
+                logger.error(
+                    f'Request {i} failed with {s.ns[-1]}/{s.req_output_len} tokens generated'  # noqa: E501
+                )
+                logger.error(f'Prompt: {prompts[i]}')
+                logger.warning('Got failed requests, metrics may be invalid')
+                break
 
 
 def parse_args():
@@ -252,13 +232,25 @@ def main():
     requests = sample_requests(args.dataset, args.num_prompts,
                                engine.tokenizer)
 
+    profiler = Profiler(args.stream_output, [50, 75, 95, 99])
+
     engine.process_request(requests,
+                           profiler,
                            temperature=args.temperature,
                            top_p=args.top_p,
                            top_k=args.top_k,
-                           concurrency=args.concurrency,
                            stream_output=args.stream_output)
 
+    hyperparams = [('Concurrency', args.concurrency),
+                   ('Stream output', str(args.stream_output).lower())]
+
+    profiler.compute_metrics()
+    profiler.summarize(title='Profile Pipeline API', hyperparams=hyperparams)
+
+    if args.csv:
+        profiler.save_csv(args.csv, (('batch', args.concurrency),
+                                     ('num_prompts', args.num_prompts)))
+
 
 if __name__ == '__main__':
     main()
diff --git a/benchmark/profile_throughput.py b/benchmark/profile_throughput.py
index 291b1be9b8..2e4d2a3b8c 100644
--- a/benchmark/profile_throughput.py
+++ b/benchmark/profile_throughput.py
@@ -1,20 +1,18 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import argparse
 import asyncio
-import csv
 import json
 import os
 import random
-import time
 from queue import Queue
 from typing import List, Tuple, Union
 
-import numpy as np
 from tqdm import tqdm
 
 from lmdeploy.cli.utils import ArgumentHelper, DefaultsAndTypesHelpFormatter
 from lmdeploy.messages import (GenerationConfig, PytorchEngineConfig,
                                TurbomindEngineConfig)
+from lmdeploy.profiler import Profiler, Session
 from lmdeploy.pytorch.engine import EngineInstance
 from lmdeploy.tokenizer import DetokenizeState, Tokenizer
 from lmdeploy.utils import get_logger
@@ -71,7 +69,7 @@ class Engine:
 
     def __init__(self, model_path: str,
                  engine_config: Union[PytorchEngineConfig,
-                                      TurbomindEngineConfig], csv: str):
+                                      TurbomindEngineConfig]):
         if isinstance(engine_config, TurbomindEngineConfig):
             from lmdeploy.turbomind import TurboMind
             tm_model = TurboMind.from_pretrained(model_path,
@@ -83,166 +81,104 @@ def __init__(self, model_path: str,
         self.tm_model = tm_model
         self.tokenizer = tm_model.tokenizer
 
-        self.csv = csv
         self.pbar = None
 
-    async def _inference(self, req_queue: Queue, res_queue: Queue,
-                         session_id: int, temperature: float, top_p: float,
-                         top_k: int, stream_output: bool):
+    async def _inference(self, req_queue: Queue, session_id: int,
+                         temperature: float, top_p: float, top_k: int,
+                         stream_output: bool, skip_tokenize: bool,
+                         skip_detokenize: bool):
         model_inst = self.tm_model.create_instance()
-        stats = []
-        # get each generated token's latency
-        per_token_latency_stats = []
-        for prompt, input_seqlen, output_seqlen in iter(
-                req_queue.get_nowait, [None, None, None]):
-            _per_token_latency_stats = [0] * (output_seqlen + 1)
-            prev = time.perf_counter()
-            n_prev_token = 0
-
-            input_ids = self.tokenizer(prompt).input_ids
+        sess: Session = None
+        for prompt, _, output_seqlen, cancel_after, sess in iter(
+                req_queue.get_nowait, None):
+
+            sess.tick(0)
+
+            if skip_tokenize:
+                input_ids = prompt
+            else:
+                input_ids = self.tokenizer(prompt).input_ids
+
             state = DetokenizeState(len(input_ids))
 
-            async for outputs in model_inst.async_stream_infer(
-                    session_id,
-                    input_ids=input_ids,
-                    gen_config=GenerationConfig(max_new_tokens=output_seqlen,
-                                                temperature=temperature,
-                                                top_p=top_p,
-                                                top_k=top_k,
-                                                ignore_eos=True),
-                    sequence_start=True,
-                    sequence_end=True,
-                    stream_output=stream_output):
-                res, n_token = input_ids + outputs.token_ids, outputs.num_token
-                _, state = self.tokenizer.detokenize_incrementally(res, state)
-                now = time.perf_counter()
-                if n_prev_token != n_token:
-                    _per_token_latency_stats[n_prev_token] = np.round(
-                        now - prev, 3)
-                    n_prev_token = n_token
-                prev = now
+            prev_len = 0
+            token_ids = input_ids.copy()
+
+            generator = model_inst.async_stream_infer(
+                session_id,
+                input_ids=input_ids,
+                gen_config=GenerationConfig(max_new_tokens=output_seqlen,
+                                            temperature=temperature,
+                                            top_p=top_p,
+                                            top_k=top_k,
+                                            ignore_eos=True),
+                sequence_start=True,
+                sequence_end=True,
+                stream_output=stream_output)
+            try:
+                async for outputs in generator:
+                    n_token = outputs.num_token
+                    if n_token > prev_len:
+                        token_ids += outputs.token_ids[prev_len - n_token:]
+                        if not skip_detokenize:
+                            _, state = self.tokenizer.detokenize_incrementally(
+                                token_ids, state)
+                        sess.tick(n_token)
+                        prev_len = n_token
+                        if n_token > cancel_after:
+                            break
+                sess.finish(Session.SUCCESS)
+            finally:
+                await generator.aclose()
+
             # for pytorch engine to restart a session
             if isinstance(model_inst, EngineInstance):
                 await model_inst.async_end(session_id)
-            assert output_seqlen <= n_token <= output_seqlen + 1, \
-                f'Error. session_id({session_id}) request {output_seqlen} ' \
-                f'tokens, but generate {n_token} tokens.\n' \
-                f'prompt: {prompt}'
-
-            first_token_latency = _per_token_latency_stats[0]
-            completion_tokens = n_token
-            total_tokens = n_token + input_seqlen
-            stats.append([
-                first_token_latency, completion_tokens, output_seqlen,
-                total_tokens
-            ])
-            # skip the first token latency
-            per_token_latency_stats.append(_per_token_latency_stats[1:])
+
             self.pbar.update(1)
-        res_queue.put_nowait((session_id, stats, per_token_latency_stats))
 
-    def process_request(self, requests, concurrency, temperature, top_p, top_k,
-                        stream_output):
-        res_queue = Queue()
+    def process_request(self, requests, profiler: Profiler, concurrency,
+                        temperature, top_p, top_k, stream_output,
+                        skip_tokenize, skip_detokenize, cancel_rate):
         req_queue = Queue()
 
-        self.pbar = tqdm(total=len(requests))
-
         # feed request to q
-        for req in requests:
+        for prompt, input_len, output_len in requests:
+            cancel_after = output_len + 1
+            if cancel_rate > 0:
+                if random.random() < cancel_rate:
+                    cancel_after = random.randint(0, cancel_after)
+            sess = profiler.new_session(input_len, output_len)
+            req = [prompt, input_len, output_len, cancel_after, sess]
+            if skip_tokenize:
+                req[0] = self.tokenizer.encode(prompt)
             req_queue.put(req)
         for i in range(concurrency):
-            req_queue.put([None, None, None])
-
-        start = time.time()
-
-        event_loop = asyncio.new_event_loop()
-        asyncio.set_event_loop(event_loop)
+            req_queue.put(None)
 
         # start threads
         tasks = []
         for i in range(concurrency):
-            task = self._inference(req_queue, res_queue, i, temperature, top_p,
-                                   top_k, stream_output)
+            task = self._inference(req_queue, i, temperature, top_p, top_k,
+                                   stream_output, skip_tokenize,
+                                   skip_detokenize)
             tasks.append(task)
 
         async def _gather_tasks(tasks):
             return await asyncio.gather(*tasks)
 
-        event_loop.run_until_complete(_gather_tasks(tasks))
-
-        elapsed_time = time.time() - start
-
-        stats = []
-        per_token_latency_stats = []
-        while not res_queue.empty():
-            session_id, _stats, _per_token_latency_stats = res_queue.get()
-            stats.append(np.array(_stats))
-            per_token_latency_stats += [
-                item for sublist in _per_token_latency_stats
-                for item in sublist
-            ]
-        stats = np.concatenate(stats).reshape(-1, 4)
-
-        first_token_latency_min = np.min(stats[:, 0], axis=0)
-        first_token_latency_max = np.max(stats[:, 0], axis=0)
-        first_token_latency_ave = np.mean(stats[:, 0], axis=0)
-        completion_tokens = np.sum(stats[:, 1], axis=0)
-        total_tokens = np.sum(stats[:, 3], axis=0)
-        prompt_tokens = total_tokens - completion_tokens
-        completion_token_throughput = completion_tokens / elapsed_time
-        total_token_throughput = total_tokens / elapsed_time
-        rps = len(requests) / elapsed_time
-        rpm = rps * 60
-
-        per_token_latency_stats.sort()
-        percentiles = [
-            np.round(
-                per_token_latency_stats[int(percent *
-                                            len(per_token_latency_stats))], 3)
-            for percent in [0.5, 0.75, 0.95, 0.99]
-        ]
-
-        print(f'\n{"-" * 50}\nconcurrency: {concurrency}\n'
-              f'elapsed_time: {elapsed_time:.3f}s\n')
-        if stream_output:
-            print(f'first token latency(s)(min, max, ave): '
-                  f'{first_token_latency_min:.3f}, '
-                  f'{first_token_latency_max:.3f}, '
-                  f'{first_token_latency_ave:.3f}')
-            print(f'per-token latency(s) percentile(50, 75, 95, 99): '
-                  f'{percentiles}\n')
-        print(
-            f'number of prompt tokens: {prompt_tokens:.0f}\n'
-            f'number of completion tokens: {completion_tokens:.0f}\n'
-            f'token throughput (completion token): {completion_token_throughput:.3f} token/s\n'  # noqa
-            f'token throughput (prompt + completion token): {total_token_throughput:.3f} token/s\n'  # noqa
-            f'RPS (request per second): {rps:.3f} req/s\n'
-            f'RPM (request per minute): {rpm:.3f} req/min\n'
-            f'{"-" * 50}\n')
-
-        if self.csv:
-            with open(self.csv, 'w') as csvfile:
-                writer = csv.writer(csvfile)
-                writer.writerow([
-                    'batch', 'num_promts', 'RPS', 'RPM', 'FTL(ave)(s)',
-                    'FTL(min)(s)', 'FTL(max)(s)', '50%(s)', '75%(s)', '95%(s)',
-                    '99%(s)', 'throughput(out tok/s)',
-                    'throughput(total tok/s)'
-                ])
-                writer.writerow([
-                    concurrency,
-                    len(requests), f'{rps:.3f}', f'{rpm:.3f}',
-                    f'{first_token_latency_ave:.3f}' if stream_output else '-',
-                    f'{first_token_latency_min:.3f}' if stream_output else '-',
-                    f'{first_token_latency_max:.3f}' if stream_output else '-',
-                    f'{percentiles[0]:.3f}' if stream_output else '-',
-                    f'{percentiles[1]:.3f}' if stream_output else '-',
-                    f'{percentiles[2]:.3f}' if stream_output else '-',
-                    f'{percentiles[3]:.3f}' if stream_output else '-',
-                    f'{completion_token_throughput:.3f}',
-                    f'{total_token_throughput:.3f}'
-                ])
+        self.pbar = tqdm(total=len(requests))
+
+        event_loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(event_loop)
+
+        profiler.start()
+
+        asyncio.run(_gather_tasks(tasks))
+
+        profiler.finish()
+
+        self.pbar.close()
 
 
 def parse_args():
@@ -266,6 +202,20 @@ def parse_args():
                         type=int,
                         help='Number of prompts to process',
                         default=5000)
+    parser.add_argument('--no-stream-output',
+                        action='store_true',
+                        help='Use stream output')
+    parser.add_argument('--skip-tokenize',
+                        action='store_true',
+                        help='Pre-tokenize input prompts before starting')
+    parser.add_argument('--skip-detokenize',
+                        action='store_true',
+                        help='Skip detokenizing output tokens')
+    parser.add_argument('--cancel-rate',
+                        type=float,
+                        help='Possibility of a request being canceled',
+                        default=0)
+    parser.add_argument('--use-uvloop', action='store_true')
     parser.add_argument('--csv',
                         type=str,
                         help='Where to save the result.',
@@ -340,19 +290,42 @@ def main():
             dtype=args.dtype,
         )
 
-    engine = Engine(args.model_path, engine_config, csv=args.csv)
+    if args.use_uvloop:
+        import uvloop
+        asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
+
+    engine = Engine(args.model_path, engine_config)
 
     requests = sample_requests(args.dataset, args.num_prompts,
                                engine.tokenizer)
 
+    stream_output = not args.no_stream_output
+
+    profiler = Profiler(stream_output, [50, 75, 95, 99])
+
     engine.process_request(
         requests,
+        profiler,
         temperature=args.temperature,
         top_p=args.top_p,
         top_k=args.top_k,
         concurrency=args.concurrency
         if args.concurrency < args.num_prompts else args.num_prompts,
-        stream_output=True)
+        stream_output=not args.no_stream_output,
+        skip_tokenize=args.skip_tokenize,
+        skip_detokenize=args.skip_detokenize,
+        cancel_rate=args.cancel_rate)
+
+    hyperparams = [('Concurrency', args.concurrency),
+                   ('Cancel rate', args.cancel_rate),
+                   ('Stream output', str(stream_output).lower()),
+                   ('Skip tokenize', str(args.skip_tokenize).lower()),
+                   ('Skip detokenize', str(args.skip_detokenize).lower())]
+    profiler.compute_metrics()
+    profiler.summarize(title='Profile Throughput', hyperparams=hyperparams)
+    if args.csv:
+        profiler.save_csv(args.csv, (('batch', args.concurrency),
+                                     ('num_prompts', args.num_prompts)))
 
 
 if __name__ == '__main__':
diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py
index 2336d10752..d4e6571b79 100644
--- a/lmdeploy/messages.py
+++ b/lmdeploy/messages.py
@@ -97,6 +97,8 @@ class GenerationConfig:
     logprobs: int = None
     response_format: Optional[Dict] = None
     logits_processors: Optional[List[LogitsProcessor]] = None
+    output_logits: Literal['all', 'generation'] = None
+    output_last_hidden_state: Literal['all', 'generation'] = None
 
     def convert_stop_bad_words_to_ids(self, tokenizer: Tokenizer):
         """convert stop_words/bad_sords to ids and append the ids to
@@ -124,7 +126,7 @@ def __post_init__(self):
         """Check input validation."""
         assert type(
             self.n) == int and self.n > 0, 'n is not a positive integer'
-        assert self.top_p > 0 and self.top_p <= 1  # (0, 1]
+        assert self.top_p >= 0 and self.top_p <= 1  # [0, 1]
         assert self.top_k >= 0, 'top_k can not be a negative integer'
         assert self.temperature >= 0 and self.temperature <= 2  # [0,2]
         assert 0 <= self.min_p <= 1, \
@@ -338,10 +340,11 @@ class Response:
     text: str
     generate_token_len: int
     input_token_len: int
-    session_id: int
     finish_reason: Optional[Literal['stop', 'length']] = None
     token_ids: List[int] = field(default_factory=list)
     logprobs: List[Dict[int, float]] = None
+    logits: torch.Tensor = None
+    last_hidden_state: torch.Tensor = None
     index: int = 0
 
 
@@ -361,6 +364,8 @@ class EngineOutput:
     token_ids: List[int]
     num_token: int
     logprobs: List[Dict[int, float]] = None
+    logits: torch.Tensor = None
+    last_hidden_state: torch.Tensor = None
 
 
 @dataclass
diff --git a/lmdeploy/profiler.py b/lmdeploy/profiler.py
new file mode 100644
index 0000000000..c1bf6b3875
--- /dev/null
+++ b/lmdeploy/profiler.py
@@ -0,0 +1,170 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import csv
+import time
+from typing import List
+
+import numpy as np
+
+
+class Session:
+
+    UNKNOWN = 0
+    SUCCESS = 1
+    FAIL = 2
+
+    def __init__(self, input_len, req_output_len):
+        self.ts = []
+        self.ns = []
+        self.input_len = input_len
+        self.req_output_len = req_output_len
+        self.status = Session.UNKNOWN
+
+    def tick(self, n_token):
+        self.ts.append(time.perf_counter())
+        self.ns.append(n_token)
+
+    def finish(self, status):
+        self.status = status
+
+
+class Profiler:
+
+    def __init__(self, stream_output: bool, percentages: List[int]):
+        self.sessions: List[Session] = []
+        self.stream_output = stream_output
+        self.percentages = percentages
+
+    def new_session(self, *args, **kwargs):
+        sess = Session(*args, **kwargs)
+        self.sessions.append(sess)
+        return sess
+
+    def start(self):
+        self.t_start = time.perf_counter()
+
+    def finish(self):
+        self.elapsed_time = time.perf_counter() - self.t_start
+
+    def compute_metrics(self):
+        self.ttfts: List[float] = []
+        self.tpots: List[float] = []
+        self.e2es: List[float] = []
+        self.itls: List[float] = []
+        self.tpts: List[int] = []
+        self.total_output = 0
+        self.total_input = 0
+        self.success = 0
+
+        for sess in self.sessions:
+            if sess.status != Session.SUCCESS:
+                continue
+            ns = sess.ns
+            ts = sess.ts
+            if ns[-1] < sess.req_output_len:
+                continue
+            self.success += 1
+            self.total_output += ns[-1]
+            self.total_input += sess.input_len
+            self.e2es.append(ts[-1] - ts[0])
+            self.ttfts.append(ts[1] - ts[0])
+            if ns[-1] > ns[1]:
+                self.tpots.append((ts[-1] - ts[1]) / (ns[-1] - ns[1]))
+            else:  # no-stream-output
+                self.tpots.append((ts[-1] - ts[0]) / (ns[-1] - ns[0]))
+            t_dif = np.subtract(ts[1:], ts[:-1])
+            n_dif = np.subtract(ns[1:], ns[:-1])
+            self.itls.extend(t_dif[1:])
+            self.tpts.extend(n_dif)
+
+        self.output_throughput = self.total_output / self.elapsed_time
+        self.input_throughput = self.total_input / self.elapsed_time
+
+        qs = self.percentages
+
+        self.e2es = self.e2es or [float('inf')]
+        self.tpots = self.tpots or [float('inf')]
+        self.ttfts = self.ttfts or [float('inf')]
+        self.itls = self.itls or [float('inf')]
+        self.tpts = self.tpts or [0]
+
+        self.tpot_mean = np.mean(self.tpots)
+        self.tpot_stat = tuple(np.percentile(self.tpots, qs))
+        self.e2e_mean = np.mean(self.e2es)
+        self.e2e_stat = tuple(np.percentile(self.e2es, qs))
+
+        if self.stream_output:
+            self.ttft_mean = np.mean(self.ttfts)
+            self.ttft_stat = tuple(np.percentile(self.ttfts, qs))
+            self.itls_mean = np.mean(self.itls)
+            self.itls_stat = tuple(np.percentile(self.itls, qs))
+            self.tpts_mean = np.mean(self.tpts)
+            self.tpts_stat = tuple(np.percentile(self.tpts, qs).astype(int))
+
+        self.rps = self.success / self.elapsed_time
+
+    def summarize(self,
+                  title: str,
+                  hyperparams: List = None,
+                  header=40,
+                  digits=10):
+
+        width = header + digits * (1 + len(self.percentages))
+
+        def tab_row(name, *items):
+
+            def fmt(x):
+                return '{:>{d}.3f}'.format(x, d=digits) if isinstance(
+                    x, float) else '{:>{d}}'.format(x, d=digits)
+
+            print('{:<{p}}{}'.format(name,
+                                     ''.join([fmt(x) for x in items]),
+                                     p=header))
+
+        print('\n{s:{c}^{n}}'.format(s=f' {title} ', n=width, c='='))
+        tab_row('Benchmark duration', self.elapsed_time)
+        tab_row('Total requests', len(self.sessions))
+        tab_row('Successful requests', self.success)
+        if hyperparams:
+            for k, v in hyperparams:
+                tab_row(k, v)
+        tab_row('Total input tokens', self.total_input)
+        tab_row('Total generated tokens', self.total_output)
+        tab_row('Input throughput (tok/s)', self.input_throughput)
+        tab_row('Output throughput (tok/s)', self.output_throughput)
+        tab_row('Request throughput (req/s)', self.rps)
+        print('-' * width)
+        tab_row('', 'mean', *(f'P{q}' for q in self.percentages))
+        tab_row('End-to-end Latency', self.e2e_mean, *self.e2e_stat)
+        if self.stream_output:
+            tab_row('Time to First Token (TTFT)', self.ttft_mean,
+                    *self.ttft_stat)
+        tab_row('Time per Output Token (TPOT)', self.tpot_mean,
+                *self.tpot_stat)
+        if self.stream_output:
+            tab_row('Inter-token Latency (ITL)', self.itls_mean,
+                    *self.itls_stat)
+            tab_row('Tokens per Tick', self.tpts_mean, *self.tpts_stat)
+        print('=' * width)
+
+    def save_csv(self, csv_file: str, hyperparams):
+        """Export legacy metrics to CSV."""
+        with open(csv_file, 'w') as csvfile:
+            writer = csv.writer(csvfile)
+            keys, vals = zip(*hyperparams)
+            writer.writerow([
+                *keys,
+                'RPS',
+                'RPM',
+                'FTL(ave)(s)',
+                'throughput(out tok/s)',
+                'throughput(total tok/s)',
+            ])
+            ttft_mean = f'{self.ttft_mean:.3f}' if self.stream_output else '-'
+            writer.writerow([
+                *vals,
+                f'{self.rps:.3f}',
+                f'{(self.rps * 60):.3f}',
+                ttft_mean,
+                f'{self.output_throughput:.3f}',
+                f'{(self.input_throughput + self.output_throughput):.3f}',
+            ])
diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py
index dfcf01a69d..d7366c654b 100644
--- a/lmdeploy/serve/async_engine.py
+++ b/lmdeploy/serve/async_engine.py
@@ -1,22 +1,28 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import asyncio
+import atexit
+import concurrent.futures
 import dataclasses
 import json
 import os
 import random
 import re
-from contextlib import asynccontextmanager
+from contextlib import asynccontextmanager, closing
 from copy import deepcopy
+from functools import partial
 from itertools import count
-from queue import Empty, Queue
+from queue import Queue
 from threading import Thread
-from typing import Any, Dict, List, Literal, Optional, Tuple, Union
+from typing import (Any, AsyncIterator, Dict, Iterator, List, Literal,
+                    Optional, Tuple, Union)
+
+import tqdm
 
 from lmdeploy.logger import RequestLogger
 from lmdeploy.messages import (GenerationConfig, PytorchEngineConfig, Response,
                                ResponseType, TurbomindEngineConfig)
 from lmdeploy.model import MODELS, ChatTemplateConfig, best_match_model
-from lmdeploy.serve.utils import LogitsMixin, _get_event_loop
+from lmdeploy.serve.utils import LogitsMixin
 from lmdeploy.tokenizer import DetokenizeState
 from lmdeploy.utils import _get_and_verify_max_len, _stop_words, get_logger
 
@@ -50,6 +56,37 @@ class GenOut:
     finish_reason: Optional[Literal['stop', 'length', 'error']] = None
     token_ids: List[int] = None
     logprobs: List[Dict[int, float]] = None
+    logits: Any = None
+    last_hidden_state: Any = None
+
+
+def _gen_out_to_response(out: GenOut, index) -> Response:
+    return Response(text=out.response,
+                    generate_token_len=out.generate_token_len,
+                    input_token_len=out.input_token_len,
+                    finish_reason=out.finish_reason,
+                    token_ids=out.token_ids,
+                    logprobs=out.logprobs,
+                    last_hidden_state=out.last_hidden_state,
+                    logits=out.logits,
+                    index=index)
+
+
+def _append_response(dst: Response, src: Response):
+    """dst += src."""
+    if not dst:
+        return src
+    dst.text += src.text
+    dst.generate_token_len = src.generate_token_len
+    dst.input_token_len = src.input_token_len
+    dst.finish_reason = src.finish_reason
+    dst.index = src.index
+    if src.token_ids:
+        dst.token_ids += src.token_ids
+    if src.logprobs:
+        dst.logprobs = dst.logprobs or []
+        dst.logprobs += src.logprobs
+    return dst
 
 
 class Session:
@@ -63,14 +100,17 @@ class Session:
         _engine (Any): engine for internal use.
         history (List[Any, str]): chat history.
     """
-    _ids = count(0)
 
-    def __init__(self):
-        self._id: int = next(self._ids)
+    def __init__(self,
+                 session_id: int,
+                 engine: Any,
+                 gen_config: GenerationConfig = None):
+        self._id: int = session_id
+        self._engine = engine
         self._step: int = 0
         self._prompt: Any = None
         self._response: Response = None
-        self._engine: Any = None
+        self._gen_config = gen_config
         self.history: List[Tuple[Any, str]] = []
 
     def _merge_response(self, resp: Response, step: Union[Response, GenOut]):
@@ -89,8 +129,8 @@ def response(self) -> Response:
     def close(self):
         """release engine storage for this session."""
         if self._engine:
-            inst = self._engine.create_instance()
-            inst.end(self._id)
+            self._engine._run(coro=self._engine.end_session(self._id)).result()
+            self._engine = None
 
     def __repr__(self) -> str:
         res = ''
@@ -100,6 +140,89 @@ def __repr__(self) -> str:
             res += f'USER:\n{user}\nASSISTANT:\n{assistant}\n'
         return res
 
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.close()
+
+    def __call__(
+            self,
+            prompt: str,
+            gen_config: Optional[GenerationConfig] = None,
+            stream_response: bool = True,
+            do_preprocess: bool = True) -> Union[Response, Iterator[Response]]:
+        self._engine.chat(prompt=prompt,
+                          gen_config=gen_config or self._gen_config,
+                          stream_response=stream_response,
+                          do_preprocess=do_preprocess,
+                          session=self)
+        if stream_response:
+            return self.generator
+        else:
+            return self.response
+
+
+class _EventLoopThread:
+
+    def __init__(self, daemon=False):
+        fut = concurrent.futures.Future()
+        self.thread = Thread(target=partial(self._thread_entry, fut),
+                             daemon=daemon)
+        self.thread.start()
+        self.loop: asyncio.AbstractEventLoop = fut.result()
+        self.closed = False
+        if daemon:
+            atexit.register(self.close)
+
+    def _thread_entry(self, fut):
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        fut.set_result(loop)
+        try:
+            loop.run_forever()
+        except BaseException as e:
+            logger.error(f'[internal_thread] {type(e).__name__} {e}')
+        finally:
+            try:
+                self._cancel_all_tasks()
+                loop.run_until_complete(loop.shutdown_asyncgens())
+            finally:
+                asyncio.set_event_loop(None)
+                loop.close()
+
+    def _cancel_all_tasks(self):
+        """Modified from asyncio/runners.py."""
+        to_cancel = asyncio.all_tasks(self.loop)
+        if not to_cancel:
+            return
+
+        for task in to_cancel:
+            task.cancel()
+
+        async def _gather():
+            await asyncio.gather(*to_cancel, return_exceptions=True)
+
+        self.loop.run_until_complete(_gather())
+
+        for task in to_cancel:
+            if task.cancelled():
+                continue
+            if task.exception() is not None:
+                self.loop.call_exception_handler({
+                    'message':
+                    'unhandled exception during worker thread shutdown',
+                    'exception': task.exception(),
+                    'task': task,
+                })
+
+    def close(self):
+        if self.closed:
+            return
+        self.closed = True
+        self.loop.call_soon_threadsafe(self.loop.stop)
+        self.thread.join()
+
 
 class AsyncEngine(LogitsMixin):
     """Async inference engine. Maintaining a bunch of tm_model instances.
@@ -179,13 +302,26 @@ def __init__(self,
         self.instance_num = self.backend_config.max_batch_size
         self.tokenizer = self.engine.tokenizer
         self.id2step = {}
-        self.id2generator = {}
-        self.running_session_ids = set()
-        self.gens_set = set()
-        for i in range(self.instance_num):
-            self.gens_set.add(self.engine.create_instance())
+        self.id2inst = {}
+        self.free_insts: asyncio.Queue = None
+        self.instances = [
+            self.engine.create_instance() for _ in range(self.instance_num)
+        ]
         self._session_id = count(0)
         self.request_logger = RequestLogger(max_log_len)
+        self.internal_thread = _EventLoopThread(daemon=True)
+        self.limiter: asyncio.Semaphore = None
+
+    def close(self):
+        self.internal_thread.close()
+
+    def _get_free_insts(self):
+        if self.free_insts is None:
+            # `asyncio.Queue` must be created in an async context
+            self.free_insts = asyncio.Queue()
+            for inst in self.instances:
+                self.free_insts.put_nowait(inst)
+        return self.free_insts
 
     def _build_turbomind(
             self,
@@ -246,45 +382,117 @@ def __call__(self,
 
     async def stop_session(self, session_id: int):
         """Stop a session by a session_id."""
-        if str(session_id) in self.id2generator:
-            await self.id2generator[str(session_id)].async_cancel(session_id)
-            self.gens_set.add(self.id2generator[str(session_id)])
-
-        self.running_session_ids.discard(session_id)
+        generator = self.id2inst.get(session_id)
+        if generator:
+            await generator.async_cancel(session_id)
+        # else it's not running at all
 
     async def end_session(self, session_id: int):
-        """Clear a session by a session_id."""
-        if str(session_id) in self.id2generator:
-            await self.id2generator[str(session_id)].async_end(session_id)
-            self.id2step[str(session_id)] = 0
-            self.gens_set.add(self.id2generator[str(session_id)])
-
-        self.running_session_ids.discard(session_id)
-
-    @asynccontextmanager
-    async def safe_run(self, session_id: Optional[int] = None):
-        """A context manager to make sure server's safe running."""
+        """For ending a session that is not running."""
+        inst = self.id2inst.get(session_id)
+        if inst:
+            await inst._active.wait()
+            assert session_id not in self.id2inst
+        inst = await self._get_free_insts().get()
         try:
-            yield
+            await inst.async_end(session_id)
+            self.id2step[session_id] = 0
         except (Exception, asyncio.CancelledError, GeneratorExit) as e:  # noqa
-            # TODO: find out why await would block the coroutine here
-            _get_event_loop().create_task(self.stop_session(session_id))
-            raise e
-        if str(session_id) in self.id2generator:
-            self.gens_set.add(self.id2generator[str(session_id)])
-        self.running_session_ids.discard(session_id)
-
-    async def get_generator(self, stop: bool, session_id: int):
-        """Only return the model instance if it is available."""
-        if stop:
-            return self.engine.create_instance()
-        # waiting no generator is available or the same session_id is running
-        while self.gens_set == set() or session_id in self.running_session_ids:
-            await asyncio.sleep(0.1)
-        generator = self.gens_set.pop()
-        self.id2generator[str(session_id)] = generator
-        self.running_session_ids.add(session_id)
-        return generator
+            logger.error(f'[end_session] exception caught: {e}')
+        finally:
+            self._get_free_insts().put_nowait(inst)
+
+    def _get_limiter(self):
+        if not self.limiter:
+            self.limiter = asyncio.Semaphore(self.instance_num)
+        return self.limiter
+
+    async def _async_infer(self, requests: AsyncIterator[Dict],
+                           **kwargs) -> AsyncIterator[AsyncIterator[Response]]:
+        async for req in requests:
+            gen = self.generate(**req, **kwargs)
+            yield gen
+
+    def _infer(self,
+               requests: Iterator[Dict],
+               multiplex: bool,
+               pbar=None,
+               loop=None) -> Iterator[Iterator[Response]]:
+
+        async def _sync_resp(g, que: Queue, idx: int, sem: asyncio.Semaphore):
+            async for out in g:
+                que.put(_gen_out_to_response(out, idx))
+            sem.release()
+            if not multiplex:
+                que.put(None)  # sentinel of inner generator
+            if pbar:
+                pbar.update(1)
+
+        que = Queue()
+
+        async def _infer():
+            sem = self._get_limiter()
+            tasks = []
+            for idx, req in enumerate(requests):
+                await sem.acquire()
+                gen = self.generate(**req)
+                dst = que if multiplex else Queue()
+                if not multiplex:
+                    que.put(iter(dst.get, None))
+                # create a task to send the responses
+                task = asyncio.create_task(_sync_resp(gen, dst, idx, sem))
+                tasks.append(task)
+            if not multiplex:  # sentinel of outer generator
+                que.put(None)
+            await asyncio.gather(*tasks)
+            if multiplex:
+                que.put(None)  # sentinel of inner generator
+
+        loop = loop or self.internal_thread.loop
+        # submit the coroutine to async world
+        asyncio.run_coroutine_threadsafe(
+            _infer(), loop).add_done_callback(lambda x: x.result())
+
+        return iter(que.get, None)
+
+    @staticmethod
+    def _is_single(prompts):
+        return isinstance(prompts, str) or isinstance(prompts[0], Dict)
+
+    def infer(self,
+              prompts: Union[List[str], str, List[Dict], List[List[Dict]]],
+              gen_config: Optional[Union[GenerationConfig,
+                                         List[GenerationConfig]]] = None,
+              do_preprocess: bool = True,
+              adapter_name: Optional[str] = None,
+              stream_response: bool = False,
+              multiplex: bool = False,
+              pbar: Optional[tqdm.tqdm] = None,
+              **kwargs):
+
+        prompts = [prompts] if AsyncEngine._is_single(prompts) else prompts
+        assert isinstance(prompts, List), 'prompts should be a list'
+        gen_config = gen_config or GenerationConfig()
+        if not isinstance(gen_config, List):
+            gen_config = [gen_config] * len(prompts)
+        assert len(prompts) == len(gen_config), \
+                'input gen_confg length differs from the length of prompts'  # noqa
+
+        def requests():
+            for prompt, gen_cfg in zip(prompts, gen_config):
+                r = dict(messages=prompt,
+                         gen_config=gen_cfg,
+                         do_preprocess=do_preprocess,
+                         adapter_name=adapter_name,
+                         stream_response=stream_response,
+                         **kwargs)
+                r.setdefault('sequence_start', True)
+                r.setdefault('sequence_end', True)
+                if 'session_id' not in r:
+                    r['session_id'] = next(self._session_id)
+                yield r
+
+        return self._infer(requests(), multiplex, pbar)
 
     def batch_infer(self,
                     prompts: Union[List[str], str, List[Dict],
@@ -310,59 +518,26 @@ def batch_infer(self,
                 Pick one from adapters. Default to None, using the base model.
             use_tqdm (bool): Whether use the progress bar. Default to False
         """
-        need_list_wrap = isinstance(prompts, str) or isinstance(
-            prompts[0], Dict)
-        prompts = [prompts] if need_list_wrap else prompts
-        assert isinstance(prompts, List), 'prompts should be a list'
-        if gen_config is None:
-            gen_config = GenerationConfig()
-        if not isinstance(gen_config, List):
-            gen_config = [gen_config] * len(prompts)
-        assert len(prompts) == len(gen_config), \
-                'input gen_confg length differs from the length of prompts'  # noqa
-        prompt_num = len(prompts)
-        session_ids = [next(self._session_id) for _ in range(prompt_num)]
-        outputs = [
-            Response('', 0, 0, session_ids[i], index=i)
-            for i in range(prompt_num)
-        ]
-        generators = []
-        if use_tqdm:
-            import tqdm
-            pbar = tqdm.tqdm(total=len(prompts))
-        for i, prompt in enumerate(prompts):
-            generators.append(
-                self.generate(prompt,
-                              session_ids[i],
-                              gen_config=gen_config[i],
-                              stream_response=True,
-                              sequence_start=True,
-                              sequence_end=True,
-                              do_preprocess=do_preprocess,
-                              adapter_name=adapter_name,
-                              **kwargs))
-
-        async def _inner_call(i, generator):
-            async for out in generator:
-                outputs[i].text += out.response
-                outputs[i].generate_token_len = out.generate_token_len
-                outputs[i].input_token_len = out.input_token_len
-                outputs[i].finish_reason = out.finish_reason
-                if out.token_ids:
-                    outputs[i].token_ids.extend(out.token_ids)
-                if out.logprobs:
-                    if outputs[i].logprobs is None:
-                        outputs[i].logprobs = []
-                    outputs[i].logprobs.extend(out.logprobs)
-                if use_tqdm and out.finish_reason is not None:
-                    pbar.update(1)
-
-        async def gather():
-            await asyncio.gather(
-                *[_inner_call(i, generators[i]) for i in range(len(prompts))])
-
-        _get_event_loop().run_until_complete(gather())
-        outputs = outputs[0] if need_list_wrap else outputs
+        is_single = AsyncEngine._is_single(prompts)
+        outputs = []
+        pbar = tqdm.tqdm(
+            total=1 if is_single else len(prompts)) if use_tqdm else None
+        try:
+            for g in self.infer(prompts,
+                                gen_config,
+                                do_preprocess,
+                                adapter_name,
+                                stream_response=False,
+                                pbar=pbar,
+                                **kwargs):
+                res = None
+                for out in g:
+                    res = _append_response(res, out)
+                outputs.append(res)
+        finally:
+            if pbar: pbar.close()  # noqa
+        if is_single:
+            return outputs[0]
         return outputs
 
     def stream_infer(
@@ -372,6 +547,7 @@ def stream_infer(
                                        List[GenerationConfig]]] = None,
             do_preprocess: bool = True,
             adapter_name: Optional[str] = None,
+            stream_response: bool = True,
             **kwargs):
         """Inference a batch of prompts with stream mode.
 
@@ -387,62 +563,13 @@ def stream_infer(
             adapter_name (str): the adapter name of slora for pytorch backend.
                 Pick one from adapters. Default to None, using the base model.
         """
-        need_list_wrap = isinstance(prompts, str) or isinstance(
-            prompts[0], Dict)
-        prompts = [prompts] if need_list_wrap else prompts
-        assert isinstance(prompts, List), 'prompts should be a list'
-        if gen_config is None:
-            gen_config = GenerationConfig()
-        if not isinstance(gen_config, List):
-            gen_config = [gen_config] * len(prompts)
-        assert len(prompts) == len(gen_config), \
-                'input gen_confg length differs from the length of prompts'  # noqa
-        session_ids = [next(self._session_id) for _ in range(len(prompts))]
-        outputs = Queue()
-        generators = []
-        for i, prompt in enumerate(prompts):
-            generators.append(
-                self.generate(prompt,
-                              session_ids[i],
-                              gen_config=gen_config[i],
-                              stream_response=True,
-                              sequence_start=True,
-                              sequence_end=True,
-                              do_preprocess=do_preprocess,
-                              adapter_name=adapter_name,
-                              **kwargs))
-
-        async def _inner_call(i, generator):
-            async for out in generator:
-                outputs.put(
-                    Response(out.response,
-                             out.generate_token_len,
-                             out.input_token_len,
-                             session_ids[i],
-                             out.finish_reason,
-                             out.token_ids,
-                             out.logprobs,
-                             index=i))
-
-        async def gather():
-            await asyncio.gather(
-                *[_inner_call(i, generators[i]) for i in range(len(prompts))])
-            outputs.put(None)
-
-        loop = _get_event_loop()
-        proc = Thread(target=lambda: loop.run_until_complete(gather()))
-        proc.start()
-
-        while True:
-            try:
-                out = outputs.get(timeout=0.001)
-                if out is None:
-                    break
-                yield out
-            except Empty:
-                pass
-
-        proc.join()
+        return self.infer(prompts,
+                          gen_config,
+                          do_preprocess,
+                          adapter_name,
+                          stream_response,
+                          multiplex=True,
+                          **kwargs)
 
     async def _get_prompt_input(self,
                                 prompt: str,
@@ -466,6 +593,34 @@ async def _get_prompt_input(self,
         input_ids = self.tokenizer.encode(prompt, add_bos=sequence_start)
         return {'prompt': prompt, 'input_ids': input_ids}
 
+    @asynccontextmanager
+    async def model_inst(self, session_id: int):
+        """A context manager to make sure server's safe running."""
+        assert session_id not in self.id2inst
+        free_insts = self._get_free_insts()
+        inst = await free_insts.get()
+        inst._active = asyncio.Event()
+        self.id2inst[session_id] = inst
+        try:
+            yield inst
+        finally:
+            self.id2inst.pop(session_id)
+            inst._active.set()
+            free_insts.put_nowait(inst)
+
+    @asynccontextmanager
+    async def safe_run(self, inst, session_id, **kwargs):
+        generator = inst.async_stream_infer(session_id, **kwargs)
+        try:
+            yield generator
+        except (Exception, asyncio.CancelledError, GeneratorExit) as e:  # noqa
+            logger.error(
+                f'[safe_run] exception caught: {type(e).__name__} {e}')
+            # TODO: remove session_id from async cancel
+            await inst.async_cancel(session_id)
+        finally:
+            await generator.aclose()
+
     async def generate(
             self,
             messages,
@@ -478,6 +633,8 @@ async def generate(
             step: int = 0,
             do_preprocess: bool = True,
             adapter_name: Optional[str] = None,
+            skip_stop_tokens: bool = True,
+            rewind_stop_tokens: bool = False,
             **kwargs):
         """Generate responses.
 
@@ -493,10 +650,10 @@ async def generate(
             do_preprocess (bool): whether pre-process the messages. Default to
                 True, which means chat_template will be applied.
         """
-        if str(session_id) not in self.id2step:
-            self.id2step[str(session_id)] = 0
+        if session_id not in self.id2step:
+            self.id2step[session_id] = 0
         if step != 0:
-            self.id2step[str(session_id)] = step
+            self.id2step[session_id] = step
         if gen_config is None:
             gen_config = GenerationConfig()
         else:
@@ -539,7 +696,7 @@ async def generate(
                                        gen_config=gen_config,
                                        adapter_name=adapter_name)
         logger.info(f'session_id={session_id}, '
-                    f'history_tokens={self.id2step[str(session_id)]}, '
+                    f'history_tokens={self.id2step[session_id]}, '
                     f'input_tokens={len(input_ids)}, '
                     f'max_new_tokens={gen_config.max_new_tokens}, '
                     f'seq_start={sequence_start}, seq_end={sequence_end}, '
@@ -548,94 +705,132 @@ async def generate(
         if gen_config.max_new_tokens is None:
             # for interactive endpoint, will try maximum possible token num
             gen_config.max_new_tokens = max(
-                128, self.session_len - self.id2step[str(session_id)] -
-                len(input_ids))
-        elif self.id2step[str(session_id)] + len(
+                128,
+                self.session_len - self.id2step[session_id] - len(input_ids))
+        elif self.id2step[session_id] + len(
                 input_ids) + gen_config.max_new_tokens > self.session_len:
             gen_config.max_new_tokens = max(
-                self.session_len - self.id2step[str(session_id)] -
-                len(input_ids), 128)
+                self.session_len - self.id2step[session_id] - len(input_ids),
+                128)
             logger.error(
                 f'Truncate max_new_tokens to {gen_config.max_new_tokens}')
-        if self.id2step[str(session_id)] + len(
+        if self.id2step[session_id] + len(
                 input_ids) + gen_config.max_new_tokens > self.session_len:
             logger.error(f'run out of tokens. session_id={session_id}.')
-            yield GenOut('', self.id2step[str(session_id)], len(input_ids), 0,
+            yield GenOut('', self.id2step[session_id], len(input_ids), 0,
                          'length')
             if sequence_end is True and sequence_start is False:
                 await self.end_session(session_id)
-        else:
-
-            def is_error(status):
-                return status not in [
-                    ResponseType.SUCCESS, ResponseType.FINISH
-                ]
-
-            generator = await self.get_generator(False, session_id)
-            async with self.safe_run(session_id):
-                state = DetokenizeState(len(input_ids))
-                start_ids_offset = state.ids_offset
-                response = ''
-                async for outputs in generator.async_stream_infer(
-                        session_id=session_id,
-                        **prompt_input,
-                        gen_config=gen_config,
-                        adapter_name=adapter_name,
-                        stream_output=stream_response,
-                        sequence_start=sequence_start,
-                        sequence_end=sequence_end,
-                        step=self.id2step[str(session_id)]):
+            return
+
+        def is_error(status):
+            return status not in [ResponseType.SUCCESS, ResponseType.FINISH]
+
+        # used to skip / rewind stop words in interactive mode
+        stop_ids = []
+        if skip_stop_tokens and not gen_config.ignore_eos:
+            stop_ids = gen_config.stop_token_ids or []
+            if self.tokenizer.eos_token_id not in stop_ids:
+                stop_ids.append(self.tokenizer.eos_token_id)
+
+        async with self.model_inst(session_id) as inst:
+            token_ids = input_ids.copy()
+            history_len = self.id2step[session_id]
+            input_len = len(input_ids)
+            output_len, gen_len = 0, 0
+            state = DetokenizeState(len(input_ids))
+            start_ids_offset = state.ids_offset
+            response = ''
+            async with self.safe_run(inst,
+                                     session_id=session_id,
+                                     **prompt_input,
+                                     gen_config=gen_config,
+                                     adapter_name=adapter_name,
+                                     stream_output=stream_response,
+                                     sequence_start=sequence_start,
+                                     sequence_end=sequence_end,
+                                     step=history_len) as gen:
+                prev_len = 0
+                hit_stop_token = 0
+                async for outputs in gen:
                     # decode res
                     if is_error(outputs.status):
-                        tokens = 0
                         break
-                    res, tokens = input_ids + outputs.token_ids, outputs.num_token  # noqa
-                    if len(res) <= state.ids_offset:
+
+                    output_len = outputs.num_token
+
+                    if hit_stop_token or prev_len == output_len:
                         continue
 
+                    # This assumes the engine will stop when stop token is hit
+                    if output_len and outputs.token_ids[-1] in stop_ids:
+                        hit_stop_token = 1
+                        # one token and it's been skipped
+                        if output_len == prev_len + 1:
+                            continue
+
+                    mask = slice(prev_len - output_len,
+                                 output_len - hit_stop_token)
+
+                    token_ids += outputs.token_ids[mask]
+                    gen_len = len(token_ids) - input_len
+
+                    prev_len = output_len
+
                     ids_offset = state.ids_offset
                     response, state = self.tokenizer.detokenize_incrementally(
-                        res,
+                        token_ids,
                         state,
                         skip_special_tokens=gen_config.skip_special_tokens)
+                    res = token_ids[ids_offset:]
+
+                    out = GenOut(response, history_len, input_len, gen_len,
+                                 finish_reason, res)
 
-                    res = res[ids_offset:]
-                    logprobs = None
-                    if outputs.logprobs:
+                    if outputs.logprobs is not None:
                         log_offset = ids_offset - start_ids_offset
-                        logprobs = outputs.logprobs[log_offset:]
+                        out.logprobs = outputs.logprobs[log_offset:]
+                    if outputs.last_hidden_state is not None:
+                        out.last_hidden_state = outputs.last_hidden_state
+                        if hit_stop_token:
+                            out.last_hidden_state = \
+                                out.last_hidden_state[:-hit_stop_token]
+                    if outputs.logits is not None:
+                        out.logits = outputs.logits
+                        if hit_stop_token:
+                            out.logits = out.logits[:-hit_stop_token]
+
+                    yield out
+                # end of generator loop
 
-                    # response, history token len,
-                    # input token len, gen token len
-                    yield GenOut(response, self.id2step[str(session_id)],
-                                 len(input_ids), tokens, finish_reason, res,
-                                 logprobs)
                 if not is_error(outputs.status):
                     finish_reason = 'length' \
-                        if tokens >= gen_config.max_new_tokens else 'stop'
+                        if gen_len >= gen_config.max_new_tokens else 'stop'
                     # utf-8 char at the end means it's a potential unfinished
                     # byte sequence
                     if not response.endswith('�'):
-                        # avaid returning the last response twice
+                        # avoid returning the last response twice
                         response = ''
-                    yield GenOut(response, self.id2step[str(session_id)],
-                                 len(input_ids), tokens, finish_reason)
+                    yield GenOut(response, self.id2step[session_id],
+                                 len(input_ids), gen_len, finish_reason)
                 else:
-                    yield GenOut(
-                        response='internal error happened',
-                        history_token_len=self.id2step[str(session_id)],
-                        input_token_len=len(input_ids),
-                        generate_token_len=0,
-                        finish_reason='error',
-                        token_ids=[])
-                # update step
-                self.id2step[str(session_id)] += len(input_ids) + tokens
-                if sequence_end:
-                    self.id2step[str(session_id)] = 0
-                # manually end pytorch session
-                # TODO modify pytorch or turbomind api
-                if self.backend == 'pytorch' and sequence_end:
-                    await self.end_session(session_id)
+                    yield GenOut(response='internal error happened',
+                                 history_token_len=self.id2step[session_id],
+                                 input_token_len=len(input_ids),
+                                 generate_token_len=0,
+                                 finish_reason='error',
+                                 token_ids=[])
+            # update step
+            if sequence_end:
+                self.id2step[session_id] = 0
+                if self.backend == 'pytorch':
+                    # manually end pytorch session
+                    await inst.async_end(session_id)
+            else:
+                if rewind_stop_tokens:
+                    # rewind the step to the token before the stop token
+                    output_len = gen_len
+                self.id2step[session_id] += input_len + output_len
 
     def parse_tool_response(self, text, tools, **kwargs):
         """Parse model response containing tool information.
@@ -684,12 +879,28 @@ def parse_tool_response(self, text, tools, **kwargs):
                           for call_info in call_info_list]
         return text, call_info_list
 
+    def _run(self, fn=None, coro=None, loop=None):
+        assert (fn or coro) and not (fn and coro)
+        loop = loop or self.internal_thread.loop
+        if fn:
+
+            async def _coro():
+                return fn()
+
+            coro = _coro()
+        return asyncio.run_coroutine_threadsafe(coro, loop)
+
+    def session(self, gen_config: GenerationConfig = None):
+        return Session(self._run(fn=lambda: next(self._session_id)).result(),
+                       engine=self,
+                       gen_config=gen_config)
+
     def chat(self,
              prompt: str,
              session=None,
              gen_config: Optional[GenerationConfig] = None,
-             do_preprocess: bool = True,
-             **kwargs) -> Session:
+             stream_response=False,
+             **kwargs) -> Union[Session, Iterator]:
         """Chat.
 
         Args:
@@ -702,8 +913,7 @@ def chat(self,
             **kwargs (dict): ad hoc parametrization of `gen_config
         """
         if session is None:
-            session = Session()
-            session._engine = self.engine
+            session = self.session()
 
         # sync & init
         session._prompt = prompt
@@ -711,25 +921,35 @@ def chat(self,
 
         sequence_start = session._step == 0
 
-        async def _work():
-            resp = Response('', -1, -1, session._id)
-            async for output in self.generate(prompt,
-                                              session_id=session._id,
-                                              gen_config=gen_config,
-                                              stream_response=False,
-                                              sequence_start=sequence_start,
-                                              sequence_end=False,
-                                              step=session._step,
-                                              do_preprocess=do_preprocess,
-                                              **kwargs):
-                resp = session._merge_response(resp, output)
-            return resp
-
-        from lmdeploy.pytorch.engine.request import _run_until_complete
-        resp = _run_until_complete(_work())
-
-        session._response = resp
-        session._step += resp.generate_token_len + resp.input_token_len
-        session.history.append((session._prompt, resp.text))
+        generator = self.infer(prompt,
+                               gen_config,
+                               sequence_start=sequence_start,
+                               sequence_end=False,
+                               session_id=session._id,
+                               stream_response=stream_response,
+                               multiplex=True)
+
+        def _gen():
+            resp = None
+            try:
+                for out in generator:
+                    resp = _append_response(resp, out)
+                    yield out
+            except:  # noqa
+                self._run(coro=self.stop_session(session._id)).result()
+                raise
+            else:
+                session._response = resp
+                session._step += resp.generate_token_len + resp.input_token_len
+                session.history.append((session._prompt, resp.text))
+
+        if stream_response:
+            session.generator = _gen()
+        else:
+            # run the generator until finish
+            with closing(_gen()) as gen:
+                for _ in gen:
+                    pass
+            session.generator = None
 
         return session
diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
index b23ef3018d..c37f7572a1 100644
--- a/lmdeploy/serve/openai/api_server.py
+++ b/lmdeploy/serve/openai/api_server.py
@@ -340,8 +340,7 @@ async def chat_completions_v1(request: ChatCompletionRequest,
     error_check_ret = await check_request(request)
     if error_check_ret is not None:
         return error_check_ret
-    if VariableInterface.async_engine.id2step.get(str(request.session_id),
-                                                  0) != 0:
+    if VariableInterface.async_engine.id2step.get(request.session_id, 0) != 0:
         return create_error_response(
             HTTPStatus.BAD_REQUEST,
             f'The session_id `{request.session_id}` is occupied.')
@@ -596,8 +595,7 @@ async def completions_v1(request: CompletionRequest,
     error_check_ret = await check_request(request)
     if error_check_ret is not None:
         return error_check_ret
-    if VariableInterface.async_engine.id2step.get(str(request.session_id),
-                                                  0) != 0:
+    if VariableInterface.async_engine.id2step.get(request.session_id, 0) != 0:
         return create_error_response(
             HTTPStatus.BAD_REQUEST,
             f'The session_id `{request.session_id}` is occupied.')
@@ -865,11 +863,22 @@ async def chat_interactive_v1(request: GenerateRequest,
         request.session_id = VariableInterface.session_id
 
     async_engine = VariableInterface.async_engine
-    sequence_start = async_engine.id2step.get(str(request.session_id), 0) == 0
+    sequence_start = async_engine.id2step.get(request.session_id, 0) == 0
     sequence_end = not request.interactive_mode
     if isinstance(request.stop, str):
         request.stop = [request.stop]
 
+    end_session = sequence_end and not sequence_start \
+        and request.prompt == '' and request.request_output_len == 0
+    if end_session:
+        await async_engine.end_session(request.session_id)
+        return JSONResponse(
+            dict(text='',
+                 tokens=0,
+                 input_tokens=0,
+                 history_tokens=0,
+                 finish_reason=None))
+
     random_seed = request.seed if request.seed else None
 
     gen_config = GenerationConfig(
diff --git a/lmdeploy/turbomind/chat.py b/lmdeploy/turbomind/chat.py
index e106beae17..6985e3dc27 100644
--- a/lmdeploy/turbomind/chat.py
+++ b/lmdeploy/turbomind/chat.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import asyncio
 import os
 import random
 
@@ -28,6 +29,28 @@ def input_prompt(model_name):
     return '\n'.join(iter(input, sentinel))
 
 
+async def async_infer(generator, session_id, input_ids, gen_config,
+                      sequence_start, step, stream_output, tokenizer, state):
+    token_ids = input_ids.copy()
+    prev_len = 0
+    async for output in generator.async_stream_infer(
+            session_id=session_id,
+            input_ids=input_ids,
+            gen_config=gen_config,
+            sequence_start=sequence_start,
+            sequence_end=False,
+            step=step,
+            stream_output=stream_output):
+        tokens = output.num_token
+        if tokens > prev_len:
+            token_ids += output.token_ids[prev_len - tokens:]
+            response, state = tokenizer.detokenize_incrementally(token_ids,
+                                                                 state=state)
+            prev_len = tokens
+            print(response, end='', flush=True)
+    return tokens
+
+
 def main(model_path: str,
          session_id: int = 1,
          top_k: float = 40,
@@ -130,6 +153,9 @@ def main(model_path: str,
                                   repetition_penalty=repetition_penalty,
                                   stop_token_ids=stop_words)
 
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+
     nth_round = 1
     step = 0
     seed = random.getrandbits(64)
@@ -138,7 +164,7 @@ def main(model_path: str,
         if prompt == 'exit':
             exit(0)
         elif prompt == 'end':
-            generator.end(session_id)
+            loop.run_until_complete(generator.async_end(session_id))
             nth_round = 1
             step = 0
             seed = random.getrandbits(64)
@@ -149,10 +175,8 @@ def main(model_path: str,
 
             if model.capability == 'chat':
                 sequence_start = (nth_round == 1)
-                sequence_end = False
             else:
                 sequence_start = True
-                sequence_end = True
                 step = 0
 
             if step + len(
@@ -163,20 +187,11 @@ def main(model_path: str,
 
             print(f'{prompt}', end='', flush=True)
             state = DetokenizeState(len(input_ids))
-            for outputs in generator.stream_infer(
-                    session_id=session_id,
-                    input_ids=[input_ids],
-                    gen_config=gen_config,
-                    sequence_start=sequence_start,
-                    sequence_end=sequence_end,
-                    step=step,
-                    stream_output=stream_output):
-
-                res, tokens = input_ids + outputs.token_ids, outputs.num_token
-                # decode res
-                response, state = tokenizer.detokenize_incrementally(
-                    res, state=state)
-                print(response, end='', flush=True)
+
+            coro = async_infer(generator, session_id, input_ids, gen_config,
+                               sequence_start, step, stream_output, tokenizer,
+                               state)
+            tokens = loop.run_until_complete(coro)
 
             # update step
             step += len(input_ids) + tokens
diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py
index a1b2fff944..4a15dd6841 100644
--- a/lmdeploy/turbomind/turbomind.py
+++ b/lmdeploy/turbomind/turbomind.py
@@ -4,10 +4,12 @@
 import json
 import os.path as osp
 import sys
+from collections.abc import Sequence
 from concurrent.futures import ThreadPoolExecutor
 from dataclasses import asdict
+from functools import partial
 from itertools import repeat
-from queue import LifoQueue, Queue
+from queue import Queue
 from typing import Dict, Iterable, List
 
 import numpy as np
@@ -317,6 +319,93 @@ def create_instance(self, cuda_stream_id=0):
         return TurboMindInstance(self, self.config, cuda_stream_id)
 
 
+def _get_logits(outputs, offset: int):
+    logits = outputs['logits']
+
+    def _func(out: EngineOutput, step: int):
+        out.logits = logits[:step - offset - 1, :]
+
+    return _func
+
+
+def _get_last_hidden_state(outputs, offset: int):
+    last_hidden_state = outputs['last_hidden_state']
+    print(f'last_hidden_state.shape = {last_hidden_state.shape}')
+
+    def _func(out: EngineOutput, step: int):
+        out.last_hidden_state = last_hidden_state[:step - offset - 1, :]
+
+    return _func
+
+
+def _get_logprobs_impl(logprob_vals: torch.Tensor,
+                       logprob_idxs: torch.Tensor,
+                       logprob_nums: torch.Tensor,
+                       output_ids: List[int],
+                       logprobs: int,
+                       out_logprobs: List[Dict[int, float]] = None):
+    length = len(output_ids)
+    offset = len(out_logprobs)
+    if length == offset:
+        return out_logprobs
+    for (pos, idx, val, n) in zip(range(offset,
+                                        length), logprob_idxs[offset:length],
+                                  logprob_vals[offset:length],
+                                  logprob_nums[offset:length]):
+        topn = min(n.item(), logprobs)
+        tok_res = {idx[i].item(): val[i].item() for i in range(topn)}
+        token_id = output_ids[pos]
+        if token_id not in tok_res:
+            print(token_id, tok_res)
+            valid_n = n.item()
+            tok_res[token_id] = \
+                val[:valid_n][idx[:valid_n] == token_id].item()
+        ids = list(tok_res.keys())
+        for k in ids:
+            if tok_res[k] == float('-inf'):
+                tok_res.pop(k)
+        out_logprobs.append(tok_res)
+    return out_logprobs
+
+
+def _get_logprobs(outputs, output_logprobs: int):
+    logprob_vals = outputs['logprob_vals']
+    logprob_idxs = outputs['logprob_indexes']
+    logprob_nums = outputs['logprob_nums']
+
+    logprobs = []
+
+    def _func(out: EngineOutput, step: int):
+        _get_logprobs_impl(logprob_vals, logprob_idxs, logprob_nums,
+                           out.token_ids, output_logprobs, logprobs)
+        out.logprobs = logprobs
+
+    return _func
+
+
+class StreamingSemaphore:
+
+    def __init__(self):
+        self.loop = asyncio.get_running_loop()
+        self.fut = None
+        self.val = 0
+
+    async def acquire(self):
+        if self.val:
+            self.val = 0
+            return
+        self.fut = self.loop.create_future()
+        await self.fut
+        self.fut = None
+        self.val = 0
+
+    def release(self):
+        if not self.val:
+            self.val = 1
+            if self.fut:
+                self.fut.set_result(None)
+
+
 class TurboMindInstance:
     """Instance of TurboMind.
 
@@ -343,116 +432,30 @@ def __init__(self,
         # create model instances
         self.model_inst = self._create_model_instance(0)
 
-        self.que = Queue()
-        self.executor: ThreadPoolExecutor = None
-        self.future = None
         self.config = config
+        self.lock = None
 
     def _create_model_instance(self, device_id):
-        rank = self.node_id * self.gpu_count + device_id
-        model_inst = self.tm_model.model_comm.create_model_instance(
-            device_id, rank, self.cuda_stream_id, self.nccl_params)
+        model_inst = self.tm_model.model_comm.create_model_instance(device_id)
         return model_inst
 
-    def _forward_callback(self, result, ctx):
-        self.que.put((False, result))
-
-    def _forward_thread(self, inputs):
-
-        def _func():
-            try:
-                output = self.model_inst.forward(inputs)
-            except Exception as e:
-                logger.error(f'unhandled exception: {e}')
-                self.que.put((-1, None))
-                return
-            self.que.put((True, output))
-
-        self.executor = ThreadPoolExecutor(1)
-        self.future = self.executor.submit(_func)
-
-    def _async_forward_callback(self, result, ctx, que: LifoQueue):
-        que.put((False, result))
-
-    def _async_forward_thread(self, inputs, que: LifoQueue):
-
-        def _func():
-            try:
-                output = self.model_inst.forward(inputs)
-            except Exception as e:
-                logger.error(f'unhandled exception: {e}')
-                que.put((-1, None))
-                return
-            que.put((True, output))
-
-        self.executor = ThreadPoolExecutor(1)
-        self.future = self.executor.submit(_func)
-
-    def _get_logprobs(self,
-                      logprob_vals: torch.Tensor,
-                      logprob_indexes: torch.Tensor,
-                      logprob_nums: torch.Tensor,
-                      output_ids: torch.Tensor,
-                      logprobs: int = None,
-                      length: int = None,
-                      out_logprobs: List[Dict[int, float]] = None,
-                      session_id: int = None):
-        if logprobs is None:
-            return None
-        if out_logprobs is None:
-            out_logprobs = []
-        if len(output_ids) <= len(out_logprobs):
-            return out_logprobs
-        offset = len(out_logprobs)
-        for (token_id, idx, val, n) in zip(output_ids[offset:length],
-                                           logprob_indexes[offset:length],
-                                           logprob_vals[offset:length],
-                                           logprob_nums[offset:length]):
-            topn = min(n.item(), logprobs)
-            tok_res = {idx[i].item(): val[i].item() for i in range(topn)}
-            if token_id.item() not in tok_res:
-                valid_n = n.item()
-                tok_res[token_id.item()] = \
-                    val[:valid_n][idx[:valid_n] == token_id].item()
-            ids = list(tok_res.keys())
-            for k in ids:
-                if tok_res[k] == float('-inf'):
-                    tok_res.pop(k)
-            out_logprobs.append(tok_res)
-        return out_logprobs
+    def _get_extra_output_processors(self, outputs: Dict[str, torch.Tensor],
+                                     gen_config: GenerationConfig,
+                                     input_len: int):
+
+        def _get_offset(type):
+            return input_len - 1 if type == 'generation' else 0
 
-    def end(self, session_id: int):
-        """End the given session."""
-        input_ids = [self.tm_model.tokenizer.eos_token_id]
-        end_generator = self.tm_model.create_instance()
-        for outputs in end_generator.stream_infer(
-                session_id,
-                input_ids,
-                sequence_start=False,
-                sequence_end=True,
-                gen_config=GenerationConfig(max_new_tokens=0)):
-            pass
-
-    async def async_end(self, session_id: int):
-        """End the given session."""
-        self.end(session_id)
-
-    def cancel(self, session_id: int):
-        """Stop current streaming inference."""
-        input_ids = [self.tm_model.tokenizer.eos_token_id]
-        stop_generator = self.tm_model.create_instance()
-        for outputs in stop_generator.stream_infer(
-                session_id,
-                input_ids,
-                sequence_start=False,
-                sequence_end=False,
-                stop=True,
-                gen_config=GenerationConfig(max_new_tokens=0)):
-            pass
-
-    async def async_cancel(self, session_id: int):
-        """End the given session."""
-        self.cancel(session_id)
+        fs = []
+        if gen_config.output_logits:
+            offset = _get_offset(gen_config.output_logits)
+            fs.append(_get_logits(outputs, offset))
+        if gen_config.output_last_hidden_state:
+            offset = _get_offset(gen_config.output_last_hidden_state)
+            fs.append(_get_last_hidden_state(outputs, offset))
+        if gen_config.logprobs:
+            fs.append(_get_logprobs(outputs, gen_config.logprobs))
+        return fs
 
     def prepare_embeddings(self,
                            input_embeddings=None,
@@ -506,61 +509,17 @@ def prepare_embeddings(self,
         return input_embeddings, input_embedding_ranges
 
     def prepare_inputs(self,
-                       session_id,
                        input_ids,
                        gen_config: GenerationConfig,
                        input_embeddings=None,
-                       input_embedding_ranges=None,
-                       sequence_start: bool = True,
-                       sequence_end: bool = False,
-                       step=0,
-                       stop=False):
+                       input_embedding_ranges=None):
         """Convert inputs format."""
-        if len(input_ids) == 0:
-            input_ids = [[]]
-        if isinstance(input_ids[0], int):
-            input_ids = [input_ids]
+        assert isinstance(input_ids, Sequence)
 
-        batch_size = len(input_ids)
+        input_ids = torch.IntTensor(input_ids)
+        input_len = len(input_ids)
 
-        def _broadcast_np(data, dtype, shape=(batch_size, )):
-            if isinstance(data, Iterable):
-                assert len(data) == batch_size
-                return data
-
-            return np.full(shape, data, dtype=dtype)
-
-        input_ids = [torch.IntTensor(ids) for ids in input_ids]
-        input_lengths = torch.IntTensor([len(ids) for ids in input_ids])
-        input_ids = pad_sequence(input_ids,
-                                 batch_first=True,
-                                 padding_value=self.eos_id)
-
-        if isinstance(session_id, int):
-            session_id = [session_id]
-        assert len(session_id) == batch_size
-
-        step = _broadcast_np(step, np.int32)
-
-        inputs = dict(
-            input_ids=input_ids,
-            input_lengths=input_lengths,
-            request_output_len=np.full(input_lengths.shape,
-                                       gen_config.max_new_tokens,
-                                       dtype=np.uint32),
-            runtime_top_k=_broadcast_np(gen_config.top_k, np.uint32),
-            runtime_top_p=_broadcast_np(gen_config.top_p, np.float32),
-            runtime_min_p=_broadcast_np(gen_config.min_p, np.float32),
-            temperature=_broadcast_np(gen_config.temperature, np.float32),
-            repetition_penalty=_broadcast_np(gen_config.repetition_penalty,
-                                             np.float32),
-            step=step,
-
-            # session input
-            START=_broadcast_np((1 if sequence_start else 0), np.int32),
-            END=_broadcast_np((1 if sequence_end else 0), np.int32),
-            CORRID=np.array(session_id, dtype=np.uint64),
-            STOP=_broadcast_np((1 if stop else 0), np.int32))
+        inputs = dict(input_ids=input_ids, )
 
         input_embeddings, input_embedding_ranges = self.prepare_embeddings(
             input_embeddings, input_embedding_ranges)
@@ -568,17 +527,6 @@ def _broadcast_np(data, dtype, shape=(batch_size, )):
             inputs['input_embeddings'] = input_embeddings
             inputs['input_embedding_ranges'] = input_embedding_ranges
 
-        if gen_config.min_new_tokens is not None:
-            inputs['min_length'] = _broadcast_np(gen_config.min_new_tokens,
-                                                 np.int32)
-
-        if gen_config.logprobs is not None and gen_config.logprobs > 0:
-            if gen_config.logprobs > MAX_LOGPROBS:
-                gen_config.logprobs = MAX_LOGPROBS
-                logger.warning('logprobs shoudd be in range [1, 1024]'
-                               f'update logprobs={gen_config.logprobs}')
-            inputs['logprobs'] = _broadcast_np(gen_config.logprobs, np.int32)
-
         bad_words = []
         if gen_config.bad_token_ids is not None:
             bad_words.extend(gen_config.bad_token_ids)
@@ -597,10 +545,24 @@ def _broadcast_np(data, dtype, shape=(batch_size, )):
         if bad_words is not None:
             inputs['bad_words_list'] = bad_words
 
-        if gen_config.random_seed is not None:
-            inputs['random_seed'] = _broadcast_np(gen_config.random_seed,
-                                                  np.uint64)
-        return inputs, input_lengths
+        return inputs, input_len
+
+    async def async_cancel(self, session_id: int = None):
+        self.model_inst.cancel()
+
+    def async_end_cb(self, fut: asyncio.Future, status: int):
+        """executing on engine's signaling thread."""
+        logger.info(f'[async_end_cb] session ended, status = {status}')
+        fut.get_loop().call_soon_threadsafe(fut.set_result, status)
+
+    async def async_end(self, session_id):
+        fut = asyncio.get_running_loop().create_future()
+        self.model_inst.end(partial(self.async_end_cb, fut), session_id)
+        await fut
+
+    def async_signal_cb(self, s: StreamingSemaphore):
+        """executing on engine's signaling thread."""
+        s.loop.call_soon_threadsafe(s.release)
 
     async def async_stream_infer(self,
                                  session_id,
@@ -610,7 +572,6 @@ async def async_stream_infer(self,
                                  sequence_start: bool = True,
                                  sequence_end: bool = False,
                                  step=0,
-                                 stop=False,
                                  gen_config: GenerationConfig = None,
                                  stream_output=False,
                                  **kwargs):
@@ -630,219 +591,119 @@ async def async_stream_infer(self,
             stream_output (bool): indicator for stream output
             kwargs (dict): kwargs for backward compatibility
         """
-        # start forward thread
-        que = LifoQueue()
-        from functools import partial
-        _forward_callback = partial(self._async_forward_callback, que=que)
-        _forward_thread = partial(self._async_forward_thread, que=que)
-        if stream_output and not stop:
-            logger.info(f'Register stream callback for {session_id}')
-            self.model_inst.register_callback(_forward_callback)
-
-        inputs, input_lengths = self.prepare_inputs(
-            session_id=session_id,
-            input_ids=input_ids,
-            input_embeddings=input_embeddings,
-            input_embedding_ranges=input_embedding_ranges,
-            sequence_start=sequence_start,
-            sequence_end=sequence_end,
-            step=step,
-            stop=stop,
-            gen_config=gen_config)
+        logger.info(f'[async_stream_infer] session {session_id} start')
+        gen_cfg = self._get_generation_config(gen_config)
 
-        tm_inputs = _np_dict_to_tm_dict(inputs)
-        _forward_thread(tm_inputs)
-
-        seq_start = input_lengths + input_lengths.new_tensor(step)
-
-        out_logprobs = None
-        prev_len = 0
-        # generator
-        while True:
-            while que.qsize() == 0:  # let other requests in
-                await asyncio.sleep(0.002)
-
-            finish, tm_outputs = que.get()
-            if finish < 0:
-                yield EngineOutput(status=ResponseType.INTERNAL_ENGINE_ERROR,
-                                   token_ids=[],
-                                   num_token=0)
-                self.executor.shutdown()
-                break
-
-            outputs = _tm_dict_to_torch_dict(tm_outputs)
-
-            output_ids = outputs['output_ids'][:, 0, :]
-            sequence_length = outputs['sequence_length'].long()[:, 0]
-            output_ids = [
-                output_id[s:l] for output_id, s, l in zip(
-                    output_ids, seq_start, sequence_length)
-            ]
-            sequence_length -= seq_start.to(sequence_length.device)
-
-            if 'logprob_vals' in outputs:
-                logprob_vals = outputs['logprob_vals'][0, 0]
-                logprob_indexes = outputs['logprob_indexes'][0, 0]
-                logprob_nums = outputs['logprob_nums'][0, 0]
-                out_logprobs = self._get_logprobs(logprob_vals,
-                                                  logprob_indexes,
-                                                  logprob_nums, output_ids[0],
-                                                  gen_config.logprobs,
-                                                  sequence_length.cpu().item(),
-                                                  out_logprobs, session_id)
-
-            outputs = []
-            status = ResponseType.FINISH if finish else ResponseType.SUCCESS
-            for output, len_ in zip(output_ids, sequence_length):
-                output, len_ = output, len_.item()
-                if len(output) > 0 and output[-1].item() == self.eos_id \
-                        and not gen_config.ignore_eos:
-                    outputs = EngineOutput(status, output[:-1].tolist(),
-                                           len_ - 1)
-                elif len(output) > 0 and \
-                    gen_config.stop_token_ids is not None and \
-                        output[-1].item() in gen_config.stop_token_ids:
-                    outputs = EngineOutput(status, output[:-1].tolist(), len_)
-                else:
-                    outputs = EngineOutput(status, output.tolist(), len_)
-            if outputs.num_token < prev_len and not finish:
-                continue
-            else:
-                prev_len = outputs.num_token
-
-            if out_logprobs:
-                output_token_len = len(outputs.token_ids)
-                outputs.logprobs = out_logprobs[:output_token_len]
-
-            yield outputs
-
-            if finish:
-                self.future.result()
-                self.executor.shutdown()
-                break
-
-        if stream_output and not stop:
-            logger.info(f'UN-register stream callback for {session_id}')
-            self.model_inst.unregister_callback()
-
-    def stream_infer(self,
-                     session_id,
-                     input_ids,
-                     input_embeddings=None,
-                     input_embedding_ranges=None,
-                     sequence_start: bool = True,
-                     sequence_end: bool = False,
-                     step=0,
-                     stop=False,
-                     gen_config: GenerationConfig = None,
-                     stream_output=False,
-                     **kwargs):
-        """Perform model inference.
-
-        Args:
-            session_id (int): the id of a session
-            input_ids (numpy.ndarray): the token ids of a prompt
-            input_embeddings (List[numpy.ndarray]): embeddings features
-            input_embedding_ranges (List[Tuple[int,int]]): the begin/end
-              offsets of input_embeddings to input_ids
-            sequence_start (bool): indicator for starting a sequence
-            sequence_end (bool): indicator for ending a sequence
-            step (int): the offset of the k/v cache
-            stop (bool): indicator for cancelling the session
-            gen_config (GenerationConfig): generation config
-            stream_output (bool): indicator for stream output
-            kwargs (dict): kwargs for backward compatibility
-        """
-        if stream_output and not stop:
-            logger.info(f'Register stream callback for {session_id}')
-            self.model_inst.register_callback(self._forward_callback)
-
-        inputs, input_lengths = self.prepare_inputs(
-            session_id=session_id,
+        inputs, input_len = self.prepare_inputs(
             input_ids=input_ids,
             input_embeddings=input_embeddings,
             input_embedding_ranges=input_embedding_ranges,
-            sequence_start=sequence_start,
-            sequence_end=sequence_end,
-            step=step,
-            stop=stop,
             gen_config=gen_config)
 
-        tm_inputs = _np_dict_to_tm_dict(inputs)
-        # start forward thread
-        self.que = Queue()
-        self._forward_thread(tm_inputs)
-
-        seq_start = input_lengths + input_lengths.new_tensor(step)
-        out_logprobs = None
-
-        # generator
-        while True:
-            while self.que.qsize() > 1:
-                self.que.get()
-
-            finish, tm_outputs = self.que.get()
-            if finish < 0:
-                yield EngineOutput(status=ResponseType.INTERNAL_ENGINE_ERROR,
-                                   token_ids=[],
-                                   num_token=0)
-                self.executor.shutdown()
-                break
-
-            outputs = _tm_dict_to_torch_dict(tm_outputs)
-
-            output_ids = outputs['output_ids'][:, 0, :]
-            sequence_length = outputs['sequence_length'].long()[:, 0]
-            output_ids = [
-                output_id[s:l] for output_id, s, l in zip(
-                    output_ids, seq_start, sequence_length)
-            ]
-            sequence_length -= seq_start.to(sequence_length.device)
-
-            if 'logprob_vals' in outputs:
-                logprob_vals = outputs['logprob_vals'][0, 0]
-                logprob_indexes = outputs['logprob_indexes'][0, 0]
-                logprob_nums = outputs['logprob_nums'][0, 0]
-                out_logprobs = self._get_logprobs(logprob_vals,
-                                                  logprob_indexes,
-                                                  logprob_nums, output_ids[0],
-                                                  gen_config.logprobs,
-                                                  sequence_length.cpu().item(),
-                                                  out_logprobs, session_id)
-
-            outputs = []
-            status = ResponseType.FINISH if finish else ResponseType.SUCCESS
-            for output, len_ in zip(output_ids, sequence_length):
-                output, len_ = output, len_.item()
-                if len(output) > 0 and output[-1].item() == self.eos_id \
-                        and not gen_config.ignore_eos:
-                    outputs = EngineOutput(status, output[:-1].tolist(),
-                                           len_ - 1, out_logprobs)
-                elif len(output) > 0 and \
-                    gen_config.stop_token_ids is not None and \
-                        output[-1].item() in gen_config.stop_token_ids:
-                    outputs = EngineOutput(status, output[:-1].tolist(), len_,
-                                           out_logprobs)
-                else:
-                    outputs = EngineOutput(status, output.tolist(), len_,
-                                           out_logprobs)
-
-            if out_logprobs:
-                output_token_len = len(outputs.token_ids)
-                outputs.logprobs = out_logprobs[:output_token_len]
-
-            yield outputs
-
-            if finish:
-                self.future.result()
-                self.executor.shutdown()
-                while self.que.qsize() > 0:
-                    self.que.get()
-                break
-
-        if stream_output and not stop:
-            logger.info(f'UN-register stream callback for {session_id}')
-            self.model_inst.unregister_callback()
+        session = _tm.SessionParam(id=session_id,
+                                   step=step,
+                                   start=sequence_start,
+                                   end=sequence_end)
+
+        inputs = _np_dict_to_tm_dict(inputs)
+
+        sem = StreamingSemaphore()
+        signal_cb = partial(self.async_signal_cb, sem)
+
+        outputs, shared_state = self.model_inst.forward(
+            inputs, session, gen_cfg, stream_output, signal_cb)
+
+        outputs = _tm_dict_to_torch_dict(outputs)
+
+        extra_fs = self._get_extra_output_processors(outputs, gen_config,
+                                                     input_len)
+
+        output_ids_buf = outputs['output_ids']
+
+        finish = False
+        state = None
+
+        output_ids = []
+        output_len = 0
+        prev_len = step + input_len
+        try:
+            while True:
+                await sem.acquire()
+                state = shared_state.consume()
+
+                status, seq_len = state.status, state.seq_len
+
+                if status in [7, 8]:  # finish / canceled
+                    finish, status = True, 0
+                elif status:
+                    yield self._get_error_output()
+                    break
+
+                if seq_len == prev_len and not finish:
+                    continue
+
+                output_ids += output_ids_buf[prev_len:seq_len].tolist()
+                output_len += seq_len - prev_len
+                status = ResponseType.FINISH if finish else ResponseType.SUCCESS  # noqa
+                output = EngineOutput(status, output_ids, output_len)
+
+                for f in extra_fs:
+                    f(output, seq_len)
+
+                prev_len = seq_len
+
+                yield output
+
+                if finish:
+                    break
+
+        except (GeneratorExit, asyncio.CancelledError) as e:
+            logger.info(f'[async_stream_infer] {type(e).__name__}')
+            self.model_inst.cancel()
+        except Exception as e:
+            logger.error(f'[async_stream_infer] {type(e).__name__} {e}')
+            self.model_inst.cancel()
+            yield self._get_error_output()
+        finally:
+            # Contract: `cb` won't be called again if status is non-zero
+            # wait for status to be set as `finish` or `error`
+            while not state or state.status == 0:
+                await sem.acquire()
+                state = shared_state.consume()
+            logger.info(f'[async_stream_infer] session {session_id} done')
+
+    def _get_error_output(self):
+        return EngineOutput(status=ResponseType.INTERNAL_ENGINE_ERROR,
+                            token_ids=[],
+                            num_token=0)
+
+    def _get_generation_config(self, cfg: GenerationConfig):
+        c = _tm.GenerationConfig()
+        c.max_new_tokens = cfg.max_new_tokens
+        c.top_k = cfg.top_k
+        c.top_p = cfg.top_p
+        c.min_p = cfg.min_p
+        c.temperature = cfg.temperature
+        c.repetition_penalty = cfg.repetition_penalty
+        if cfg.min_new_tokens:
+            c.min_new_tokens = cfg.min_new_tokens
+        output_type = dict(all=1, generation=2)
+        if cfg.output_last_hidden_state:
+            c.output_last_hidden_state = output_type[
+                cfg.output_last_hidden_state]
+        if cfg.output_logits:
+            c.output_logits = output_type[cfg.output_logits]
+        if cfg.logprobs:
+            if cfg.logprobs > MAX_LOGPROBS:
+                cfg.logprobs = MAX_LOGPROBS
+                logger.warning(
+                    f'logprobs shoudd be in range [1, {MAX_LOGPROBS}]'
+                    f'update logprobs={cfg.logprobs}')
+            c.output_logprobs = cfg.logprobs
+        if cfg.random_seed is not None:
+            c.random_seed = cfg.random_seed
+        # print (c)
+        return c
 
     def decode(self,
                input_ids,
diff --git a/src/turbomind/CMakeLists.txt b/src/turbomind/CMakeLists.txt
index aec443a1aa..62adb94e5a 100644
--- a/src/turbomind/CMakeLists.txt
+++ b/src/turbomind/CMakeLists.txt
@@ -16,6 +16,7 @@ add_subdirectory(utils)
 add_subdirectory(kernels)
 add_subdirectory(layers)
 add_subdirectory(models)
+add_subdirectory(engine)
 if(BUILD_PYT)
     add_subdirectory(th_op)
 endif()
diff --git a/src/turbomind/engine/CMakeLists.txt b/src/turbomind/engine/CMakeLists.txt
new file mode 100644
index 0000000000..1d68116cf6
--- /dev/null
+++ b/src/turbomind/engine/CMakeLists.txt
@@ -0,0 +1,7 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+cmake_minimum_required(VERSION 3.8)
+
+add_library(engine STATIC gateway.cc request_queue.cc model_request.cc)
+set_property(TARGET engine PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET engine PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
diff --git a/src/turbomind/engine/gateway.cc b/src/turbomind/engine/gateway.cc
new file mode 100644
index 0000000000..e949ec7cd3
--- /dev/null
+++ b/src/turbomind/engine/gateway.cc
@@ -0,0 +1,40 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include <memory>
+
+#include "src/turbomind/engine/gateway.h"
+#include "src/turbomind/engine/request_queue.h"
+
+namespace turbomind {
+
+Gateway::Gateway(std::function<std::shared_ptr<void>()> ctx_factory): request_queue_{this}, ctx_factory_{ctx_factory}
+{
+    signal_thread_ = std::thread(&Gateway::signal_thread_entry, this);
+}
+
+void Gateway::shutdown()
+{
+    request_queue_.close();
+    signal_buffer_.close();
+
+    signal_thread_.join();
+}
+
+void Gateway::signal_thread_entry() noexcept
+{
+    while (true) {
+        bool                abort{};
+        std::vector<Signal> signals = signal_buffer_.take_all(abort);
+        if (abort) {
+            break;
+        }
+        else {
+            auto ctx = ctx_factory_();
+            for (const auto& s : signals) {
+                s();
+            }
+        }
+    }
+}
+
+}  // namespace turbomind
diff --git a/src/turbomind/engine/gateway.h b/src/turbomind/engine/gateway.h
new file mode 100644
index 0000000000..d939c0bcc2
--- /dev/null
+++ b/src/turbomind/engine/gateway.h
@@ -0,0 +1,61 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include <memory>
+#include <thread>
+#include <vector>
+
+#include "src/turbomind/engine/request_queue.h"
+#include "src/turbomind/engine/signal_buffer.h"
+
+namespace turbomind {
+
+class Gateway {
+public:
+    Gateway(std::function<std::shared_ptr<void>()> ctx_factory);
+
+    void shutdown();
+
+    void push(std::vector<std::shared_ptr<Request>> reqs)
+    {
+        return request_queue_.push(std::move(reqs));
+    }
+
+    void pop(std::vector<std::shared_ptr<Request>>& infer_reqs,
+             std::vector<std::shared_ptr<Request>>& kill_reqs,
+             unsigned                               max_infer_num,
+             bool                                   blocking,
+             bool&                                  abort)
+    {
+        return request_queue_.pop(infer_reqs, kill_reqs, max_infer_num, blocking, abort);
+    }
+
+    void cancel(std::shared_ptr<Request> req)
+    {
+        return request_queue_.cancel(std::move(req));
+    }
+
+    void kill(std::shared_ptr<Request> req)
+    {
+        return request_queue_.kill(std::move(req));
+    }
+
+    void notify(std::vector<Signal> signals)
+    {
+        return signal_buffer_.push(std::move(signals));
+    }
+
+private:
+    void signal_thread_entry() noexcept;
+
+private:
+    RequestQueue request_queue_;
+    SignalBuffer signal_buffer_;
+
+    std::function<std::shared_ptr<void>()> ctx_factory_;
+
+    std::thread signal_thread_;
+};
+
+}  // namespace turbomind
diff --git a/src/turbomind/engine/model_request.cc b/src/turbomind/engine/model_request.cc
new file mode 100644
index 0000000000..6ba355e896
--- /dev/null
+++ b/src/turbomind/engine/model_request.cc
@@ -0,0 +1,174 @@
+
+
+#include <algorithm>
+#include <functional>
+#include <memory>
+#include <numeric>
+#include <type_traits>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "src/turbomind/engine/model_request.h"
+#include "src/turbomind/engine/request.h"
+#include "src/turbomind/utils/Tensor.h"
+#include "src/turbomind/utils/constant.h"
+#include "src/turbomind/utils/cuda_utils.h"
+
+namespace turbomind {
+
+static ManagedTensor create(DataType dtype, MemoryType where, const std::vector<int64_t>& size, int64_t& byte_size)
+{
+    byte_size = std::accumulate(size.begin(), size.end(), Tensor::getTypeSize(dtype), std::multiplies<>{});
+    void* data{};
+
+    if (where == MEMORY_GPU) {
+        check_cuda_error(cudaMallocAsync(&data, byte_size, nullptr));
+    }
+    else {
+        data = std::malloc(byte_size);
+    }
+
+    ManagedTensor ret;
+    ret.tensor = Tensor{where, dtype, std::vector<size_t>(size.begin(), size.end()), data};
+    ret.data_holder.reset((void*)nullptr, [data, where](auto) {
+        // std::cerr << "turbomind tensor deallocate" << std::endl;
+        if (where == MEMORY_GPU) {
+            /// TODO: guard device id
+            check_cuda_error(cudaFreeAsync(data, nullptr));
+        }
+        else {
+            std::free(data);
+        }
+    });
+    return ret;
+}
+
+template<class T>
+static T get(const std::unordered_map<std::string, ManagedTensor>& m, const std::string& key, T fallback = {})
+{
+    auto it = m.find(key);
+    if (it != m.end()) {
+        return it->second->getVal<T>();
+    }
+    return fallback;
+}
+
+ModelRequest::ModelRequest(Gateway* gateway, DataType data_type, int session_len, int vocab_size, int hidden_dim):
+    gateway_{gateway},
+    data_type_{data_type},
+    session_len_{session_len},
+    vocab_size_{vocab_size},
+    hidden_dim_{hidden_dim}
+{
+}
+
+void ModelRequest::Cancel()
+{
+    // request is finished if lock failed
+    if (auto r = request_.lock()) {
+        gateway_->cancel(std::move(r));
+    }
+}
+
+void ModelRequest::End(std::function<void(int)> cb, uint64_t session_id)
+{
+    auto r = std::make_shared<Request>();
+
+    r->id = r->session.id = session_id;
+    r->session.kill_flag  = true;
+
+    r->end_cb = std::move(cb);
+
+    gateway_->kill(std::move(r));
+}
+
+auto ModelRequest::Forward(InputParam param, std::function<void()> cb) -> OutputParam
+{
+    inputs_  = std::make_shared<TensorMap_>();
+    outputs_ = std::make_shared<TensorMap_>();
+
+    auto add = [](auto& dest, auto key, auto dtype, auto where, auto shape, auto&&... dims) {
+        std::vector<int64_t> shape_;
+        if constexpr (std::is_integral_v<decltype(shape)>) {
+            shape_ = {shape, dims...};
+        }
+        else {
+            shape_ = {shape.cbegin(), shape.cend()};
+        }
+        int64_t byte_size{};
+        auto    it = dest->emplace(key, create(dtype, where, shape_, byte_size)).first;
+        return std::make_pair(it->second->data, byte_size);
+    };
+
+    auto& inputs = *param.tensors;
+
+    FT_CHECK(inputs.at("input_ids")->shape.size() == 1);
+
+    const int input_len  = inputs.at("input_ids")->shape[0];
+    const int output_len = param.gen_cfg.max_new_tokens;
+
+    // Max possible length of a sequence, this depends on `history_len` which isn't available here, so `session_len`
+    // is used instead
+    const int max_seq_len = session_len_ + 1;
+    const int max_out_len = std::min(output_len, session_len_) + 1;
+    // This does not include histroy length in interactive mode
+    const int max_in_out_len = std::min(input_len + output_len, session_len_) + 1;
+
+    for (auto& [k, v] : *param.tensors) {
+        inputs_->emplace(k, v);
+    }
+
+    add(outputs_, "output_ids", TYPE_INT32, MEMORY_CPU, max_seq_len);
+    add(outputs_, "sequence_length", TYPE_INT32, MEMORY_CPU, 1);
+
+    if (param.gen_cfg.output_logits) {
+        const int len = param.gen_cfg.output_logits == GenerationConfig::kAll ? max_in_out_len : max_out_len;
+        add(outputs_, "logits", TYPE_FP32, MEMORY_CPU, len, vocab_size_);
+    }
+
+    if (param.gen_cfg.output_last_hidden_state) {
+        const int len = param.gen_cfg.output_last_hidden_state == GenerationConfig::kAll ? max_in_out_len : max_out_len;
+        add(outputs_, "last_hidden_state", data_type_, MEMORY_CPU, len, hidden_dim_);
+    }
+
+    if (param.gen_cfg.output_logprobs) {
+        add(outputs_, "logprob_vals", TYPE_FP32, MEMORY_CPU, max_out_len, kMaxLogProb);
+        add(outputs_, "logprob_indexes", TYPE_INT32, MEMORY_CPU, max_out_len, kMaxLogProb);
+        add(outputs_, "logprob_nums", TYPE_INT32, MEMORY_CPU, max_out_len);
+    }
+
+    auto r = std::make_shared<Request>();
+
+    for (const auto& [k, v] : *inputs_) {
+        r->inputs.insert(k, *v);
+    }
+    for (const auto& [k, v] : *outputs_) {
+        r->outputs.insert(k, *v);
+    }
+
+    auto state = std::make_shared<AtomicRequestState>();
+
+    if (param.session.start_flag) {
+        session_id_ = param.session.id;
+    }
+
+    r->id            = param.session.id;
+    r->session       = param.session;
+    r->gen_cfg       = param.gen_cfg;
+    r->stream_output = param.stream_output;
+    r->forward_cb    = std::move(cb);
+    r->state         = state;
+
+    r->output_ids      = *outputs_->at("output_ids");
+    r->sequence_length = *outputs_->at("sequence_length");
+
+    // Keep a weak reference for canceling the request
+    request_ = r;
+
+    gateway_->push({std::move(r)});
+
+    return OutputParam{outputs_, state};
+}
+
+}  // namespace turbomind
diff --git a/src/turbomind/engine/model_request.h b/src/turbomind/engine/model_request.h
new file mode 100644
index 0000000000..aea889e856
--- /dev/null
+++ b/src/turbomind/engine/model_request.h
@@ -0,0 +1,59 @@
+
+
+#pragma once
+
+#include <memory>
+
+#include "src/turbomind/engine/gateway.h"
+#include "src/turbomind/utils/Tensor.h"
+
+namespace turbomind {
+
+class ModelRequest {
+public:
+    virtual ~ModelRequest() = default;
+
+    ModelRequest(Gateway* gateway, DataType data_type, int session_len, int vocab_size, int hidden_dim);
+
+    // Cancel running request
+    void Cancel();
+
+    // Reset the channel to uninitailized state, calls `notify` when done
+    void End(std::function<void(int)> cb, uint64_t session_id);
+
+    using TensorMap_ = std::unordered_map<std::string, ManagedTensor>;
+
+    struct InputParam {
+        std::shared_ptr<TensorMap_> tensors;
+
+        SessionParam     session;
+        GenerationConfig gen_cfg;
+
+        bool stream_output;
+    };
+
+    struct OutputParam {
+        std::shared_ptr<TensorMap_>         tensors;
+        std::shared_ptr<AtomicRequestState> state;
+    };
+
+    OutputParam Forward(InputParam param, std::function<void()> cb);
+
+protected:
+    Gateway* const gateway_;
+
+    const DataType data_type_;
+
+    const int session_len_;
+    const int hidden_dim_;
+    const int vocab_size_;
+
+    uint64_t session_id_;
+
+    std::weak_ptr<Request> request_;
+
+    std::shared_ptr<TensorMap_> inputs_;   // owned by caller
+    std::shared_ptr<TensorMap_> outputs_;  // owned by `this`
+};
+
+}  // namespace turbomind
diff --git a/src/turbomind/engine/request.h b/src/turbomind/engine/request.h
new file mode 100644
index 0000000000..6bf706c9b8
--- /dev/null
+++ b/src/turbomind/engine/request.h
@@ -0,0 +1,148 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include <atomic>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <ostream>
+
+#include "src/turbomind/utils/Tensor.h"
+
+namespace turbomind {
+
+struct GenerationConfig {
+    int max_new_tokens = 0;
+    int min_new_tokens = 0;
+
+    int   top_k       = 1;
+    float top_p       = 0.f;
+    float min_p       = 0.f;
+    float temperature = 1.f;
+
+    float repetition_penalty = 1.f;
+
+    uint64_t random_seed = 0;
+
+    int output_logprobs = 0;
+
+    enum OutType
+    {
+        kNone       = 0,
+        kAll        = 1,
+        kGeneration = 2
+    };
+    int output_last_hidden_state = 0;
+    int output_logits            = 0;
+};
+
+inline std::ostream& operator<<(std::ostream& os, const GenerationConfig& c)
+{
+    os << "GenerationConfig { ";
+    os << "max_new_tokens=" << c.max_new_tokens;
+    os << ", min_new_tokens=" << c.min_new_tokens;
+    os << ", top_p=" << c.top_p;
+    os << ", top_k=" << c.top_k;
+    os << ", min_p=" << c.min_p;
+    os << ", temperature=" << c.temperature;
+    os << ", repetition_penalty=" << c.repetition_penalty;
+    os << ", random_seed=" << c.random_seed;
+    os << ", output_logprobs=" << c.output_logprobs;
+    os << ", output_hidden_states=" << c.output_last_hidden_state;
+    os << ", output_logits=" << c.output_logits;
+    os << " }";
+    return os;
+}
+
+struct SessionParam {
+    uint64_t id;
+
+    int step;
+
+    bool start_flag;
+    bool end_flag;
+    bool kill_flag;
+};
+
+struct RequestState {
+    int status;
+    int seq_len;
+};
+
+struct AtomicRequestState {
+
+    std::atomic<RequestState*> data_;
+
+    static_assert(std::atomic<RequestState*>::is_always_lock_free);
+
+    ~AtomicRequestState()
+    {
+        auto data = exchange(nullptr);
+    }
+
+    std::unique_ptr<RequestState> exchange(RequestState* data)
+    {
+        return std::unique_ptr<RequestState>{data_.exchange(data, std::memory_order_acq_rel)};
+    }
+};
+
+struct Request {
+    uint64_t id;         // sequence id
+    uint64_t unique_id;  // monotonic increasing
+
+    SessionParam     session;
+    GenerationConfig gen_cfg;
+
+    bool stream_output;
+
+    // reference to IO tensors
+    TensorMap inputs;
+    TensorMap outputs;
+    // fast path for accessing common output buffers
+    Tensor output_ids;
+    Tensor sequence_length;
+
+    std::function<void(int)> end_cb;
+
+    std::atomic<int> cancel_flag;
+    bool             is_canceled{};
+
+    std::function<void()> forward_cb;
+
+    std::shared_ptr<AtomicRequestState> state;
+
+    int ec;  // set when disabling conflicting requests
+
+    enum
+    {
+        kOk       = 0,
+        kInvalid  = 1,  // Sequence not exist or both `start` & `stop` (instead of `end`) is set
+        kConflict = 2,  // Concurrent requests to the same sequence
+        kBusy     = 3,  // Sequence is already running
+        kInactive = 4,  // Sequence to `stop` is not active
+        kFail     = 5,  // Can't find sequence for `stop` request or internal error during inference
+        kTooLong  = 6,  // history + prompt > session_len,
+        kFinish   = 7,
+        kCancel   = 8,
+    };
+};
+
+inline void UpdateState(Request& r, int status, int seq_len)
+{
+    try {
+        auto new_state = new RequestState{status, seq_len};
+        auto old_state = r.state->exchange(new_state);
+        if (!old_state && r.forward_cb) {
+            r.forward_cb();
+        }
+    }
+    catch (const std::exception& e) {
+        TM_LOG_ERROR("Error invoking callback for (%lu): %s", r.id, e.what());
+    }
+    catch (...) {
+        TM_LOG_ERROR("Unknown error invoking callback for (%lu)", r.id);
+    }
+}
+
+}  // namespace turbomind
diff --git a/src/turbomind/engine/request_queue.cc b/src/turbomind/engine/request_queue.cc
new file mode 100644
index 0000000000..8c0b52b5bf
--- /dev/null
+++ b/src/turbomind/engine/request_queue.cc
@@ -0,0 +1,93 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/engine/request_queue.h"
+#include "src/turbomind/engine/gateway.h"
+
+#include "src/turbomind/engine/request.h"
+
+namespace turbomind {
+
+void RequestQueue::push(std::vector<std::shared_ptr<Request>> reqs)
+{
+    {
+        std::lock_guard<std::mutex> lock(mutex_);
+        if (closed_) {
+            throw std::runtime_error("Queue is closed");
+        }
+        for (auto& r : reqs) {
+            queue_.push(std::move(r));
+        }
+    }
+    cv_.notify_one();
+}
+
+void RequestQueue::cancel(std::shared_ptr<Request> r)
+{
+    // -1 canceled
+    //  0 queued
+    //  1 active
+    if (r->cancel_flag.exchange(-1, std::memory_order_acq_rel) != 0) {
+        // request is picked up by engine
+        return;
+    }
+    else {
+        // not picked by engine yet, skip directly
+        gateway_->notify({[r = std::move(r)] {  //
+            UpdateState(*r, Request::kCancel, 0);
+        }});
+    }
+}
+
+void RequestQueue::kill(std::shared_ptr<Request> r)
+{
+    {
+        std::lock_guard<std::mutex> lock(mutex_);
+        if (closed_) {
+            throw std::runtime_error("Queue is closed");
+        }
+        kill_.push_back(std::move(r));
+    }
+    cv_.notify_one();
+}
+
+void RequestQueue::pop(std::vector<std::shared_ptr<Request>>& infer_reqs,
+                       std::vector<std::shared_ptr<Request>>& kill_reqs,
+                       unsigned                               max_infer_num,
+                       bool                                   blocking,
+                       bool&                                  abort)
+{
+    std::unique_lock<std::mutex> lock(mutex_);
+
+    if (blocking) {
+        cv_.wait(lock, [this] { return !queue_.empty() || !kill_.empty() || closed_; });
+        if (closed_) {
+            abort = true;
+            return;
+        }
+    }
+
+    infer_reqs.clear();
+    while (!queue_.empty() && infer_reqs.size() <= max_infer_num) {
+        auto& r = queue_.front();
+        if (r->cancel_flag.exchange(1, std::memory_order_acq_rel) == 0) {
+            infer_reqs.push_back(std::move(r));
+        }
+        else {
+            // Canceled requests are simply ignored
+        }
+        queue_.pop();
+    }
+
+    kill_reqs = std::move(kill_);
+}
+
+void RequestQueue::close()
+{
+    {
+        std::lock_guard<std::mutex> lock(mutex_);
+        closed_ = true;
+    }
+    cv_.notify_all();
+}
+
+}  // namespace turbomind
diff --git a/src/turbomind/engine/request_queue.h b/src/turbomind/engine/request_queue.h
new file mode 100644
index 0000000000..c029f38f4b
--- /dev/null
+++ b/src/turbomind/engine/request_queue.h
@@ -0,0 +1,46 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include <condition_variable>
+#include <mutex>
+#include <queue>
+
+#include "src/turbomind/engine/request.h"
+
+namespace turbomind {
+
+class Gateway;
+
+class RequestQueue {
+public:
+    RequestQueue(Gateway* gateway): gateway_{gateway} {}
+
+    void push(std::vector<std::shared_ptr<Request>> reqs);
+
+    void pop(std::vector<std::shared_ptr<Request>>& infer_reqs,
+             std::vector<std::shared_ptr<Request>>& kill_reqs,
+             unsigned                               max_infer_num,
+             bool                                   blocking,
+             bool&                                  abort);
+
+    void cancel(std::shared_ptr<Request> r);
+
+    void kill(std::shared_ptr<Request> r);
+
+    void close();
+
+private:
+    Gateway* gateway_;
+
+    std::queue<std::shared_ptr<Request>> queue_;
+
+    std::vector<std::shared_ptr<Request>> kill_;
+
+    std::mutex              mutex_;
+    std::condition_variable cv_;
+
+    bool closed_{false};
+};
+
+}  // namespace turbomind
diff --git a/src/turbomind/engine/signal_buffer.h b/src/turbomind/engine/signal_buffer.h
new file mode 100644
index 0000000000..cb09be7909
--- /dev/null
+++ b/src/turbomind/engine/signal_buffer.h
@@ -0,0 +1,61 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include <condition_variable>
+#include <functional>
+#include <mutex>
+
+namespace turbomind {
+
+using Signal = std::function<void()>;
+
+class SignalBuffer {
+public:
+    void push(std::vector<Signal> signals)
+    {
+        if (signals.empty()) {
+            return;
+        }
+        {
+            std::lock_guard lock{mutex_};
+            signals_.insert(signals_.end(), std::move_iterator{signals.begin()}, std::move_iterator{signals.end()});
+        }
+        cv_.notify_one();
+    }
+
+    void close()
+    {
+        {
+            std::lock_guard lock{mutex_};
+            aborted_ = true;
+        }
+        cv_.notify_all();
+    }
+
+    std::vector<Signal> take_all(bool& abort)
+    {
+        std::vector<Signal> signals;
+        {
+            std::unique_lock lock{mutex_};
+            cv_.wait(lock, [&] { return !signals_.empty() || aborted_; });
+            if (aborted_) {
+                abort = true;
+            }
+            else {
+                signals.swap(signals_);
+            }
+        }
+        return signals;
+    }
+
+private:
+    std::vector<Signal> signals_;
+
+    std::mutex              mutex_;
+    std::condition_variable cv_;
+
+    bool aborted_{false};
+};
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/gpt_kernels.cu b/src/turbomind/kernels/gpt_kernels.cu
index 4f47631fa5..d611cfab43 100644
--- a/src/turbomind/kernels/gpt_kernels.cu
+++ b/src/turbomind/kernels/gpt_kernels.cu
@@ -269,4 +269,61 @@ void invokeTransposeAxis01(
 template void invokeTransposeAxis01(
     int* out, int* in, const int* in_skipping_dim1, const int dim0, const int dim1, cudaStream_t stream);
 
+template<int TILE_DIM, int BLOCK_ROWS, class T>
+__global__ void transpose_2d_kernel(T* __restrict__ dst, const T* __restrict__ src, int rows, int cols, bool swap_xy)
+{
+    __shared__ T smem[TILE_DIM][TILE_DIM + 1];
+
+    const int block_idx_x = swap_xy ? blockIdx.y : blockIdx.x;
+    const int block_idx_y = swap_xy ? blockIdx.x : blockIdx.y;
+
+    {
+        const int j = block_idx_x * TILE_DIM + threadIdx.x;
+        const int i = block_idx_y * TILE_DIM + threadIdx.y;
+
+#pragma unroll
+        for (int y = 0; y < TILE_DIM; y += BLOCK_ROWS) {
+            if (i + y < rows && j < cols) {
+                smem[threadIdx.y + y][threadIdx.x] = src[(i + y) * cols + j];
+            }
+        }
+    }
+
+    __syncthreads();
+
+    {
+        const int j = block_idx_y * TILE_DIM + threadIdx.x;
+        const int i = block_idx_x * TILE_DIM + threadIdx.y;
+
+#pragma unroll
+        for (int y = 0; y < TILE_DIM; y += BLOCK_ROWS) {
+            if (i + y < cols && j < rows) {
+                dst[(i + y) * rows + j] = smem[threadIdx.x][threadIdx.y + y];
+            }
+        }
+    }
+}
+
+template<class T>
+void invokeTranspose2D_(T* dst, const T* src, int rows, int cols, cudaStream_t st)
+{
+    constexpr int TILE_DIM   = 32;  // warp size
+    constexpr int BLOCK_ROWS = 8;
+
+    const dim3 block(TILE_DIM, BLOCK_ROWS);
+
+    dim3 grid((cols + TILE_DIM - 1) / TILE_DIM,  //
+              (rows + TILE_DIM - 1) / TILE_DIM);
+    bool swap_xy = false;
+
+    if (grid.y > 65535) {  // max dim for grid.y
+        std::swap(grid.x, grid.y);
+        swap_xy = true;
+    }
+
+    transpose_2d_kernel<TILE_DIM, BLOCK_ROWS><<<grid, block, 0, st>>>(dst, src, rows, cols, swap_xy);
+}
+
+template void invokeTranspose2D_(uint32_t*, const uint32_t*, int, int, cudaStream_t);
+
 }  // namespace turbomind
diff --git a/src/turbomind/kernels/gpt_kernels.h b/src/turbomind/kernels/gpt_kernels.h
index 4e1dc49be8..a351473332 100644
--- a/src/turbomind/kernels/gpt_kernels.h
+++ b/src/turbomind/kernels/gpt_kernels.h
@@ -238,4 +238,19 @@ void invokeSumLengthDimension(float*       out_buf,
                               const size_t hidden_dim,
                               cudaStream_t stream = 0);
 
+template<class T>
+void invokeTranspose2D_(T* dst, const T* src, int rows, int cols, cudaStream_t st);
+
+template<class T>
+void invokeTranspose2D(T* dst, const T* src, int rows, int cols, cudaStream_t st)
+{
+    if constexpr (sizeof(T) == 4) {
+        // FT_CHECK(0);
+        invokeTranspose2D_((uint32_t*)dst, (const uint32_t*)src, rows, cols, st);
+    }
+    else {
+        FT_CHECK(0);
+    }
+}
+
 }  // namespace turbomind
diff --git a/src/turbomind/kernels/sampling_penalty_kernels.cu b/src/turbomind/kernels/sampling_penalty_kernels.cu
index 1d4cfe24b0..cf360580b9 100644
--- a/src/turbomind/kernels/sampling_penalty_kernels.cu
+++ b/src/turbomind/kernels/sampling_penalty_kernels.cu
@@ -17,6 +17,8 @@
 #include <assert.h>
 #include <float.h>
 
+#include "src/turbomind/kernels/core/array_ops.h"
+#include "src/turbomind/kernels/core/common.h"
 #include "src/turbomind/kernels/sampling_penalty_kernels.h"
 
 namespace turbomind {
@@ -221,6 +223,81 @@ template void invokeBatchApplyTemperaturePenalty(half*        logits,
                                                  const int    vocab_size_padd,
                                                  cudaStream_t stream);
 #endif
+
+template<int vec_size>
+__global__ void batchApplyTemperaturePenalty_v2(float*       logits,
+                                                const float* bias,
+                                                const float* temperatures,
+                                                const int    batch_size,
+                                                const int    vocab_size,
+                                                const int    vocab_size_padded)
+{
+    const int vi = blockIdx.x * blockDim.x + threadIdx.x;
+    const int bi = blockIdx.y;
+
+    __shared__ float shared_scale;
+
+    if (threadIdx.x == 0) {
+        shared_scale = fdividef(1.f, temperatures[bi] + 1e-6f);
+    }
+
+    __syncthreads();
+
+    const float scale = shared_scale;
+
+    logits += (size_t)bi * vocab_size_padded;
+
+    const int step = gridDim.x * blockDim.x * vec_size;
+
+    for (int i = vi * vec_size; i < vocab_size_padded; i += step) {
+        Array<float, vec_size> vec;
+        Load(vec, logits + i);
+        PRAGMA_UNROLL
+        for (int c = 0; c < vec_size; ++c) {
+            if (i + c < vocab_size) {
+                vec[c] *= scale;
+            }
+            else {
+                vec[c] = -FLT_MAX;
+            }
+        }
+        Store(logits + i, vec);
+    }
+}
+
+void invokeBatchApplyTemperaturePenalty_v2(float*       logits,
+                                           const float* bias,
+                                           const float* temperatures,
+                                           const int    batch_size,
+                                           const int    vocab_size,
+                                           const int    vocab_size_padded,
+                                           cudaStream_t stream)
+{
+
+    auto invoke = [&](auto vec_size) {
+        constexpr int threads        = 256;
+        const int     blocks_per_tok = (vocab_size_padded + threads * vec_size - 1) / (threads * vec_size);
+        const dim3    blocks(blocks_per_tok, batch_size);
+        batchApplyTemperaturePenalty_v2<vec_size.value><<<blocks, threads, 0, stream>>>(  //
+            logits,
+            bias,
+            temperatures,
+            batch_size,
+            vocab_size,
+            vocab_size_padded);
+    };
+
+    if (vocab_size_padded % 4 == 0) {
+        invoke(std::integral_constant<int, 4>{});
+    }
+    else if (vocab_size_padded % 2 == 0) {
+        invoke(std::integral_constant<int, 2>{});
+    }
+    else {
+        invoke(std::integral_constant<int, 1>{});
+    }
+}
+
 template<typename T, RepetitionPenaltyType penalty_type>
 __global__ void applyRepetitionPenalty(T*          logits,
                                        const float penalty,
diff --git a/src/turbomind/kernels/sampling_penalty_kernels.h b/src/turbomind/kernels/sampling_penalty_kernels.h
index e12698cdf7..1f26b7d352 100644
--- a/src/turbomind/kernels/sampling_penalty_kernels.h
+++ b/src/turbomind/kernels/sampling_penalty_kernels.h
@@ -69,6 +69,14 @@ void invokeBatchApplyTemperaturePenalty(T*           logits,
                                         const int    vocab_size_padd,
                                         cudaStream_t stream);
 
+void invokeBatchApplyTemperaturePenalty_v2(float*       logits,
+                                           const float* bias,
+                                           const float* temperatures,
+                                           const int    batch_size,
+                                           const int    vocab_size,
+                                           const int    vocab_size_padd,
+                                           cudaStream_t stream);
+
 template<typename T>
 void invokeMinLengthPenalty(T*           logits,
                             const int*   min_lengths,
diff --git a/src/turbomind/kernels/sampling_topp_kernels.cu b/src/turbomind/kernels/sampling_topp_kernels.cu
index 04ea0577d1..4d4cff464c 100644
--- a/src/turbomind/kernels/sampling_topp_kernels.cu
+++ b/src/turbomind/kernels/sampling_topp_kernels.cu
@@ -22,6 +22,7 @@
 #include "3rdparty/cub/cub.cuh"
 #endif
 
+#include "src/turbomind/kernels/core/math.h"
 #include "src/turbomind/kernels/reduce_kernel_utils.cuh"
 #include "src/turbomind/kernels/sampling_topp_kernels.h"
 #include "src/turbomind/utils/constant.h"
@@ -216,9 +217,9 @@ void invokeTopPSort(TopPSortParams& params, cudaStream_t stream)
     size_t topp_id_val_buf_size  = sizeof(int) * params.batch_size * params.vocab_size_padded;
     size_t begin_offset_buf_size = sizeof(int) * params.batch_size;
     size_t end_offset_buf_size   = sizeof(int) * params.batch_size;
-    topp_id_val_buf_size         = div_up(topp_id_val_buf_size, 256) * 256;
-    begin_offset_buf_size        = div_up(begin_offset_buf_size, 256) * 256;
-    end_offset_buf_size          = div_up(end_offset_buf_size, 256) * 256;
+    topp_id_val_buf_size         = cdiv<size_t>(topp_id_val_buf_size, 256) * 256;
+    begin_offset_buf_size        = cdiv<size_t>(begin_offset_buf_size, 256) * 256;
+    end_offset_buf_size          = cdiv<size_t>(end_offset_buf_size, 256) * 256;
 
     if (params.workspace == nullptr) {
         size_t cub_temp_storage_size;
@@ -236,7 +237,7 @@ void invokeTopPSort(TopPSortParams& params, cudaStream_t stream)
                                                                0,              // begin_bit
                                                                sizeof(T) * 8,  // end_bit = sizeof(KeyT) * 8
                                                                stream));       // cudaStream_t
-        cub_temp_storage_size = div_up(cub_temp_storage_size, 256) * 256;
+        cub_temp_storage_size = cdiv<size_t>(cub_temp_storage_size, 256) * 256;
         params.workspace_size =
             topp_id_val_buf_size + begin_offset_buf_size + end_offset_buf_size + cub_temp_storage_size;
         return;
diff --git a/src/turbomind/layers/sampling_layers/LogitsProcessorLayer.cc b/src/turbomind/layers/sampling_layers/LogitsProcessorLayer.cc
index b588d8b6f5..c458998031 100644
--- a/src/turbomind/layers/sampling_layers/LogitsProcessorLayer.cc
+++ b/src/turbomind/layers/sampling_layers/LogitsProcessorLayer.cc
@@ -178,7 +178,7 @@ void LogitsProcessorLayer<T>::forward(TensorMap* output_tensors, TensorMap* inpu
     // temperature
     {
         if (!ALL_OF(temperature_.begin(), batch_size, float, 1.f)) {
-            invokeBatchApplyTemperaturePenalty(
+            invokeBatchApplyTemperaturePenalty_v2(
                 logits, (T*)nullptr, temperature_buf_, batch_size, args_.vocab_size, args_.vocab_size_padded, stream_);
             sync_check_cuda_error();
         }
diff --git a/src/turbomind/models/llama/CMakeLists.txt b/src/turbomind/models/llama/CMakeLists.txt
index 3c714bd234..6c297e3d56 100644
--- a/src/turbomind/models/llama/CMakeLists.txt
+++ b/src/turbomind/models/llama/CMakeLists.txt
@@ -25,6 +25,7 @@ add_library(Llama STATIC
 set_property(TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE  ON)
 set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
 target_link_libraries(Llama PUBLIC CUDA::cudart
+        engine
         gemm2
         rms_norm
         cublasMMWrapper
diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc
index ea321d06a0..e37af1bb76 100644
--- a/src/turbomind/models/llama/LlamaBatch.cc
+++ b/src/turbomind/models/llama/LlamaBatch.cc
@@ -1,19 +1,43 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 
-#include "src/turbomind/models/llama/LlamaBatch.h"
+#include <algorithm>
+#include <atomic>
+#include <chrono>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cuda_runtime.h>
+#include <driver_types.h>
+#include <functional>
+#include <iomanip>
+#include <memory>
+#include <memory_resource>
+#include <numeric>
+#include <random>
+#include <sstream>
+#include <thread>
+#include <unordered_map>
+#include <utility>
+
+#include "src/turbomind/macro.h"
+
+#include "src/turbomind/engine/gateway.h"
+#include "src/turbomind/engine/request.h"
+
 #include "src/turbomind/kernels/core/data_type.h"
 #include "src/turbomind/kernels/decoding_kernels.h"
 #include "src/turbomind/kernels/gemm/tuner/params.h"
 #include "src/turbomind/kernels/sampling_topk_kernels.h"
-#include "src/turbomind/macro.h"
+
 #include "src/turbomind/models/llama/BlockManager.h"
+#include "src/turbomind/models/llama/LlamaBatch.h"
 #include "src/turbomind/models/llama/LlamaNcclGuard.h"
 #include "src/turbomind/models/llama/LlamaV2.h"
-#include "src/turbomind/models/llama/Request.h"
 #include "src/turbomind/models/llama/SequenceManager.h"
 #include "src/turbomind/models/llama/copy.h"
 #include "src/turbomind/models/llama/llama_kernels.h"
 #include "src/turbomind/models/llama/llama_utils.h"
+
 #include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/anomaly_handler.h"
 #include "src/turbomind/utils/constant.h"
@@ -21,20 +45,6 @@
 #include "src/turbomind/utils/debug_utils.h"
 #include "src/turbomind/utils/logger.h"
 #include "src/turbomind/utils/nccl_utils.h"
-#include <algorithm>
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <cuda_runtime.h>
-#include <functional>
-#include <iomanip>
-#include <iterator>
-#include <mutex>
-#include <numeric>
-#include <random>
-#include <sstream>
-#include <unordered_map>
-#include <utility>
 
 namespace turbomind {
 
@@ -84,150 +94,92 @@ void DropEmbeddings(const Sequence& seq)
 }
 
 template<typename T>
-void LlamaBatch<T>::RejectInvalidRequests(Requests& stop_reqs, Requests& infer_reqs)
+void LlamaBatch<T>::DisableConflictRequests(Requests& infer_reqs, Requests& kill_reqs)
 {
-    std::unordered_map<uint64_t, int> occurrence;
-
-    auto count_occurrence = [&occurrence](const Requests& rs) {
-        for (const auto& r : rs) {
-            ++occurrence[r->id];
-        }
-    };
-
-    auto reject = [](const char* type, std::shared_ptr<Request>& req, int ec) {
-        TM_LOG_WARNING(
-            "[RejectInvalidRequests] Skipping invalid %s request for id %ld, code = %d", type, (long)req->id, ec);
-        req->signal.set_value(ec);
-        req.reset();
-    };
-
-    auto handle_conflict_or_invalid = [this, &occurrence, &reject](Requests& rs, const char* type) {
-        for (auto& r : rs) {
-            if (r) {
-                int ec = 0;
+    NvtxScope _("disable conflict");
 
-                const int  input_length = r->inputs.getVal<int>("input_lengths", 0);
-                const auto get_offset   = [&](int token_count) {
-                    return std::max(0, std::min(token_count, r->inputs.getVal<int>("step", token_count)));
-                };
+    std::pmr::monotonic_buffer_resource    mbr;
+    std::pmr::unordered_map<uint64_t, int> occur(&mbr);
 
-                if (occurrence[r->id] != 1) {
-                    ec = Request::kConflict;
-                }
-                else if (r->start_flag && r->stop_flag) {
-                    ec = Request::kInvalid;
-                }
-                else if (input_length > session_len_) {
-                    ec = Request::kTooLong;
-                }
-                else if (!r->start_flag) {
-                    if (auto seq = sequence_manager_->Get(r->id); seq == nullptr) {
-                        ec = Request::kInvalid;
-                    }
-                    else if (get_offset(seq->tokens.size()) + input_length > session_len_) {
-                        ec = Request::kTooLong;
-                    }
-                }
-
-                if (ec) {
-                    reject(type, r, ec);
-                }
-            }
+    auto count = [&occur](const auto& reqs) {
+        for (const auto& r : reqs) {
+            ++occur[r->id];
         }
     };
 
-    auto drop_invalid = [](Requests& rs) {
-        int count = 0;
-        for (int i = 0; i < rs.size(); ++i) {
-            if (rs[i]) {
-                rs[count++] = std::move(rs[i]);
+    auto validate = [&occur](auto& reqs, const char* type) {
+        for (const auto& r : reqs) {
+            if (occur[r->id] > 1) {
+                TM_LOG_ERROR("Skip conflicting %s request for ID %lu", type, r->id);
+                r->ec = Request::kConflict;
             }
         }
-        rs.resize(count);
     };
 
-    count_occurrence(stop_reqs);
-    count_occurrence(infer_reqs);
-
-    if (!stop_reqs.empty()) {
-        handle_conflict_or_invalid(stop_reqs, "stop");
-
-        // invalidate stop-only requests for inactive sequences
-        for (auto& r : stop_reqs) {
-            if (r && r->end_flag == false) {
-                int ec = Request::kInactive;
-                for (int i = 0; i < state_->size; ++i) {
-                    if (state_->requests[i] && state_->requests[i]->id == r->id) {
-                        ec = 0;
-                        break;
-                    }
-                }
-                if (ec) {
-                    reject("stop", r, ec);
-                }
-            }
+    for (int i = 0; i < state_->size; ++i) {
+        if (state_->requests[i]) {
+            ++occur[state_->requests[i]->id];
         }
-
-        drop_invalid(stop_reqs);
     }
 
-    if (!infer_reqs.empty()) {
-        handle_conflict_or_invalid(infer_reqs, "infer");
+    count(kill_reqs);
+    count(infer_reqs);
 
-        // invalidate requests for busy sequences
-        for (auto& r : infer_reqs) {
-            if (r) {
-                for (int i = 0; i < state_->size; ++i) {
-                    if (state_->requests[i] && state_->requests[i]->id == r->id) {
-                        reject("infer", r, Request::kBusy);
-                        break;
-                    }
-                }
-            }
-        }
+    validate(kill_reqs, "kill");
+    validate(infer_reqs, "infer");
+}
 
-        drop_invalid(infer_reqs);
+template<class T>
+void LlamaBatch<T>::BroadcastCancelFlags()
+{
+    for (int i = 0; i < state_->size; ++i) {
+        const auto& r = state_->requests[i];
+        if (r && r->cancel_flag.load(std::memory_order_acquire) == -1) {
+            r->is_canceled = true;
+        }
     }
 }
 
-template<typename T>
-auto LlamaBatch<T>::ProcessStopRequests(const Requests& requests) -> std::vector<Signal>
+template<class T>
+void LlamaBatch<T>::ProcessCancelRequests(std::vector<Signal>& signals)
 {
-    NvtxScope           scope("stop_request");
-    std::vector<Signal> signals;
-    int                 count = 0;
-    for (const auto& r : requests) {
-        int ec = Request::kFail;
-        // find matching active sequence
-        for (int i = 0; i < state_->size; ++i) {
-            // stop & optionally erase active sequence
-            if (state_->requests[i] && state_->requests[i]->id == r->id) {
-                ec = 0;
-                signals.push_back(Interrupt(i, true, r->end_flag));
-                ++count;
-                break;
-            }
-        }
-        // mismatch, try erase inactive sequence, in this case there is no active request to interrupt
-        if (ec && r->end_flag) {
-            if (sequence_manager_->Erase(r->id)) {
-                ec = 0;
-            }
+    int count = 0;
+    for (int i = 0; i < state_->size; ++i) {
+        const auto& r = state_->requests[i];
+        if (r && r->is_canceled) {
+            ++count;
+            signals.push_back(Interrupt(i, true));
+            // Interrupt should reset r
+            FT_CHECK(!r);
         }
-        signals.push_back([=] {
-            if (rank_ == 0) {
-                r->signal.set_value(ec);
-            }
-        });
     }
     if (count) {
         check_cuda_error(cudaStreamSynchronize(stream_));
     }
-    return signals;
+}
+
+template<class T>
+void LlamaBatch<T>::ProcessKillRequests(const Requests& kill_reqs, std::vector<Signal>& signals)
+{
+    for (auto& r : kill_reqs) {
+        if (r) {
+            int ec = r->ec;
+            if (!ec) {
+                if (!sequence_manager_->Erase(r->id)) {
+                    ec = Request::kInvalid;
+                }
+            }
+            signals.push_back([=] {
+                if (r->end_cb) {
+                    r->end_cb(ec);
+                }
+            });
+        }
+    }
 }
 
 template<typename T>
-void LlamaBatch<T>::ProcessInferRequests(const Requests& requests)
+void LlamaBatch<T>::ProcessInferRequests(const Requests& reqs, std::vector<Signal>& signals)
 {
     NvtxScope scope("infer_request");
     auto&     state = *incoming_;
@@ -238,58 +190,90 @@ void LlamaBatch<T>::ProcessInferRequests(const Requests& requests)
     std::vector<int> existing_idx;
 
     int idx = 0;
-    for (const auto& r : requests) {
-        FT_CHECK(!state.requests[idx]);
+    for (const auto& r : reqs) {
 
         if (rank_ == 0) {
             TM_LOG_INFO("[ProcessInferRequests] Request for %ld received.", (long)r->id);
         }
 
-        state.requests[idx] = r;
+        if (r->ec) {
+            signals.push_back([r] { UpdateState(*r, r->ec, 0); });
+            continue;
+        }
 
-        // get sequence for the request
-        state.sequences[idx] = r->start_flag ? sequence_manager_->Create(r->id) : sequence_manager_->Get(r->id);
-        FT_CHECK(state.sequences[idx]);
+        const int input_length = r->inputs.at("input_ids").shape[0];
 
-        auto& seq = *state.sequences[idx];
+        if (input_length > session_len_) {
+            signals.push_back([r] { UpdateState(*r, Request::kTooLong, 0); });
+            continue;
+        }
 
-        if (int step = r->inputs.getVal<int>("step", -1); step >= 0) {
-            if (step <= seq.tokens.size()) {
-                seq.tokens.resize(step);
-                seq.cache_len = std::min(seq.cache_len, step);
-                DropEmbeddings(seq);
+        auto ptr = r->session.start_flag ? sequence_manager_->Create(r->id) : sequence_manager_->Get(r->id);
+        if (!ptr) {
+            signals.push_back([r] { UpdateState(*r, Request::kInvalid, 0); });
+            continue;
+        }
+
+        const int step = [&] {
+            int s = r->session.step;
+            if (s < 0) {
+                s = ptr->tokens.size();
             }
-            else if (rank_ == 0) {
-                TM_LOG_WARNING(
-                    "[ProcessInferRequests] Skipping invalid step (%d) setting for ID %ld", step, (long)seq.id);
+            else if (s > ptr->tokens.size()) {
+                if (rank_ == 0) {
+                    TM_LOG_WARNING("[ProcessInferRequests] Skipping invalid step (%d) setting for ID %lu", s, ptr->id);
+                }
+                s = ptr->tokens.size();
             }
+            return s;
+        }();
+
+        if (step + input_length > session_len_) {
+            signals.push_back([r] { UpdateState(*r, Request::kTooLong, 0); });
+            continue;
         }
 
-        const int  input_length = r->inputs.getVal<int>("input_lengths");
-        const int* input_ids    = r->inputs.getPtr<int>("input_ids");
+        FT_CHECK(!state.requests[idx]);
+
+        state.requests[idx]  = r;
+        state.sequences[idx] = ptr;
+
+        auto& seq = *state.sequences[idx];
+
+        if (step < seq.tokens.size()) {
+            // resize sequence tokens to match step
+            seq.tokens.resize(step);
+            seq.cache_len = std::min(seq.cache_len, step);
+            DropEmbeddings(seq);
+        }
+
+        const int* input_ids = r->inputs.getPtr<int>("input_ids");
 
         {
             // `output_ids` contains all token ids of the sequences
             const auto output_ids_base = state.output_ids + session_len_ * idx;
-            auto       output_ids      = output_ids_base;
+            auto       d_output_ids    = output_ids_base;
+            auto       h_output_ids    = r->output_ids.getPtr<int>();
             // copy history tokens
             if (!seq.tokens.empty()) {
-                output_ids = Copy(seq.tokens.data(), seq.tokens.size(), output_ids);
+                d_output_ids = Copy(seq.tokens.data(), seq.tokens.size(), d_output_ids);
+                h_output_ids = std::copy_n(seq.tokens.data(), seq.tokens.size(), h_output_ids);
             }
 
             // copy input tokens
             if (input_length) {
-                output_ids = Copy(input_ids, input_length, output_ids);
+                d_output_ids = Copy(input_ids, input_length, d_output_ids);
+                h_output_ids = std::copy_n(input_ids, input_length, h_output_ids);
             }
 
             // total context length (history + input)
-            state.h_prompt_length[idx]  = output_ids - output_ids_base;
-            state.h_context_length[idx] = output_ids - output_ids_base;
+            state.h_prompt_length[idx]  = d_output_ids - output_ids_base;
+            state.h_context_length[idx] = d_output_ids - output_ids_base;
             state.h_finished[idx]       = false;
         }
 
         // copy input tokens to prompt for prefix matching
-        if (input_length && r->start_flag && !r->inputs.isExist("input_embedding_ranges")) {
+        if (input_length && r->session.start_flag && !r->inputs.isExist("input_embedding_ranges")) {
             // TODO: truncate prompt to enable prefix caching for VLM
             seq.prompt.resize(input_length);
             std::copy_n(input_ids, input_length, seq.prompt.data());
@@ -348,8 +332,8 @@ void LlamaBatch<T>::ProcessInferRequests(const Requests& requests)
             }
         }
 
-        const int request_output_len = state.requests[idx]->inputs.getVal<int>("request_output_len");
-        state.seq_len_limit[idx]     = state.h_context_length[idx] + request_output_len;
+        const int max_new_tokens = state.requests[idx]->gen_cfg.max_new_tokens;
+        state.seq_len_limit[idx] = state.h_context_length[idx] + max_new_tokens;
         // `length_criterion` sets finish flag when step >= seq_limit_len, however when step == seq_limit_len
         // the actual sequence length is seq_limit_len + 1, hence seq_limit_len must truncated to session_len - 1
         if (state.seq_len_limit[idx] >= session_len_) {
@@ -357,17 +341,17 @@ void LlamaBatch<T>::ProcessInferRequests(const Requests& requests)
             if (rank_ == 0) {
                 const int trunc_output_len = state.seq_len_limit[idx] - state.h_context_length[idx];
                 TM_LOG_WARNING(
-                    "[ProcessInferRequests] [%ld] total sequence length (%d + %d) exceeds `session_len` (%d), `request_output_len` is truncated to %d",
+                    "[ProcessInferRequests] [%ld] total sequence length (%d + %d) exceeds `session_len` (%d), `max_new_tokens` is truncated to %d",
                     (long)seq.id,
                     state.h_context_length[idx],
-                    request_output_len,
+                    max_new_tokens,
                     (int)session_len_,
                     trunc_output_len);
             }
         }
 
         // compute rope scaling factor
-        if (r->start_flag) {
+        if (r->session.start_flag) {
             seq.rope_theta = model_->attn_param_.rotary_embedding_base;
             if (model_->attn_param_.use_dynamic_ntk) {
                 auto scaling_factor = model_->attn_param_.rope_scaling_factor;
@@ -388,9 +372,9 @@ void LlamaBatch<T>::ProcessInferRequests(const Requests& requests)
         }
         state.h_rope_theta[idx] = seq.rope_theta;
 
-        if (r->start_flag) {
+        if (r->session.start_flag) {
             // prepare to initialize random state for new sequence
-            h_random_seed_[idx] = r->inputs.getVal<unsigned long long>("random_seed", 0);
+            h_random_seed_[idx] = r->gen_cfg.random_seed;
         }
         else {
             // Recover device states if not a new sequence
@@ -799,12 +783,6 @@ void LlamaBatch<T>::AllocatePersistantBuffer(size_t max_batch_size, int cache_bl
     sampling_params_ = {
         {"stop_words_list", (std::byte*)h_stop_words_, (std::byte*)d_stop_words_},
         {"bad_words_list", (std::byte*)h_bad_words_, (std::byte*)d_bad_words_},
-        {"min_length", (std::byte*)h_min_length_, nullptr},
-        {"runtime_top_k", (std::byte*)h_runtime_top_k_, nullptr},
-        {"runtime_top_p", (std::byte*)h_runtime_top_p_, nullptr},
-        {"runtime_min_p", (std::byte*)h_runtime_min_p_, nullptr},
-        {"temperature", (std::byte*)h_temperature_, nullptr},
-        {"repetition_penalty", (std::byte*)h_repetition_penalty_, nullptr},
     };
 
     for (auto& s : states_) {
@@ -941,19 +919,9 @@ template<typename T>
 LlamaBatch<T>::~LlamaBatch()
 {
     TM_LOG_DEBUG("~LlamaBatch()");
-    shared_state_->request_queue.close();
 
     internal_thread_.join();
 
-    if (output_thread_.joinable()) {
-        {
-            std::lock_guard lock{output_mutex_};
-            output_stop_token_ = true;
-        }
-        output_cv_.notify_one();
-        output_thread_.join();
-    }
-
     // The dtor maybe called from unknown thread, set device id before CUDA calls
     check_cuda_error(cudaSetDevice(device_id_));
     check_cuda_error(cudaStreamSynchronize(stream_));
@@ -970,8 +938,10 @@ LlamaBatch<T>::LlamaBatch(const EngineParam&           param,
                           std::unique_ptr<LlamaV2<T>>  model,  // ! This is moved
                           std::unique_ptr<Context<T>>  ctx,    // ! This is moved
                           std::shared_ptr<SharedState> state,
+                          std::shared_ptr<Gateway>     gateway,
                           int                          device_id):
     param_(param),
+    gateway_(gateway),
     shared_state_(state),
     max_batch_size_(param.max_batch_size),
     max_forward_token_num_(param.max_prefill_token_num + param.max_batch_size),
@@ -1068,7 +1038,7 @@ void LlamaBatch<T>::InitializeSampling(const GenerationState& g)
     sync_check_cuda_error();
 
     Clear(token_ids_buf_, batch_size * session_len_);
-    invokeTransposeAxis01(token_ids_buf_, state_->output_ids, batch_size, session_len_, 1, stream_);
+    invokeTranspose2D(token_ids_buf_, state_->output_ids, batch_size, session_len_, stream_);
     sync_check_cuda_error();
 
     // token_ids_buf_[s, b]
@@ -1087,6 +1057,27 @@ void LlamaBatch<T>::InitializeSampling(const GenerationState& g)
     Copy(h_seq_limit_len_, batch_size, seq_limit_len_);
 
     TensorMap inputs;
+
+    auto member_to_tensor = [&](auto getter, auto key, auto dest, auto init) {
+        int count = 0;
+        for (int i = 0; i < batch_size; ++i) {
+            // `std::invoke`
+            dest[i] = state_->requests[i]->gen_cfg.*getter;
+            count += dest[i] != init;
+        }
+        if (count) {
+            inputs.insert(key, {MEMORY_CPU, getTensorType<decltype(init)>(), {(size_t)batch_size}, dest});
+        }
+    };
+
+    using G = GenerationConfig;
+    member_to_tensor(&G::top_k, "runtime_top_k", h_runtime_top_k_, 0);
+    member_to_tensor(&G::top_p, "runtime_top_p", h_runtime_top_p_, 0);
+    member_to_tensor(&G::min_p, "runtime_min_p", h_runtime_min_p_, 0);
+    member_to_tensor(&G::temperature, "temperature", h_temperature_, 0.f);
+    member_to_tensor(&G::repetition_penalty, "repetition_penalty", h_repetition_penalty_, 1.f);
+    member_to_tensor(&G::min_new_tokens, "min_length", h_min_length_, 0);
+
     for (const auto& [name, h_ptr, d_ptr] : sampling_params_) {
         // find an exemplar that matches the param name
         const Tensor* ptr{};
@@ -1173,7 +1164,7 @@ void LlamaBatch<T>::InitializeSampling(const GenerationState& g)
 
     TensorMap outputs;
     for (int i = 0; i < batch_size; i++) {
-        if (state_->requests[i]->inputs.isExist("logprobs")) {
+        if (state_->requests[i]->gen_cfg.output_logprobs) {
             outputs.insert(
                 {"sampled_logprobs", {MEMORY_GPU, TYPE_FP32, {(size_t)batch_size, 1, kMaxLogProb}, sampled_logprobs_}});
             outputs.insert(
@@ -1187,89 +1178,157 @@ void LlamaBatch<T>::InitializeSampling(const GenerationState& g)
     sync_check_cuda_error();
 }
 
-template<typename T>
-void LlamaBatch<T>::OutputContextLogits(T*                                  context_decoder_output,
-                                        const std::vector<int>&             indices,
-                                        const std::vector<int>&             lengths,
-                                        const std::vector<const Sequence*>& sequences)
+template<class T>
+void LlamaBatch<T>::ComputeAndOutputLogits(T* hidden_states, int first, int last)
 {
-    std::vector<float*> output_logits;
-    int                 num_token = 0;
-    {
-        bool is_return_logits = false;
-        for (int k = 0; k < indices.size(); ++k) {
-            auto& request = state_->requests[indices[k]];
-            auto  logits  = request->outputs.getPtr<float>("logits", nullptr);
-            if (logits && sequences[k]->cache_len + lengths[k] <= sequences[k]->tokens.size()) {
-                logits = nullptr;
-            }
-            output_logits.push_back(logits);
-            num_token += lengths[k];
-            if (output_logits.back()) {
-                is_return_logits = true;
+    int  token_num = 0;
+    bool found     = false;
+    for (int i = first; i < last; ++i) {
+        if (state_->requests[i]->gen_cfg.output_logits == GenerationConfig::kAll) {
+            const auto& s = *state_->sequences[i];
+            // Skip when the seq is filling missed cache only
+            if (s.cache_len + h_input_length_buf_[i] > s.tokens.size()) {
+                found = true;
             }
         }
-        if (!is_return_logits) {
-            return;
-        }
+        token_num += h_input_length_buf_[i];
     }
 
-    {
-        context_logits_buf_ = (float*)allocator_->reMalloc(
-            context_logits_buf_, sizeof(float) * model_->vocab_size_padded_ * num_token, false);
-        const auto tp = model_->tensor_para_.world_size_;
-        if (tp > 1) {
-            NcclGuard guard(model_->tensor_para_, stream_, true);
-            FT_CHECK(model_->vocab_size_padded_ % tp == 0);
-            const auto local_vocab_size = model_->vocab_size_padded_ / tp;
-            local_context_logits_buf_   = (float*)peer_allocator_->reMalloc(
-                local_context_logits_buf_, sizeof(float) * model_->vocab_size_padded_ * num_token, false);
-        }
+    if (!found) {
+        return;
     }
 
-    model_->postDecodeEmbedding(context_logits_buf_, local_context_logits_buf_, context_decoder_output, num_token);
+    context_logits_buf_ = (float*)allocator_->reMalloc(
+        context_logits_buf_, sizeof(float) * model_->vocab_size_padded_ * token_num, false);
+    const auto tp = model_->tensor_para_.world_size_;
+
+    if (tp > 1) {
+        NcclGuard guard(model_->tensor_para_, stream_, true);
+        FT_CHECK(model_->vocab_size_padded_ % tp == 0);
+        const auto local_vocab_size = model_->vocab_size_padded_ / tp;
+        local_context_logits_buf_   = (float*)peer_allocator_->reMalloc(
+            local_context_logits_buf_, sizeof(float) * model_->vocab_size_padded_ * token_num, false);
+    }
 
-    auto logits = context_logits_buf_;
+    model_->postDecodeEmbedding(context_logits_buf_, local_context_logits_buf_, hidden_states, token_num);
 
-    // Only rank-0 writes to output
     if (rank_ != 0) {
         return;
     }
 
-    for (int k = 0; k < indices.size(); ++k) {
-        if (output_logits[k]) {
-            auto src_ptr       = logits;
-            auto dst_ptr       = output_logits[k];
-            int  num_new_token = 0;
-            if (sequences[k]->cache_len < sequences[k]->tokens.size()) {
-                num_new_token = sequences[k]->cache_len + lengths[k] - sequences[k]->tokens.size();
-                src_ptr += (lengths[k] - num_new_token) * model_->vocab_size_padded_;
-            }
-            else {
-                num_new_token = lengths[k];
-                dst_ptr += (sequences[k]->cache_len - sequences[k]->tokens.size()) * model_->vocab_size_;
+    OutputLogits(context_logits_buf_, first, last, GenerationConfig::kAll);
+}
+
+template<typename T>
+void LlamaBatch<T>::OutputLogits(const float* logits, int first, int last, GenerationConfig::OutType out_type)
+{
+    // when `is_all` is true, logits only contains last token of the sequences
+    const bool is_all = out_type == GenerationConfig::kAll;
+
+    for (int i = first; i < last; ++i) {
+
+        const int    input_len = h_input_length_buf_[i];  // input lenght for this iter
+        const float* src_ptr   = logits;
+
+        logits += (is_all ? input_len : 1) * model_->vocab_size_padded_;
+
+        if (state_->requests[i]->gen_cfg.output_logits == out_type) {
+
+            auto dst_ptr = state_->requests[i]->outputs.getPtr<float>("logits");
+
+            const int cache_len   = state_->sequences[i]->cache_len;
+            const int history_len = state_->sequences[i]->tokens.size();
+
+            // ----------H------I-------P-----------
+            //      C        C      C         C
+
+            // offset to the last token prompt
+            const int offset = is_all ? 0 : state_->requests[i]->inputs.at("input_ids").shape[0] - 1;
+
+            int diff = (history_len + offset) - cache_len;
+
+            const int valid_len = input_len - std::max(0, (history_len + offset) - cache_len);
+
+            // TM_LOG_ERROR("%d %d   %d %d  %d  %d %d",
+            //              history_len,
+            //              offset,
+            //              cache_len,
+            //              input_len,
+            //              valid_len,
+            //              std::max(0, diff),
+            //              std::max(0, -diff));
+
+            if (valid_len <= 0) {
+                continue;
             }
-            if (model_->vocab_size_padded_ == model_->vocab_size_) {
-                Copy(src_ptr, model_->vocab_size_ * num_new_token, dst_ptr);
+
+            if (is_all) {
+                // Skip invalid tokens caused by cache miss
+                src_ptr += std::max(0, (history_len + offset) - cache_len) * model_->vocab_size_padded_;
             }
-            else {
-                for (int tok = 0; tok < num_new_token; tok++) {
-                    Copy(src_ptr, model_->vocab_size_, dst_ptr);
-                    src_ptr += model_->vocab_size_padded_;
-                    dst_ptr += model_->vocab_size_;
-                }
+            // Skip previous chunks
+            dst_ptr += std::max(0, cache_len - (history_len + offset)) * model_->vocab_size_;
+
+            check_cuda_error(cudaMemcpy2DAsync(dst_ptr,
+                                               sizeof(float) * model_->vocab_size_,
+                                               src_ptr,
+                                               sizeof(float) * model_->vocab_size_padded_,
+                                               sizeof(float) * model_->vocab_size_,
+                                               valid_len,
+                                               cudaMemcpyDefault,
+                                               stream_));
+        }
+    }
+}
+
+template<class T>
+void LlamaBatch<T>::OutputLastHiddenState(const T* hidden_states, int first, int last)
+{
+    for (int i = first; i < last; ++i) {
+
+        const int input_len = h_input_length_buf_[i];  // input lenght for this iter
+        const T*  src_ptr   = hidden_states;
+
+        hidden_states += input_len * model_->hidden_units_;
+
+        if (auto out_type = state_->requests[i]->gen_cfg.output_last_hidden_state) {
+
+            const bool is_all = out_type == GenerationConfig::kAll;
+
+            T* dst_ptr = state_->requests[i]->outputs.getPtr<T>("last_hidden_state");
+
+            const int cache_len   = state_->sequences[i]->cache_len;
+            const int history_len = state_->sequences[i]->tokens.size();
+
+            // offset to the last prompt token
+            const int offset = is_all ? 0 : state_->requests[i]->inputs.at("input_ids").shape[0] - 1;
+
+            const int valid_len = input_len - std::max(0, (history_len + offset) - cache_len);
+
+            // TM_LOG_ERROR("%d %d %d %d %d", history_len, offset, cache_len, input_len, valid_len);
+
+            if (valid_len <= 0) {
+                continue;
             }
+
+            // Skip invalid tokens caused by cache miss
+            src_ptr += std::max(0, (history_len + offset) - cache_len) * model_->hidden_units_;
+            // Skip previous chunks
+            dst_ptr += std::max(0, cache_len - (history_len + offset)) * model_->hidden_units_;
+
+            Copy(src_ptr, valid_len * model_->hidden_units_, dst_ptr);
         }
-        logits += model_->vocab_size_padded_ * lengths[k];
     }
 }
 
 template<typename T>
-auto LlamaBatch<T>::Finish(GenerationState& g) -> std::vector<Signal>
+void LlamaBatch<T>::Finish(GenerationState& g, std::vector<Signal>& signals)
 {
     NvtxScope scope("Finish");
     const int batch_size = state_->active_size;
 
+    signals.reserve(batch_size);
+
     if (batch_size - g.partial) {
         FT_CHECK(g.step >= 0);
 
@@ -1285,13 +1344,22 @@ auto LlamaBatch<T>::Finish(GenerationState& g) -> std::vector<Signal>
         sync_check_cuda_error();
     }
 
-    Copy(state_->output_ids, batch_size * session_len_, h_output_ids_);
+    Copy(token_ids_buf_ + (g.step - 1) * (batch_size - g.partial), batch_size - g.partial, h_output_ids_);
     Copy(finished_buf_, batch_size, state_->h_finished);
     Copy(sequence_lengths_, batch_size, state_->h_context_length);
 
-    Copy(sampled_logprobs_, batch_size * kMaxLogProb, h_sampled_logprobs_);
-    Copy(sampled_indexes_, batch_size * kMaxLogProb, h_sampled_indexes_);
-    Copy(sampled_nums_, batch_size, h_sampled_nums_);
+    bool output_logprobs = false;
+    for (int i = 0; i < batch_size - g.partial; ++i) {
+        if (state_->requests[i]->gen_cfg.output_logprobs) {
+            output_logprobs = true;
+            break;
+        }
+    }
+    if (output_logprobs) {
+        Copy(sampled_logprobs_, batch_size * kMaxLogProb, h_sampled_logprobs_);
+        Copy(sampled_indexes_, batch_size * kMaxLogProb, h_sampled_indexes_);
+        Copy(sampled_nums_, batch_size, h_sampled_nums_);
+    }
 
     check_cuda_error(cudaStreamSynchronize(stream_));
 
@@ -1302,13 +1370,14 @@ auto LlamaBatch<T>::Finish(GenerationState& g) -> std::vector<Signal>
     }
 
     // ! Only rank-0 writes to output
-    if (rank_ == 0) {
+    if (rank_ == 0 && output_logprobs) {
+        NvtxScope scope("logprobs");
         // output logprobs, should be set before sequence_length
         float*    sampled_logprobs_ptr = h_sampled_logprobs_;
         uint32_t* sampled_indexes_ptr  = h_sampled_indexes_;
         uint32_t* sampled_nums_ptr     = h_sampled_nums_;
         for (int i = 0; i < batch_size - g.partial; ++i) {
-            if (state_->requests[i] && state_->requests[i]->inputs.isExist("logprobs")) {
+            if (state_->requests[i] && state_->requests[i]->gen_cfg.output_logprobs) {
                 auto logprob_vals    = state_->requests[i]->outputs.getPtr<float>("logprob_vals");
                 auto logprob_indexes = state_->requests[i]->outputs.getPtr<uint32_t>("logprob_indexes");
                 auto logprob_nums    = state_->requests[i]->outputs.getPtr<uint32_t>("logprob_nums");
@@ -1330,18 +1399,37 @@ auto LlamaBatch<T>::Finish(GenerationState& g) -> std::vector<Signal>
 
     // ! Only rank-0 writes to output
     if (rank_ == 0) {
-        // set output tokens ids and sequence length
-        int* output_ptr = h_output_ids_;
-        for (int i = 0; i < batch_size - g.partial; ++i) {
-            if (state_->requests[i] && (state_->requests[i]->stream_cb || state_->h_finished[i])) {
-                auto      output_ids = state_->requests[i]->outputs.getPtr<int>("output_ids");
-                auto      output_len = state_->requests[i]->outputs.getPtr<int>("sequence_length");
-                const int count      = state_->h_context_length[i];
-                // TODO: sync history output tokens at when receiving the request and copy the last token here
-                std::copy(output_ptr, output_ptr + count, output_ids);
-                *output_len = count;
+        NvtxScope scope("output_ids");
+        if constexpr (0) {
+            // set output tokens ids and sequence length
+            int* output_ptr = h_output_ids_;
+            for (int i = 0; i < batch_size - g.partial; ++i) {
+                if (auto& r = state_->requests[i]) {
+                    auto      output_ids = static_cast<int*>(r->output_ids.data);
+                    auto      output_len = static_cast<int*>(r->sequence_length.data);
+                    const int count      = state_->h_context_length[i];
+                    if (r->stream_output) {
+                        output_ids[count - 1] = output_ptr[count - 1];
+                        *output_len           = count;
+                    }
+                    else if (state_->h_finished[i]) {
+                        std::copy(output_ptr, output_ptr + count, output_ids);
+                        *output_len = count;
+                    }
+                }
+                output_ptr += session_len_;
+            }
+        }
+        else {
+            for (int i = 0; i < batch_size - g.partial; ++i) {
+                if (auto& r = state_->requests[i]) {
+                    auto      output_ids  = static_cast<int*>(r->output_ids.data);
+                    auto      output_len  = static_cast<int*>(r->sequence_length.data);
+                    const int count       = state_->h_context_length[i];
+                    output_ids[count - 1] = h_output_ids_[i];
+                    *output_len           = count;
+                }
             }
-            output_ptr += session_len_;
         }
     }
 
@@ -1362,48 +1450,53 @@ auto LlamaBatch<T>::Finish(GenerationState& g) -> std::vector<Signal>
         }
     }
 
-    std::vector<Signal> signals;
     {
-        NvtxScope _("stream_and_completion_signal");
+        NvtxScope _("count and sync");
+        bool      need_sync = false;
         for (int i = 0; i < batch_size - g.partial; ++i) {
-            if (state_->requests[i]) {
-                if (state_->h_finished[i]) {
-                    // Interrupt finished sequences and move the request handle into the signal closure
-                    signals.push_back(Interrupt(i));
-                    ++g.finished_count;
-                }
-                else if (state_->requests[i]->stream_cb) {
-                    // Create signals by copying the request handles for non-finished streaming requests
-                    signals.push_back([this, r = state_->requests[i]] {
-                        if (rank_ == 0) {
-                            try {
-                                r->stream_cb(&r->outputs.get());
-                            }
-                            catch (const std::bad_function_call& e) {
-                                TM_LOG_ERROR("Null stream callback for (%s)", std::to_string(r->id).c_str());
-                            }
-                            catch (...) {
-                                TM_LOG_ERROR("Unknown exception invoking stream callback for (%s)",
-                                             std::to_string(r->id).c_str());
-                            }
-                        }
-                    });
+            if (state_->h_finished[i]) {
+                ++g.finished_count;
+                if (!state_->requests[i]->session.end_flag) {
+                    need_sync = true;
                 }
             }
         }
-        if (g.finished_count) {
-            // synchronize for interrupted sequences
-            check_cuda_error(cudaStreamSynchronize(stream_));
+        if (need_sync) {
+            // Release updates on request output buffers to all ranks (`Interrupt` will use it)
+            shared_state_->barrier->wait();
         }
     }
 
+    {
+        NvtxScope _("stream_and_completion_signal");
+        for (int i = 0; i < batch_size - g.partial; ++i) {
+            auto& r = state_->requests[i];
+            if (state_->h_finished[i]) {
+                // Interrupt finished sequences and move the request handle into the signal closure
+                signals.push_back(Interrupt(i));
+                // Interrupt should reset r
+                FT_CHECK(!r);
+            }
+            else if (r->stream_output && rank_ == 0) {
+                const auto seq_len = r->sequence_length.getVal<int>();
+                // Create signals by copying the request handles for non-finished streaming requests
+                signals.push_back([this, r, seq_len] {  //
+                    UpdateState(*r, Request::kOk, seq_len);
+                });
+            }
+        }
+    }
+
+    if (g.finished_count) {
+        // synchronize for interrupted sequences
+        check_cuda_error(cudaStreamSynchronize(stream_));
+    }
+
     if (g.partial) {
         const int i = batch_size - 1;
         // recover full context length of partial
         state_->h_context_length[i] = g.partial_context_legnth;
     }
-
-    return signals;
 }
 
 template<typename T>
@@ -1424,7 +1517,7 @@ auto LlamaBatch<T>::Interrupt(int index, bool force_stop, bool force_end) -> Sig
         TM_LOG_INFO("[Interrupt] slot %d, tokens [%s]", index, ss.str().c_str());
     }
 
-    if (state_->requests[index]->end_flag || force_end) {
+    if (state_->requests[index]->session.end_flag || force_end) {
         // Sequence is ending this round or a stop request is issued to end it
         FT_CHECK(sequence_manager_->Erase(state_->requests[index]->id));
     }
@@ -1434,17 +1527,10 @@ auto LlamaBatch<T>::Interrupt(int index, bool force_stop, bool force_end) -> Sig
 
         // Update token IDs
         seq.tokens.resize(output_len);
-        const auto output_ids_data = [&] {
-            if (force_stop) {
-                // `h_output_ids_` is UNDEFINED at `ProcessStopRequests`
-                return state_->requests[index]->outputs.at("output_ids").getPtr<int>();
-            }
-            else {
-                // `h_output_ids_` just updated by `Finish`, but `outputs` is NOT synced atm
-                return h_output_ids_ + index * (size_t)session_len_;
-            }
-        }();
-        std::copy_n(output_ids_data, output_len, seq.tokens.data());
+
+        // output_ids is updated & synced in `Finish`
+        const auto output_ids = state_->requests[index]->output_ids.getPtr<int>();
+        std::copy_n(output_ids, output_len, seq.tokens.data());
 
         // Save random state in host memory
         seq.random_state.resize(sizeof(curandState_t));
@@ -1457,13 +1543,12 @@ auto LlamaBatch<T>::Interrupt(int index, bool force_stop, bool force_end) -> Sig
 
     state_->sequences[index] = nullptr;
 
-    auto ec = std::exchange(state_->errors[index], 0);
+    auto ec = std::exchange(state_->errors[index], Request::kOk);
 
+    const auto len = state_->requests[index]->sequence_length.getVal<int>();
     // move the request handle into the signal
-    return [this, ec, r = std::move(state_->requests[index])] {
-        if (rank_ == 0) {
-            r->signal.set_value(ec);
-        }
+    return [this, len, force_stop, r = std::move(state_->requests[index])] {  //
+        UpdateState(*r, force_stop ? Request::kCancel : Request::kFinish, len);
     };
 }
 
@@ -1476,33 +1561,30 @@ void LlamaBatch<T>::InternalThreadEntry()
     // Initialize `AnomalyHandler`
     AnomalyHandler::instance().Init(rank_, model_->vocab_size_padded_, model_->end_id_, max_batch_size_, stream_);
 
-    auto& request_queue  = shared_state_->request_queue;
-    auto& infer_requests = shared_state_->infer_requests;
-    auto& stop_requests  = shared_state_->stop_requests;
+    // auto& request_queue = shared_state_->request_queue;
+    auto& infer_reqs = shared_state_->infer_reqs;
+    auto& kill_reqs  = shared_state_->kill_reqs;
 
     GenerationState g{};
 
-    constexpr int request_interval = 1;
-    long          request_counter  = 0;
-
     while (1) {
+
         if (rank_ == 0) {
-            const int  free_slot_count = max_batch_size_ - state_->size + g.finished_count;
-            const bool is_empty        = (free_slot_count == max_batch_size_);
-            stop_requests.clear();
-            infer_requests.clear();
-            if (is_empty || request_counter % request_interval == 0) {
+            {
+                NvtxScope  _("pop");
+                const int  free_slot_count = max_batch_size_ - state_->size + g.finished_count;
+                const bool is_empty        = (free_slot_count == max_batch_size_);
                 // Block if batch is empty
-                request_queue.dequeue(stop_requests, infer_requests, free_slot_count, is_empty, shared_state_->abort);
-                if (!shared_state_->abort) {
-                    RejectInvalidRequests(stop_requests, infer_requests);
-                }
+                gateway_->pop(infer_reqs, kill_reqs, free_slot_count, is_empty, shared_state_->abort);
             }
+            // Mark reqs to the same session_id as invalid (which are dangerous to the engine)
+            DisableConflictRequests(infer_reqs, kill_reqs);
         }
 
         NvtxScope scope("mainloop");
 
-        // wait while rank-0 is dequeueing
+        // 1. Wait while rank-0 is dequeueing
+        // 2. Broadcast `ec` from rank-0
         shared_state_->barrier->wait();
 
         if (shared_state_->abort) {
@@ -1510,90 +1592,58 @@ void LlamaBatch<T>::InternalThreadEntry()
             return;
         }
 
-        auto signals = ProcessStopRequests(stop_requests);
+        std::vector<Signal> signals;
+
+        ProcessKillRequests(kill_reqs, signals);
 
         // Shared `priority` field will be assigned by rank-0
-        ProcessInferRequests(infer_requests);
+        ProcessInferRequests(infer_reqs, signals);
+
+        // is_canceled <- cancel_flag.load()
+        if (rank_ == 0) {
+            BroadcastCancelFlags();
+        }
 
-        // Wait while shared `requests` is being used
+        // 1. Wait while shared `requests` is being used
+        // 2. Broadcast modifcations from rank-0
         shared_state_->barrier->wait();
 
-        SendSignals(std::move(signals));
+        ProcessCancelRequests(signals);
+
+        if (rank_ == 0) {
+            gateway_->notify(std::move(signals));
+        }
 
         Initialize(g);
 
         if (state_->active_size) {
             //
-            (void)Forward(g);
-            //
-            if (auto signals = Finish(g); !signals.empty()) {
-                if (g.finished_count) {
-                    // Finished requests and corresponding output tensors will be released when notified
-                    // wait for all ranks to ensure no rank (except for output thread) will access related
-                    // resources
-                    shared_state_->barrier->wait();
-                }
-                SendSignals(std::move(signals));
+            Forward(g);
+
+            Finish(g, signals);
+
+            if (g.finished_count) {
+                // Finished requests and corresponding output tensors will be released when notified
+                // wait for all ranks to ensure no rank (except for output thread) will access related
+                // resources
+                shared_state_->barrier->wait();
             }
-        }
 
-        ++request_counter;
+            if (rank_ == 0) {
+                gateway_->notify(std::move(signals));
+            }
+        }
     }
 
+    // Unreachable
     FT_CHECK(0);
 }
 
-template<typename T>
-void LlamaBatch<T>::SendSignals(std::vector<Signal> signals)
-{
-    if (rank_ != 0 || signals.empty()) {
-        return;
-    }
-    {
-        std::lock_guard lock{output_mutex_};
-        output_signals_.insert(output_signals_.end(),  //
-                               std::move_iterator{signals.begin()},
-                               std::move_iterator{signals.end()});
-    }
-    output_cv_.notify_one();
-}
-
 template<typename T>
 void LlamaBatch<T>::Start()
 {
     TM_LOG_INFO("LlamaBatch<T>::Start()");
     internal_thread_ = std::thread(&LlamaBatch::InternalThreadEntry, this);
-    if (rank_ == 0) {
-        output_thread_ = std::thread(&LlamaBatch::OutputThreadEntry, this);
-    }
-}
-
-template<typename T>
-void LlamaBatch<T>::OutputThreadEntry()
-{
-    while (true) {
-        std::vector<Signal> signals;
-        {
-            // Wait for signals to come
-            std::unique_lock lock(output_mutex_);
-            output_cv_.wait(lock, [&] { return !output_signals_.empty() || output_stop_token_; });
-            if (output_stop_token_) {
-                TM_LOG_INFO("[OutputThreadEntry] stop requested.");
-                return;
-            }
-            signals = std::move(output_signals_);
-        }
-        if (rank_ == 0 && ffi_lock_) {
-            ffi_lock_(1);
-        }
-        // invoke stream cbs & signals
-        for (const auto& s : signals) {
-            s();
-        }
-        if (rank_ == 0 && ffi_lock_) {
-            ffi_lock_(0);
-        }
-    }
 }
 
 template<typename T>
@@ -1662,20 +1712,11 @@ bool LlamaBatch<T>::Forward(GenerationState& g)
         const int last            = offsets[p + 1];
         const int mini_batch_size = last - first;
         int*      input_ids       = context_decoder_ids_buf_;
-        //
-        std::vector<int> decode_indices{};
-        std::vector<int> decode_lengths{};
-
-        std::vector<const Sequence*> sequences;
 
         BatchedCopy batched_copy;
         int         sum_k = 0;
         for (int i = first; i < last; ++i) {
             input_ids = batched_copy.Add(input_d_ptrs[i], h_input_length_buf_[i], input_ids);
-            dbg(i, h_input_length_buf_[i]);
-            decode_indices.push_back(i);
-            decode_lengths.push_back(h_input_length_buf_[i]);
-            sequences.push_back(state_->sequences[i]);
             if (h_input_length_buf_[i] > 1) {
                 sum_k += state_->h_context_length[i];
             }
@@ -1717,20 +1758,10 @@ bool LlamaBatch<T>::Forward(GenerationState& g)
                                dc_batch_size,
                                pf_batch_size,
                                lora_mask_buf_,
-                               sequences.data());
-
-        // compute logits of inputs if requested
-        OutputContextLogits(context_decoder_output_buf_, decode_indices, decode_lengths, sequences);
-    }
-
-    std::fill(h_input_length_buf_, h_input_length_buf_ + active_size, 0);
+                               state_->sequences.data() + first);
 
-    // `SequenceManager` needs real-time value of cache length
-    for (int i = 0; i < active_size; ++i) {
-        if (state_->requests[i]) {
-            FT_CHECK(state_->sequences[i]);
-            state_->sequences[i]->cache_len += state_->sequences[i]->input_length;
-        }
+        ComputeAndOutputLogits(context_decoder_output_buf_, first, last);
+        OutputLastHiddenState(context_decoder_output_buf_, first, last);
     }
 
     if (active_size > g.partial) {
@@ -1738,12 +1769,10 @@ bool LlamaBatch<T>::Forward(GenerationState& g)
 
         AnomalyHandler::instance().FixLogits(logits_buf_, active_size - g.partial, 1);
 
-        // count_and_fix(logits_buf_, (active_size - g.partial) * model_->vocab_size_padded_, "logits", 1);
+        OutputLogits(logits_buf_, 0, active_size - g.partial, GenerationConfig::kGeneration);
 
         FT_CHECK(g.step >= 0);
 
-        // TM_LOG_INFO("dyn decode bsz %d, partial %d", active_size, g.partial);
-
         if (!g.skip_init_sampling) {
             InitializeSampling(g);
         }
@@ -1767,6 +1796,15 @@ bool LlamaBatch<T>::Forward(GenerationState& g)
                               active_size - g.partial);
     }
 
+    std::fill(h_input_length_buf_, h_input_length_buf_ + active_size, 0);
+
+    // `SequenceManager` needs real-time value of cache length
+    for (int i = 0; i < active_size; ++i) {
+        FT_CHECK((bool)state_->requests[i]);
+        FT_CHECK(state_->sequences[i]);
+        state_->sequences[i]->cache_len += state_->sequences[i]->input_length;
+    }
+
     AnomalyHandler::instance().Summarize([&](const int* is_anomaly, int batch_size) {
         for (int i = 0; i < batch_size; ++i) {
             if (is_anomaly[i]) {
@@ -1800,101 +1838,6 @@ bool LlamaBatch<T>::Forward(GenerationState& g)
     return true;
 }
 
-static inline Tensor slice(const Tensor& tensor, int index)
-{
-    auto shape = tensor.shape;
-    if (shape.at(0) == 1) {
-        return tensor;
-    }
-    shape[0]          = 1;
-    const auto offset = std::accumulate(shape.begin(), shape.end(), (size_t)index, std::multiplies<>{});
-    return tensor.slice(shape, offset);
-}
-
-// ! implicit conversion from `unordered_map` to `TensorMap` drops 0-sized tensors
-static inline TensorMap slice(const std::unordered_map<std::string, Tensor>& src, int index)
-{
-    TensorMap dst;
-    for (const auto& kv : src) {
-        dst.insert({kv.first, slice(kv.second, index)});
-    }
-    return dst;
-}
-
-template<typename T>
-void LlamaBatch<T>::Submit(std::unordered_map<std::string, Tensor>*       outputs,
-                           const std::unordered_map<std::string, Tensor>* inputs,
-                           Control                                        control)
-{
-    if (debug_) {
-        for (const auto& kv : *inputs) {
-            TM_LOG_INFO("[Submit] INPUT: %s", format(kv).c_str());
-        }
-        for (const auto& kv : *outputs) {
-            TM_LOG_INFO("[Submit] OUTPUT: %s", format(kv).c_str());
-        }
-    }
-
-    const int batch_size = outputs->at("output_ids").shape[0];
-
-    std::vector<std::shared_ptr<Request>> requests(batch_size);
-
-    // allocates all requests for the batch
-    for (int i = 0; i < batch_size; ++i) {
-        requests[i] = std::make_shared<Request>();
-    }
-
-    for (int i = 0; i < batch_size; ++i) {
-        auto& r = requests[i];
-
-        r->inputs  = slice(*inputs, i);
-        r->outputs = slice(*outputs, i);
-
-        r->id         = r->inputs.getVal<uint64_t>("CORRID", i);
-        r->start_flag = r->inputs.getVal<int>("START", 1);
-        r->end_flag   = r->inputs.getVal<int>("END", 1);
-        r->stop_flag  = r->inputs.getVal<int>("STOP", 0);
-        r->stream_cb  = control.callback;
-    }
-
-    // Submits the tasks and wait for finish
-    std::vector<int> error_codes;
-    bool             has_error = 0;
-
-    TM_LOG_INFO("[forward] Enqueue requests");
-
-    std::vector<uint64_t> ids;
-    for (const auto& r : requests) {
-        ids.push_back(r->id);
-    }
-
-    auto futures = shared_state_->request_queue.enqueue(std::move(requests));
-
-    FT_CHECK_WITH_INFO(ids.size() == futures.size(), "check failed");
-
-    TM_LOG_INFO("[forward] Wait for requests to complete ...");
-
-    for (int i = 0; i < futures.size(); ++i) {
-        auto ec = futures[i].get();
-        error_codes.push_back(ec);
-        if (ec) {
-            has_error = true;
-            TM_LOG_WARNING("[forward] Request failed for %ld, code %d", (long)ids[i], (int)ec);
-        }
-        else {
-            TM_LOG_INFO("[forward] Request completed for %ld", (long)ids[i]);
-        }
-    }
-
-    if (has_error) {
-        std::stringstream ss;
-        for (int i = 0; i < error_codes.size(); ++i) {
-            ss << (i ? "" : " ") << error_codes[i];
-        }
-        throw std::runtime_error(ss.str());
-    }
-}
-
 namespace {
 
 template<class First, class Last>
diff --git a/src/turbomind/models/llama/LlamaBatch.h b/src/turbomind/models/llama/LlamaBatch.h
index f952da6bae..44a20bebe1 100644
--- a/src/turbomind/models/llama/LlamaBatch.h
+++ b/src/turbomind/models/llama/LlamaBatch.h
@@ -2,38 +2,31 @@
 
 #pragma once
 
+#include <curand_kernel.h>
+
+#include "src/turbomind/engine/gateway.h"
+#include "src/turbomind/engine/request.h"
+
 #include "src/turbomind/models/llama/Barrier.h"
-#include "src/turbomind/models/llama/LlamaNcclGuard.h"
-#include "src/turbomind/models/llama/Request.h"
 #include "src/turbomind/models/llama/SequenceManager.h"
 #include "src/turbomind/models/llama/context.h"
 #include "src/turbomind/models/llama/llama_kernels.h"
 #include "src/turbomind/models/llama/llama_params.h"
+
 #include "src/turbomind/utils/allocator.h"
 #include "src/turbomind/utils/cublasMMWrapper.h"
 #include "src/turbomind/utils/cuda_utils.h"
-#include <condition_variable>
-#include <curand_kernel.h>
-#include <mutex>
-#include <type_traits>
-
-using ffi_api_lock_ctrl_t = std::function<void(int)>;
 
 namespace turbomind {
 
 struct SharedState {
-    std::vector<std::shared_ptr<Request>> infer_requests;
-    std::vector<std::shared_ptr<Request>> stop_requests;
-    RequestQueue                          request_queue;
+    std::vector<std::shared_ptr<Request>> infer_reqs;
+    std::vector<std::shared_ptr<Request>> kill_reqs;
     std::shared_ptr<Barrier>              barrier;
     bool                                  abort;
     std::atomic<size_t>                   free_size{std::numeric_limits<size_t>::max()};
 };
 
-struct Control {
-    Request::Callback callback;
-};
-
 struct BatchState {
     int*  h_prompt_length;  // history + input, ignore generated
     int*  h_context_length;
@@ -87,11 +80,11 @@ class LlamaBatch {
     using Requests = std::vector<std::shared_ptr<Request>>;
     using Signal   = std::function<void()>;
 
-    void RejectInvalidRequests(Requests& stop_reqs, Requests& infer_reqs);
+    void DisableConflictRequests(Requests& infer_reqs, Requests& kill_reqs);
 
-    [[nodiscard]] auto ProcessStopRequests(const Requests& requests) -> std::vector<Signal>;
+    void ProcessKillRequests(const Requests& reqs, std::vector<Signal>& signals);
 
-    void ProcessInferRequests(const Requests& requests);
+    void ProcessInferRequests(const Requests& reqs, std::vector<Signal>& signals);
 
     int AdjustMaxInputCount(GenerationState&                    g,
                             const std::vector<const Sequence*>& sequences,
@@ -101,36 +94,29 @@ class LlamaBatch {
 
     void InitializeSampling(const GenerationState& g);
 
-    [[nodiscard]] bool Forward(GenerationState& g);
+    bool Forward(GenerationState& g);
 
-    [[nodiscard]] auto Finish(GenerationState& g) -> std::vector<Signal>;
+    void Finish(GenerationState& g, std::vector<Signal>& signals);
 
     [[nodiscard]] Signal Interrupt(int index, bool force_stop = false, bool force_end = false);
 
-    void OutputContextLogits(T*                                  context_decoder_output,
-                             const std::vector<int>&             indices,
-                             const std::vector<int>&             lengths,
-                             const std::vector<const Sequence*>& sequences);
+    void ComputeAndOutputLogits(T* hidden_states, int first, int last);
+
+    void OutputLogits(const float* logits, int first, int last, GenerationConfig::OutType out_type);
+
+    void OutputLastHiddenState(const T* hidden_states, int first, int last);
 
     explicit LlamaBatch(const EngineParam&           param,
                         std::unique_ptr<LlamaV2<T>>  model,
                         std::unique_ptr<Context<T>>  ctx,
                         std::shared_ptr<SharedState> state,
+                        std::shared_ptr<Gateway>     gateway,
                         int                          device_id);
 
     ~LlamaBatch();
 
     void Start();
 
-    void Submit(std::unordered_map<std::string, Tensor>*       outputs,
-                const std::unordered_map<std::string, Tensor>* inputs,
-                Control                                        control);
-
-    void set_ffi_lock(ffi_api_lock_ctrl_t func)
-    {
-        ffi_lock_ = func;
-    }
-
     LlamaV2<T>& model() noexcept
     {
         return *model_;
@@ -144,14 +130,16 @@ class LlamaBatch {
     void tune();
 
 private:
+    void BroadcastCancelFlags();
+
+    void ProcessCancelRequests(std::vector<Signal>& signals);
+
     void InternalThreadEntry();
 
     void OutputThreadEntry();
 
     void CopyState(const std::vector<std::tuple<BatchState*, BatchState*, int, int>>& desc);
 
-    void SendSignals(std::vector<Signal> signals);
-
     // analogs to `std::copy_n`
     template<typename U>
     U* Copy(const U* src, size_t count, U* dst)
@@ -211,6 +199,7 @@ class LlamaBatch {
 private:
     const EngineParam param_;
 
+    const std::shared_ptr<Gateway>     gateway_;
     const std::shared_ptr<SharedState> shared_state_;
 
     const int      max_batch_size_;
@@ -323,14 +312,6 @@ class LlamaBatch {
 
     std::thread internal_thread_;
 
-    // async stream callback utils
-    std::thread             output_thread_;
-    std::mutex              output_mutex_;
-    std::condition_variable output_cv_;
-    std::vector<Signal>     output_signals_;
-    bool                    output_stop_token_{false};
-    ffi_api_lock_ctrl_t     ffi_lock_;
-
     int* h_output_ids_{};
 };
 
diff --git a/src/turbomind/models/llama/LlamaV2.cc b/src/turbomind/models/llama/LlamaV2.cc
index 05b22deed5..b25a8eac60 100644
--- a/src/turbomind/models/llama/LlamaV2.cc
+++ b/src/turbomind/models/llama/LlamaV2.cc
@@ -20,35 +20,30 @@
 // Modified from
 // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.cc
 
-#include "src/turbomind/models/llama/LlamaV2.h"
-#include "src/turbomind/kernels/attention/attention_params.h"
-#include "src/turbomind/kernels/decoding_kernels.h"
-#include "src/turbomind/kernels/gemm/tuner/params.h"
-#include "src/turbomind/kernels/gpt_kernels.h"
+#include <algorithm>
+#include <memory>
+
 #include "src/turbomind/macro.h"
+
 #include "src/turbomind/models/llama/LlamaBatch.h"
 #include "src/turbomind/models/llama/LlamaDenseWeight.h"
 #include "src/turbomind/models/llama/LlamaNcclGuard.h"
+#include "src/turbomind/models/llama/LlamaV2.h"
 #include "src/turbomind/models/llama/LlamaWeight.h"
-#include "src/turbomind/models/llama/Request.h"
 #include "src/turbomind/models/llama/SequenceManager.h"
 #include "src/turbomind/models/llama/llama_params.h"
 #include "src/turbomind/models/llama/llama_utils.h"
 #include "src/turbomind/models/llama/unified_decoder.h"
+
+#include "src/turbomind/kernels/decoding_kernels.h"
+#include "src/turbomind/kernels/gpt_kernels.h"
+
 #include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/allocator.h"
 #include "src/turbomind/utils/anomaly_handler.h"
 #include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/logger.h"
 #include "src/turbomind/utils/memory_utils.h"
-#include "src/turbomind/utils/monotonic.h"
-#include <algorithm>
-#include <chrono>
-#include <exception>
-#include <functional>
-#include <memory>
-#include <ratio>
-#include <sstream>
 
 namespace turbomind {
 
diff --git a/src/turbomind/models/llama/LlamaV2.h b/src/turbomind/models/llama/LlamaV2.h
index a0d35b887f..8101310812 100644
--- a/src/turbomind/models/llama/LlamaV2.h
+++ b/src/turbomind/models/llama/LlamaV2.h
@@ -21,14 +21,9 @@
 
 #pragma once
 
-#include <limits>
-#include <unordered_map>
-
 #include "src/turbomind/layers/DynamicDecodeLayer.h"
-#include "src/turbomind/models/llama/Barrier.h"
 #include "src/turbomind/models/llama/LlamaBatch.h"
 #include "src/turbomind/models/llama/LlamaWeight.h"
-#include "src/turbomind/models/llama/Request.h"
 #include "src/turbomind/models/llama/SequenceManager.h"
 #include "src/turbomind/models/llama/llama_params.h"
 #include "src/turbomind/models/llama/unified_decoder.h"
diff --git a/src/turbomind/models/llama/Request.h b/src/turbomind/models/llama/Request.h
deleted file mode 100644
index 2a715e9c9c..0000000000
--- a/src/turbomind/models/llama/Request.h
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#pragma once
-
-#include "src/turbomind/utils/Tensor.h"
-#include <condition_variable>
-#include <cstdint>
-#include <future>
-#include <limits>
-#include <queue>
-#include <unordered_map>
-
-namespace turbomind {
-
-struct Request {
-    uint64_t id;         // sequence id
-    uint64_t unique_id;  // monotonic increasing
-
-    bool start_flag;
-    bool end_flag;
-    bool stop_flag;
-
-    // per rank inputs/outputs
-    TensorMap inputs;
-    TensorMap outputs;
-
-    using Callback = std::function<void(std::unordered_map<std::string, Tensor>*)>;
-    Callback stream_cb;
-
-    enum
-    {
-        kInvalid  = 1,  // Sequence not exist or both `start` & `stop` (instead of `end`) is set
-        kConflict = 2,  // Concurrent requests to the same sequence
-        kBusy     = 3,  // Sequence is already running
-        kInactive = 4,  // Sequence to `stop` is not active
-        kFail     = 5,  // Can't find sequence for `stop` request or internal error during inference
-        kTooLong  = 6   // history + prompt > session_len
-    };
-
-    std::promise<int> signal;
-};
-
-class RequestQueue {
-public:
-    std::vector<std::future<int>> enqueue(std::vector<std::shared_ptr<Request>> requests)
-    {
-        std::vector<std::future<int>> futures;
-        futures.reserve(requests.size());
-        {
-            std::lock_guard<std::mutex> lock(mutex_);
-
-            if (closed_) {
-                throw std::runtime_error("Queue is closed");
-            }
-
-            for (auto& r : requests) {
-                futures.push_back(r->signal.get_future());
-                if (r->stop_flag) {
-                    stop_queue_.push(std::move(r));
-                }
-                else {
-                    infer_queue_.push(std::move(r));
-                }
-            }
-        }
-        cv_.notify_one();
-        return futures;
-    }
-
-    void dequeue(std::vector<std::shared_ptr<Request>>& stop_requests,
-                 std::vector<std::shared_ptr<Request>>& infer_requests,
-                 unsigned                               max_infer_count,
-                 bool                                   blocking,
-                 bool&                                  abort)
-    {
-        std::unique_lock<std::mutex> lock(mutex_);
-        if (blocking) {
-            cv_.wait(lock, [this] { return !(stop_queue_.empty() && infer_queue_.empty()) || closed_; });
-            if (closed_) {
-                abort = true;
-                return;
-            }
-        }
-
-        stop_requests.clear();
-        while (!stop_queue_.empty()) {
-            stop_requests.push_back(std::move(stop_queue_.front()));
-            stop_queue_.pop();
-        }
-
-        infer_requests.clear();
-        while (!infer_queue_.empty() && infer_requests.size() < max_infer_count) {
-            infer_requests.push_back(std::move(infer_queue_.front()));
-            infer_queue_.pop();
-        }
-    }
-
-    void close()
-    {
-        {
-            std::lock_guard<std::mutex> lock(mutex_);
-            closed_ = true;
-        }
-        cv_.notify_all();
-    }
-
-private:
-    std::queue<std::shared_ptr<Request>> stop_queue_;
-    std::queue<std::shared_ptr<Request>> infer_queue_;
-    std::mutex                           mutex_;
-    std::condition_variable              cv_;
-    bool                                 closed_{false};
-};
-
-}  // namespace turbomind
diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp
index 71792a4be8..042e858b09 100644
--- a/src/turbomind/python/bind.cpp
+++ b/src/turbomind/python/bind.cpp
@@ -1,6 +1,7 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 
 #include <memory>
+#include <sstream>
 #include <stdexcept>
 
 #include <cuda_runtime.h>
@@ -11,6 +12,7 @@
 #include <pybind11/stl.h>
 #include <pybind11/stl_bind.h>
 
+#include "src/turbomind/engine/model_request.h"
 #include "src/turbomind/python/dlpack.h"
 #include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
 #include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
@@ -22,18 +24,19 @@ namespace py = pybind11;
 namespace ft = turbomind;
 using namespace pybind11::literals;
 
+using ft::ManagedTensor;
+using ft::Tensor;
+
 // prepare to bind container
-using TensorVector = std::vector<ft::Tensor>;
-PYBIND11_MAKE_OPAQUE(TensorVector);
-using TensorMap = std::unordered_map<std::string, ft::Tensor>;
+using TensorMap = std::unordered_map<std::string, ft::ManagedTensor>;
 PYBIND11_MAKE_OPAQUE(TensorMap);
 static const char kDlTensorCapsuleName[] = "dltensor";
 
-DLDevice getDLDevice(ft::Tensor& tensor)
+DLDevice getDLDevice(const ft::Tensor& tensor)
 {
     int device_id = 0;
     if (tensor.where == ft::MEMORY_GPU) {
-        cudaPointerAttributes ptr_attr;
+        cudaPointerAttributes ptr_attr{};
         cudaPointerGetAttributes(&ptr_attr, tensor.data);
         device_id = ptr_attr.device;
     }
@@ -57,12 +60,12 @@ DLDevice getDLDevice(ft::Tensor& tensor)
     return device;
 }
 
-DLManagedTensor* TritonTensorToDLManagedTensor(ft::Tensor& tensor)
+DLManagedTensor* TritonTensorToDLManagedTensor(ManagedTensor& tensor)
 {
-    DLDevice device = getDLDevice(tensor);
+    DLDevice device = getDLDevice(*tensor);
 
     DLDataType data_type{0, 0, 1};
-    switch (tensor.type) {
+    switch (tensor->type) {
         case ft::TYPE_BOOL:
             data_type.code = DLDataTypeCode::kDLBool;
             data_type.bits = 8;
@@ -119,14 +122,26 @@ DLManagedTensor* TritonTensorToDLManagedTensor(ft::Tensor& tensor)
         default:
             break;
     }
-    DLTensor dl_tensor{const_cast<void*>(tensor.data),
+    ManagedTensor* ctx = new ManagedTensor(tensor);
+    DLTensor       dl_tensor{const_cast<void*>((*ctx)->data),
                        device,
-                       (int32_t)(tensor.shape.size()),
+                       (int32_t)((*ctx)->shape.size()),
                        data_type,
-                       reinterpret_cast<int64_t*>(const_cast<size_t*>(tensor.shape.data())),
+                       reinterpret_cast<int64_t*>(const_cast<size_t*>((*ctx)->shape.data())),
                        (int64_t*)(nullptr),
                        0};
-    return new DLManagedTensor{dl_tensor, nullptr, [](DLManagedTensor* dlmt) { delete dlmt; }};
+    return new DLManagedTensor{dl_tensor, ctx, [](DLManagedTensor* dlmt) {  //
+                                   //    auto&             x = *(ManagedTensor*)dlmt->manager_ctx;
+                                   //    std::stringstream ss;
+                                   //    ss << "(";
+                                   //    for (const auto& d : x->shape) {
+                                   //        ss << d << ",";
+                                   //    }
+                                   //    ss << ")";
+                                   //    std::cerr << "turbomind tensor dtor " << ss.str() << " " << std::endl;
+                                   delete (ManagedTensor*)dlmt->manager_ctx;
+                                   delete dlmt;
+                               }};
 }
 
 ft::MemoryType getMemoryType(DLDevice device)
@@ -200,7 +215,7 @@ ft::DataType getDataType(DLDataType data_type)
     }
 }
 
-std::shared_ptr<ft::Tensor> DLManagedTensorToTritonTensor(DLManagedTensor* tensor)
+std::shared_ptr<ManagedTensor> DLManagedTensorToTritonTensor(DLManagedTensor* tensor)
 {
     auto& dl_tensor = tensor->dl_tensor;
     auto  where     = getMemoryType(dl_tensor.device);
@@ -209,14 +224,15 @@ std::shared_ptr<ft::Tensor> DLManagedTensorToTritonTensor(DLManagedTensor* tenso
     std::vector<size_t> shape(dl_tensor.shape, dl_tensor.shape + dl_tensor.ndim);
     auto                data = dl_tensor.data;
 
-    return std::make_shared<ft::Tensor>(where, dtype, shape, data);
-}
-
-DLTensor GetDLTensor(py::object obj)
-{
-    py::capsule      cap  = obj.attr("__dlpack__")();
-    DLManagedTensor* dlmt = static_cast<DLManagedTensor*>(PyCapsule_GetPointer(cap.ptr(), kDlTensorCapsuleName));
-    return dlmt->dl_tensor;
+    auto ret    = std::make_shared<ManagedTensor>();
+    ret->tensor = Tensor(where, dtype, std::move(shape), data);
+    ret->data_holder.reset((void*)nullptr, [tensor](void*) {
+        // std::cerr << "dlpack tensor dtor" << std::endl;
+        if (tensor->deleter) {
+            tensor->deleter(tensor);
+        }
+    });
+    return ret;
 }
 
 static void safe_memcpy(void* dst, const void* src, size_t size)
@@ -264,6 +280,26 @@ static void safe_memcpy(void* dst, const void* src, size_t size)
     }
 }
 
+namespace {
+
+struct ScopedGIL {
+    ScopedGIL(const ScopedGIL&) = delete;
+    ScopedGIL& operator=(const ScopedGIL&) = delete;
+    ScopedGIL(ScopedGIL&&)                 = delete;
+    ScopedGIL& operator=(ScopedGIL&&) = delete;
+    ScopedGIL()
+    {
+        state = PyGILState_Ensure();
+    }
+    ~ScopedGIL()
+    {
+        PyGILState_Release(state);
+    }
+    PyGILState_STATE state;
+};
+
+}  // namespace
+
 PYBIND11_MODULE(_turbomind, m)
 {
     // nccl param
@@ -272,7 +308,54 @@ PYBIND11_MODULE(_turbomind, m)
         .def("__str__", &ft::NcclParam::toString);
 
     // custom comm
-    py::class_<ft::AbstractCustomComm, std::shared_ptr<ft::AbstractCustomComm>>(m, "AbstractCustomComm");
+    (void)py::class_<ft::AbstractCustomComm, std::shared_ptr<ft::AbstractCustomComm>>(m, "AbstractCustomComm");
+
+    py::class_<ft::SessionParam>(m, "SessionParam")
+        .def(py::init([](uint64_t id, int step, bool start, bool end) {
+                 if (!start && end) {
+                     throw std::logic_error("unsupported arguments: start=false, end=true");
+                 }
+                 ft::SessionParam param{};
+                 param.id         = id;
+                 param.step       = step;
+                 param.start_flag = start;
+                 param.end_flag   = end;
+                 return param;
+             }),
+             "id"_a,
+             "step"_a,
+             "start"_a,
+             "end"_a)
+        .def_readwrite("id", &ft::SessionParam::id)
+        .def_readwrite("step", &ft::SessionParam::step)
+        .def_readwrite("start", &ft::SessionParam::start_flag)
+        .def_readwrite("end", &ft::SessionParam::end_flag);
+
+    py::class_<ft::GenerationConfig>(m, "GenerationConfig")
+        .def(py::init())
+        .def_readwrite("max_new_tokens", &ft::GenerationConfig::max_new_tokens)
+        .def_readwrite("min_new_tokens", &ft::GenerationConfig::min_new_tokens)
+        .def_readwrite("top_p", &ft::GenerationConfig::top_p)
+        .def_readwrite("top_k", &ft::GenerationConfig::top_k)
+        .def_readwrite("min_p", &ft::GenerationConfig::min_p)
+        .def_readwrite("temperature", &ft::GenerationConfig::temperature)
+        .def_readwrite("repetition_penalty", &ft::GenerationConfig::repetition_penalty)
+        .def_readwrite("random_seed", &ft::GenerationConfig::random_seed)
+        .def_readwrite("output_logprobs", &ft::GenerationConfig::output_logprobs)
+        .def_readwrite("output_last_hidden_state", &ft::GenerationConfig::output_last_hidden_state)
+        .def_readwrite("output_logits", &ft::GenerationConfig::output_logits)
+        .def("__repr__", [](const ft::GenerationConfig& c) {
+            std::ostringstream oss;
+            oss << c;
+            return oss.str();
+        });
+
+    py::class_<ft::RequestState, std::unique_ptr<ft::RequestState>>(m, "RequestState")
+        .def_readonly("status", &ft::RequestState::status)
+        .def_readonly("seq_len", &ft::RequestState::seq_len);
+
+    py::class_<ft::AtomicRequestState, std::shared_ptr<ft::AtomicRequestState>>(m, "AtomicRequestState")
+        .def("consume", [](ft::AtomicRequestState& s) { return s.exchange(nullptr); });
 
     // data type
     py::enum_<ft::DataType>(m, "DataType")
@@ -299,45 +382,46 @@ PYBIND11_MODULE(_turbomind, m)
         .value("MEMORY_GPU", ft::MemoryType::MEMORY_GPU);
 
     // tensor
-    py::class_<ft::Tensor, std::shared_ptr<ft::Tensor>>(m, "Tensor")
-        .def_readonly("where", &ft::Tensor::where)
-        .def_readonly("type", &ft::Tensor::type)
-        .def_readonly("shape", &ft::Tensor::shape)
-        .def_readonly("data", &ft::Tensor::data)
-        .def(py::init(
-            [](const ft::MemoryType where, const ft::DataType type, const std::vector<size_t>& shape, const long data) {
-                auto data_ptr = reinterpret_cast<void*>(data);
-                return new ft::Tensor(where, type, shape, data_ptr);
-            }))
+    py::class_<ManagedTensor, std::shared_ptr<ManagedTensor>>(m, "Tensor")
+        .def_property_readonly("where", [](const ManagedTensor& t) { return t->where; })
+        .def_property_readonly("type", [](const ManagedTensor& t) { return t->type; })
+        .def_property_readonly("shape", [](const ManagedTensor& t) { return t->shape; })
+        .def_property_readonly("data", [](const ManagedTensor& t) { return t->data; })
         .def(
             "view",
-            [](ft::Tensor* self, ft::DataType new_type) {
-                return new ft::Tensor(self->where, new_type, self->shape, self->data);
+            [](const ManagedTensor& self, ft::DataType new_type) {
+                auto x  = self;
+                x->type = new_type;
+                return std::make_shared<ManagedTensor>(std::move(x));
             },
             "new_type"_a)
         .def(
             "view",
-            [](ft::Tensor* self, std::vector<size_t> new_shape) {
-                return new ft::Tensor(self->where, self->type, new_shape, self->data);
+            [](const ManagedTensor& self, std::vector<size_t> new_shape) {
+                auto x   = self;
+                x->shape = new_shape;
+                return std::make_shared<ManagedTensor>(std::move(x));
             },
             "new_shape"_a)
         .def(
             "copy_from",
-            [](ft::Tensor* self, py::object obj) {
+            [](ManagedTensor& self, py::object obj) {
                 py::capsule      cap = obj.attr("__dlpack__")();
                 DLManagedTensor* dlmt =
                     static_cast<DLManagedTensor*>(PyCapsule_GetPointer(cap.ptr(), kDlTensorCapsuleName));
                 auto src = DLManagedTensorToTritonTensor(dlmt);
+                // take ownership of capsule's payload
+                cap.set_name("used_dltensor");
                 switch (self->type) {
                     case ft::TYPE_FP16:
                     case ft::TYPE_FP32:
                     case ft::TYPE_INT32:
                     case ft::TYPE_BF16: {
-                        auto num_element =
-                            std::accumulate(src->shape.begin(), src->shape.end(), 1LL, std::multiplies<int64_t>());
+                        auto num_element = std::accumulate(
+                            (*src)->shape.begin(), (*src)->shape.end(), 1LL, std::multiplies<int64_t>());
                         auto num_bytes = num_element * dlmt->dl_tensor.dtype.bits / 8;
                         ft::FT_CHECK(self->shape.size() == 1 && num_bytes == self->shape[0]);
-                        safe_memcpy(const_cast<void*>(self->data), src->data, num_bytes);
+                        safe_memcpy(const_cast<void*>(self->data), (*src)->data, num_bytes);
                         break;
                     }
                     default:
@@ -347,8 +431,8 @@ PYBIND11_MODULE(_turbomind, m)
             "tensor"_a)
         .def(
             "__dlpack__",
-            [](ft::Tensor* self, long stream) {
-                DLManagedTensor* dlmt = TritonTensorToDLManagedTensor(*self);
+            [](ManagedTensor& self, long stream) {
+                DLManagedTensor* dlmt = TritonTensorToDLManagedTensor(self);
                 return py::capsule(dlmt, kDlTensorCapsuleName, [](PyObject* obj) {
                     DLManagedTensor* dlmt =
                         static_cast<DLManagedTensor*>(PyCapsule_GetPointer(obj, kDlTensorCapsuleName));
@@ -363,7 +447,7 @@ PYBIND11_MODULE(_turbomind, m)
                 });
             },
             "stream"_a = 0)
-        .def("__dlpack_device__", [](ft::Tensor* self) {
+        .def("__dlpack_device__", [](const ManagedTensor& self) {
             auto device = getDLDevice(*self);
             return std::tuple<int, int>(int(device.device_type), device.device_id);
         });
@@ -374,29 +458,59 @@ PYBIND11_MODULE(_turbomind, m)
             DLManagedTensor* dlmt =
                 static_cast<DLManagedTensor*>(PyCapsule_GetPointer(cap.ptr(), kDlTensorCapsuleName));
             auto ret = DLManagedTensorToTritonTensor(dlmt);
+            // take ownership of capsule's payload
+            cap.set_name("used_dltensor");
             return ret;
         },
         "dl_managed_tensor"_a);
 
     // transformer model instance
-    using ft::AbstractTransformerModelInstance;
+    using ft::ModelRequest;
     py::bind_map<TensorMap, std::shared_ptr<TensorMap>>(m, "TensorMap");
-    py::class_<AbstractTransformerModelInstance>(m, "AbstractTransformerModelInstance")
+    py::class_<ModelRequest>(m, "ModelRequest")
         .def(
             "forward",
-            [](AbstractTransformerModelInstance* model, std::shared_ptr<TensorMap> input_tensors) {
-                return model->forward(input_tensors);
+            [](ModelRequest*               model_request,
+               std::shared_ptr<TensorMap>  input_tensors,
+               const ft::SessionParam&     session,
+               const ft::GenerationConfig& gen_cfg,
+               bool                        stream_output,
+               std::function<void()>       cb) {
+                ModelRequest::InputParam param{};
+                param.tensors       = std::move(input_tensors);
+                param.session       = session;
+                param.gen_cfg       = gen_cfg;
+                param.stream_output = stream_output;
+                auto ret            = model_request->Forward(std::move(param), [cb = std::move(cb)]() {
+                    try {
+                        cb();
+                    }
+                    catch (const py::error_already_set& e) {
+                        std::cerr << e.what() << std::endl;
+                    }
+                });
+                return std::make_tuple(std::move(ret.tensors), std::move(ret.state));
             },
             py::call_guard<py::gil_scoped_release>(),
-            "input_tensors"_a)
+            "input_tensors"_a,
+            "session"_a,
+            "gen_cfg"_a,
+            "stream_output"_a,
+            "cb"_a)
         .def(
-            "register_callback",
-            [](AbstractTransformerModelInstance* self, ft::triton_stream_cb_t cb, py::object ctx) {
-                self->registerCallback(cb, ctx.ptr());
+            "cancel",
+            [](ModelRequest* model_request) {
+                model_request->Cancel();  //
             },
-            "callback"_a,
-            "context"_a = nullptr)
-        .def("unregister_callback", &AbstractTransformerModelInstance::unRegisterCallback);
+            py::call_guard<py::gil_scoped_release>())
+        .def(
+            "end",
+            [](ModelRequest* model_request, std::function<void(int)> cb, uint64_t session_id) {
+                model_request->End(std::move(cb), session_id);  //
+            },
+            py::call_guard<py::gil_scoped_release>(),
+            "cb"_a,
+            "session_id"_a);
 
     // transformer model
     using ft::AbstractTransformerModel;
@@ -410,25 +524,19 @@ PYBIND11_MODULE(_turbomind, m)
                size_t      pipeline_para_size,
                int         enable_custom_all_reduce,
                std::string data_type) -> std::shared_ptr<AbstractTransformerModel> {
-                auto gil_control = [state = PyGILState_STATE{}](int op) mutable {
-                    if (op) {
-                        state = PyGILState_Ensure();
-                    }
-                    else {
-                        PyGILState_Release(state);
-                    }
+                auto gil_factory = [] {  //
+                    // erase the type
+                    return std::static_pointer_cast<void>(std::make_shared<ScopedGIL>());
                 };
                 if (data_type == "half" || data_type == "fp16" || data_type == "float16" || data_type == "int4") {
                     auto model = std::make_shared<LlamaTritonModel<half>>(
-                        tensor_para_size, pipeline_para_size, enable_custom_all_reduce, model_dir, config);
-                    model->set_ffi_lock(gil_control);
+                        tensor_para_size, pipeline_para_size, enable_custom_all_reduce, model_dir, config, gil_factory);
                     return model;
                 }
                 else if (data_type == "bf16" || data_type == "bfloat16") {
 #ifdef ENABLE_BF16
                     auto model = std::make_shared<LlamaTritonModel<__nv_bfloat16>>(
-                        tensor_para_size, pipeline_para_size, enable_custom_all_reduce, model_dir, config);
-                    model->set_ffi_lock(gil_control);
+                        tensor_para_size, pipeline_para_size, enable_custom_all_reduce, model_dir, config, gil_factory);
                     return model;
 #else
                     throw std::runtime_error("Error: turbomind has not been built with bf16 support.");
@@ -437,8 +545,7 @@ PYBIND11_MODULE(_turbomind, m)
                 else {
 #ifdef ENABLE_FP32
                     auto model = std::make_shared<LlamaTritonModel<float>>(
-                        tensor_para_size, pipeline_para_size, enable_custom_all_reduce, model_dir, config);
-                    model->set_ffi_lock(gil_control);
+                        tensor_para_size, pipeline_para_size, enable_custom_all_reduce, model_dir, config, gil_factory);
                     return model;
 #else
                     throw std::runtime_error("Error: turbomind has not been built with fp32 support.");
@@ -466,21 +573,9 @@ PYBIND11_MODULE(_turbomind, m)
             "world_size"_a)
         .def(
             "create_model_instance",
-            [](AbstractTransformerModel*                                         model,
-               int                                                               deviceId,
-               int                                                               rank,
-               long                                                              stream_id,
-               std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
-               std::shared_ptr<ft::AbstractCustomComm>                           custom_all_reduce_comm = nullptr) {
-                cudaStream_t stream = reinterpret_cast<cudaStream_t>(stream_id);
-                return model->createModelInstance(deviceId, rank, stream, nccl_params, custom_all_reduce_comm);
-            },
+            [](AbstractTransformerModel* model, int deviceId) { return model->createModelInstance(deviceId); },
             py::call_guard<py::gil_scoped_release>(),
-            "device_id"_a,
-            "rank"_a,
-            "stream"_a,
-            "nccl_params"_a,
-            "custom_all_reduce_comm"_a = nullptr)
+            "device_id"_a)
         .def("create_shared_weights",
              &AbstractTransformerModel::createSharedWeights,
              py::call_guard<py::gil_scoped_release>(),
@@ -489,8 +584,13 @@ PYBIND11_MODULE(_turbomind, m)
         .def(
             "get_params",
             [](AbstractTransformerModel* model, int deviceId, int rank) {
-                TensorMap output = model->getParams(deviceId, rank);
-                return output;
+                auto      output = model->getParams(deviceId, rank);
+                TensorMap ret;
+                for (const auto& [k, v] : output) {
+                    // export reference to weight data only (no ownership)
+                    ret.emplace(k, ManagedTensor{v});
+                }
+                return ret;
             },
             py::call_guard<py::gil_scoped_release>(),
             "device_id"_a,
diff --git a/src/turbomind/triton_backend/llama/CMakeLists.txt b/src/turbomind/triton_backend/llama/CMakeLists.txt
index 26c580714a..7e193f6677 100644
--- a/src/turbomind/triton_backend/llama/CMakeLists.txt
+++ b/src/turbomind/triton_backend/llama/CMakeLists.txt
@@ -19,7 +19,6 @@ cmake_minimum_required(VERSION 3.8)
 
 set(llama_triton_backend_files
     LlamaTritonModel.cc
-    LlamaTritonModelInstance.cc
 )
 
 find_package(CUDAToolkit REQUIRED)
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
index 40c5ac8907..c4c3c00d6c 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -24,15 +24,16 @@
 #include <cuda_runtime.h>
 #include <yaml-cpp/yaml.h>
 
+#include "src/turbomind/engine/gateway.h"
+#include "src/turbomind/engine/model_request.h"
 #include "src/turbomind/models/llama/LlamaDenseWeight.h"
+#include "src/turbomind/models/llama/LlamaV2.h"
 #include "src/turbomind/models/llama/context.h"
 #include "src/turbomind/models/llama/llama_params.h"
 #include "src/turbomind/utils/allocator.h"
 #include "src/turbomind/utils/cuda_utils.h"
 
 #include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
-#include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h"
-#include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
 
 namespace turbomind {
 
@@ -60,56 +61,6 @@ static std::optional<MoeParam::Method> get_moe_method()
     return value;
 }
 
-std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createLlamaModel(std::string config_file)
-{
-    YAML::Node reader;
-    try {
-        reader = YAML::Load(config_file);
-    }
-    catch (const YAML::Exception& e) {
-        std::cerr << "Error reading YAML config: " << e.what() << std::endl;
-        FT_CHECK(false);
-    }
-
-    const auto        ft_instance_hyperparameter = reader["ft_instance_hyperparameter"];
-    const std::string data_type                  = ft_instance_hyperparameter["data_type"].as<std::string>();
-    int               tensor_para_size           = ft_instance_hyperparameter["tensor_para_size"].as<int>();
-    std::string       model_dir                  = ft_instance_hyperparameter["model_dir"].as<std::string>();
-
-    if (data_type == "half" || data_type == "fp16" || data_type == "float16") {
-        return std::make_shared<LlamaTritonModel<half>>(
-            ft_instance_hyperparameter["tensor_para_size"].as<int>(),
-            ft_instance_hyperparameter["pipeline_para_size"].as<int>(),
-            ft_instance_hyperparameter["enable_custom_all_reduce"].as<int>(0),
-            model_dir);
-    }
-    else if (data_type == "bf16" || data_type == "bfloat16") {
-#ifdef ENABLE_BF16
-        return std::make_shared<LlamaTritonModel<__nv_bfloat16>>(
-            ft_instance_hyperparameter["tensor_para_size"].as<int>(),
-            ft_instance_hyperparameter["pipeline_para_size"].as<int>(),
-            ft_instance_hyperparameter["enable_custom_all_reduce"].as<int>(0),
-            model_dir);
-#else
-        TM_LOG_ERROR("[ERROR] Turbomind is not built with ENABLE_BF16");
-        FT_CHECK(false);
-#endif
-    }
-    else {
-#ifdef ENABLE_FP32
-        return std::make_shared<LlamaTritonModel<float>>(
-            ft_instance_hyperparameter["tensor_para_size"].as<int>(),
-            ft_instance_hyperparameter["pipeline_para_size"].as<int>(),
-            ft_instance_hyperparameter["enable_custom_all_reduce"].as<int>(0),
-            model_dir);
-#else
-        TM_LOG_ERROR("[ERROR] Turbomind is not built with ENABLE_BF32");
-        FT_CHECK(false);
-#endif
-    }
-    return nullptr;
-}
-
 template<typename T>
 std::map<std::string, std::pair<std::regex, T>> getLoraPattern(std::string pattern, T (*func)(const std::string& s))
 {
@@ -207,6 +158,9 @@ template<typename T>
 LlamaTritonModel<T>::~LlamaTritonModel()
 {
     FT_CHECK(weights_.size() == engines_.size());
+
+    gateway_->shutdown();
+
     for (int device_id = 0; device_id < (int)engines_.size(); ++device_id) {
         // Set device id before destructing CUDA resources
         check_cuda_error(cudaSetDevice(device_id));
@@ -216,11 +170,17 @@ LlamaTritonModel<T>::~LlamaTritonModel()
 }
 
 template<typename T>
-LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
-                                      size_t      pipeline_para_size,
-                                      int         enable_custom_all_reduce,
-                                      std::string model_dir,
-                                      std::string config):
+LlamaTritonModel<T>::LlamaTritonModel(size_t                                 tensor_para_size,
+                                      size_t                                 pipeline_para_size,
+                                      int                                    enable_custom_all_reduce,
+                                      std::string                            model_dir,
+                                      std::string                            config,
+                                      std::function<std::shared_ptr<void>()> ffi_ctx_factory):
+    model_param_{},
+    attn_param_{},
+    moe_param_{},
+    lora_param_{},
+    engine_param_{},
     tensor_para_size_(tensor_para_size),
     pipeline_para_size_(pipeline_para_size),
     weights_(getDeviceCount()),
@@ -333,6 +293,8 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
     shared_state_          = std::make_shared<SharedState>();
     shared_state_->barrier = std::make_shared<Barrier>(tensor_para_size);
 
+    gateway_ = std::make_shared<Gateway>(ffi_ctx_factory);
+
     const auto device_count = getDeviceCount();
     engines_.resize(device_count);
 
@@ -398,6 +360,7 @@ LlamaTritonModel<T>::createSharedModelInstance(int
                                               std::move(model),
                                               std::move(ctx),
                                               shared_state_,
+                                              gateway_,
                                               device_id);
 
     // Wait for pinned buffers to be allocated for all ranks, otherwise tuning will hang
@@ -410,22 +373,17 @@ LlamaTritonModel<T>::createSharedModelInstance(int
 }
 
 template<typename T>
-std::unique_ptr<AbstractTransformerModelInstance>
-LlamaTritonModel<T>::createModelInstance(int          device_id,
-                                         int          rank,
-                                         cudaStream_t stream,
-                                         std::pair<std::vector<NcclParam>, std::vector<NcclParam>>,
-                                         std::shared_ptr<AbstractCustomComm>)
+std::unique_ptr<ModelRequest> LlamaTritonModel<T>::createModelInstance(int device_id)
 {
     check_cuda_error(cudaSetDevice(device_id));
 
     FT_CHECK(engines_[device_id] != nullptr);
 
-    auto allocator = std::make_unique<Allocator<AllocatorType::CUDA>>(device_id, false);
-
-    allocator->setStream(stream);
-
-    return std::make_unique<LlamaTritonModelInstance<T>>(*engines_[device_id], std::move(allocator), device_id);
+    return std::make_unique<ModelRequest>(gateway_.get(),
+                                          getTensorType<T>(),
+                                          engine_param_.session_len,
+                                          model_param_.vocab_size,
+                                          model_param_.hidden_units);
 }
 
 template<typename T>
@@ -483,7 +441,6 @@ void LlamaTritonModel<T>::createEngine(int
 {
 
     auto engine = createSharedModelInstance(device_id, rank, nccl_params, custom_all_reduce_comm);
-    engine->set_ffi_lock(ffi_lock_);
 
     engine->tune();
 
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.h b/src/turbomind/triton_backend/llama/LlamaTritonModel.h
index 8f473cd4cd..21b124e5a8 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.h
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.h
@@ -20,35 +20,29 @@
 
 #pragma once
 
+#include "src/turbomind/engine/gateway.h"
 #include "src/turbomind/models/llama/LlamaBatch.h"
-#include "src/turbomind/models/llama/LlamaV2.h"
+#include "src/turbomind/models/llama/LlamaWeight.h"
 #include "src/turbomind/models/llama/llama_params.h"
-#include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h"
 #include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
-#include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/custom_ar_comm.h"
 #include "src/turbomind/utils/nccl_utils.h"
 #include <cuda_fp16.h>
-#include <mutex>
 
 namespace turbomind {
 
 template<typename T>
 struct LlamaTritonModel: public AbstractTransformerModel {
-    LlamaTritonModel(size_t      tensor_para_size,
-                     size_t      pipeline_para_size,
-                     int         enable_custom_all_reduce,
-                     std::string model_dir,
-                     std::string config = "");
+    LlamaTritonModel(size_t                                 tensor_para_size,
+                     size_t                                 pipeline_para_size,
+                     int                                    enable_custom_all_reduce,
+                     std::string                            model_dir,
+                     std::string                            config,
+                     std::function<std::shared_ptr<void>()> ffi_ctx_factory);
 
     ~LlamaTritonModel() override;
 
-    std::unique_ptr<AbstractTransformerModelInstance>
-    createModelInstance(int                                                       deviceId,
-                        int                                                       rank,
-                        cudaStream_t                                              stream,
-                        std::pair<std::vector<NcclParam>, std::vector<NcclParam>> nccl_params,
-                        std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm = nullptr) override;
+    std::unique_ptr<ModelRequest> createModelInstance(int deviceId) override;
 
     void createSharedWeights(int deviceId, int rank) override;
 
@@ -66,11 +60,6 @@ struct LlamaTritonModel: public AbstractTransformerModel {
 
     void handleMissingParams();
 
-    void set_ffi_lock(ffi_api_lock_ctrl_t func)
-    {
-        ffi_lock_ = func;
-    }
-
     std::string toString() override;
     int         getTensorParaSize() override;
     int         getPipelineParaSize() override;
@@ -91,6 +80,8 @@ struct LlamaTritonModel: public AbstractTransformerModel {
     size_t         pipeline_para_size_;
 
     std::shared_ptr<SharedState> shared_state_;
+    std::shared_ptr<Gateway>     gateway_;
+
     // Weights & engine instances for the ranks
     std::vector<std::shared_ptr<LlamaWeight<T>>> weights_;
     std::vector<std::shared_ptr<Engine<T>>>      engines_;
@@ -100,8 +91,6 @@ struct LlamaTritonModel: public AbstractTransformerModel {
 
     std::string model_name_;
     std::string model_dir_;
-
-    ffi_api_lock_ctrl_t ffi_lock_ = nullptr;
 };
 
 }  // namespace turbomind
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc
deleted file mode 100644
index 976fc9cc1d..0000000000
--- a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc
+++ /dev/null
@@ -1,216 +0,0 @@
-/*
- * Copyright (c) OpenMMLab. All rights reserved.
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Modified from
-// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h
-
-#include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h"
-#include "src/turbomind/macro.h"
-#include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
-#include "src/turbomind/utils/Tensor.h"
-#include "src/turbomind/utils/constant.h"
-#include "src/turbomind/utils/cuda_utils.h"
-#include <algorithm>
-#include <functional>
-#include <numeric>
-#include <sstream>
-#include <unordered_map>
-#include <vector>
-
-namespace turbomind {
-
-template<typename T>
-void triton_stream_callback(std::unordered_map<std::string, Tensor>* outputs, void* ctx)
-{
-    LlamaTritonModelInstance<T>* model = reinterpret_cast<LlamaTritonModelInstance<T>*>(ctx);
-    model->stream_cb_(std::make_shared<std::unordered_map<std::string, Tensor>>(*outputs), model->stream_ctx_);
-}
-
-template<typename T>
-LlamaTritonModelInstance<T>::LlamaTritonModelInstance(Engine<T>&                                      instance,
-                                                      std::unique_ptr<Allocator<AllocatorType::CUDA>> allocator,
-                                                      int                                             device_id):
-    device_id_{device_id}, instance_(&instance), allocator_(std::move(allocator))
-{
-}
-
-template<typename T>
-std::string format_vector(const std::vector<T>& vec)
-{
-    std::stringstream ss;
-    ss << "[";
-    bool first = true;
-    for (const auto& x : vec) {
-        ss << (first ? "" : ", ") << x;
-        first = false;
-    }
-    ss << "]";
-    return ss.str();
-}
-
-template<typename T>
-std::shared_ptr<std::unordered_map<std::string, Tensor>>
-LlamaTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::string, Tensor>> inputs)
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-
-    // In some cases, this is needed to trigger the creation of CUDA context, or later `cudaMallocAsync` will die
-    check_cuda_error(cudaSetDevice(device_id_));
-
-    FT_CHECK_WITH_INFO(inputs->at("input_ids").shape.size() == 2, "inputs->at(\"input_ids\").shape.size() == 2");
-    FT_CHECK_WITH_INFO(inputs->at("input_lengths").shape.size() == 1,
-                       "inputs->at(\"input_lengths\").shape.size() == 1");
-
-    const uint32_t request_batch_size     = inputs->at("input_ids").shape[0];
-    const uint32_t max_request_output_len = (size_t)*std::max_element((int*)inputs->at("request_output_len").data,
-                                                                      (int*)inputs->at("request_output_len").data
-                                                                          + inputs->at("request_output_len").shape[0]);
-    // const uint32_t total_output_len = max_request_output_len + input_tensors->at("input_ids").shape[1];
-    const uint32_t beam_width = inputs->count("beam_width") ? (size_t)(*(uint*)inputs->at("beam_width").data) : 1;
-    FT_CHECK_WITH_INFO(beam_width == 1, "Beam search is not implemented");
-
-    h_total_output_lengths_ =
-        (uint32_t*)std::realloc((void*)h_total_output_lengths_, request_batch_size * sizeof(uint32_t));
-
-    const size_t max_input_len    = inputs->at("input_ids").shape[1];
-    const bool   is_return_logits = inputs->count("is_return_logits") && *(bool*)inputs->at("is_return_logits").data;
-
-    const size_t vocab_size = instance_->model().vocab_size();
-
-    allocateBuffer(request_batch_size, max_input_len, beam_width, instance_->session_len(), is_return_logits);
-
-    std::unordered_map<std::string, Tensor> outputs{
-        {"output_ids",
-         Tensor{MEMORY_CPU,
-                TYPE_UINT32,
-                std::vector<size_t>{request_batch_size, beam_width, (size_t)instance_->session_len()},
-                d_output_ids_}},
-        {"sequence_length",
-         Tensor{MEMORY_CPU, TYPE_UINT32, std::vector<size_t>{request_batch_size, beam_width}, d_sequence_lengths_}}};
-
-    if (inputs->count("is_return_log_probs") && *((bool*)inputs->at("is_return_log_probs").data)) {
-        outputs.insert({"output_log_probs",
-                        Tensor{MEMORY_GPU,
-                               TYPE_FP32,
-                               std::vector<size_t>{request_batch_size, beam_width, max_request_output_len},
-                               d_output_log_probs_}});
-        outputs.insert(
-            {"cum_log_probs",
-             Tensor{MEMORY_GPU, TYPE_FP32, std::vector<size_t>{request_batch_size, beam_width}, d_cum_log_probs_}});
-    }
-
-    if (inputs->count("logprobs")) {
-        size_t max_logprob_length = std::min((int)max_request_output_len, instance_->session_len()) + 1;
-        h_logprob_vals_           = (float*)std::realloc(
-            h_logprob_vals_, sizeof(float) * request_batch_size * beam_width * max_logprob_length * kMaxLogProb);
-        h_logprob_indexes_ = (uint32_t*)std::realloc(
-            h_logprob_indexes_, sizeof(uint32_t) * request_batch_size * beam_width * max_logprob_length * kMaxLogProb);
-        h_logprob_nums_ = (uint32_t*)std::realloc(
-            h_logprob_nums_, sizeof(uint32_t) * request_batch_size * beam_width * max_logprob_length);
-
-        outputs.insert({{"logprob_vals",
-                         Tensor{MEMORY_CPU,
-                                TYPE_FP32,
-                                std::vector<size_t>{request_batch_size, beam_width, max_logprob_length, kMaxLogProb},
-                                h_logprob_vals_}}});
-
-        outputs.insert({{"logprob_indexes",
-                         Tensor{MEMORY_CPU,
-                                TYPE_UINT32,
-                                std::vector<size_t>{request_batch_size, beam_width, max_logprob_length, kMaxLogProb},
-                                h_logprob_indexes_}}});
-
-        outputs.insert({{"logprob_nums",
-                         Tensor{MEMORY_CPU,
-                                TYPE_UINT32,
-                                std::vector<size_t>{request_batch_size, beam_width, max_logprob_length},
-                                h_logprob_nums_}}});
-    }
-
-    if (is_return_logits) {
-        outputs.insert(
-            {{"logits", {MEMORY_GPU, TYPE_FP32, {request_batch_size, max_input_len, vocab_size}, d_output_logits_}}});
-    }
-
-    try {
-        Request::Callback callback;
-
-        if (stream_cb_) {
-            callback = [this](std::unordered_map<std::string, Tensor>* outputs) {
-                triton_stream_callback<T>(outputs, this);
-            };
-        }
-
-        check_cuda_error(cudaStreamSynchronize(allocator_->returnStream()));
-
-        instance_->Submit(&outputs, inputs.get(), {callback});
-        // ! stream synced by the model before returning
-    }
-    catch (...) {
-        h_exception_ = std::current_exception();
-        outputs.insert({"error_message", Tensor{MEMORY_CPU, TYPE_BYTES, {1}, &h_exception_}});
-    }
-
-    return std::make_shared<std::unordered_map<std::string, Tensor>>(std::move(outputs));
-}
-
-template<typename T>
-LlamaTritonModelInstance<T>::~LlamaTritonModelInstance()
-{
-    freeBuffer();
-}
-
-template<typename T>
-void LlamaTritonModelInstance<T>::allocateBuffer(const size_t request_batch_size,
-                                                 const size_t max_input_len,
-                                                 const size_t beam_width,
-                                                 const size_t session_len,
-                                                 const bool   is_return_logits)
-{
-    d_output_ids_ = (int*)std::realloc(d_output_ids_, sizeof(int) * request_batch_size * beam_width * session_len);
-    d_sequence_lengths_ = (int*)std::realloc(d_sequence_lengths_, sizeof(int) * request_batch_size * beam_width);
-
-    if (is_return_logits) {
-        d_output_logits_ = (float*)allocator_->reMalloc(d_output_logits_,
-                                                        sizeof(float) * request_batch_size * max_input_len
-                                                            * instance_->model().vocab_size(),
-                                                        false);
-    }
-}
-
-template<typename T>
-void LlamaTritonModelInstance<T>::freeBuffer()
-{
-    std::free(d_output_ids_);
-    std::free(d_sequence_lengths_);
-    allocator_->free((void**)(&d_output_log_probs_));
-    allocator_->free((void**)(&d_cum_log_probs_));
-    std::free(h_total_output_lengths_);
-    std::free(h_logprob_vals_);
-    std::free(h_logprob_indexes_);
-    std::free(h_logprob_nums_);
-}
-
-#ifdef ENABLE_FP32
-template struct LlamaTritonModelInstance<float>;
-#endif
-template struct LlamaTritonModelInstance<half>;
-#ifdef ENABLE_BF16
-template struct LlamaTritonModelInstance<__nv_bfloat16>;
-#endif
-
-}  // namespace turbomind
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h
deleted file mode 100644
index 2cf69b9fa5..0000000000
--- a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) OpenMMLab. All rights reserved.
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Modified from
-// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h
-
-#pragma once
-
-#include <memory>
-
-#include "src/turbomind/models/llama/LlamaBatch.h"
-#include "src/turbomind/models/llama/LlamaV2.h"
-#include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
-#include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
-
-namespace turbomind {
-
-template<typename T>
-struct LlamaTritonModelInstance: AbstractTransformerModelInstance {
-
-    LlamaTritonModelInstance(Engine<T>&                                      instance,
-                             std::unique_ptr<Allocator<AllocatorType::CUDA>> allocator,
-                             int                                             device_id);
-    ~LlamaTritonModelInstance() override;
-
-    virtual std::shared_ptr<std::unordered_map<std::string, Tensor>>
-    forward(std::shared_ptr<std::unordered_map<std::string, Tensor>> input_tensors) override;
-
-private:
-    Engine<T>*                                            instance_;
-    const std::unique_ptr<Allocator<AllocatorType::CUDA>> allocator_;
-
-    void allocateBuffer(const size_t request_batch_size,
-                        const size_t max_input_len,
-                        const size_t beam_width,
-                        const size_t session_len,
-                        const bool   is_return_logits);
-    void freeBuffer();
-
-    int device_id_;
-
-    int*   d_input_ids_                = nullptr;
-    int*   d_input_lengths_            = nullptr;
-    int*   d_input_bad_words_          = nullptr;
-    int*   d_input_stop_words_         = nullptr;
-    int*   d_request_prompt_lengths_   = nullptr;
-    T*     d_request_prompt_embedding_ = nullptr;
-    float* d_top_p_decay_              = nullptr;
-    float* d_top_p_min_                = nullptr;
-    int*   d_top_p_reset_ids_          = nullptr;
-
-    int*   d_output_ids_       = nullptr;
-    int*   d_sequence_lengths_ = nullptr;
-    float* d_output_log_probs_ = nullptr;
-    float* d_cum_log_probs_    = nullptr;
-    float* d_output_logits_    = nullptr;
-
-    float*    h_logprob_vals_    = nullptr;
-    uint32_t* h_logprob_indexes_ = nullptr;
-    uint32_t* h_logprob_nums_    = nullptr;
-
-    uint32_t*          h_total_output_lengths_ = nullptr;
-    std::exception_ptr h_exception_            = nullptr;
-};
-
-}  // namespace turbomind
diff --git a/src/turbomind/triton_backend/transformer_triton_backend.hpp b/src/turbomind/triton_backend/transformer_triton_backend.hpp
index 6d49df4578..7e1e235160 100644
--- a/src/turbomind/triton_backend/transformer_triton_backend.hpp
+++ b/src/turbomind/triton_backend/transformer_triton_backend.hpp
@@ -22,16 +22,18 @@
 
 #include <functional>
 #include <memory>
-#include <sstream>
+#include <vector>
+
 #ifdef __linux__
 #include <sys/time.h>
 #endif
-#include <vector>
 
 #include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/custom_ar_comm.h"
 #include "src/turbomind/utils/nccl_utils.h"
 
+#include "src/turbomind/engine/model_request.h"
+
 namespace turbomind {
 
 using triton_stream_cb_t = std::function<void(std::shared_ptr<std::unordered_map<std::string, Tensor>>, void*)>;
@@ -62,7 +64,6 @@ struct AbstractTransformerModelInstance {
 };
 
 struct AbstractTransformerModel {
-    static std::shared_ptr<AbstractTransformerModel> createLlamaModel(std::string model_dir);
 
     virtual ~AbstractTransformerModel() = default;
 
@@ -72,12 +73,7 @@ struct AbstractTransformerModel {
     virtual void createCustomComms(std::vector<std::shared_ptr<AbstractCustomComm>>* custom_all_reduce_comms,
                                    int                                               world_size) = 0;
 
-    virtual std::unique_ptr<AbstractTransformerModelInstance>
-    createModelInstance(int                                                       deviceId,
-                        int                                                       rank,
-                        cudaStream_t                                              stream,
-                        std::pair<std::vector<NcclParam>, std::vector<NcclParam>> nccl_params,
-                        std::shared_ptr<AbstractCustomComm>                       custom_all_reduce_comm = nullptr) = 0;
+    virtual std::unique_ptr<ModelRequest> createModelInstance(int deviceId) = 0;
 
     virtual void createSharedWeights(int deviceId, int rank) = 0;
 
diff --git a/src/turbomind/utils/Tensor.h b/src/turbomind/utils/Tensor.h
index b2b8524e09..1f6f737b72 100644
--- a/src/turbomind/utils/Tensor.h
+++ b/src/turbomind/utils/Tensor.h
@@ -530,4 +530,29 @@ class TensorMap {
     void             saveNpy(const std::string& base_folder);
 };
 
+struct ManagedTensor {
+    Tensor                tensor;
+    std::shared_ptr<void> data_holder;
+
+    Tensor* operator->() noexcept
+    {
+        return &tensor;
+    }
+
+    const Tensor* operator->() const noexcept
+    {
+        return &tensor;
+    }
+
+    Tensor& operator*() noexcept
+    {
+        return tensor;
+    }
+
+    const Tensor& operator*() const noexcept
+    {
+        return tensor;
+    }
+};
+
 }  // namespace turbomind
diff --git a/src/turbomind/utils/cuda_utils.h b/src/turbomind/utils/cuda_utils.h
index 8311e6eb9e..ac1664bdd2 100644
--- a/src/turbomind/utils/cuda_utils.h
+++ b/src/turbomind/utils/cuda_utils.h
@@ -306,7 +306,8 @@ inline std::string getDeviceName()
     return std::string(props.name);
 }
 
-inline int div_up(int a, int n)
+template<class T>
+inline T div_up(T a, T n)
 {
     return (a + n - 1) / n;
 }

From a6d4b4adb11914fa32089428a012146b3c99fe43 Mon Sep 17 00:00:00 2001
From: Chen Xin <irexyc@gmail.com>
Date: Sat, 11 Jan 2025 12:38:27 +0800
Subject: [PATCH 2/4] fix xcomposer2 when transformers is upgraded greater than
 4.46 (#3001)

---
 lmdeploy/vl/model/xcomposer2.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lmdeploy/vl/model/xcomposer2.py b/lmdeploy/vl/model/xcomposer2.py
index 3c72d0c29f..312ef9132b 100644
--- a/lmdeploy/vl/model/xcomposer2.py
+++ b/lmdeploy/vl/model/xcomposer2.py
@@ -156,6 +156,9 @@ def build_model(self):
                                                      trust_remote_code=True)
             model.vit.load_model()
             model.vit.resize_pos()
+            if hasattr(self.hf_config, 'img_size'):
+                model.vit.vision_tower.vision_model.embeddings.image_size = \
+                    self.hf_config.img_size
             model.vit.vision_tower.vision_model.post_layernorm.to_empty(
                 device='cpu').half()
             self.vl_model = model

From 551e6d532bab2194a59c0c89fdbb687c5f66ff8f Mon Sep 17 00:00:00 2001
From: Wei Tao <1136862851@qq.com>
Date: Sun, 12 Jan 2025 17:21:07 +0800
Subject: [PATCH 3/4] [dlinfer]rm rope reshape (#2984)

---
 lmdeploy/pytorch/kernels/dlinfer/apply_rotary_pos_emb.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/lmdeploy/pytorch/kernels/dlinfer/apply_rotary_pos_emb.py b/lmdeploy/pytorch/kernels/dlinfer/apply_rotary_pos_emb.py
index 0f13f3f38c..0fd07cf10c 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/apply_rotary_pos_emb.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/apply_rotary_pos_emb.py
@@ -15,15 +15,12 @@ def apply_rotary_pos_emb(
 ) -> Tuple[Tensor, Tensor]:
     query_states = query_states.contiguous()
     key_states = key_states.contiguous()
-    bs = query_states.shape[0]
     query_states_reshaped = query_states.unsqueeze(0)
     key_states_reshaped = key_states.unsqueeze(0)
-    cos_reshaped = cos.reshape(1, bs, 1, -1)
-    sin_reshaped = sin.reshape(1, bs, 1, -1)
     query_states_reshaped, key_states_reshaped = \
         ext_ops.apply_rotary_pos_emb(query_states_reshaped,
                                      key_states_reshaped,
-                                     cos_reshaped, sin_reshaped,
+                                     cos, sin,
                                      None, None)
     if q_embed is None:
         q_embed = query_states_reshaped.view(query_states.shape)

From 086481ed84b59bee3b8e4274e5fc69620040c048 Mon Sep 17 00:00:00 2001
From: AllentDan <41138331+AllentDan@users.noreply.github.com>
Date: Sun, 12 Jan 2025 17:33:47 +0800
Subject: [PATCH 4/4] Expose spaces_between_special_tokens (#2991)

---
 lmdeploy/messages.py                |  4 ++++
 lmdeploy/serve/async_engine.py      |  4 +++-
 lmdeploy/serve/gradio/vl.py         |  4 +++-
 lmdeploy/serve/openai/api_server.py | 31 ++++++++++++++++++++++-------
 lmdeploy/serve/openai/protocol.py   |  2 ++
 5 files changed, 36 insertions(+), 9 deletions(-)

diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py
index d4e6571b79..11626f44a2 100644
--- a/lmdeploy/messages.py
+++ b/lmdeploy/messages.py
@@ -52,6 +52,9 @@ class GenerationConfig:
             ignoring the number of tokens in the prompt.
         skip_special_tokens (bool): Whether or not to remove special tokens
             in the decoding. Default to be True.
+        spaces_between_special_tokens (bool): Whether or not to add spaces
+            around special tokens. The behavior of Fast tokenizers is to have
+            this to False. This is setup to True in slow tokenizers.
         logprobs (int): Number of log probabilities to return per output token.
         response_format (Dict): Only pytorch backend support formatting
         response. Examples:
@@ -94,6 +97,7 @@ class GenerationConfig:
     bad_token_ids: List[int] = None
     min_new_tokens: int = None
     skip_special_tokens: bool = True
+    spaces_between_special_tokens: bool = True
     logprobs: int = None
     response_format: Optional[Dict] = None
     logits_processors: Optional[List[LogitsProcessor]] = None
diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py
index d7366c654b..2b2d02b38f 100644
--- a/lmdeploy/serve/async_engine.py
+++ b/lmdeploy/serve/async_engine.py
@@ -781,7 +781,9 @@ def is_error(status):
                     response, state = self.tokenizer.detokenize_incrementally(
                         token_ids,
                         state,
-                        skip_special_tokens=gen_config.skip_special_tokens)
+                        skip_special_tokens=gen_config.skip_special_tokens,
+                        spaces_between_special_tokens=gen_config.
+                        spaces_between_special_tokens)
                     res = token_ids[ids_offset:]
 
                     out = GenOut(response, history_len, input_len, gen_len,
diff --git a/lmdeploy/serve/gradio/vl.py b/lmdeploy/serve/gradio/vl.py
index bf8ee87e68..26f23613af 100644
--- a/lmdeploy/serve/gradio/vl.py
+++ b/lmdeploy/serve/gradio/vl.py
@@ -151,7 +151,9 @@ def chat(chatbot, session, max_new_tokens, top_p, top_k, temperature):
                 response, state = engine.tokenizer.detokenize_incrementally(
                     res,
                     state,
-                    skip_special_tokens=gen_config.skip_special_tokens)
+                    skip_special_tokens=gen_config.skip_special_tokens,
+                    spaces_between_special_tokens=gen_config.
+                    spaces_between_special_tokens)  # noqa
                 if chatbot[-1][1] is None:
                     chatbot[-1][1] = ''
                     history[-1][1] = ''
diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
index c37f7572a1..a284250f21 100644
--- a/lmdeploy/serve/openai/api_server.py
+++ b/lmdeploy/serve/openai/api_server.py
@@ -149,7 +149,8 @@ def _create_completion_logprobs(tokenizer: Tokenizer,
                                 skip_special_tokens: bool = True,
                                 offset: int = 0,
                                 all_token_ids: List[int] = None,
-                                state: DetokenizeState = None):
+                                state: DetokenizeState = None,
+                                spaces_between_special_tokens: bool = True):
     """create openai LogProbs for completion.
 
     Args:
@@ -162,6 +163,9 @@ def _create_completion_logprobs(tokenizer: Tokenizer,
         offset (int): text offset.
         all_token_ids (int): the history output token ids.
         state (DetokenizeState): tokenizer decode state.
+        spaces_between_special_tokens (bool): Whether or not to add spaces
+            around special tokens. The behavior of Fast tokenizers is to have
+            this to False. This is setup to True in slow tokenizers.
     """
     if logprobs is None or len(logprobs) == 0:
         return None, None, None, None
@@ -183,7 +187,8 @@ def _create_completion_logprobs(tokenizer: Tokenizer,
             response, _state = tokenizer.detokenize_incrementally(
                 all_token_ids + [top_id],
                 copy.deepcopy(state),
-                skip_special_tokens=skip_special_tokens)
+                skip_special_tokens=skip_special_tokens,
+                spaces_between_special_tokens=spaces_between_special_tokens)
             res[response] = prob
             if top_id == token_id:
                 out_state = _state
@@ -323,6 +328,9 @@ async def chat_completions_v1(request: ChatCompletionRequest,
     - ignore_eos (bool): indicator for ignoring eos
     - skip_special_tokens (bool): Whether or not to remove special tokens
         in the decoding. Default to be True.
+    - spaces_between_special_tokens (bool): Whether or not to add spaces
+        around special tokens. The behavior of Fast tokenizers is to have
+        this to False. This is setup to True in slow tokenizers.
     - min_new_tokens (int): To generate at least numbers of tokens.
     - min_p (float): Minimum token probability, which will be scaled by the
         probability of the most likely token. It must be a value between
@@ -393,7 +401,8 @@ async def chat_completions_v1(request: ChatCompletionRequest,
         logits_processors=logits_processors,
         min_new_tokens=request.min_new_tokens,
         min_p=request.min_p,
-        random_seed=random_seed)
+        random_seed=random_seed,
+        spaces_between_special_tokens=request.spaces_between_special_tokens)
 
     tools = None
     if request.tools and request.tool_choice != 'none':
@@ -581,6 +590,9 @@ async def completions_v1(request: CompletionRequest,
     - ignore_eos (bool): indicator for ignoring eos
     - skip_special_tokens (bool): Whether or not to remove special tokens
         in the decoding. Default to be True.
+    - spaces_between_special_tokens (bool): Whether or not to add spaces
+        around special tokens. The behavior of Fast tokenizers is to have
+        this to False. This is setup to True in slow tokenizers.
     - top_k (int): The number of the highest probability vocabulary
         tokens to keep for top-k-filtering
 
@@ -623,7 +635,8 @@ async def completions_v1(request: CompletionRequest,
         ignore_eos=request.ignore_eos,
         stop_words=request.stop,
         skip_special_tokens=request.skip_special_tokens,
-        random_seed=random_seed)
+        random_seed=random_seed,
+        spaces_between_special_tokens=request.spaces_between_special_tokens)
     generators = []
     for i in range(len(request.prompt)):
         result_generator = VariableInterface.async_engine.generate(
@@ -672,7 +685,7 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
                         VariableInterface.async_engine.tokenizer,
                         res.token_ids, res.logprobs,
                         gen_config.skip_special_tokens, offset, all_token_ids,
-                        state)
+                        state, gen_config.spaces_between_special_tokens)
                 if request.stream_options and request.stream_options.include_usage:  # noqa E501
                     final_res = res
                     total_tokens = sum([
@@ -724,8 +737,12 @@ async def _inner_call(i, generator):
         logprobs = None
         if request.logprobs and len(final_logprobs):
             logprobs, _, _, _ = _create_completion_logprobs(
-                VariableInterface.async_engine.tokenizer, final_token_ids,
-                final_logprobs, gen_config.skip_special_tokens)
+                VariableInterface.async_engine.tokenizer,
+                final_token_ids,
+                final_logprobs,
+                gen_config.skip_special_tokens,
+                spaces_between_special_tokens=gen_config.
+                spaces_between_special_tokens)
 
         assert final_res is not None
         choice_data = CompletionResponseChoice(
diff --git a/lmdeploy/serve/openai/protocol.py b/lmdeploy/serve/openai/protocol.py
index 2b9d39c7b7..a6f945ac13 100644
--- a/lmdeploy/serve/openai/protocol.py
+++ b/lmdeploy/serve/openai/protocol.py
@@ -135,6 +135,7 @@ class ChatCompletionRequest(BaseModel):
     session_id: Optional[int] = -1
     ignore_eos: Optional[bool] = False
     skip_special_tokens: Optional[bool] = True
+    spaces_between_special_tokens: Optional[bool] = True
     top_k: Optional[int] = 40
     seed: Optional[int] = None
     min_new_tokens: Optional[int] = Field(default=None, examples=[None])
@@ -251,6 +252,7 @@ class CompletionRequest(BaseModel):
     session_id: Optional[int] = -1
     ignore_eos: Optional[bool] = False
     skip_special_tokens: Optional[bool] = True
+    spaces_between_special_tokens: Optional[bool] = True
     top_k: Optional[int] = 40  # for opencompass
     seed: Optional[int] = None