From c25381f3416558f0e4904b8aa23b182928e9d1f6 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Sat, 11 Jan 2025 01:00:49 +0800 Subject: [PATCH 1/4] Refactor async engine & turbomind IO (#2968) * refactor * async interface * update perf metrics & adaptive tokens per tick * wait-free * refactor gateway * optimize throughput * add cancel cb * simplify async engine * simplify async engine * fix end session * faster synchronization * fix async engine * refactor async engine * fix semaphore * refactor inference API * remove turbomind sync interface * fix msvc build * fix msvc build * fix msvc build * add extra outputs * skip stop tokens * exit gracefully * cancel all tasks atexit * refactor profiler * fix id2step for api server * save csv * fix interactive * fix lint * fix generate_token_len * fix async_end * update pipeline ut * fix ignore eos * minor * refactor profile pipeline api * fix stop ids * fix duplication * control output range of logits & last hidden states * fix lint & typo * fix blank response * export batch & num prompts --- .../interface/pipeline/test_pipeline_func.py | 2 +- autotest/utils/pipeline_chat.py | 4 +- benchmark/profile_pipeline_api.py | 116 +-- benchmark/profile_throughput.py | 257 +++-- lmdeploy/messages.py | 9 +- lmdeploy/profiler.py | 170 ++++ lmdeploy/serve/async_engine.py | 714 ++++++++----- lmdeploy/serve/openai/api_server.py | 19 +- lmdeploy/turbomind/chat.py | 49 +- lmdeploy/turbomind/turbomind.py | 615 +++++------- src/turbomind/CMakeLists.txt | 1 + src/turbomind/engine/CMakeLists.txt | 7 + src/turbomind/engine/gateway.cc | 40 + src/turbomind/engine/gateway.h | 61 ++ src/turbomind/engine/model_request.cc | 174 ++++ src/turbomind/engine/model_request.h | 59 ++ src/turbomind/engine/request.h | 148 +++ src/turbomind/engine/request_queue.cc | 93 ++ src/turbomind/engine/request_queue.h | 46 + src/turbomind/engine/signal_buffer.h | 61 ++ src/turbomind/kernels/gpt_kernels.cu | 57 ++ src/turbomind/kernels/gpt_kernels.h | 15 + .../kernels/sampling_penalty_kernels.cu | 77 ++ .../kernels/sampling_penalty_kernels.h | 8 + .../kernels/sampling_topp_kernels.cu | 9 +- .../sampling_layers/LogitsProcessorLayer.cc | 2 +- src/turbomind/models/llama/CMakeLists.txt | 1 + src/turbomind/models/llama/LlamaBatch.cc | 945 ++++++++---------- src/turbomind/models/llama/LlamaBatch.h | 67 +- src/turbomind/models/llama/LlamaV2.cc | 23 +- src/turbomind/models/llama/LlamaV2.h | 5 - src/turbomind/models/llama/Request.h | 115 --- src/turbomind/python/bind.cpp | 266 +++-- .../triton_backend/llama/CMakeLists.txt | 1 - .../triton_backend/llama/LlamaTritonModel.cc | 95 +- .../triton_backend/llama/LlamaTritonModel.h | 33 +- .../llama/LlamaTritonModelInstance.cc | 216 ---- .../llama/LlamaTritonModelInstance.h | 80 -- .../transformer_triton_backend.hpp | 14 +- src/turbomind/utils/Tensor.h | 25 + src/turbomind/utils/cuda_utils.h | 3 +- 41 files changed, 2683 insertions(+), 2019 deletions(-) create mode 100644 lmdeploy/profiler.py create mode 100644 src/turbomind/engine/CMakeLists.txt create mode 100644 src/turbomind/engine/gateway.cc create mode 100644 src/turbomind/engine/gateway.h create mode 100644 src/turbomind/engine/model_request.cc create mode 100644 src/turbomind/engine/model_request.h create mode 100644 src/turbomind/engine/request.h create mode 100644 src/turbomind/engine/request_queue.cc create mode 100644 src/turbomind/engine/request_queue.h create mode 100644 src/turbomind/engine/signal_buffer.h delete mode 100644 src/turbomind/models/llama/Request.h delete mode 100644 src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc delete mode 100644 src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h diff --git a/autotest/interface/pipeline/test_pipeline_func.py b/autotest/interface/pipeline/test_pipeline_func.py index 87a0719bcb..0696684890 100644 --- a/autotest/interface/pipeline/test_pipeline_func.py +++ b/autotest/interface/pipeline/test_pipeline_func.py @@ -408,7 +408,7 @@ def run_pipeline_testcase(config, model, backend, file_name): result = True for i in range(2): result &= response[i].finish_reason == 'length' - result &= response[i].session_id == i + result &= response[i].index == i save_pipeline_common_log(config, file_name, result, response) del pipe torch.cuda.empty_cache() diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py index 8f03e4e406..5dcb358319 100644 --- a/autotest/utils/pipeline_chat.py +++ b/autotest/utils/pipeline_chat.py @@ -235,7 +235,7 @@ def assert_pipeline_single_stream_return(output, logprobs_num: int = 0): def assert_pipeline_batch_stream_return(output, size: int = 1): for i in range(size): - output_list = [item for item in output if item.session_id == i] + output_list = [item for item in output if item.index == i] result, msg = assert_pipeline_single_stream_return(output_list) if not result: return result, msg @@ -249,7 +249,7 @@ def assert_pipeline_single_element(output, result = True result &= output.generate_token_len > 0 result &= output.input_token_len > 0 - result &= output.session_id >= 0 + result &= output.index >= 0 if is_last: result &= len(output.text) >= 0 result &= output.finish_reason in ['stop', 'length'] diff --git a/benchmark/profile_pipeline_api.py b/benchmark/profile_pipeline_api.py index 764f78399c..334be7fa14 100644 --- a/benchmark/profile_pipeline_api.py +++ b/benchmark/profile_pipeline_api.py @@ -1,11 +1,8 @@ # Copyright (c) OpenMMLab. All rights reserved. import argparse -import csv import json import os import random -import time -from collections import OrderedDict from typing import List, Tuple from tqdm import tqdm @@ -14,6 +11,10 @@ from lmdeploy import (GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, pipeline) from lmdeploy.cli.utils import ArgumentHelper, DefaultsAndTypesHelpFormatter +from lmdeploy.profiler import Profiler, Session +from lmdeploy.utils import get_logger + +logger = get_logger('lmdeploy') def sample_requests(dataset_path: str, num_requests: int, @@ -66,91 +67,70 @@ def __init__(self, model_path: str, engine_config, csv: str): self.csv = csv - def process_request(self, requests, concurrency, temperature, top_p, top_k, - stream_output): + def process_request(self, requests, profiler: Profiler, temperature, top_p, + top_k, stream_output): - stats = OrderedDict( - (session_id, None) for session_id in range(len(requests))) prompts = [prompt for prompt, _, _ in requests] gen_configs = [ GenerationConfig(temperature=temperature, top_p=top_p, top_k=top_k, ignore_eos=True, + do_sample=True, max_new_tokens=output_len) for _, _, output_len in requests ] - start = time.perf_counter() + sess: List[Session] = [] + for _, input_len, output_len in requests: + sess.append(profiler.new_session(input_len, output_len)) + + def _to_status(finish_reason): + if finish_reason == 'length': + return Session.SUCCESS + else: + return Session.FAIL + + profiler.start() + + for s in sess: + s.tick(0) + if stream_output: pbar = tqdm(total=len(requests)) for output in self.pipe.stream_infer(prompts, gen_configs, do_preprocess=False): - session_id = output.session_id + index = output.index n_token = output.generate_token_len finish_reason = output.finish_reason - stats[session_id] = (n_token, finish_reason) + sess[index].tick(n_token) if finish_reason is not None: + sess[index].finish(_to_status(finish_reason)) pbar.update(1) + pbar.close() else: for output in self.pipe(prompts, gen_configs, do_preprocess=False, use_tqdm=True): - session_id = output.session_id + index = output.index n_token = output.generate_token_len finish_reason = output.finish_reason - stats[session_id] = (n_token, finish_reason) - - elapsed_time = time.perf_counter() - start - - completion_tokens = 0 - for session_id, (n_token, finish_reason) in stats.items(): - assert finish_reason == 'length', \ - f'unexpected finish_reason of session_id={session_id}, ' \ - f'prompt={requests[session_id][0]}' - assert n_token - 1 <= requests[session_id][-1] <= n_token, \ - f'request to generate {requests[session_id][-1]} tokens, ' \ - f'but got {n_token} tokens' - completion_tokens += n_token - - prompt_tokens = 0 - for _, input_len, _ in requests: - prompt_tokens += input_len - - completion_token_throughput = completion_tokens / elapsed_time - total_token_throughput = (prompt_tokens + - completion_tokens) / elapsed_time - rps = len(requests) / elapsed_time - rpm = rps * 60 - - print(f'\n{"-" * 50}\nconcurrency: {concurrency}\n' - f'elapsed_time: {elapsed_time:.3f}s\n') - - print( - f'number of prompts: {len(requests)}\n' - f'number of prompt tokens: {prompt_tokens:.0f}\n' - f'number of completion tokens: {completion_tokens:.0f}\n' - f'token throughput (completion token): {completion_token_throughput:.3f} token/s\n' # noqa - f'token throughput (prompt + completion token): {total_token_throughput:.3f} token/s\n' # noqa - f'RPS (request per second): {rps:.3f} req/s\n' - f'RPM (request per minute): {rpm:.3f} req/min\n' - f'{"-" * 50}\n') - - if self.csv: - with open(self.csv, 'w') as csvfile: - writer = csv.writer(csvfile) - writer.writerow([ - 'batch', 'num_promts', 'RPS', 'RPM', - 'throughput(out tok/s)', 'throughput(total tok/s)' - ]) - writer.writerow([ - concurrency, - len(requests), f'{rps:.3f}', f'{rpm:.3f}', - f'{completion_token_throughput:.3f}', - f'{total_token_throughput:.3f}' - ]) + sess[index].tick(n_token) + sess[index].finish(_to_status(finish_reason)) + + profiler.finish() + + # report first failure + for i, s in enumerate(sess): + if s.status != Session.SUCCESS or s.ns[-1] < s.req_output_len: + logger.error( + f'Request {i} failed with {s.ns[-1]}/{s.req_output_len} tokens generated' # noqa: E501 + ) + logger.error(f'Prompt: {prompts[i]}') + logger.warning('Got failed requests, metrics may be invalid') + break def parse_args(): @@ -252,13 +232,25 @@ def main(): requests = sample_requests(args.dataset, args.num_prompts, engine.tokenizer) + profiler = Profiler(args.stream_output, [50, 75, 95, 99]) + engine.process_request(requests, + profiler, temperature=args.temperature, top_p=args.top_p, top_k=args.top_k, - concurrency=args.concurrency, stream_output=args.stream_output) + hyperparams = [('Concurrency', args.concurrency), + ('Stream output', str(args.stream_output).lower())] + + profiler.compute_metrics() + profiler.summarize(title='Profile Pipeline API', hyperparams=hyperparams) + + if args.csv: + profiler.save_csv(args.csv, (('batch', args.concurrency), + ('num_prompts', args.num_prompts))) + if __name__ == '__main__': main() diff --git a/benchmark/profile_throughput.py b/benchmark/profile_throughput.py index 291b1be9b8..2e4d2a3b8c 100644 --- a/benchmark/profile_throughput.py +++ b/benchmark/profile_throughput.py @@ -1,20 +1,18 @@ # Copyright (c) OpenMMLab. All rights reserved. import argparse import asyncio -import csv import json import os import random -import time from queue import Queue from typing import List, Tuple, Union -import numpy as np from tqdm import tqdm from lmdeploy.cli.utils import ArgumentHelper, DefaultsAndTypesHelpFormatter from lmdeploy.messages import (GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig) +from lmdeploy.profiler import Profiler, Session from lmdeploy.pytorch.engine import EngineInstance from lmdeploy.tokenizer import DetokenizeState, Tokenizer from lmdeploy.utils import get_logger @@ -71,7 +69,7 @@ class Engine: def __init__(self, model_path: str, engine_config: Union[PytorchEngineConfig, - TurbomindEngineConfig], csv: str): + TurbomindEngineConfig]): if isinstance(engine_config, TurbomindEngineConfig): from lmdeploy.turbomind import TurboMind tm_model = TurboMind.from_pretrained(model_path, @@ -83,166 +81,104 @@ def __init__(self, model_path: str, self.tm_model = tm_model self.tokenizer = tm_model.tokenizer - self.csv = csv self.pbar = None - async def _inference(self, req_queue: Queue, res_queue: Queue, - session_id: int, temperature: float, top_p: float, - top_k: int, stream_output: bool): + async def _inference(self, req_queue: Queue, session_id: int, + temperature: float, top_p: float, top_k: int, + stream_output: bool, skip_tokenize: bool, + skip_detokenize: bool): model_inst = self.tm_model.create_instance() - stats = [] - # get each generated token's latency - per_token_latency_stats = [] - for prompt, input_seqlen, output_seqlen in iter( - req_queue.get_nowait, [None, None, None]): - _per_token_latency_stats = [0] * (output_seqlen + 1) - prev = time.perf_counter() - n_prev_token = 0 - - input_ids = self.tokenizer(prompt).input_ids + sess: Session = None + for prompt, _, output_seqlen, cancel_after, sess in iter( + req_queue.get_nowait, None): + + sess.tick(0) + + if skip_tokenize: + input_ids = prompt + else: + input_ids = self.tokenizer(prompt).input_ids + state = DetokenizeState(len(input_ids)) - async for outputs in model_inst.async_stream_infer( - session_id, - input_ids=input_ids, - gen_config=GenerationConfig(max_new_tokens=output_seqlen, - temperature=temperature, - top_p=top_p, - top_k=top_k, - ignore_eos=True), - sequence_start=True, - sequence_end=True, - stream_output=stream_output): - res, n_token = input_ids + outputs.token_ids, outputs.num_token - _, state = self.tokenizer.detokenize_incrementally(res, state) - now = time.perf_counter() - if n_prev_token != n_token: - _per_token_latency_stats[n_prev_token] = np.round( - now - prev, 3) - n_prev_token = n_token - prev = now + prev_len = 0 + token_ids = input_ids.copy() + + generator = model_inst.async_stream_infer( + session_id, + input_ids=input_ids, + gen_config=GenerationConfig(max_new_tokens=output_seqlen, + temperature=temperature, + top_p=top_p, + top_k=top_k, + ignore_eos=True), + sequence_start=True, + sequence_end=True, + stream_output=stream_output) + try: + async for outputs in generator: + n_token = outputs.num_token + if n_token > prev_len: + token_ids += outputs.token_ids[prev_len - n_token:] + if not skip_detokenize: + _, state = self.tokenizer.detokenize_incrementally( + token_ids, state) + sess.tick(n_token) + prev_len = n_token + if n_token > cancel_after: + break + sess.finish(Session.SUCCESS) + finally: + await generator.aclose() + # for pytorch engine to restart a session if isinstance(model_inst, EngineInstance): await model_inst.async_end(session_id) - assert output_seqlen <= n_token <= output_seqlen + 1, \ - f'Error. session_id({session_id}) request {output_seqlen} ' \ - f'tokens, but generate {n_token} tokens.\n' \ - f'prompt: {prompt}' - - first_token_latency = _per_token_latency_stats[0] - completion_tokens = n_token - total_tokens = n_token + input_seqlen - stats.append([ - first_token_latency, completion_tokens, output_seqlen, - total_tokens - ]) - # skip the first token latency - per_token_latency_stats.append(_per_token_latency_stats[1:]) + self.pbar.update(1) - res_queue.put_nowait((session_id, stats, per_token_latency_stats)) - def process_request(self, requests, concurrency, temperature, top_p, top_k, - stream_output): - res_queue = Queue() + def process_request(self, requests, profiler: Profiler, concurrency, + temperature, top_p, top_k, stream_output, + skip_tokenize, skip_detokenize, cancel_rate): req_queue = Queue() - self.pbar = tqdm(total=len(requests)) - # feed request to q - for req in requests: + for prompt, input_len, output_len in requests: + cancel_after = output_len + 1 + if cancel_rate > 0: + if random.random() < cancel_rate: + cancel_after = random.randint(0, cancel_after) + sess = profiler.new_session(input_len, output_len) + req = [prompt, input_len, output_len, cancel_after, sess] + if skip_tokenize: + req[0] = self.tokenizer.encode(prompt) req_queue.put(req) for i in range(concurrency): - req_queue.put([None, None, None]) - - start = time.time() - - event_loop = asyncio.new_event_loop() - asyncio.set_event_loop(event_loop) + req_queue.put(None) # start threads tasks = [] for i in range(concurrency): - task = self._inference(req_queue, res_queue, i, temperature, top_p, - top_k, stream_output) + task = self._inference(req_queue, i, temperature, top_p, top_k, + stream_output, skip_tokenize, + skip_detokenize) tasks.append(task) async def _gather_tasks(tasks): return await asyncio.gather(*tasks) - event_loop.run_until_complete(_gather_tasks(tasks)) - - elapsed_time = time.time() - start - - stats = [] - per_token_latency_stats = [] - while not res_queue.empty(): - session_id, _stats, _per_token_latency_stats = res_queue.get() - stats.append(np.array(_stats)) - per_token_latency_stats += [ - item for sublist in _per_token_latency_stats - for item in sublist - ] - stats = np.concatenate(stats).reshape(-1, 4) - - first_token_latency_min = np.min(stats[:, 0], axis=0) - first_token_latency_max = np.max(stats[:, 0], axis=0) - first_token_latency_ave = np.mean(stats[:, 0], axis=0) - completion_tokens = np.sum(stats[:, 1], axis=0) - total_tokens = np.sum(stats[:, 3], axis=0) - prompt_tokens = total_tokens - completion_tokens - completion_token_throughput = completion_tokens / elapsed_time - total_token_throughput = total_tokens / elapsed_time - rps = len(requests) / elapsed_time - rpm = rps * 60 - - per_token_latency_stats.sort() - percentiles = [ - np.round( - per_token_latency_stats[int(percent * - len(per_token_latency_stats))], 3) - for percent in [0.5, 0.75, 0.95, 0.99] - ] - - print(f'\n{"-" * 50}\nconcurrency: {concurrency}\n' - f'elapsed_time: {elapsed_time:.3f}s\n') - if stream_output: - print(f'first token latency(s)(min, max, ave): ' - f'{first_token_latency_min:.3f}, ' - f'{first_token_latency_max:.3f}, ' - f'{first_token_latency_ave:.3f}') - print(f'per-token latency(s) percentile(50, 75, 95, 99): ' - f'{percentiles}\n') - print( - f'number of prompt tokens: {prompt_tokens:.0f}\n' - f'number of completion tokens: {completion_tokens:.0f}\n' - f'token throughput (completion token): {completion_token_throughput:.3f} token/s\n' # noqa - f'token throughput (prompt + completion token): {total_token_throughput:.3f} token/s\n' # noqa - f'RPS (request per second): {rps:.3f} req/s\n' - f'RPM (request per minute): {rpm:.3f} req/min\n' - f'{"-" * 50}\n') - - if self.csv: - with open(self.csv, 'w') as csvfile: - writer = csv.writer(csvfile) - writer.writerow([ - 'batch', 'num_promts', 'RPS', 'RPM', 'FTL(ave)(s)', - 'FTL(min)(s)', 'FTL(max)(s)', '50%(s)', '75%(s)', '95%(s)', - '99%(s)', 'throughput(out tok/s)', - 'throughput(total tok/s)' - ]) - writer.writerow([ - concurrency, - len(requests), f'{rps:.3f}', f'{rpm:.3f}', - f'{first_token_latency_ave:.3f}' if stream_output else '-', - f'{first_token_latency_min:.3f}' if stream_output else '-', - f'{first_token_latency_max:.3f}' if stream_output else '-', - f'{percentiles[0]:.3f}' if stream_output else '-', - f'{percentiles[1]:.3f}' if stream_output else '-', - f'{percentiles[2]:.3f}' if stream_output else '-', - f'{percentiles[3]:.3f}' if stream_output else '-', - f'{completion_token_throughput:.3f}', - f'{total_token_throughput:.3f}' - ]) + self.pbar = tqdm(total=len(requests)) + + event_loop = asyncio.new_event_loop() + asyncio.set_event_loop(event_loop) + + profiler.start() + + asyncio.run(_gather_tasks(tasks)) + + profiler.finish() + + self.pbar.close() def parse_args(): @@ -266,6 +202,20 @@ def parse_args(): type=int, help='Number of prompts to process', default=5000) + parser.add_argument('--no-stream-output', + action='store_true', + help='Use stream output') + parser.add_argument('--skip-tokenize', + action='store_true', + help='Pre-tokenize input prompts before starting') + parser.add_argument('--skip-detokenize', + action='store_true', + help='Skip detokenizing output tokens') + parser.add_argument('--cancel-rate', + type=float, + help='Possibility of a request being canceled', + default=0) + parser.add_argument('--use-uvloop', action='store_true') parser.add_argument('--csv', type=str, help='Where to save the result.', @@ -340,19 +290,42 @@ def main(): dtype=args.dtype, ) - engine = Engine(args.model_path, engine_config, csv=args.csv) + if args.use_uvloop: + import uvloop + asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) + + engine = Engine(args.model_path, engine_config) requests = sample_requests(args.dataset, args.num_prompts, engine.tokenizer) + stream_output = not args.no_stream_output + + profiler = Profiler(stream_output, [50, 75, 95, 99]) + engine.process_request( requests, + profiler, temperature=args.temperature, top_p=args.top_p, top_k=args.top_k, concurrency=args.concurrency if args.concurrency < args.num_prompts else args.num_prompts, - stream_output=True) + stream_output=not args.no_stream_output, + skip_tokenize=args.skip_tokenize, + skip_detokenize=args.skip_detokenize, + cancel_rate=args.cancel_rate) + + hyperparams = [('Concurrency', args.concurrency), + ('Cancel rate', args.cancel_rate), + ('Stream output', str(stream_output).lower()), + ('Skip tokenize', str(args.skip_tokenize).lower()), + ('Skip detokenize', str(args.skip_detokenize).lower())] + profiler.compute_metrics() + profiler.summarize(title='Profile Throughput', hyperparams=hyperparams) + if args.csv: + profiler.save_csv(args.csv, (('batch', args.concurrency), + ('num_prompts', args.num_prompts))) if __name__ == '__main__': diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py index 2336d10752..d4e6571b79 100644 --- a/lmdeploy/messages.py +++ b/lmdeploy/messages.py @@ -97,6 +97,8 @@ class GenerationConfig: logprobs: int = None response_format: Optional[Dict] = None logits_processors: Optional[List[LogitsProcessor]] = None + output_logits: Literal['all', 'generation'] = None + output_last_hidden_state: Literal['all', 'generation'] = None def convert_stop_bad_words_to_ids(self, tokenizer: Tokenizer): """convert stop_words/bad_sords to ids and append the ids to @@ -124,7 +126,7 @@ def __post_init__(self): """Check input validation.""" assert type( self.n) == int and self.n > 0, 'n is not a positive integer' - assert self.top_p > 0 and self.top_p <= 1 # (0, 1] + assert self.top_p >= 0 and self.top_p <= 1 # [0, 1] assert self.top_k >= 0, 'top_k can not be a negative integer' assert self.temperature >= 0 and self.temperature <= 2 # [0,2] assert 0 <= self.min_p <= 1, \ @@ -338,10 +340,11 @@ class Response: text: str generate_token_len: int input_token_len: int - session_id: int finish_reason: Optional[Literal['stop', 'length']] = None token_ids: List[int] = field(default_factory=list) logprobs: List[Dict[int, float]] = None + logits: torch.Tensor = None + last_hidden_state: torch.Tensor = None index: int = 0 @@ -361,6 +364,8 @@ class EngineOutput: token_ids: List[int] num_token: int logprobs: List[Dict[int, float]] = None + logits: torch.Tensor = None + last_hidden_state: torch.Tensor = None @dataclass diff --git a/lmdeploy/profiler.py b/lmdeploy/profiler.py new file mode 100644 index 0000000000..c1bf6b3875 --- /dev/null +++ b/lmdeploy/profiler.py @@ -0,0 +1,170 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import csv +import time +from typing import List + +import numpy as np + + +class Session: + + UNKNOWN = 0 + SUCCESS = 1 + FAIL = 2 + + def __init__(self, input_len, req_output_len): + self.ts = [] + self.ns = [] + self.input_len = input_len + self.req_output_len = req_output_len + self.status = Session.UNKNOWN + + def tick(self, n_token): + self.ts.append(time.perf_counter()) + self.ns.append(n_token) + + def finish(self, status): + self.status = status + + +class Profiler: + + def __init__(self, stream_output: bool, percentages: List[int]): + self.sessions: List[Session] = [] + self.stream_output = stream_output + self.percentages = percentages + + def new_session(self, *args, **kwargs): + sess = Session(*args, **kwargs) + self.sessions.append(sess) + return sess + + def start(self): + self.t_start = time.perf_counter() + + def finish(self): + self.elapsed_time = time.perf_counter() - self.t_start + + def compute_metrics(self): + self.ttfts: List[float] = [] + self.tpots: List[float] = [] + self.e2es: List[float] = [] + self.itls: List[float] = [] + self.tpts: List[int] = [] + self.total_output = 0 + self.total_input = 0 + self.success = 0 + + for sess in self.sessions: + if sess.status != Session.SUCCESS: + continue + ns = sess.ns + ts = sess.ts + if ns[-1] < sess.req_output_len: + continue + self.success += 1 + self.total_output += ns[-1] + self.total_input += sess.input_len + self.e2es.append(ts[-1] - ts[0]) + self.ttfts.append(ts[1] - ts[0]) + if ns[-1] > ns[1]: + self.tpots.append((ts[-1] - ts[1]) / (ns[-1] - ns[1])) + else: # no-stream-output + self.tpots.append((ts[-1] - ts[0]) / (ns[-1] - ns[0])) + t_dif = np.subtract(ts[1:], ts[:-1]) + n_dif = np.subtract(ns[1:], ns[:-1]) + self.itls.extend(t_dif[1:]) + self.tpts.extend(n_dif) + + self.output_throughput = self.total_output / self.elapsed_time + self.input_throughput = self.total_input / self.elapsed_time + + qs = self.percentages + + self.e2es = self.e2es or [float('inf')] + self.tpots = self.tpots or [float('inf')] + self.ttfts = self.ttfts or [float('inf')] + self.itls = self.itls or [float('inf')] + self.tpts = self.tpts or [0] + + self.tpot_mean = np.mean(self.tpots) + self.tpot_stat = tuple(np.percentile(self.tpots, qs)) + self.e2e_mean = np.mean(self.e2es) + self.e2e_stat = tuple(np.percentile(self.e2es, qs)) + + if self.stream_output: + self.ttft_mean = np.mean(self.ttfts) + self.ttft_stat = tuple(np.percentile(self.ttfts, qs)) + self.itls_mean = np.mean(self.itls) + self.itls_stat = tuple(np.percentile(self.itls, qs)) + self.tpts_mean = np.mean(self.tpts) + self.tpts_stat = tuple(np.percentile(self.tpts, qs).astype(int)) + + self.rps = self.success / self.elapsed_time + + def summarize(self, + title: str, + hyperparams: List = None, + header=40, + digits=10): + + width = header + digits * (1 + len(self.percentages)) + + def tab_row(name, *items): + + def fmt(x): + return '{:>{d}.3f}'.format(x, d=digits) if isinstance( + x, float) else '{:>{d}}'.format(x, d=digits) + + print('{:<{p}}{}'.format(name, + ''.join([fmt(x) for x in items]), + p=header)) + + print('\n{s:{c}^{n}}'.format(s=f' {title} ', n=width, c='=')) + tab_row('Benchmark duration', self.elapsed_time) + tab_row('Total requests', len(self.sessions)) + tab_row('Successful requests', self.success) + if hyperparams: + for k, v in hyperparams: + tab_row(k, v) + tab_row('Total input tokens', self.total_input) + tab_row('Total generated tokens', self.total_output) + tab_row('Input throughput (tok/s)', self.input_throughput) + tab_row('Output throughput (tok/s)', self.output_throughput) + tab_row('Request throughput (req/s)', self.rps) + print('-' * width) + tab_row('', 'mean', *(f'P{q}' for q in self.percentages)) + tab_row('End-to-end Latency', self.e2e_mean, *self.e2e_stat) + if self.stream_output: + tab_row('Time to First Token (TTFT)', self.ttft_mean, + *self.ttft_stat) + tab_row('Time per Output Token (TPOT)', self.tpot_mean, + *self.tpot_stat) + if self.stream_output: + tab_row('Inter-token Latency (ITL)', self.itls_mean, + *self.itls_stat) + tab_row('Tokens per Tick', self.tpts_mean, *self.tpts_stat) + print('=' * width) + + def save_csv(self, csv_file: str, hyperparams): + """Export legacy metrics to CSV.""" + with open(csv_file, 'w') as csvfile: + writer = csv.writer(csvfile) + keys, vals = zip(*hyperparams) + writer.writerow([ + *keys, + 'RPS', + 'RPM', + 'FTL(ave)(s)', + 'throughput(out tok/s)', + 'throughput(total tok/s)', + ]) + ttft_mean = f'{self.ttft_mean:.3f}' if self.stream_output else '-' + writer.writerow([ + *vals, + f'{self.rps:.3f}', + f'{(self.rps * 60):.3f}', + ttft_mean, + f'{self.output_throughput:.3f}', + f'{(self.input_throughput + self.output_throughput):.3f}', + ]) diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py index dfcf01a69d..d7366c654b 100644 --- a/lmdeploy/serve/async_engine.py +++ b/lmdeploy/serve/async_engine.py @@ -1,22 +1,28 @@ # Copyright (c) OpenMMLab. All rights reserved. import asyncio +import atexit +import concurrent.futures import dataclasses import json import os import random import re -from contextlib import asynccontextmanager +from contextlib import asynccontextmanager, closing from copy import deepcopy +from functools import partial from itertools import count -from queue import Empty, Queue +from queue import Queue from threading import Thread -from typing import Any, Dict, List, Literal, Optional, Tuple, Union +from typing import (Any, AsyncIterator, Dict, Iterator, List, Literal, + Optional, Tuple, Union) + +import tqdm from lmdeploy.logger import RequestLogger from lmdeploy.messages import (GenerationConfig, PytorchEngineConfig, Response, ResponseType, TurbomindEngineConfig) from lmdeploy.model import MODELS, ChatTemplateConfig, best_match_model -from lmdeploy.serve.utils import LogitsMixin, _get_event_loop +from lmdeploy.serve.utils import LogitsMixin from lmdeploy.tokenizer import DetokenizeState from lmdeploy.utils import _get_and_verify_max_len, _stop_words, get_logger @@ -50,6 +56,37 @@ class GenOut: finish_reason: Optional[Literal['stop', 'length', 'error']] = None token_ids: List[int] = None logprobs: List[Dict[int, float]] = None + logits: Any = None + last_hidden_state: Any = None + + +def _gen_out_to_response(out: GenOut, index) -> Response: + return Response(text=out.response, + generate_token_len=out.generate_token_len, + input_token_len=out.input_token_len, + finish_reason=out.finish_reason, + token_ids=out.token_ids, + logprobs=out.logprobs, + last_hidden_state=out.last_hidden_state, + logits=out.logits, + index=index) + + +def _append_response(dst: Response, src: Response): + """dst += src.""" + if not dst: + return src + dst.text += src.text + dst.generate_token_len = src.generate_token_len + dst.input_token_len = src.input_token_len + dst.finish_reason = src.finish_reason + dst.index = src.index + if src.token_ids: + dst.token_ids += src.token_ids + if src.logprobs: + dst.logprobs = dst.logprobs or [] + dst.logprobs += src.logprobs + return dst class Session: @@ -63,14 +100,17 @@ class Session: _engine (Any): engine for internal use. history (List[Any, str]): chat history. """ - _ids = count(0) - def __init__(self): - self._id: int = next(self._ids) + def __init__(self, + session_id: int, + engine: Any, + gen_config: GenerationConfig = None): + self._id: int = session_id + self._engine = engine self._step: int = 0 self._prompt: Any = None self._response: Response = None - self._engine: Any = None + self._gen_config = gen_config self.history: List[Tuple[Any, str]] = [] def _merge_response(self, resp: Response, step: Union[Response, GenOut]): @@ -89,8 +129,8 @@ def response(self) -> Response: def close(self): """release engine storage for this session.""" if self._engine: - inst = self._engine.create_instance() - inst.end(self._id) + self._engine._run(coro=self._engine.end_session(self._id)).result() + self._engine = None def __repr__(self) -> str: res = '' @@ -100,6 +140,89 @@ def __repr__(self) -> str: res += f'USER:\n{user}\nASSISTANT:\n{assistant}\n' return res + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.close() + + def __call__( + self, + prompt: str, + gen_config: Optional[GenerationConfig] = None, + stream_response: bool = True, + do_preprocess: bool = True) -> Union[Response, Iterator[Response]]: + self._engine.chat(prompt=prompt, + gen_config=gen_config or self._gen_config, + stream_response=stream_response, + do_preprocess=do_preprocess, + session=self) + if stream_response: + return self.generator + else: + return self.response + + +class _EventLoopThread: + + def __init__(self, daemon=False): + fut = concurrent.futures.Future() + self.thread = Thread(target=partial(self._thread_entry, fut), + daemon=daemon) + self.thread.start() + self.loop: asyncio.AbstractEventLoop = fut.result() + self.closed = False + if daemon: + atexit.register(self.close) + + def _thread_entry(self, fut): + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + fut.set_result(loop) + try: + loop.run_forever() + except BaseException as e: + logger.error(f'[internal_thread] {type(e).__name__} {e}') + finally: + try: + self._cancel_all_tasks() + loop.run_until_complete(loop.shutdown_asyncgens()) + finally: + asyncio.set_event_loop(None) + loop.close() + + def _cancel_all_tasks(self): + """Modified from asyncio/runners.py.""" + to_cancel = asyncio.all_tasks(self.loop) + if not to_cancel: + return + + for task in to_cancel: + task.cancel() + + async def _gather(): + await asyncio.gather(*to_cancel, return_exceptions=True) + + self.loop.run_until_complete(_gather()) + + for task in to_cancel: + if task.cancelled(): + continue + if task.exception() is not None: + self.loop.call_exception_handler({ + 'message': + 'unhandled exception during worker thread shutdown', + 'exception': task.exception(), + 'task': task, + }) + + def close(self): + if self.closed: + return + self.closed = True + self.loop.call_soon_threadsafe(self.loop.stop) + self.thread.join() + class AsyncEngine(LogitsMixin): """Async inference engine. Maintaining a bunch of tm_model instances. @@ -179,13 +302,26 @@ def __init__(self, self.instance_num = self.backend_config.max_batch_size self.tokenizer = self.engine.tokenizer self.id2step = {} - self.id2generator = {} - self.running_session_ids = set() - self.gens_set = set() - for i in range(self.instance_num): - self.gens_set.add(self.engine.create_instance()) + self.id2inst = {} + self.free_insts: asyncio.Queue = None + self.instances = [ + self.engine.create_instance() for _ in range(self.instance_num) + ] self._session_id = count(0) self.request_logger = RequestLogger(max_log_len) + self.internal_thread = _EventLoopThread(daemon=True) + self.limiter: asyncio.Semaphore = None + + def close(self): + self.internal_thread.close() + + def _get_free_insts(self): + if self.free_insts is None: + # `asyncio.Queue` must be created in an async context + self.free_insts = asyncio.Queue() + for inst in self.instances: + self.free_insts.put_nowait(inst) + return self.free_insts def _build_turbomind( self, @@ -246,45 +382,117 @@ def __call__(self, async def stop_session(self, session_id: int): """Stop a session by a session_id.""" - if str(session_id) in self.id2generator: - await self.id2generator[str(session_id)].async_cancel(session_id) - self.gens_set.add(self.id2generator[str(session_id)]) - - self.running_session_ids.discard(session_id) + generator = self.id2inst.get(session_id) + if generator: + await generator.async_cancel(session_id) + # else it's not running at all async def end_session(self, session_id: int): - """Clear a session by a session_id.""" - if str(session_id) in self.id2generator: - await self.id2generator[str(session_id)].async_end(session_id) - self.id2step[str(session_id)] = 0 - self.gens_set.add(self.id2generator[str(session_id)]) - - self.running_session_ids.discard(session_id) - - @asynccontextmanager - async def safe_run(self, session_id: Optional[int] = None): - """A context manager to make sure server's safe running.""" + """For ending a session that is not running.""" + inst = self.id2inst.get(session_id) + if inst: + await inst._active.wait() + assert session_id not in self.id2inst + inst = await self._get_free_insts().get() try: - yield + await inst.async_end(session_id) + self.id2step[session_id] = 0 except (Exception, asyncio.CancelledError, GeneratorExit) as e: # noqa - # TODO: find out why await would block the coroutine here - _get_event_loop().create_task(self.stop_session(session_id)) - raise e - if str(session_id) in self.id2generator: - self.gens_set.add(self.id2generator[str(session_id)]) - self.running_session_ids.discard(session_id) - - async def get_generator(self, stop: bool, session_id: int): - """Only return the model instance if it is available.""" - if stop: - return self.engine.create_instance() - # waiting no generator is available or the same session_id is running - while self.gens_set == set() or session_id in self.running_session_ids: - await asyncio.sleep(0.1) - generator = self.gens_set.pop() - self.id2generator[str(session_id)] = generator - self.running_session_ids.add(session_id) - return generator + logger.error(f'[end_session] exception caught: {e}') + finally: + self._get_free_insts().put_nowait(inst) + + def _get_limiter(self): + if not self.limiter: + self.limiter = asyncio.Semaphore(self.instance_num) + return self.limiter + + async def _async_infer(self, requests: AsyncIterator[Dict], + **kwargs) -> AsyncIterator[AsyncIterator[Response]]: + async for req in requests: + gen = self.generate(**req, **kwargs) + yield gen + + def _infer(self, + requests: Iterator[Dict], + multiplex: bool, + pbar=None, + loop=None) -> Iterator[Iterator[Response]]: + + async def _sync_resp(g, que: Queue, idx: int, sem: asyncio.Semaphore): + async for out in g: + que.put(_gen_out_to_response(out, idx)) + sem.release() + if not multiplex: + que.put(None) # sentinel of inner generator + if pbar: + pbar.update(1) + + que = Queue() + + async def _infer(): + sem = self._get_limiter() + tasks = [] + for idx, req in enumerate(requests): + await sem.acquire() + gen = self.generate(**req) + dst = que if multiplex else Queue() + if not multiplex: + que.put(iter(dst.get, None)) + # create a task to send the responses + task = asyncio.create_task(_sync_resp(gen, dst, idx, sem)) + tasks.append(task) + if not multiplex: # sentinel of outer generator + que.put(None) + await asyncio.gather(*tasks) + if multiplex: + que.put(None) # sentinel of inner generator + + loop = loop or self.internal_thread.loop + # submit the coroutine to async world + asyncio.run_coroutine_threadsafe( + _infer(), loop).add_done_callback(lambda x: x.result()) + + return iter(que.get, None) + + @staticmethod + def _is_single(prompts): + return isinstance(prompts, str) or isinstance(prompts[0], Dict) + + def infer(self, + prompts: Union[List[str], str, List[Dict], List[List[Dict]]], + gen_config: Optional[Union[GenerationConfig, + List[GenerationConfig]]] = None, + do_preprocess: bool = True, + adapter_name: Optional[str] = None, + stream_response: bool = False, + multiplex: bool = False, + pbar: Optional[tqdm.tqdm] = None, + **kwargs): + + prompts = [prompts] if AsyncEngine._is_single(prompts) else prompts + assert isinstance(prompts, List), 'prompts should be a list' + gen_config = gen_config or GenerationConfig() + if not isinstance(gen_config, List): + gen_config = [gen_config] * len(prompts) + assert len(prompts) == len(gen_config), \ + 'input gen_confg length differs from the length of prompts' # noqa + + def requests(): + for prompt, gen_cfg in zip(prompts, gen_config): + r = dict(messages=prompt, + gen_config=gen_cfg, + do_preprocess=do_preprocess, + adapter_name=adapter_name, + stream_response=stream_response, + **kwargs) + r.setdefault('sequence_start', True) + r.setdefault('sequence_end', True) + if 'session_id' not in r: + r['session_id'] = next(self._session_id) + yield r + + return self._infer(requests(), multiplex, pbar) def batch_infer(self, prompts: Union[List[str], str, List[Dict], @@ -310,59 +518,26 @@ def batch_infer(self, Pick one from adapters. Default to None, using the base model. use_tqdm (bool): Whether use the progress bar. Default to False """ - need_list_wrap = isinstance(prompts, str) or isinstance( - prompts[0], Dict) - prompts = [prompts] if need_list_wrap else prompts - assert isinstance(prompts, List), 'prompts should be a list' - if gen_config is None: - gen_config = GenerationConfig() - if not isinstance(gen_config, List): - gen_config = [gen_config] * len(prompts) - assert len(prompts) == len(gen_config), \ - 'input gen_confg length differs from the length of prompts' # noqa - prompt_num = len(prompts) - session_ids = [next(self._session_id) for _ in range(prompt_num)] - outputs = [ - Response('', 0, 0, session_ids[i], index=i) - for i in range(prompt_num) - ] - generators = [] - if use_tqdm: - import tqdm - pbar = tqdm.tqdm(total=len(prompts)) - for i, prompt in enumerate(prompts): - generators.append( - self.generate(prompt, - session_ids[i], - gen_config=gen_config[i], - stream_response=True, - sequence_start=True, - sequence_end=True, - do_preprocess=do_preprocess, - adapter_name=adapter_name, - **kwargs)) - - async def _inner_call(i, generator): - async for out in generator: - outputs[i].text += out.response - outputs[i].generate_token_len = out.generate_token_len - outputs[i].input_token_len = out.input_token_len - outputs[i].finish_reason = out.finish_reason - if out.token_ids: - outputs[i].token_ids.extend(out.token_ids) - if out.logprobs: - if outputs[i].logprobs is None: - outputs[i].logprobs = [] - outputs[i].logprobs.extend(out.logprobs) - if use_tqdm and out.finish_reason is not None: - pbar.update(1) - - async def gather(): - await asyncio.gather( - *[_inner_call(i, generators[i]) for i in range(len(prompts))]) - - _get_event_loop().run_until_complete(gather()) - outputs = outputs[0] if need_list_wrap else outputs + is_single = AsyncEngine._is_single(prompts) + outputs = [] + pbar = tqdm.tqdm( + total=1 if is_single else len(prompts)) if use_tqdm else None + try: + for g in self.infer(prompts, + gen_config, + do_preprocess, + adapter_name, + stream_response=False, + pbar=pbar, + **kwargs): + res = None + for out in g: + res = _append_response(res, out) + outputs.append(res) + finally: + if pbar: pbar.close() # noqa + if is_single: + return outputs[0] return outputs def stream_infer( @@ -372,6 +547,7 @@ def stream_infer( List[GenerationConfig]]] = None, do_preprocess: bool = True, adapter_name: Optional[str] = None, + stream_response: bool = True, **kwargs): """Inference a batch of prompts with stream mode. @@ -387,62 +563,13 @@ def stream_infer( adapter_name (str): the adapter name of slora for pytorch backend. Pick one from adapters. Default to None, using the base model. """ - need_list_wrap = isinstance(prompts, str) or isinstance( - prompts[0], Dict) - prompts = [prompts] if need_list_wrap else prompts - assert isinstance(prompts, List), 'prompts should be a list' - if gen_config is None: - gen_config = GenerationConfig() - if not isinstance(gen_config, List): - gen_config = [gen_config] * len(prompts) - assert len(prompts) == len(gen_config), \ - 'input gen_confg length differs from the length of prompts' # noqa - session_ids = [next(self._session_id) for _ in range(len(prompts))] - outputs = Queue() - generators = [] - for i, prompt in enumerate(prompts): - generators.append( - self.generate(prompt, - session_ids[i], - gen_config=gen_config[i], - stream_response=True, - sequence_start=True, - sequence_end=True, - do_preprocess=do_preprocess, - adapter_name=adapter_name, - **kwargs)) - - async def _inner_call(i, generator): - async for out in generator: - outputs.put( - Response(out.response, - out.generate_token_len, - out.input_token_len, - session_ids[i], - out.finish_reason, - out.token_ids, - out.logprobs, - index=i)) - - async def gather(): - await asyncio.gather( - *[_inner_call(i, generators[i]) for i in range(len(prompts))]) - outputs.put(None) - - loop = _get_event_loop() - proc = Thread(target=lambda: loop.run_until_complete(gather())) - proc.start() - - while True: - try: - out = outputs.get(timeout=0.001) - if out is None: - break - yield out - except Empty: - pass - - proc.join() + return self.infer(prompts, + gen_config, + do_preprocess, + adapter_name, + stream_response, + multiplex=True, + **kwargs) async def _get_prompt_input(self, prompt: str, @@ -466,6 +593,34 @@ async def _get_prompt_input(self, input_ids = self.tokenizer.encode(prompt, add_bos=sequence_start) return {'prompt': prompt, 'input_ids': input_ids} + @asynccontextmanager + async def model_inst(self, session_id: int): + """A context manager to make sure server's safe running.""" + assert session_id not in self.id2inst + free_insts = self._get_free_insts() + inst = await free_insts.get() + inst._active = asyncio.Event() + self.id2inst[session_id] = inst + try: + yield inst + finally: + self.id2inst.pop(session_id) + inst._active.set() + free_insts.put_nowait(inst) + + @asynccontextmanager + async def safe_run(self, inst, session_id, **kwargs): + generator = inst.async_stream_infer(session_id, **kwargs) + try: + yield generator + except (Exception, asyncio.CancelledError, GeneratorExit) as e: # noqa + logger.error( + f'[safe_run] exception caught: {type(e).__name__} {e}') + # TODO: remove session_id from async cancel + await inst.async_cancel(session_id) + finally: + await generator.aclose() + async def generate( self, messages, @@ -478,6 +633,8 @@ async def generate( step: int = 0, do_preprocess: bool = True, adapter_name: Optional[str] = None, + skip_stop_tokens: bool = True, + rewind_stop_tokens: bool = False, **kwargs): """Generate responses. @@ -493,10 +650,10 @@ async def generate( do_preprocess (bool): whether pre-process the messages. Default to True, which means chat_template will be applied. """ - if str(session_id) not in self.id2step: - self.id2step[str(session_id)] = 0 + if session_id not in self.id2step: + self.id2step[session_id] = 0 if step != 0: - self.id2step[str(session_id)] = step + self.id2step[session_id] = step if gen_config is None: gen_config = GenerationConfig() else: @@ -539,7 +696,7 @@ async def generate( gen_config=gen_config, adapter_name=adapter_name) logger.info(f'session_id={session_id}, ' - f'history_tokens={self.id2step[str(session_id)]}, ' + f'history_tokens={self.id2step[session_id]}, ' f'input_tokens={len(input_ids)}, ' f'max_new_tokens={gen_config.max_new_tokens}, ' f'seq_start={sequence_start}, seq_end={sequence_end}, ' @@ -548,94 +705,132 @@ async def generate( if gen_config.max_new_tokens is None: # for interactive endpoint, will try maximum possible token num gen_config.max_new_tokens = max( - 128, self.session_len - self.id2step[str(session_id)] - - len(input_ids)) - elif self.id2step[str(session_id)] + len( + 128, + self.session_len - self.id2step[session_id] - len(input_ids)) + elif self.id2step[session_id] + len( input_ids) + gen_config.max_new_tokens > self.session_len: gen_config.max_new_tokens = max( - self.session_len - self.id2step[str(session_id)] - - len(input_ids), 128) + self.session_len - self.id2step[session_id] - len(input_ids), + 128) logger.error( f'Truncate max_new_tokens to {gen_config.max_new_tokens}') - if self.id2step[str(session_id)] + len( + if self.id2step[session_id] + len( input_ids) + gen_config.max_new_tokens > self.session_len: logger.error(f'run out of tokens. session_id={session_id}.') - yield GenOut('', self.id2step[str(session_id)], len(input_ids), 0, + yield GenOut('', self.id2step[session_id], len(input_ids), 0, 'length') if sequence_end is True and sequence_start is False: await self.end_session(session_id) - else: - - def is_error(status): - return status not in [ - ResponseType.SUCCESS, ResponseType.FINISH - ] - - generator = await self.get_generator(False, session_id) - async with self.safe_run(session_id): - state = DetokenizeState(len(input_ids)) - start_ids_offset = state.ids_offset - response = '' - async for outputs in generator.async_stream_infer( - session_id=session_id, - **prompt_input, - gen_config=gen_config, - adapter_name=adapter_name, - stream_output=stream_response, - sequence_start=sequence_start, - sequence_end=sequence_end, - step=self.id2step[str(session_id)]): + return + + def is_error(status): + return status not in [ResponseType.SUCCESS, ResponseType.FINISH] + + # used to skip / rewind stop words in interactive mode + stop_ids = [] + if skip_stop_tokens and not gen_config.ignore_eos: + stop_ids = gen_config.stop_token_ids or [] + if self.tokenizer.eos_token_id not in stop_ids: + stop_ids.append(self.tokenizer.eos_token_id) + + async with self.model_inst(session_id) as inst: + token_ids = input_ids.copy() + history_len = self.id2step[session_id] + input_len = len(input_ids) + output_len, gen_len = 0, 0 + state = DetokenizeState(len(input_ids)) + start_ids_offset = state.ids_offset + response = '' + async with self.safe_run(inst, + session_id=session_id, + **prompt_input, + gen_config=gen_config, + adapter_name=adapter_name, + stream_output=stream_response, + sequence_start=sequence_start, + sequence_end=sequence_end, + step=history_len) as gen: + prev_len = 0 + hit_stop_token = 0 + async for outputs in gen: # decode res if is_error(outputs.status): - tokens = 0 break - res, tokens = input_ids + outputs.token_ids, outputs.num_token # noqa - if len(res) <= state.ids_offset: + + output_len = outputs.num_token + + if hit_stop_token or prev_len == output_len: continue + # This assumes the engine will stop when stop token is hit + if output_len and outputs.token_ids[-1] in stop_ids: + hit_stop_token = 1 + # one token and it's been skipped + if output_len == prev_len + 1: + continue + + mask = slice(prev_len - output_len, + output_len - hit_stop_token) + + token_ids += outputs.token_ids[mask] + gen_len = len(token_ids) - input_len + + prev_len = output_len + ids_offset = state.ids_offset response, state = self.tokenizer.detokenize_incrementally( - res, + token_ids, state, skip_special_tokens=gen_config.skip_special_tokens) + res = token_ids[ids_offset:] + + out = GenOut(response, history_len, input_len, gen_len, + finish_reason, res) - res = res[ids_offset:] - logprobs = None - if outputs.logprobs: + if outputs.logprobs is not None: log_offset = ids_offset - start_ids_offset - logprobs = outputs.logprobs[log_offset:] + out.logprobs = outputs.logprobs[log_offset:] + if outputs.last_hidden_state is not None: + out.last_hidden_state = outputs.last_hidden_state + if hit_stop_token: + out.last_hidden_state = \ + out.last_hidden_state[:-hit_stop_token] + if outputs.logits is not None: + out.logits = outputs.logits + if hit_stop_token: + out.logits = out.logits[:-hit_stop_token] + + yield out + # end of generator loop - # response, history token len, - # input token len, gen token len - yield GenOut(response, self.id2step[str(session_id)], - len(input_ids), tokens, finish_reason, res, - logprobs) if not is_error(outputs.status): finish_reason = 'length' \ - if tokens >= gen_config.max_new_tokens else 'stop' + if gen_len >= gen_config.max_new_tokens else 'stop' # utf-8 char at the end means it's a potential unfinished # byte sequence if not response.endswith('�'): - # avaid returning the last response twice + # avoid returning the last response twice response = '' - yield GenOut(response, self.id2step[str(session_id)], - len(input_ids), tokens, finish_reason) + yield GenOut(response, self.id2step[session_id], + len(input_ids), gen_len, finish_reason) else: - yield GenOut( - response='internal error happened', - history_token_len=self.id2step[str(session_id)], - input_token_len=len(input_ids), - generate_token_len=0, - finish_reason='error', - token_ids=[]) - # update step - self.id2step[str(session_id)] += len(input_ids) + tokens - if sequence_end: - self.id2step[str(session_id)] = 0 - # manually end pytorch session - # TODO modify pytorch or turbomind api - if self.backend == 'pytorch' and sequence_end: - await self.end_session(session_id) + yield GenOut(response='internal error happened', + history_token_len=self.id2step[session_id], + input_token_len=len(input_ids), + generate_token_len=0, + finish_reason='error', + token_ids=[]) + # update step + if sequence_end: + self.id2step[session_id] = 0 + if self.backend == 'pytorch': + # manually end pytorch session + await inst.async_end(session_id) + else: + if rewind_stop_tokens: + # rewind the step to the token before the stop token + output_len = gen_len + self.id2step[session_id] += input_len + output_len def parse_tool_response(self, text, tools, **kwargs): """Parse model response containing tool information. @@ -684,12 +879,28 @@ def parse_tool_response(self, text, tools, **kwargs): for call_info in call_info_list] return text, call_info_list + def _run(self, fn=None, coro=None, loop=None): + assert (fn or coro) and not (fn and coro) + loop = loop or self.internal_thread.loop + if fn: + + async def _coro(): + return fn() + + coro = _coro() + return asyncio.run_coroutine_threadsafe(coro, loop) + + def session(self, gen_config: GenerationConfig = None): + return Session(self._run(fn=lambda: next(self._session_id)).result(), + engine=self, + gen_config=gen_config) + def chat(self, prompt: str, session=None, gen_config: Optional[GenerationConfig] = None, - do_preprocess: bool = True, - **kwargs) -> Session: + stream_response=False, + **kwargs) -> Union[Session, Iterator]: """Chat. Args: @@ -702,8 +913,7 @@ def chat(self, **kwargs (dict): ad hoc parametrization of `gen_config """ if session is None: - session = Session() - session._engine = self.engine + session = self.session() # sync & init session._prompt = prompt @@ -711,25 +921,35 @@ def chat(self, sequence_start = session._step == 0 - async def _work(): - resp = Response('', -1, -1, session._id) - async for output in self.generate(prompt, - session_id=session._id, - gen_config=gen_config, - stream_response=False, - sequence_start=sequence_start, - sequence_end=False, - step=session._step, - do_preprocess=do_preprocess, - **kwargs): - resp = session._merge_response(resp, output) - return resp - - from lmdeploy.pytorch.engine.request import _run_until_complete - resp = _run_until_complete(_work()) - - session._response = resp - session._step += resp.generate_token_len + resp.input_token_len - session.history.append((session._prompt, resp.text)) + generator = self.infer(prompt, + gen_config, + sequence_start=sequence_start, + sequence_end=False, + session_id=session._id, + stream_response=stream_response, + multiplex=True) + + def _gen(): + resp = None + try: + for out in generator: + resp = _append_response(resp, out) + yield out + except: # noqa + self._run(coro=self.stop_session(session._id)).result() + raise + else: + session._response = resp + session._step += resp.generate_token_len + resp.input_token_len + session.history.append((session._prompt, resp.text)) + + if stream_response: + session.generator = _gen() + else: + # run the generator until finish + with closing(_gen()) as gen: + for _ in gen: + pass + session.generator = None return session diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py index b23ef3018d..c37f7572a1 100644 --- a/lmdeploy/serve/openai/api_server.py +++ b/lmdeploy/serve/openai/api_server.py @@ -340,8 +340,7 @@ async def chat_completions_v1(request: ChatCompletionRequest, error_check_ret = await check_request(request) if error_check_ret is not None: return error_check_ret - if VariableInterface.async_engine.id2step.get(str(request.session_id), - 0) != 0: + if VariableInterface.async_engine.id2step.get(request.session_id, 0) != 0: return create_error_response( HTTPStatus.BAD_REQUEST, f'The session_id `{request.session_id}` is occupied.') @@ -596,8 +595,7 @@ async def completions_v1(request: CompletionRequest, error_check_ret = await check_request(request) if error_check_ret is not None: return error_check_ret - if VariableInterface.async_engine.id2step.get(str(request.session_id), - 0) != 0: + if VariableInterface.async_engine.id2step.get(request.session_id, 0) != 0: return create_error_response( HTTPStatus.BAD_REQUEST, f'The session_id `{request.session_id}` is occupied.') @@ -865,11 +863,22 @@ async def chat_interactive_v1(request: GenerateRequest, request.session_id = VariableInterface.session_id async_engine = VariableInterface.async_engine - sequence_start = async_engine.id2step.get(str(request.session_id), 0) == 0 + sequence_start = async_engine.id2step.get(request.session_id, 0) == 0 sequence_end = not request.interactive_mode if isinstance(request.stop, str): request.stop = [request.stop] + end_session = sequence_end and not sequence_start \ + and request.prompt == '' and request.request_output_len == 0 + if end_session: + await async_engine.end_session(request.session_id) + return JSONResponse( + dict(text='', + tokens=0, + input_tokens=0, + history_tokens=0, + finish_reason=None)) + random_seed = request.seed if request.seed else None gen_config = GenerationConfig( diff --git a/lmdeploy/turbomind/chat.py b/lmdeploy/turbomind/chat.py index e106beae17..6985e3dc27 100644 --- a/lmdeploy/turbomind/chat.py +++ b/lmdeploy/turbomind/chat.py @@ -1,4 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. +import asyncio import os import random @@ -28,6 +29,28 @@ def input_prompt(model_name): return '\n'.join(iter(input, sentinel)) +async def async_infer(generator, session_id, input_ids, gen_config, + sequence_start, step, stream_output, tokenizer, state): + token_ids = input_ids.copy() + prev_len = 0 + async for output in generator.async_stream_infer( + session_id=session_id, + input_ids=input_ids, + gen_config=gen_config, + sequence_start=sequence_start, + sequence_end=False, + step=step, + stream_output=stream_output): + tokens = output.num_token + if tokens > prev_len: + token_ids += output.token_ids[prev_len - tokens:] + response, state = tokenizer.detokenize_incrementally(token_ids, + state=state) + prev_len = tokens + print(response, end='', flush=True) + return tokens + + def main(model_path: str, session_id: int = 1, top_k: float = 40, @@ -130,6 +153,9 @@ def main(model_path: str, repetition_penalty=repetition_penalty, stop_token_ids=stop_words) + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + nth_round = 1 step = 0 seed = random.getrandbits(64) @@ -138,7 +164,7 @@ def main(model_path: str, if prompt == 'exit': exit(0) elif prompt == 'end': - generator.end(session_id) + loop.run_until_complete(generator.async_end(session_id)) nth_round = 1 step = 0 seed = random.getrandbits(64) @@ -149,10 +175,8 @@ def main(model_path: str, if model.capability == 'chat': sequence_start = (nth_round == 1) - sequence_end = False else: sequence_start = True - sequence_end = True step = 0 if step + len( @@ -163,20 +187,11 @@ def main(model_path: str, print(f'{prompt}', end='', flush=True) state = DetokenizeState(len(input_ids)) - for outputs in generator.stream_infer( - session_id=session_id, - input_ids=[input_ids], - gen_config=gen_config, - sequence_start=sequence_start, - sequence_end=sequence_end, - step=step, - stream_output=stream_output): - - res, tokens = input_ids + outputs.token_ids, outputs.num_token - # decode res - response, state = tokenizer.detokenize_incrementally( - res, state=state) - print(response, end='', flush=True) + + coro = async_infer(generator, session_id, input_ids, gen_config, + sequence_start, step, stream_output, tokenizer, + state) + tokens = loop.run_until_complete(coro) # update step step += len(input_ids) + tokens diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index a1b2fff944..4a15dd6841 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -4,10 +4,12 @@ import json import os.path as osp import sys +from collections.abc import Sequence from concurrent.futures import ThreadPoolExecutor from dataclasses import asdict +from functools import partial from itertools import repeat -from queue import LifoQueue, Queue +from queue import Queue from typing import Dict, Iterable, List import numpy as np @@ -317,6 +319,93 @@ def create_instance(self, cuda_stream_id=0): return TurboMindInstance(self, self.config, cuda_stream_id) +def _get_logits(outputs, offset: int): + logits = outputs['logits'] + + def _func(out: EngineOutput, step: int): + out.logits = logits[:step - offset - 1, :] + + return _func + + +def _get_last_hidden_state(outputs, offset: int): + last_hidden_state = outputs['last_hidden_state'] + print(f'last_hidden_state.shape = {last_hidden_state.shape}') + + def _func(out: EngineOutput, step: int): + out.last_hidden_state = last_hidden_state[:step - offset - 1, :] + + return _func + + +def _get_logprobs_impl(logprob_vals: torch.Tensor, + logprob_idxs: torch.Tensor, + logprob_nums: torch.Tensor, + output_ids: List[int], + logprobs: int, + out_logprobs: List[Dict[int, float]] = None): + length = len(output_ids) + offset = len(out_logprobs) + if length == offset: + return out_logprobs + for (pos, idx, val, n) in zip(range(offset, + length), logprob_idxs[offset:length], + logprob_vals[offset:length], + logprob_nums[offset:length]): + topn = min(n.item(), logprobs) + tok_res = {idx[i].item(): val[i].item() for i in range(topn)} + token_id = output_ids[pos] + if token_id not in tok_res: + print(token_id, tok_res) + valid_n = n.item() + tok_res[token_id] = \ + val[:valid_n][idx[:valid_n] == token_id].item() + ids = list(tok_res.keys()) + for k in ids: + if tok_res[k] == float('-inf'): + tok_res.pop(k) + out_logprobs.append(tok_res) + return out_logprobs + + +def _get_logprobs(outputs, output_logprobs: int): + logprob_vals = outputs['logprob_vals'] + logprob_idxs = outputs['logprob_indexes'] + logprob_nums = outputs['logprob_nums'] + + logprobs = [] + + def _func(out: EngineOutput, step: int): + _get_logprobs_impl(logprob_vals, logprob_idxs, logprob_nums, + out.token_ids, output_logprobs, logprobs) + out.logprobs = logprobs + + return _func + + +class StreamingSemaphore: + + def __init__(self): + self.loop = asyncio.get_running_loop() + self.fut = None + self.val = 0 + + async def acquire(self): + if self.val: + self.val = 0 + return + self.fut = self.loop.create_future() + await self.fut + self.fut = None + self.val = 0 + + def release(self): + if not self.val: + self.val = 1 + if self.fut: + self.fut.set_result(None) + + class TurboMindInstance: """Instance of TurboMind. @@ -343,116 +432,30 @@ def __init__(self, # create model instances self.model_inst = self._create_model_instance(0) - self.que = Queue() - self.executor: ThreadPoolExecutor = None - self.future = None self.config = config + self.lock = None def _create_model_instance(self, device_id): - rank = self.node_id * self.gpu_count + device_id - model_inst = self.tm_model.model_comm.create_model_instance( - device_id, rank, self.cuda_stream_id, self.nccl_params) + model_inst = self.tm_model.model_comm.create_model_instance(device_id) return model_inst - def _forward_callback(self, result, ctx): - self.que.put((False, result)) - - def _forward_thread(self, inputs): - - def _func(): - try: - output = self.model_inst.forward(inputs) - except Exception as e: - logger.error(f'unhandled exception: {e}') - self.que.put((-1, None)) - return - self.que.put((True, output)) - - self.executor = ThreadPoolExecutor(1) - self.future = self.executor.submit(_func) - - def _async_forward_callback(self, result, ctx, que: LifoQueue): - que.put((False, result)) - - def _async_forward_thread(self, inputs, que: LifoQueue): - - def _func(): - try: - output = self.model_inst.forward(inputs) - except Exception as e: - logger.error(f'unhandled exception: {e}') - que.put((-1, None)) - return - que.put((True, output)) - - self.executor = ThreadPoolExecutor(1) - self.future = self.executor.submit(_func) - - def _get_logprobs(self, - logprob_vals: torch.Tensor, - logprob_indexes: torch.Tensor, - logprob_nums: torch.Tensor, - output_ids: torch.Tensor, - logprobs: int = None, - length: int = None, - out_logprobs: List[Dict[int, float]] = None, - session_id: int = None): - if logprobs is None: - return None - if out_logprobs is None: - out_logprobs = [] - if len(output_ids) <= len(out_logprobs): - return out_logprobs - offset = len(out_logprobs) - for (token_id, idx, val, n) in zip(output_ids[offset:length], - logprob_indexes[offset:length], - logprob_vals[offset:length], - logprob_nums[offset:length]): - topn = min(n.item(), logprobs) - tok_res = {idx[i].item(): val[i].item() for i in range(topn)} - if token_id.item() not in tok_res: - valid_n = n.item() - tok_res[token_id.item()] = \ - val[:valid_n][idx[:valid_n] == token_id].item() - ids = list(tok_res.keys()) - for k in ids: - if tok_res[k] == float('-inf'): - tok_res.pop(k) - out_logprobs.append(tok_res) - return out_logprobs + def _get_extra_output_processors(self, outputs: Dict[str, torch.Tensor], + gen_config: GenerationConfig, + input_len: int): + + def _get_offset(type): + return input_len - 1 if type == 'generation' else 0 - def end(self, session_id: int): - """End the given session.""" - input_ids = [self.tm_model.tokenizer.eos_token_id] - end_generator = self.tm_model.create_instance() - for outputs in end_generator.stream_infer( - session_id, - input_ids, - sequence_start=False, - sequence_end=True, - gen_config=GenerationConfig(max_new_tokens=0)): - pass - - async def async_end(self, session_id: int): - """End the given session.""" - self.end(session_id) - - def cancel(self, session_id: int): - """Stop current streaming inference.""" - input_ids = [self.tm_model.tokenizer.eos_token_id] - stop_generator = self.tm_model.create_instance() - for outputs in stop_generator.stream_infer( - session_id, - input_ids, - sequence_start=False, - sequence_end=False, - stop=True, - gen_config=GenerationConfig(max_new_tokens=0)): - pass - - async def async_cancel(self, session_id: int): - """End the given session.""" - self.cancel(session_id) + fs = [] + if gen_config.output_logits: + offset = _get_offset(gen_config.output_logits) + fs.append(_get_logits(outputs, offset)) + if gen_config.output_last_hidden_state: + offset = _get_offset(gen_config.output_last_hidden_state) + fs.append(_get_last_hidden_state(outputs, offset)) + if gen_config.logprobs: + fs.append(_get_logprobs(outputs, gen_config.logprobs)) + return fs def prepare_embeddings(self, input_embeddings=None, @@ -506,61 +509,17 @@ def prepare_embeddings(self, return input_embeddings, input_embedding_ranges def prepare_inputs(self, - session_id, input_ids, gen_config: GenerationConfig, input_embeddings=None, - input_embedding_ranges=None, - sequence_start: bool = True, - sequence_end: bool = False, - step=0, - stop=False): + input_embedding_ranges=None): """Convert inputs format.""" - if len(input_ids) == 0: - input_ids = [[]] - if isinstance(input_ids[0], int): - input_ids = [input_ids] + assert isinstance(input_ids, Sequence) - batch_size = len(input_ids) + input_ids = torch.IntTensor(input_ids) + input_len = len(input_ids) - def _broadcast_np(data, dtype, shape=(batch_size, )): - if isinstance(data, Iterable): - assert len(data) == batch_size - return data - - return np.full(shape, data, dtype=dtype) - - input_ids = [torch.IntTensor(ids) for ids in input_ids] - input_lengths = torch.IntTensor([len(ids) for ids in input_ids]) - input_ids = pad_sequence(input_ids, - batch_first=True, - padding_value=self.eos_id) - - if isinstance(session_id, int): - session_id = [session_id] - assert len(session_id) == batch_size - - step = _broadcast_np(step, np.int32) - - inputs = dict( - input_ids=input_ids, - input_lengths=input_lengths, - request_output_len=np.full(input_lengths.shape, - gen_config.max_new_tokens, - dtype=np.uint32), - runtime_top_k=_broadcast_np(gen_config.top_k, np.uint32), - runtime_top_p=_broadcast_np(gen_config.top_p, np.float32), - runtime_min_p=_broadcast_np(gen_config.min_p, np.float32), - temperature=_broadcast_np(gen_config.temperature, np.float32), - repetition_penalty=_broadcast_np(gen_config.repetition_penalty, - np.float32), - step=step, - - # session input - START=_broadcast_np((1 if sequence_start else 0), np.int32), - END=_broadcast_np((1 if sequence_end else 0), np.int32), - CORRID=np.array(session_id, dtype=np.uint64), - STOP=_broadcast_np((1 if stop else 0), np.int32)) + inputs = dict(input_ids=input_ids, ) input_embeddings, input_embedding_ranges = self.prepare_embeddings( input_embeddings, input_embedding_ranges) @@ -568,17 +527,6 @@ def _broadcast_np(data, dtype, shape=(batch_size, )): inputs['input_embeddings'] = input_embeddings inputs['input_embedding_ranges'] = input_embedding_ranges - if gen_config.min_new_tokens is not None: - inputs['min_length'] = _broadcast_np(gen_config.min_new_tokens, - np.int32) - - if gen_config.logprobs is not None and gen_config.logprobs > 0: - if gen_config.logprobs > MAX_LOGPROBS: - gen_config.logprobs = MAX_LOGPROBS - logger.warning('logprobs shoudd be in range [1, 1024]' - f'update logprobs={gen_config.logprobs}') - inputs['logprobs'] = _broadcast_np(gen_config.logprobs, np.int32) - bad_words = [] if gen_config.bad_token_ids is not None: bad_words.extend(gen_config.bad_token_ids) @@ -597,10 +545,24 @@ def _broadcast_np(data, dtype, shape=(batch_size, )): if bad_words is not None: inputs['bad_words_list'] = bad_words - if gen_config.random_seed is not None: - inputs['random_seed'] = _broadcast_np(gen_config.random_seed, - np.uint64) - return inputs, input_lengths + return inputs, input_len + + async def async_cancel(self, session_id: int = None): + self.model_inst.cancel() + + def async_end_cb(self, fut: asyncio.Future, status: int): + """executing on engine's signaling thread.""" + logger.info(f'[async_end_cb] session ended, status = {status}') + fut.get_loop().call_soon_threadsafe(fut.set_result, status) + + async def async_end(self, session_id): + fut = asyncio.get_running_loop().create_future() + self.model_inst.end(partial(self.async_end_cb, fut), session_id) + await fut + + def async_signal_cb(self, s: StreamingSemaphore): + """executing on engine's signaling thread.""" + s.loop.call_soon_threadsafe(s.release) async def async_stream_infer(self, session_id, @@ -610,7 +572,6 @@ async def async_stream_infer(self, sequence_start: bool = True, sequence_end: bool = False, step=0, - stop=False, gen_config: GenerationConfig = None, stream_output=False, **kwargs): @@ -630,219 +591,119 @@ async def async_stream_infer(self, stream_output (bool): indicator for stream output kwargs (dict): kwargs for backward compatibility """ - # start forward thread - que = LifoQueue() - from functools import partial - _forward_callback = partial(self._async_forward_callback, que=que) - _forward_thread = partial(self._async_forward_thread, que=que) - if stream_output and not stop: - logger.info(f'Register stream callback for {session_id}') - self.model_inst.register_callback(_forward_callback) - - inputs, input_lengths = self.prepare_inputs( - session_id=session_id, - input_ids=input_ids, - input_embeddings=input_embeddings, - input_embedding_ranges=input_embedding_ranges, - sequence_start=sequence_start, - sequence_end=sequence_end, - step=step, - stop=stop, - gen_config=gen_config) + logger.info(f'[async_stream_infer] session {session_id} start') + gen_cfg = self._get_generation_config(gen_config) - tm_inputs = _np_dict_to_tm_dict(inputs) - _forward_thread(tm_inputs) - - seq_start = input_lengths + input_lengths.new_tensor(step) - - out_logprobs = None - prev_len = 0 - # generator - while True: - while que.qsize() == 0: # let other requests in - await asyncio.sleep(0.002) - - finish, tm_outputs = que.get() - if finish < 0: - yield EngineOutput(status=ResponseType.INTERNAL_ENGINE_ERROR, - token_ids=[], - num_token=0) - self.executor.shutdown() - break - - outputs = _tm_dict_to_torch_dict(tm_outputs) - - output_ids = outputs['output_ids'][:, 0, :] - sequence_length = outputs['sequence_length'].long()[:, 0] - output_ids = [ - output_id[s:l] for output_id, s, l in zip( - output_ids, seq_start, sequence_length) - ] - sequence_length -= seq_start.to(sequence_length.device) - - if 'logprob_vals' in outputs: - logprob_vals = outputs['logprob_vals'][0, 0] - logprob_indexes = outputs['logprob_indexes'][0, 0] - logprob_nums = outputs['logprob_nums'][0, 0] - out_logprobs = self._get_logprobs(logprob_vals, - logprob_indexes, - logprob_nums, output_ids[0], - gen_config.logprobs, - sequence_length.cpu().item(), - out_logprobs, session_id) - - outputs = [] - status = ResponseType.FINISH if finish else ResponseType.SUCCESS - for output, len_ in zip(output_ids, sequence_length): - output, len_ = output, len_.item() - if len(output) > 0 and output[-1].item() == self.eos_id \ - and not gen_config.ignore_eos: - outputs = EngineOutput(status, output[:-1].tolist(), - len_ - 1) - elif len(output) > 0 and \ - gen_config.stop_token_ids is not None and \ - output[-1].item() in gen_config.stop_token_ids: - outputs = EngineOutput(status, output[:-1].tolist(), len_) - else: - outputs = EngineOutput(status, output.tolist(), len_) - if outputs.num_token < prev_len and not finish: - continue - else: - prev_len = outputs.num_token - - if out_logprobs: - output_token_len = len(outputs.token_ids) - outputs.logprobs = out_logprobs[:output_token_len] - - yield outputs - - if finish: - self.future.result() - self.executor.shutdown() - break - - if stream_output and not stop: - logger.info(f'UN-register stream callback for {session_id}') - self.model_inst.unregister_callback() - - def stream_infer(self, - session_id, - input_ids, - input_embeddings=None, - input_embedding_ranges=None, - sequence_start: bool = True, - sequence_end: bool = False, - step=0, - stop=False, - gen_config: GenerationConfig = None, - stream_output=False, - **kwargs): - """Perform model inference. - - Args: - session_id (int): the id of a session - input_ids (numpy.ndarray): the token ids of a prompt - input_embeddings (List[numpy.ndarray]): embeddings features - input_embedding_ranges (List[Tuple[int,int]]): the begin/end - offsets of input_embeddings to input_ids - sequence_start (bool): indicator for starting a sequence - sequence_end (bool): indicator for ending a sequence - step (int): the offset of the k/v cache - stop (bool): indicator for cancelling the session - gen_config (GenerationConfig): generation config - stream_output (bool): indicator for stream output - kwargs (dict): kwargs for backward compatibility - """ - if stream_output and not stop: - logger.info(f'Register stream callback for {session_id}') - self.model_inst.register_callback(self._forward_callback) - - inputs, input_lengths = self.prepare_inputs( - session_id=session_id, + inputs, input_len = self.prepare_inputs( input_ids=input_ids, input_embeddings=input_embeddings, input_embedding_ranges=input_embedding_ranges, - sequence_start=sequence_start, - sequence_end=sequence_end, - step=step, - stop=stop, gen_config=gen_config) - tm_inputs = _np_dict_to_tm_dict(inputs) - # start forward thread - self.que = Queue() - self._forward_thread(tm_inputs) - - seq_start = input_lengths + input_lengths.new_tensor(step) - out_logprobs = None - - # generator - while True: - while self.que.qsize() > 1: - self.que.get() - - finish, tm_outputs = self.que.get() - if finish < 0: - yield EngineOutput(status=ResponseType.INTERNAL_ENGINE_ERROR, - token_ids=[], - num_token=0) - self.executor.shutdown() - break - - outputs = _tm_dict_to_torch_dict(tm_outputs) - - output_ids = outputs['output_ids'][:, 0, :] - sequence_length = outputs['sequence_length'].long()[:, 0] - output_ids = [ - output_id[s:l] for output_id, s, l in zip( - output_ids, seq_start, sequence_length) - ] - sequence_length -= seq_start.to(sequence_length.device) - - if 'logprob_vals' in outputs: - logprob_vals = outputs['logprob_vals'][0, 0] - logprob_indexes = outputs['logprob_indexes'][0, 0] - logprob_nums = outputs['logprob_nums'][0, 0] - out_logprobs = self._get_logprobs(logprob_vals, - logprob_indexes, - logprob_nums, output_ids[0], - gen_config.logprobs, - sequence_length.cpu().item(), - out_logprobs, session_id) - - outputs = [] - status = ResponseType.FINISH if finish else ResponseType.SUCCESS - for output, len_ in zip(output_ids, sequence_length): - output, len_ = output, len_.item() - if len(output) > 0 and output[-1].item() == self.eos_id \ - and not gen_config.ignore_eos: - outputs = EngineOutput(status, output[:-1].tolist(), - len_ - 1, out_logprobs) - elif len(output) > 0 and \ - gen_config.stop_token_ids is not None and \ - output[-1].item() in gen_config.stop_token_ids: - outputs = EngineOutput(status, output[:-1].tolist(), len_, - out_logprobs) - else: - outputs = EngineOutput(status, output.tolist(), len_, - out_logprobs) - - if out_logprobs: - output_token_len = len(outputs.token_ids) - outputs.logprobs = out_logprobs[:output_token_len] - - yield outputs - - if finish: - self.future.result() - self.executor.shutdown() - while self.que.qsize() > 0: - self.que.get() - break - - if stream_output and not stop: - logger.info(f'UN-register stream callback for {session_id}') - self.model_inst.unregister_callback() + session = _tm.SessionParam(id=session_id, + step=step, + start=sequence_start, + end=sequence_end) + + inputs = _np_dict_to_tm_dict(inputs) + + sem = StreamingSemaphore() + signal_cb = partial(self.async_signal_cb, sem) + + outputs, shared_state = self.model_inst.forward( + inputs, session, gen_cfg, stream_output, signal_cb) + + outputs = _tm_dict_to_torch_dict(outputs) + + extra_fs = self._get_extra_output_processors(outputs, gen_config, + input_len) + + output_ids_buf = outputs['output_ids'] + + finish = False + state = None + + output_ids = [] + output_len = 0 + prev_len = step + input_len + try: + while True: + await sem.acquire() + state = shared_state.consume() + + status, seq_len = state.status, state.seq_len + + if status in [7, 8]: # finish / canceled + finish, status = True, 0 + elif status: + yield self._get_error_output() + break + + if seq_len == prev_len and not finish: + continue + + output_ids += output_ids_buf[prev_len:seq_len].tolist() + output_len += seq_len - prev_len + status = ResponseType.FINISH if finish else ResponseType.SUCCESS # noqa + output = EngineOutput(status, output_ids, output_len) + + for f in extra_fs: + f(output, seq_len) + + prev_len = seq_len + + yield output + + if finish: + break + + except (GeneratorExit, asyncio.CancelledError) as e: + logger.info(f'[async_stream_infer] {type(e).__name__}') + self.model_inst.cancel() + except Exception as e: + logger.error(f'[async_stream_infer] {type(e).__name__} {e}') + self.model_inst.cancel() + yield self._get_error_output() + finally: + # Contract: `cb` won't be called again if status is non-zero + # wait for status to be set as `finish` or `error` + while not state or state.status == 0: + await sem.acquire() + state = shared_state.consume() + logger.info(f'[async_stream_infer] session {session_id} done') + + def _get_error_output(self): + return EngineOutput(status=ResponseType.INTERNAL_ENGINE_ERROR, + token_ids=[], + num_token=0) + + def _get_generation_config(self, cfg: GenerationConfig): + c = _tm.GenerationConfig() + c.max_new_tokens = cfg.max_new_tokens + c.top_k = cfg.top_k + c.top_p = cfg.top_p + c.min_p = cfg.min_p + c.temperature = cfg.temperature + c.repetition_penalty = cfg.repetition_penalty + if cfg.min_new_tokens: + c.min_new_tokens = cfg.min_new_tokens + output_type = dict(all=1, generation=2) + if cfg.output_last_hidden_state: + c.output_last_hidden_state = output_type[ + cfg.output_last_hidden_state] + if cfg.output_logits: + c.output_logits = output_type[cfg.output_logits] + if cfg.logprobs: + if cfg.logprobs > MAX_LOGPROBS: + cfg.logprobs = MAX_LOGPROBS + logger.warning( + f'logprobs shoudd be in range [1, {MAX_LOGPROBS}]' + f'update logprobs={cfg.logprobs}') + c.output_logprobs = cfg.logprobs + if cfg.random_seed is not None: + c.random_seed = cfg.random_seed + # print (c) + return c def decode(self, input_ids, diff --git a/src/turbomind/CMakeLists.txt b/src/turbomind/CMakeLists.txt index aec443a1aa..62adb94e5a 100644 --- a/src/turbomind/CMakeLists.txt +++ b/src/turbomind/CMakeLists.txt @@ -16,6 +16,7 @@ add_subdirectory(utils) add_subdirectory(kernels) add_subdirectory(layers) add_subdirectory(models) +add_subdirectory(engine) if(BUILD_PYT) add_subdirectory(th_op) endif() diff --git a/src/turbomind/engine/CMakeLists.txt b/src/turbomind/engine/CMakeLists.txt new file mode 100644 index 0000000000..1d68116cf6 --- /dev/null +++ b/src/turbomind/engine/CMakeLists.txt @@ -0,0 +1,7 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +cmake_minimum_required(VERSION 3.8) + +add_library(engine STATIC gateway.cc request_queue.cc model_request.cc) +set_property(TARGET engine PROPERTY POSITION_INDEPENDENT_CODE ON) +set_property(TARGET engine PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) diff --git a/src/turbomind/engine/gateway.cc b/src/turbomind/engine/gateway.cc new file mode 100644 index 0000000000..e949ec7cd3 --- /dev/null +++ b/src/turbomind/engine/gateway.cc @@ -0,0 +1,40 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#include + +#include "src/turbomind/engine/gateway.h" +#include "src/turbomind/engine/request_queue.h" + +namespace turbomind { + +Gateway::Gateway(std::function()> ctx_factory): request_queue_{this}, ctx_factory_{ctx_factory} +{ + signal_thread_ = std::thread(&Gateway::signal_thread_entry, this); +} + +void Gateway::shutdown() +{ + request_queue_.close(); + signal_buffer_.close(); + + signal_thread_.join(); +} + +void Gateway::signal_thread_entry() noexcept +{ + while (true) { + bool abort{}; + std::vector signals = signal_buffer_.take_all(abort); + if (abort) { + break; + } + else { + auto ctx = ctx_factory_(); + for (const auto& s : signals) { + s(); + } + } + } +} + +} // namespace turbomind diff --git a/src/turbomind/engine/gateway.h b/src/turbomind/engine/gateway.h new file mode 100644 index 0000000000..d939c0bcc2 --- /dev/null +++ b/src/turbomind/engine/gateway.h @@ -0,0 +1,61 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#pragma once + +#include +#include +#include + +#include "src/turbomind/engine/request_queue.h" +#include "src/turbomind/engine/signal_buffer.h" + +namespace turbomind { + +class Gateway { +public: + Gateway(std::function()> ctx_factory); + + void shutdown(); + + void push(std::vector> reqs) + { + return request_queue_.push(std::move(reqs)); + } + + void pop(std::vector>& infer_reqs, + std::vector>& kill_reqs, + unsigned max_infer_num, + bool blocking, + bool& abort) + { + return request_queue_.pop(infer_reqs, kill_reqs, max_infer_num, blocking, abort); + } + + void cancel(std::shared_ptr req) + { + return request_queue_.cancel(std::move(req)); + } + + void kill(std::shared_ptr req) + { + return request_queue_.kill(std::move(req)); + } + + void notify(std::vector signals) + { + return signal_buffer_.push(std::move(signals)); + } + +private: + void signal_thread_entry() noexcept; + +private: + RequestQueue request_queue_; + SignalBuffer signal_buffer_; + + std::function()> ctx_factory_; + + std::thread signal_thread_; +}; + +} // namespace turbomind diff --git a/src/turbomind/engine/model_request.cc b/src/turbomind/engine/model_request.cc new file mode 100644 index 0000000000..6ba355e896 --- /dev/null +++ b/src/turbomind/engine/model_request.cc @@ -0,0 +1,174 @@ + + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "src/turbomind/engine/model_request.h" +#include "src/turbomind/engine/request.h" +#include "src/turbomind/utils/Tensor.h" +#include "src/turbomind/utils/constant.h" +#include "src/turbomind/utils/cuda_utils.h" + +namespace turbomind { + +static ManagedTensor create(DataType dtype, MemoryType where, const std::vector& size, int64_t& byte_size) +{ + byte_size = std::accumulate(size.begin(), size.end(), Tensor::getTypeSize(dtype), std::multiplies<>{}); + void* data{}; + + if (where == MEMORY_GPU) { + check_cuda_error(cudaMallocAsync(&data, byte_size, nullptr)); + } + else { + data = std::malloc(byte_size); + } + + ManagedTensor ret; + ret.tensor = Tensor{where, dtype, std::vector(size.begin(), size.end()), data}; + ret.data_holder.reset((void*)nullptr, [data, where](auto) { + // std::cerr << "turbomind tensor deallocate" << std::endl; + if (where == MEMORY_GPU) { + /// TODO: guard device id + check_cuda_error(cudaFreeAsync(data, nullptr)); + } + else { + std::free(data); + } + }); + return ret; +} + +template +static T get(const std::unordered_map& m, const std::string& key, T fallback = {}) +{ + auto it = m.find(key); + if (it != m.end()) { + return it->second->getVal(); + } + return fallback; +} + +ModelRequest::ModelRequest(Gateway* gateway, DataType data_type, int session_len, int vocab_size, int hidden_dim): + gateway_{gateway}, + data_type_{data_type}, + session_len_{session_len}, + vocab_size_{vocab_size}, + hidden_dim_{hidden_dim} +{ +} + +void ModelRequest::Cancel() +{ + // request is finished if lock failed + if (auto r = request_.lock()) { + gateway_->cancel(std::move(r)); + } +} + +void ModelRequest::End(std::function cb, uint64_t session_id) +{ + auto r = std::make_shared(); + + r->id = r->session.id = session_id; + r->session.kill_flag = true; + + r->end_cb = std::move(cb); + + gateway_->kill(std::move(r)); +} + +auto ModelRequest::Forward(InputParam param, std::function cb) -> OutputParam +{ + inputs_ = std::make_shared(); + outputs_ = std::make_shared(); + + auto add = [](auto& dest, auto key, auto dtype, auto where, auto shape, auto&&... dims) { + std::vector shape_; + if constexpr (std::is_integral_v) { + shape_ = {shape, dims...}; + } + else { + shape_ = {shape.cbegin(), shape.cend()}; + } + int64_t byte_size{}; + auto it = dest->emplace(key, create(dtype, where, shape_, byte_size)).first; + return std::make_pair(it->second->data, byte_size); + }; + + auto& inputs = *param.tensors; + + FT_CHECK(inputs.at("input_ids")->shape.size() == 1); + + const int input_len = inputs.at("input_ids")->shape[0]; + const int output_len = param.gen_cfg.max_new_tokens; + + // Max possible length of a sequence, this depends on `history_len` which isn't available here, so `session_len` + // is used instead + const int max_seq_len = session_len_ + 1; + const int max_out_len = std::min(output_len, session_len_) + 1; + // This does not include histroy length in interactive mode + const int max_in_out_len = std::min(input_len + output_len, session_len_) + 1; + + for (auto& [k, v] : *param.tensors) { + inputs_->emplace(k, v); + } + + add(outputs_, "output_ids", TYPE_INT32, MEMORY_CPU, max_seq_len); + add(outputs_, "sequence_length", TYPE_INT32, MEMORY_CPU, 1); + + if (param.gen_cfg.output_logits) { + const int len = param.gen_cfg.output_logits == GenerationConfig::kAll ? max_in_out_len : max_out_len; + add(outputs_, "logits", TYPE_FP32, MEMORY_CPU, len, vocab_size_); + } + + if (param.gen_cfg.output_last_hidden_state) { + const int len = param.gen_cfg.output_last_hidden_state == GenerationConfig::kAll ? max_in_out_len : max_out_len; + add(outputs_, "last_hidden_state", data_type_, MEMORY_CPU, len, hidden_dim_); + } + + if (param.gen_cfg.output_logprobs) { + add(outputs_, "logprob_vals", TYPE_FP32, MEMORY_CPU, max_out_len, kMaxLogProb); + add(outputs_, "logprob_indexes", TYPE_INT32, MEMORY_CPU, max_out_len, kMaxLogProb); + add(outputs_, "logprob_nums", TYPE_INT32, MEMORY_CPU, max_out_len); + } + + auto r = std::make_shared(); + + for (const auto& [k, v] : *inputs_) { + r->inputs.insert(k, *v); + } + for (const auto& [k, v] : *outputs_) { + r->outputs.insert(k, *v); + } + + auto state = std::make_shared(); + + if (param.session.start_flag) { + session_id_ = param.session.id; + } + + r->id = param.session.id; + r->session = param.session; + r->gen_cfg = param.gen_cfg; + r->stream_output = param.stream_output; + r->forward_cb = std::move(cb); + r->state = state; + + r->output_ids = *outputs_->at("output_ids"); + r->sequence_length = *outputs_->at("sequence_length"); + + // Keep a weak reference for canceling the request + request_ = r; + + gateway_->push({std::move(r)}); + + return OutputParam{outputs_, state}; +} + +} // namespace turbomind diff --git a/src/turbomind/engine/model_request.h b/src/turbomind/engine/model_request.h new file mode 100644 index 0000000000..aea889e856 --- /dev/null +++ b/src/turbomind/engine/model_request.h @@ -0,0 +1,59 @@ + + +#pragma once + +#include + +#include "src/turbomind/engine/gateway.h" +#include "src/turbomind/utils/Tensor.h" + +namespace turbomind { + +class ModelRequest { +public: + virtual ~ModelRequest() = default; + + ModelRequest(Gateway* gateway, DataType data_type, int session_len, int vocab_size, int hidden_dim); + + // Cancel running request + void Cancel(); + + // Reset the channel to uninitailized state, calls `notify` when done + void End(std::function cb, uint64_t session_id); + + using TensorMap_ = std::unordered_map; + + struct InputParam { + std::shared_ptr tensors; + + SessionParam session; + GenerationConfig gen_cfg; + + bool stream_output; + }; + + struct OutputParam { + std::shared_ptr tensors; + std::shared_ptr state; + }; + + OutputParam Forward(InputParam param, std::function cb); + +protected: + Gateway* const gateway_; + + const DataType data_type_; + + const int session_len_; + const int hidden_dim_; + const int vocab_size_; + + uint64_t session_id_; + + std::weak_ptr request_; + + std::shared_ptr inputs_; // owned by caller + std::shared_ptr outputs_; // owned by `this` +}; + +} // namespace turbomind diff --git a/src/turbomind/engine/request.h b/src/turbomind/engine/request.h new file mode 100644 index 0000000000..6bf706c9b8 --- /dev/null +++ b/src/turbomind/engine/request.h @@ -0,0 +1,148 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#pragma once + +#include +#include +#include +#include +#include + +#include "src/turbomind/utils/Tensor.h" + +namespace turbomind { + +struct GenerationConfig { + int max_new_tokens = 0; + int min_new_tokens = 0; + + int top_k = 1; + float top_p = 0.f; + float min_p = 0.f; + float temperature = 1.f; + + float repetition_penalty = 1.f; + + uint64_t random_seed = 0; + + int output_logprobs = 0; + + enum OutType + { + kNone = 0, + kAll = 1, + kGeneration = 2 + }; + int output_last_hidden_state = 0; + int output_logits = 0; +}; + +inline std::ostream& operator<<(std::ostream& os, const GenerationConfig& c) +{ + os << "GenerationConfig { "; + os << "max_new_tokens=" << c.max_new_tokens; + os << ", min_new_tokens=" << c.min_new_tokens; + os << ", top_p=" << c.top_p; + os << ", top_k=" << c.top_k; + os << ", min_p=" << c.min_p; + os << ", temperature=" << c.temperature; + os << ", repetition_penalty=" << c.repetition_penalty; + os << ", random_seed=" << c.random_seed; + os << ", output_logprobs=" << c.output_logprobs; + os << ", output_hidden_states=" << c.output_last_hidden_state; + os << ", output_logits=" << c.output_logits; + os << " }"; + return os; +} + +struct SessionParam { + uint64_t id; + + int step; + + bool start_flag; + bool end_flag; + bool kill_flag; +}; + +struct RequestState { + int status; + int seq_len; +}; + +struct AtomicRequestState { + + std::atomic data_; + + static_assert(std::atomic::is_always_lock_free); + + ~AtomicRequestState() + { + auto data = exchange(nullptr); + } + + std::unique_ptr exchange(RequestState* data) + { + return std::unique_ptr{data_.exchange(data, std::memory_order_acq_rel)}; + } +}; + +struct Request { + uint64_t id; // sequence id + uint64_t unique_id; // monotonic increasing + + SessionParam session; + GenerationConfig gen_cfg; + + bool stream_output; + + // reference to IO tensors + TensorMap inputs; + TensorMap outputs; + // fast path for accessing common output buffers + Tensor output_ids; + Tensor sequence_length; + + std::function end_cb; + + std::atomic cancel_flag; + bool is_canceled{}; + + std::function forward_cb; + + std::shared_ptr state; + + int ec; // set when disabling conflicting requests + + enum + { + kOk = 0, + kInvalid = 1, // Sequence not exist or both `start` & `stop` (instead of `end`) is set + kConflict = 2, // Concurrent requests to the same sequence + kBusy = 3, // Sequence is already running + kInactive = 4, // Sequence to `stop` is not active + kFail = 5, // Can't find sequence for `stop` request or internal error during inference + kTooLong = 6, // history + prompt > session_len, + kFinish = 7, + kCancel = 8, + }; +}; + +inline void UpdateState(Request& r, int status, int seq_len) +{ + try { + auto new_state = new RequestState{status, seq_len}; + auto old_state = r.state->exchange(new_state); + if (!old_state && r.forward_cb) { + r.forward_cb(); + } + } + catch (const std::exception& e) { + TM_LOG_ERROR("Error invoking callback for (%lu): %s", r.id, e.what()); + } + catch (...) { + TM_LOG_ERROR("Unknown error invoking callback for (%lu)", r.id); + } +} + +} // namespace turbomind diff --git a/src/turbomind/engine/request_queue.cc b/src/turbomind/engine/request_queue.cc new file mode 100644 index 0000000000..8c0b52b5bf --- /dev/null +++ b/src/turbomind/engine/request_queue.cc @@ -0,0 +1,93 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#include "src/turbomind/engine/request_queue.h" +#include "src/turbomind/engine/gateway.h" + +#include "src/turbomind/engine/request.h" + +namespace turbomind { + +void RequestQueue::push(std::vector> reqs) +{ + { + std::lock_guard lock(mutex_); + if (closed_) { + throw std::runtime_error("Queue is closed"); + } + for (auto& r : reqs) { + queue_.push(std::move(r)); + } + } + cv_.notify_one(); +} + +void RequestQueue::cancel(std::shared_ptr r) +{ + // -1 canceled + // 0 queued + // 1 active + if (r->cancel_flag.exchange(-1, std::memory_order_acq_rel) != 0) { + // request is picked up by engine + return; + } + else { + // not picked by engine yet, skip directly + gateway_->notify({[r = std::move(r)] { // + UpdateState(*r, Request::kCancel, 0); + }}); + } +} + +void RequestQueue::kill(std::shared_ptr r) +{ + { + std::lock_guard lock(mutex_); + if (closed_) { + throw std::runtime_error("Queue is closed"); + } + kill_.push_back(std::move(r)); + } + cv_.notify_one(); +} + +void RequestQueue::pop(std::vector>& infer_reqs, + std::vector>& kill_reqs, + unsigned max_infer_num, + bool blocking, + bool& abort) +{ + std::unique_lock lock(mutex_); + + if (blocking) { + cv_.wait(lock, [this] { return !queue_.empty() || !kill_.empty() || closed_; }); + if (closed_) { + abort = true; + return; + } + } + + infer_reqs.clear(); + while (!queue_.empty() && infer_reqs.size() <= max_infer_num) { + auto& r = queue_.front(); + if (r->cancel_flag.exchange(1, std::memory_order_acq_rel) == 0) { + infer_reqs.push_back(std::move(r)); + } + else { + // Canceled requests are simply ignored + } + queue_.pop(); + } + + kill_reqs = std::move(kill_); +} + +void RequestQueue::close() +{ + { + std::lock_guard lock(mutex_); + closed_ = true; + } + cv_.notify_all(); +} + +} // namespace turbomind diff --git a/src/turbomind/engine/request_queue.h b/src/turbomind/engine/request_queue.h new file mode 100644 index 0000000000..c029f38f4b --- /dev/null +++ b/src/turbomind/engine/request_queue.h @@ -0,0 +1,46 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#pragma once + +#include +#include +#include + +#include "src/turbomind/engine/request.h" + +namespace turbomind { + +class Gateway; + +class RequestQueue { +public: + RequestQueue(Gateway* gateway): gateway_{gateway} {} + + void push(std::vector> reqs); + + void pop(std::vector>& infer_reqs, + std::vector>& kill_reqs, + unsigned max_infer_num, + bool blocking, + bool& abort); + + void cancel(std::shared_ptr r); + + void kill(std::shared_ptr r); + + void close(); + +private: + Gateway* gateway_; + + std::queue> queue_; + + std::vector> kill_; + + std::mutex mutex_; + std::condition_variable cv_; + + bool closed_{false}; +}; + +} // namespace turbomind diff --git a/src/turbomind/engine/signal_buffer.h b/src/turbomind/engine/signal_buffer.h new file mode 100644 index 0000000000..cb09be7909 --- /dev/null +++ b/src/turbomind/engine/signal_buffer.h @@ -0,0 +1,61 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#pragma once + +#include +#include +#include + +namespace turbomind { + +using Signal = std::function; + +class SignalBuffer { +public: + void push(std::vector signals) + { + if (signals.empty()) { + return; + } + { + std::lock_guard lock{mutex_}; + signals_.insert(signals_.end(), std::move_iterator{signals.begin()}, std::move_iterator{signals.end()}); + } + cv_.notify_one(); + } + + void close() + { + { + std::lock_guard lock{mutex_}; + aborted_ = true; + } + cv_.notify_all(); + } + + std::vector take_all(bool& abort) + { + std::vector signals; + { + std::unique_lock lock{mutex_}; + cv_.wait(lock, [&] { return !signals_.empty() || aborted_; }); + if (aborted_) { + abort = true; + } + else { + signals.swap(signals_); + } + } + return signals; + } + +private: + std::vector signals_; + + std::mutex mutex_; + std::condition_variable cv_; + + bool aborted_{false}; +}; + +} // namespace turbomind diff --git a/src/turbomind/kernels/gpt_kernels.cu b/src/turbomind/kernels/gpt_kernels.cu index 4f47631fa5..d611cfab43 100644 --- a/src/turbomind/kernels/gpt_kernels.cu +++ b/src/turbomind/kernels/gpt_kernels.cu @@ -269,4 +269,61 @@ void invokeTransposeAxis01( template void invokeTransposeAxis01( int* out, int* in, const int* in_skipping_dim1, const int dim0, const int dim1, cudaStream_t stream); +template +__global__ void transpose_2d_kernel(T* __restrict__ dst, const T* __restrict__ src, int rows, int cols, bool swap_xy) +{ + __shared__ T smem[TILE_DIM][TILE_DIM + 1]; + + const int block_idx_x = swap_xy ? blockIdx.y : blockIdx.x; + const int block_idx_y = swap_xy ? blockIdx.x : blockIdx.y; + + { + const int j = block_idx_x * TILE_DIM + threadIdx.x; + const int i = block_idx_y * TILE_DIM + threadIdx.y; + +#pragma unroll + for (int y = 0; y < TILE_DIM; y += BLOCK_ROWS) { + if (i + y < rows && j < cols) { + smem[threadIdx.y + y][threadIdx.x] = src[(i + y) * cols + j]; + } + } + } + + __syncthreads(); + + { + const int j = block_idx_y * TILE_DIM + threadIdx.x; + const int i = block_idx_x * TILE_DIM + threadIdx.y; + +#pragma unroll + for (int y = 0; y < TILE_DIM; y += BLOCK_ROWS) { + if (i + y < cols && j < rows) { + dst[(i + y) * rows + j] = smem[threadIdx.x][threadIdx.y + y]; + } + } + } +} + +template +void invokeTranspose2D_(T* dst, const T* src, int rows, int cols, cudaStream_t st) +{ + constexpr int TILE_DIM = 32; // warp size + constexpr int BLOCK_ROWS = 8; + + const dim3 block(TILE_DIM, BLOCK_ROWS); + + dim3 grid((cols + TILE_DIM - 1) / TILE_DIM, // + (rows + TILE_DIM - 1) / TILE_DIM); + bool swap_xy = false; + + if (grid.y > 65535) { // max dim for grid.y + std::swap(grid.x, grid.y); + swap_xy = true; + } + + transpose_2d_kernel<<>>(dst, src, rows, cols, swap_xy); +} + +template void invokeTranspose2D_(uint32_t*, const uint32_t*, int, int, cudaStream_t); + } // namespace turbomind diff --git a/src/turbomind/kernels/gpt_kernels.h b/src/turbomind/kernels/gpt_kernels.h index 4e1dc49be8..a351473332 100644 --- a/src/turbomind/kernels/gpt_kernels.h +++ b/src/turbomind/kernels/gpt_kernels.h @@ -238,4 +238,19 @@ void invokeSumLengthDimension(float* out_buf, const size_t hidden_dim, cudaStream_t stream = 0); +template +void invokeTranspose2D_(T* dst, const T* src, int rows, int cols, cudaStream_t st); + +template +void invokeTranspose2D(T* dst, const T* src, int rows, int cols, cudaStream_t st) +{ + if constexpr (sizeof(T) == 4) { + // FT_CHECK(0); + invokeTranspose2D_((uint32_t*)dst, (const uint32_t*)src, rows, cols, st); + } + else { + FT_CHECK(0); + } +} + } // namespace turbomind diff --git a/src/turbomind/kernels/sampling_penalty_kernels.cu b/src/turbomind/kernels/sampling_penalty_kernels.cu index 1d4cfe24b0..cf360580b9 100644 --- a/src/turbomind/kernels/sampling_penalty_kernels.cu +++ b/src/turbomind/kernels/sampling_penalty_kernels.cu @@ -17,6 +17,8 @@ #include #include +#include "src/turbomind/kernels/core/array_ops.h" +#include "src/turbomind/kernels/core/common.h" #include "src/turbomind/kernels/sampling_penalty_kernels.h" namespace turbomind { @@ -221,6 +223,81 @@ template void invokeBatchApplyTemperaturePenalty(half* logits, const int vocab_size_padd, cudaStream_t stream); #endif + +template +__global__ void batchApplyTemperaturePenalty_v2(float* logits, + const float* bias, + const float* temperatures, + const int batch_size, + const int vocab_size, + const int vocab_size_padded) +{ + const int vi = blockIdx.x * blockDim.x + threadIdx.x; + const int bi = blockIdx.y; + + __shared__ float shared_scale; + + if (threadIdx.x == 0) { + shared_scale = fdividef(1.f, temperatures[bi] + 1e-6f); + } + + __syncthreads(); + + const float scale = shared_scale; + + logits += (size_t)bi * vocab_size_padded; + + const int step = gridDim.x * blockDim.x * vec_size; + + for (int i = vi * vec_size; i < vocab_size_padded; i += step) { + Array vec; + Load(vec, logits + i); + PRAGMA_UNROLL + for (int c = 0; c < vec_size; ++c) { + if (i + c < vocab_size) { + vec[c] *= scale; + } + else { + vec[c] = -FLT_MAX; + } + } + Store(logits + i, vec); + } +} + +void invokeBatchApplyTemperaturePenalty_v2(float* logits, + const float* bias, + const float* temperatures, + const int batch_size, + const int vocab_size, + const int vocab_size_padded, + cudaStream_t stream) +{ + + auto invoke = [&](auto vec_size) { + constexpr int threads = 256; + const int blocks_per_tok = (vocab_size_padded + threads * vec_size - 1) / (threads * vec_size); + const dim3 blocks(blocks_per_tok, batch_size); + batchApplyTemperaturePenalty_v2<<>>( // + logits, + bias, + temperatures, + batch_size, + vocab_size, + vocab_size_padded); + }; + + if (vocab_size_padded % 4 == 0) { + invoke(std::integral_constant{}); + } + else if (vocab_size_padded % 2 == 0) { + invoke(std::integral_constant{}); + } + else { + invoke(std::integral_constant{}); + } +} + template __global__ void applyRepetitionPenalty(T* logits, const float penalty, diff --git a/src/turbomind/kernels/sampling_penalty_kernels.h b/src/turbomind/kernels/sampling_penalty_kernels.h index e12698cdf7..1f26b7d352 100644 --- a/src/turbomind/kernels/sampling_penalty_kernels.h +++ b/src/turbomind/kernels/sampling_penalty_kernels.h @@ -69,6 +69,14 @@ void invokeBatchApplyTemperaturePenalty(T* logits, const int vocab_size_padd, cudaStream_t stream); +void invokeBatchApplyTemperaturePenalty_v2(float* logits, + const float* bias, + const float* temperatures, + const int batch_size, + const int vocab_size, + const int vocab_size_padd, + cudaStream_t stream); + template void invokeMinLengthPenalty(T* logits, const int* min_lengths, diff --git a/src/turbomind/kernels/sampling_topp_kernels.cu b/src/turbomind/kernels/sampling_topp_kernels.cu index 04ea0577d1..4d4cff464c 100644 --- a/src/turbomind/kernels/sampling_topp_kernels.cu +++ b/src/turbomind/kernels/sampling_topp_kernels.cu @@ -22,6 +22,7 @@ #include "3rdparty/cub/cub.cuh" #endif +#include "src/turbomind/kernels/core/math.h" #include "src/turbomind/kernels/reduce_kernel_utils.cuh" #include "src/turbomind/kernels/sampling_topp_kernels.h" #include "src/turbomind/utils/constant.h" @@ -216,9 +217,9 @@ void invokeTopPSort(TopPSortParams& params, cudaStream_t stream) size_t topp_id_val_buf_size = sizeof(int) * params.batch_size * params.vocab_size_padded; size_t begin_offset_buf_size = sizeof(int) * params.batch_size; size_t end_offset_buf_size = sizeof(int) * params.batch_size; - topp_id_val_buf_size = div_up(topp_id_val_buf_size, 256) * 256; - begin_offset_buf_size = div_up(begin_offset_buf_size, 256) * 256; - end_offset_buf_size = div_up(end_offset_buf_size, 256) * 256; + topp_id_val_buf_size = cdiv(topp_id_val_buf_size, 256) * 256; + begin_offset_buf_size = cdiv(begin_offset_buf_size, 256) * 256; + end_offset_buf_size = cdiv(end_offset_buf_size, 256) * 256; if (params.workspace == nullptr) { size_t cub_temp_storage_size; @@ -236,7 +237,7 @@ void invokeTopPSort(TopPSortParams& params, cudaStream_t stream) 0, // begin_bit sizeof(T) * 8, // end_bit = sizeof(KeyT) * 8 stream)); // cudaStream_t - cub_temp_storage_size = div_up(cub_temp_storage_size, 256) * 256; + cub_temp_storage_size = cdiv(cub_temp_storage_size, 256) * 256; params.workspace_size = topp_id_val_buf_size + begin_offset_buf_size + end_offset_buf_size + cub_temp_storage_size; return; diff --git a/src/turbomind/layers/sampling_layers/LogitsProcessorLayer.cc b/src/turbomind/layers/sampling_layers/LogitsProcessorLayer.cc index b588d8b6f5..c458998031 100644 --- a/src/turbomind/layers/sampling_layers/LogitsProcessorLayer.cc +++ b/src/turbomind/layers/sampling_layers/LogitsProcessorLayer.cc @@ -178,7 +178,7 @@ void LogitsProcessorLayer::forward(TensorMap* output_tensors, TensorMap* inpu // temperature { if (!ALL_OF(temperature_.begin(), batch_size, float, 1.f)) { - invokeBatchApplyTemperaturePenalty( + invokeBatchApplyTemperaturePenalty_v2( logits, (T*)nullptr, temperature_buf_, batch_size, args_.vocab_size, args_.vocab_size_padded, stream_); sync_check_cuda_error(); } diff --git a/src/turbomind/models/llama/CMakeLists.txt b/src/turbomind/models/llama/CMakeLists.txt index 3c714bd234..6c297e3d56 100644 --- a/src/turbomind/models/llama/CMakeLists.txt +++ b/src/turbomind/models/llama/CMakeLists.txt @@ -25,6 +25,7 @@ add_library(Llama STATIC set_property(TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) target_link_libraries(Llama PUBLIC CUDA::cudart + engine gemm2 rms_norm cublasMMWrapper diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index ea321d06a0..e37af1bb76 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -1,19 +1,43 @@ // Copyright (c) OpenMMLab. All rights reserved. -#include "src/turbomind/models/llama/LlamaBatch.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "src/turbomind/macro.h" + +#include "src/turbomind/engine/gateway.h" +#include "src/turbomind/engine/request.h" + #include "src/turbomind/kernels/core/data_type.h" #include "src/turbomind/kernels/decoding_kernels.h" #include "src/turbomind/kernels/gemm/tuner/params.h" #include "src/turbomind/kernels/sampling_topk_kernels.h" -#include "src/turbomind/macro.h" + #include "src/turbomind/models/llama/BlockManager.h" +#include "src/turbomind/models/llama/LlamaBatch.h" #include "src/turbomind/models/llama/LlamaNcclGuard.h" #include "src/turbomind/models/llama/LlamaV2.h" -#include "src/turbomind/models/llama/Request.h" #include "src/turbomind/models/llama/SequenceManager.h" #include "src/turbomind/models/llama/copy.h" #include "src/turbomind/models/llama/llama_kernels.h" #include "src/turbomind/models/llama/llama_utils.h" + #include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/anomaly_handler.h" #include "src/turbomind/utils/constant.h" @@ -21,20 +45,6 @@ #include "src/turbomind/utils/debug_utils.h" #include "src/turbomind/utils/logger.h" #include "src/turbomind/utils/nccl_utils.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include namespace turbomind { @@ -84,150 +94,92 @@ void DropEmbeddings(const Sequence& seq) } template -void LlamaBatch::RejectInvalidRequests(Requests& stop_reqs, Requests& infer_reqs) +void LlamaBatch::DisableConflictRequests(Requests& infer_reqs, Requests& kill_reqs) { - std::unordered_map occurrence; - - auto count_occurrence = [&occurrence](const Requests& rs) { - for (const auto& r : rs) { - ++occurrence[r->id]; - } - }; - - auto reject = [](const char* type, std::shared_ptr& req, int ec) { - TM_LOG_WARNING( - "[RejectInvalidRequests] Skipping invalid %s request for id %ld, code = %d", type, (long)req->id, ec); - req->signal.set_value(ec); - req.reset(); - }; - - auto handle_conflict_or_invalid = [this, &occurrence, &reject](Requests& rs, const char* type) { - for (auto& r : rs) { - if (r) { - int ec = 0; + NvtxScope _("disable conflict"); - const int input_length = r->inputs.getVal("input_lengths", 0); - const auto get_offset = [&](int token_count) { - return std::max(0, std::min(token_count, r->inputs.getVal("step", token_count))); - }; + std::pmr::monotonic_buffer_resource mbr; + std::pmr::unordered_map occur(&mbr); - if (occurrence[r->id] != 1) { - ec = Request::kConflict; - } - else if (r->start_flag && r->stop_flag) { - ec = Request::kInvalid; - } - else if (input_length > session_len_) { - ec = Request::kTooLong; - } - else if (!r->start_flag) { - if (auto seq = sequence_manager_->Get(r->id); seq == nullptr) { - ec = Request::kInvalid; - } - else if (get_offset(seq->tokens.size()) + input_length > session_len_) { - ec = Request::kTooLong; - } - } - - if (ec) { - reject(type, r, ec); - } - } + auto count = [&occur](const auto& reqs) { + for (const auto& r : reqs) { + ++occur[r->id]; } }; - auto drop_invalid = [](Requests& rs) { - int count = 0; - for (int i = 0; i < rs.size(); ++i) { - if (rs[i]) { - rs[count++] = std::move(rs[i]); + auto validate = [&occur](auto& reqs, const char* type) { + for (const auto& r : reqs) { + if (occur[r->id] > 1) { + TM_LOG_ERROR("Skip conflicting %s request for ID %lu", type, r->id); + r->ec = Request::kConflict; } } - rs.resize(count); }; - count_occurrence(stop_reqs); - count_occurrence(infer_reqs); - - if (!stop_reqs.empty()) { - handle_conflict_or_invalid(stop_reqs, "stop"); - - // invalidate stop-only requests for inactive sequences - for (auto& r : stop_reqs) { - if (r && r->end_flag == false) { - int ec = Request::kInactive; - for (int i = 0; i < state_->size; ++i) { - if (state_->requests[i] && state_->requests[i]->id == r->id) { - ec = 0; - break; - } - } - if (ec) { - reject("stop", r, ec); - } - } + for (int i = 0; i < state_->size; ++i) { + if (state_->requests[i]) { + ++occur[state_->requests[i]->id]; } - - drop_invalid(stop_reqs); } - if (!infer_reqs.empty()) { - handle_conflict_or_invalid(infer_reqs, "infer"); + count(kill_reqs); + count(infer_reqs); - // invalidate requests for busy sequences - for (auto& r : infer_reqs) { - if (r) { - for (int i = 0; i < state_->size; ++i) { - if (state_->requests[i] && state_->requests[i]->id == r->id) { - reject("infer", r, Request::kBusy); - break; - } - } - } - } + validate(kill_reqs, "kill"); + validate(infer_reqs, "infer"); +} - drop_invalid(infer_reqs); +template +void LlamaBatch::BroadcastCancelFlags() +{ + for (int i = 0; i < state_->size; ++i) { + const auto& r = state_->requests[i]; + if (r && r->cancel_flag.load(std::memory_order_acquire) == -1) { + r->is_canceled = true; + } } } -template -auto LlamaBatch::ProcessStopRequests(const Requests& requests) -> std::vector +template +void LlamaBatch::ProcessCancelRequests(std::vector& signals) { - NvtxScope scope("stop_request"); - std::vector signals; - int count = 0; - for (const auto& r : requests) { - int ec = Request::kFail; - // find matching active sequence - for (int i = 0; i < state_->size; ++i) { - // stop & optionally erase active sequence - if (state_->requests[i] && state_->requests[i]->id == r->id) { - ec = 0; - signals.push_back(Interrupt(i, true, r->end_flag)); - ++count; - break; - } - } - // mismatch, try erase inactive sequence, in this case there is no active request to interrupt - if (ec && r->end_flag) { - if (sequence_manager_->Erase(r->id)) { - ec = 0; - } + int count = 0; + for (int i = 0; i < state_->size; ++i) { + const auto& r = state_->requests[i]; + if (r && r->is_canceled) { + ++count; + signals.push_back(Interrupt(i, true)); + // Interrupt should reset r + FT_CHECK(!r); } - signals.push_back([=] { - if (rank_ == 0) { - r->signal.set_value(ec); - } - }); } if (count) { check_cuda_error(cudaStreamSynchronize(stream_)); } - return signals; +} + +template +void LlamaBatch::ProcessKillRequests(const Requests& kill_reqs, std::vector& signals) +{ + for (auto& r : kill_reqs) { + if (r) { + int ec = r->ec; + if (!ec) { + if (!sequence_manager_->Erase(r->id)) { + ec = Request::kInvalid; + } + } + signals.push_back([=] { + if (r->end_cb) { + r->end_cb(ec); + } + }); + } + } } template -void LlamaBatch::ProcessInferRequests(const Requests& requests) +void LlamaBatch::ProcessInferRequests(const Requests& reqs, std::vector& signals) { NvtxScope scope("infer_request"); auto& state = *incoming_; @@ -238,58 +190,90 @@ void LlamaBatch::ProcessInferRequests(const Requests& requests) std::vector existing_idx; int idx = 0; - for (const auto& r : requests) { - FT_CHECK(!state.requests[idx]); + for (const auto& r : reqs) { if (rank_ == 0) { TM_LOG_INFO("[ProcessInferRequests] Request for %ld received.", (long)r->id); } - state.requests[idx] = r; + if (r->ec) { + signals.push_back([r] { UpdateState(*r, r->ec, 0); }); + continue; + } - // get sequence for the request - state.sequences[idx] = r->start_flag ? sequence_manager_->Create(r->id) : sequence_manager_->Get(r->id); - FT_CHECK(state.sequences[idx]); + const int input_length = r->inputs.at("input_ids").shape[0]; - auto& seq = *state.sequences[idx]; + if (input_length > session_len_) { + signals.push_back([r] { UpdateState(*r, Request::kTooLong, 0); }); + continue; + } - if (int step = r->inputs.getVal("step", -1); step >= 0) { - if (step <= seq.tokens.size()) { - seq.tokens.resize(step); - seq.cache_len = std::min(seq.cache_len, step); - DropEmbeddings(seq); + auto ptr = r->session.start_flag ? sequence_manager_->Create(r->id) : sequence_manager_->Get(r->id); + if (!ptr) { + signals.push_back([r] { UpdateState(*r, Request::kInvalid, 0); }); + continue; + } + + const int step = [&] { + int s = r->session.step; + if (s < 0) { + s = ptr->tokens.size(); } - else if (rank_ == 0) { - TM_LOG_WARNING( - "[ProcessInferRequests] Skipping invalid step (%d) setting for ID %ld", step, (long)seq.id); + else if (s > ptr->tokens.size()) { + if (rank_ == 0) { + TM_LOG_WARNING("[ProcessInferRequests] Skipping invalid step (%d) setting for ID %lu", s, ptr->id); + } + s = ptr->tokens.size(); } + return s; + }(); + + if (step + input_length > session_len_) { + signals.push_back([r] { UpdateState(*r, Request::kTooLong, 0); }); + continue; } - const int input_length = r->inputs.getVal("input_lengths"); - const int* input_ids = r->inputs.getPtr("input_ids"); + FT_CHECK(!state.requests[idx]); + + state.requests[idx] = r; + state.sequences[idx] = ptr; + + auto& seq = *state.sequences[idx]; + + if (step < seq.tokens.size()) { + // resize sequence tokens to match step + seq.tokens.resize(step); + seq.cache_len = std::min(seq.cache_len, step); + DropEmbeddings(seq); + } + + const int* input_ids = r->inputs.getPtr("input_ids"); { // `output_ids` contains all token ids of the sequences const auto output_ids_base = state.output_ids + session_len_ * idx; - auto output_ids = output_ids_base; + auto d_output_ids = output_ids_base; + auto h_output_ids = r->output_ids.getPtr(); // copy history tokens if (!seq.tokens.empty()) { - output_ids = Copy(seq.tokens.data(), seq.tokens.size(), output_ids); + d_output_ids = Copy(seq.tokens.data(), seq.tokens.size(), d_output_ids); + h_output_ids = std::copy_n(seq.tokens.data(), seq.tokens.size(), h_output_ids); } // copy input tokens if (input_length) { - output_ids = Copy(input_ids, input_length, output_ids); + d_output_ids = Copy(input_ids, input_length, d_output_ids); + h_output_ids = std::copy_n(input_ids, input_length, h_output_ids); } // total context length (history + input) - state.h_prompt_length[idx] = output_ids - output_ids_base; - state.h_context_length[idx] = output_ids - output_ids_base; + state.h_prompt_length[idx] = d_output_ids - output_ids_base; + state.h_context_length[idx] = d_output_ids - output_ids_base; state.h_finished[idx] = false; } // copy input tokens to prompt for prefix matching - if (input_length && r->start_flag && !r->inputs.isExist("input_embedding_ranges")) { + if (input_length && r->session.start_flag && !r->inputs.isExist("input_embedding_ranges")) { // TODO: truncate prompt to enable prefix caching for VLM seq.prompt.resize(input_length); std::copy_n(input_ids, input_length, seq.prompt.data()); @@ -348,8 +332,8 @@ void LlamaBatch::ProcessInferRequests(const Requests& requests) } } - const int request_output_len = state.requests[idx]->inputs.getVal("request_output_len"); - state.seq_len_limit[idx] = state.h_context_length[idx] + request_output_len; + const int max_new_tokens = state.requests[idx]->gen_cfg.max_new_tokens; + state.seq_len_limit[idx] = state.h_context_length[idx] + max_new_tokens; // `length_criterion` sets finish flag when step >= seq_limit_len, however when step == seq_limit_len // the actual sequence length is seq_limit_len + 1, hence seq_limit_len must truncated to session_len - 1 if (state.seq_len_limit[idx] >= session_len_) { @@ -357,17 +341,17 @@ void LlamaBatch::ProcessInferRequests(const Requests& requests) if (rank_ == 0) { const int trunc_output_len = state.seq_len_limit[idx] - state.h_context_length[idx]; TM_LOG_WARNING( - "[ProcessInferRequests] [%ld] total sequence length (%d + %d) exceeds `session_len` (%d), `request_output_len` is truncated to %d", + "[ProcessInferRequests] [%ld] total sequence length (%d + %d) exceeds `session_len` (%d), `max_new_tokens` is truncated to %d", (long)seq.id, state.h_context_length[idx], - request_output_len, + max_new_tokens, (int)session_len_, trunc_output_len); } } // compute rope scaling factor - if (r->start_flag) { + if (r->session.start_flag) { seq.rope_theta = model_->attn_param_.rotary_embedding_base; if (model_->attn_param_.use_dynamic_ntk) { auto scaling_factor = model_->attn_param_.rope_scaling_factor; @@ -388,9 +372,9 @@ void LlamaBatch::ProcessInferRequests(const Requests& requests) } state.h_rope_theta[idx] = seq.rope_theta; - if (r->start_flag) { + if (r->session.start_flag) { // prepare to initialize random state for new sequence - h_random_seed_[idx] = r->inputs.getVal("random_seed", 0); + h_random_seed_[idx] = r->gen_cfg.random_seed; } else { // Recover device states if not a new sequence @@ -799,12 +783,6 @@ void LlamaBatch::AllocatePersistantBuffer(size_t max_batch_size, int cache_bl sampling_params_ = { {"stop_words_list", (std::byte*)h_stop_words_, (std::byte*)d_stop_words_}, {"bad_words_list", (std::byte*)h_bad_words_, (std::byte*)d_bad_words_}, - {"min_length", (std::byte*)h_min_length_, nullptr}, - {"runtime_top_k", (std::byte*)h_runtime_top_k_, nullptr}, - {"runtime_top_p", (std::byte*)h_runtime_top_p_, nullptr}, - {"runtime_min_p", (std::byte*)h_runtime_min_p_, nullptr}, - {"temperature", (std::byte*)h_temperature_, nullptr}, - {"repetition_penalty", (std::byte*)h_repetition_penalty_, nullptr}, }; for (auto& s : states_) { @@ -941,19 +919,9 @@ template LlamaBatch::~LlamaBatch() { TM_LOG_DEBUG("~LlamaBatch()"); - shared_state_->request_queue.close(); internal_thread_.join(); - if (output_thread_.joinable()) { - { - std::lock_guard lock{output_mutex_}; - output_stop_token_ = true; - } - output_cv_.notify_one(); - output_thread_.join(); - } - // The dtor maybe called from unknown thread, set device id before CUDA calls check_cuda_error(cudaSetDevice(device_id_)); check_cuda_error(cudaStreamSynchronize(stream_)); @@ -970,8 +938,10 @@ LlamaBatch::LlamaBatch(const EngineParam& param, std::unique_ptr> model, // ! This is moved std::unique_ptr> ctx, // ! This is moved std::shared_ptr state, + std::shared_ptr gateway, int device_id): param_(param), + gateway_(gateway), shared_state_(state), max_batch_size_(param.max_batch_size), max_forward_token_num_(param.max_prefill_token_num + param.max_batch_size), @@ -1068,7 +1038,7 @@ void LlamaBatch::InitializeSampling(const GenerationState& g) sync_check_cuda_error(); Clear(token_ids_buf_, batch_size * session_len_); - invokeTransposeAxis01(token_ids_buf_, state_->output_ids, batch_size, session_len_, 1, stream_); + invokeTranspose2D(token_ids_buf_, state_->output_ids, batch_size, session_len_, stream_); sync_check_cuda_error(); // token_ids_buf_[s, b] @@ -1087,6 +1057,27 @@ void LlamaBatch::InitializeSampling(const GenerationState& g) Copy(h_seq_limit_len_, batch_size, seq_limit_len_); TensorMap inputs; + + auto member_to_tensor = [&](auto getter, auto key, auto dest, auto init) { + int count = 0; + for (int i = 0; i < batch_size; ++i) { + // `std::invoke` + dest[i] = state_->requests[i]->gen_cfg.*getter; + count += dest[i] != init; + } + if (count) { + inputs.insert(key, {MEMORY_CPU, getTensorType(), {(size_t)batch_size}, dest}); + } + }; + + using G = GenerationConfig; + member_to_tensor(&G::top_k, "runtime_top_k", h_runtime_top_k_, 0); + member_to_tensor(&G::top_p, "runtime_top_p", h_runtime_top_p_, 0); + member_to_tensor(&G::min_p, "runtime_min_p", h_runtime_min_p_, 0); + member_to_tensor(&G::temperature, "temperature", h_temperature_, 0.f); + member_to_tensor(&G::repetition_penalty, "repetition_penalty", h_repetition_penalty_, 1.f); + member_to_tensor(&G::min_new_tokens, "min_length", h_min_length_, 0); + for (const auto& [name, h_ptr, d_ptr] : sampling_params_) { // find an exemplar that matches the param name const Tensor* ptr{}; @@ -1173,7 +1164,7 @@ void LlamaBatch::InitializeSampling(const GenerationState& g) TensorMap outputs; for (int i = 0; i < batch_size; i++) { - if (state_->requests[i]->inputs.isExist("logprobs")) { + if (state_->requests[i]->gen_cfg.output_logprobs) { outputs.insert( {"sampled_logprobs", {MEMORY_GPU, TYPE_FP32, {(size_t)batch_size, 1, kMaxLogProb}, sampled_logprobs_}}); outputs.insert( @@ -1187,89 +1178,157 @@ void LlamaBatch::InitializeSampling(const GenerationState& g) sync_check_cuda_error(); } -template -void LlamaBatch::OutputContextLogits(T* context_decoder_output, - const std::vector& indices, - const std::vector& lengths, - const std::vector& sequences) +template +void LlamaBatch::ComputeAndOutputLogits(T* hidden_states, int first, int last) { - std::vector output_logits; - int num_token = 0; - { - bool is_return_logits = false; - for (int k = 0; k < indices.size(); ++k) { - auto& request = state_->requests[indices[k]]; - auto logits = request->outputs.getPtr("logits", nullptr); - if (logits && sequences[k]->cache_len + lengths[k] <= sequences[k]->tokens.size()) { - logits = nullptr; - } - output_logits.push_back(logits); - num_token += lengths[k]; - if (output_logits.back()) { - is_return_logits = true; + int token_num = 0; + bool found = false; + for (int i = first; i < last; ++i) { + if (state_->requests[i]->gen_cfg.output_logits == GenerationConfig::kAll) { + const auto& s = *state_->sequences[i]; + // Skip when the seq is filling missed cache only + if (s.cache_len + h_input_length_buf_[i] > s.tokens.size()) { + found = true; } } - if (!is_return_logits) { - return; - } + token_num += h_input_length_buf_[i]; } - { - context_logits_buf_ = (float*)allocator_->reMalloc( - context_logits_buf_, sizeof(float) * model_->vocab_size_padded_ * num_token, false); - const auto tp = model_->tensor_para_.world_size_; - if (tp > 1) { - NcclGuard guard(model_->tensor_para_, stream_, true); - FT_CHECK(model_->vocab_size_padded_ % tp == 0); - const auto local_vocab_size = model_->vocab_size_padded_ / tp; - local_context_logits_buf_ = (float*)peer_allocator_->reMalloc( - local_context_logits_buf_, sizeof(float) * model_->vocab_size_padded_ * num_token, false); - } + if (!found) { + return; } - model_->postDecodeEmbedding(context_logits_buf_, local_context_logits_buf_, context_decoder_output, num_token); + context_logits_buf_ = (float*)allocator_->reMalloc( + context_logits_buf_, sizeof(float) * model_->vocab_size_padded_ * token_num, false); + const auto tp = model_->tensor_para_.world_size_; + + if (tp > 1) { + NcclGuard guard(model_->tensor_para_, stream_, true); + FT_CHECK(model_->vocab_size_padded_ % tp == 0); + const auto local_vocab_size = model_->vocab_size_padded_ / tp; + local_context_logits_buf_ = (float*)peer_allocator_->reMalloc( + local_context_logits_buf_, sizeof(float) * model_->vocab_size_padded_ * token_num, false); + } - auto logits = context_logits_buf_; + model_->postDecodeEmbedding(context_logits_buf_, local_context_logits_buf_, hidden_states, token_num); - // Only rank-0 writes to output if (rank_ != 0) { return; } - for (int k = 0; k < indices.size(); ++k) { - if (output_logits[k]) { - auto src_ptr = logits; - auto dst_ptr = output_logits[k]; - int num_new_token = 0; - if (sequences[k]->cache_len < sequences[k]->tokens.size()) { - num_new_token = sequences[k]->cache_len + lengths[k] - sequences[k]->tokens.size(); - src_ptr += (lengths[k] - num_new_token) * model_->vocab_size_padded_; - } - else { - num_new_token = lengths[k]; - dst_ptr += (sequences[k]->cache_len - sequences[k]->tokens.size()) * model_->vocab_size_; + OutputLogits(context_logits_buf_, first, last, GenerationConfig::kAll); +} + +template +void LlamaBatch::OutputLogits(const float* logits, int first, int last, GenerationConfig::OutType out_type) +{ + // when `is_all` is true, logits only contains last token of the sequences + const bool is_all = out_type == GenerationConfig::kAll; + + for (int i = first; i < last; ++i) { + + const int input_len = h_input_length_buf_[i]; // input lenght for this iter + const float* src_ptr = logits; + + logits += (is_all ? input_len : 1) * model_->vocab_size_padded_; + + if (state_->requests[i]->gen_cfg.output_logits == out_type) { + + auto dst_ptr = state_->requests[i]->outputs.getPtr("logits"); + + const int cache_len = state_->sequences[i]->cache_len; + const int history_len = state_->sequences[i]->tokens.size(); + + // ----------H------I-------P----------- + // C C C C + + // offset to the last token prompt + const int offset = is_all ? 0 : state_->requests[i]->inputs.at("input_ids").shape[0] - 1; + + int diff = (history_len + offset) - cache_len; + + const int valid_len = input_len - std::max(0, (history_len + offset) - cache_len); + + // TM_LOG_ERROR("%d %d %d %d %d %d %d", + // history_len, + // offset, + // cache_len, + // input_len, + // valid_len, + // std::max(0, diff), + // std::max(0, -diff)); + + if (valid_len <= 0) { + continue; } - if (model_->vocab_size_padded_ == model_->vocab_size_) { - Copy(src_ptr, model_->vocab_size_ * num_new_token, dst_ptr); + + if (is_all) { + // Skip invalid tokens caused by cache miss + src_ptr += std::max(0, (history_len + offset) - cache_len) * model_->vocab_size_padded_; } - else { - for (int tok = 0; tok < num_new_token; tok++) { - Copy(src_ptr, model_->vocab_size_, dst_ptr); - src_ptr += model_->vocab_size_padded_; - dst_ptr += model_->vocab_size_; - } + // Skip previous chunks + dst_ptr += std::max(0, cache_len - (history_len + offset)) * model_->vocab_size_; + + check_cuda_error(cudaMemcpy2DAsync(dst_ptr, + sizeof(float) * model_->vocab_size_, + src_ptr, + sizeof(float) * model_->vocab_size_padded_, + sizeof(float) * model_->vocab_size_, + valid_len, + cudaMemcpyDefault, + stream_)); + } + } +} + +template +void LlamaBatch::OutputLastHiddenState(const T* hidden_states, int first, int last) +{ + for (int i = first; i < last; ++i) { + + const int input_len = h_input_length_buf_[i]; // input lenght for this iter + const T* src_ptr = hidden_states; + + hidden_states += input_len * model_->hidden_units_; + + if (auto out_type = state_->requests[i]->gen_cfg.output_last_hidden_state) { + + const bool is_all = out_type == GenerationConfig::kAll; + + T* dst_ptr = state_->requests[i]->outputs.getPtr("last_hidden_state"); + + const int cache_len = state_->sequences[i]->cache_len; + const int history_len = state_->sequences[i]->tokens.size(); + + // offset to the last prompt token + const int offset = is_all ? 0 : state_->requests[i]->inputs.at("input_ids").shape[0] - 1; + + const int valid_len = input_len - std::max(0, (history_len + offset) - cache_len); + + // TM_LOG_ERROR("%d %d %d %d %d", history_len, offset, cache_len, input_len, valid_len); + + if (valid_len <= 0) { + continue; } + + // Skip invalid tokens caused by cache miss + src_ptr += std::max(0, (history_len + offset) - cache_len) * model_->hidden_units_; + // Skip previous chunks + dst_ptr += std::max(0, cache_len - (history_len + offset)) * model_->hidden_units_; + + Copy(src_ptr, valid_len * model_->hidden_units_, dst_ptr); } - logits += model_->vocab_size_padded_ * lengths[k]; } } template -auto LlamaBatch::Finish(GenerationState& g) -> std::vector +void LlamaBatch::Finish(GenerationState& g, std::vector& signals) { NvtxScope scope("Finish"); const int batch_size = state_->active_size; + signals.reserve(batch_size); + if (batch_size - g.partial) { FT_CHECK(g.step >= 0); @@ -1285,13 +1344,22 @@ auto LlamaBatch::Finish(GenerationState& g) -> std::vector sync_check_cuda_error(); } - Copy(state_->output_ids, batch_size * session_len_, h_output_ids_); + Copy(token_ids_buf_ + (g.step - 1) * (batch_size - g.partial), batch_size - g.partial, h_output_ids_); Copy(finished_buf_, batch_size, state_->h_finished); Copy(sequence_lengths_, batch_size, state_->h_context_length); - Copy(sampled_logprobs_, batch_size * kMaxLogProb, h_sampled_logprobs_); - Copy(sampled_indexes_, batch_size * kMaxLogProb, h_sampled_indexes_); - Copy(sampled_nums_, batch_size, h_sampled_nums_); + bool output_logprobs = false; + for (int i = 0; i < batch_size - g.partial; ++i) { + if (state_->requests[i]->gen_cfg.output_logprobs) { + output_logprobs = true; + break; + } + } + if (output_logprobs) { + Copy(sampled_logprobs_, batch_size * kMaxLogProb, h_sampled_logprobs_); + Copy(sampled_indexes_, batch_size * kMaxLogProb, h_sampled_indexes_); + Copy(sampled_nums_, batch_size, h_sampled_nums_); + } check_cuda_error(cudaStreamSynchronize(stream_)); @@ -1302,13 +1370,14 @@ auto LlamaBatch::Finish(GenerationState& g) -> std::vector } // ! Only rank-0 writes to output - if (rank_ == 0) { + if (rank_ == 0 && output_logprobs) { + NvtxScope scope("logprobs"); // output logprobs, should be set before sequence_length float* sampled_logprobs_ptr = h_sampled_logprobs_; uint32_t* sampled_indexes_ptr = h_sampled_indexes_; uint32_t* sampled_nums_ptr = h_sampled_nums_; for (int i = 0; i < batch_size - g.partial; ++i) { - if (state_->requests[i] && state_->requests[i]->inputs.isExist("logprobs")) { + if (state_->requests[i] && state_->requests[i]->gen_cfg.output_logprobs) { auto logprob_vals = state_->requests[i]->outputs.getPtr("logprob_vals"); auto logprob_indexes = state_->requests[i]->outputs.getPtr("logprob_indexes"); auto logprob_nums = state_->requests[i]->outputs.getPtr("logprob_nums"); @@ -1330,18 +1399,37 @@ auto LlamaBatch::Finish(GenerationState& g) -> std::vector // ! Only rank-0 writes to output if (rank_ == 0) { - // set output tokens ids and sequence length - int* output_ptr = h_output_ids_; - for (int i = 0; i < batch_size - g.partial; ++i) { - if (state_->requests[i] && (state_->requests[i]->stream_cb || state_->h_finished[i])) { - auto output_ids = state_->requests[i]->outputs.getPtr("output_ids"); - auto output_len = state_->requests[i]->outputs.getPtr("sequence_length"); - const int count = state_->h_context_length[i]; - // TODO: sync history output tokens at when receiving the request and copy the last token here - std::copy(output_ptr, output_ptr + count, output_ids); - *output_len = count; + NvtxScope scope("output_ids"); + if constexpr (0) { + // set output tokens ids and sequence length + int* output_ptr = h_output_ids_; + for (int i = 0; i < batch_size - g.partial; ++i) { + if (auto& r = state_->requests[i]) { + auto output_ids = static_cast(r->output_ids.data); + auto output_len = static_cast(r->sequence_length.data); + const int count = state_->h_context_length[i]; + if (r->stream_output) { + output_ids[count - 1] = output_ptr[count - 1]; + *output_len = count; + } + else if (state_->h_finished[i]) { + std::copy(output_ptr, output_ptr + count, output_ids); + *output_len = count; + } + } + output_ptr += session_len_; + } + } + else { + for (int i = 0; i < batch_size - g.partial; ++i) { + if (auto& r = state_->requests[i]) { + auto output_ids = static_cast(r->output_ids.data); + auto output_len = static_cast(r->sequence_length.data); + const int count = state_->h_context_length[i]; + output_ids[count - 1] = h_output_ids_[i]; + *output_len = count; + } } - output_ptr += session_len_; } } @@ -1362,48 +1450,53 @@ auto LlamaBatch::Finish(GenerationState& g) -> std::vector } } - std::vector signals; { - NvtxScope _("stream_and_completion_signal"); + NvtxScope _("count and sync"); + bool need_sync = false; for (int i = 0; i < batch_size - g.partial; ++i) { - if (state_->requests[i]) { - if (state_->h_finished[i]) { - // Interrupt finished sequences and move the request handle into the signal closure - signals.push_back(Interrupt(i)); - ++g.finished_count; - } - else if (state_->requests[i]->stream_cb) { - // Create signals by copying the request handles for non-finished streaming requests - signals.push_back([this, r = state_->requests[i]] { - if (rank_ == 0) { - try { - r->stream_cb(&r->outputs.get()); - } - catch (const std::bad_function_call& e) { - TM_LOG_ERROR("Null stream callback for (%s)", std::to_string(r->id).c_str()); - } - catch (...) { - TM_LOG_ERROR("Unknown exception invoking stream callback for (%s)", - std::to_string(r->id).c_str()); - } - } - }); + if (state_->h_finished[i]) { + ++g.finished_count; + if (!state_->requests[i]->session.end_flag) { + need_sync = true; } } } - if (g.finished_count) { - // synchronize for interrupted sequences - check_cuda_error(cudaStreamSynchronize(stream_)); + if (need_sync) { + // Release updates on request output buffers to all ranks (`Interrupt` will use it) + shared_state_->barrier->wait(); } } + { + NvtxScope _("stream_and_completion_signal"); + for (int i = 0; i < batch_size - g.partial; ++i) { + auto& r = state_->requests[i]; + if (state_->h_finished[i]) { + // Interrupt finished sequences and move the request handle into the signal closure + signals.push_back(Interrupt(i)); + // Interrupt should reset r + FT_CHECK(!r); + } + else if (r->stream_output && rank_ == 0) { + const auto seq_len = r->sequence_length.getVal(); + // Create signals by copying the request handles for non-finished streaming requests + signals.push_back([this, r, seq_len] { // + UpdateState(*r, Request::kOk, seq_len); + }); + } + } + } + + if (g.finished_count) { + // synchronize for interrupted sequences + check_cuda_error(cudaStreamSynchronize(stream_)); + } + if (g.partial) { const int i = batch_size - 1; // recover full context length of partial state_->h_context_length[i] = g.partial_context_legnth; } - - return signals; } template @@ -1424,7 +1517,7 @@ auto LlamaBatch::Interrupt(int index, bool force_stop, bool force_end) -> Sig TM_LOG_INFO("[Interrupt] slot %d, tokens [%s]", index, ss.str().c_str()); } - if (state_->requests[index]->end_flag || force_end) { + if (state_->requests[index]->session.end_flag || force_end) { // Sequence is ending this round or a stop request is issued to end it FT_CHECK(sequence_manager_->Erase(state_->requests[index]->id)); } @@ -1434,17 +1527,10 @@ auto LlamaBatch::Interrupt(int index, bool force_stop, bool force_end) -> Sig // Update token IDs seq.tokens.resize(output_len); - const auto output_ids_data = [&] { - if (force_stop) { - // `h_output_ids_` is UNDEFINED at `ProcessStopRequests` - return state_->requests[index]->outputs.at("output_ids").getPtr(); - } - else { - // `h_output_ids_` just updated by `Finish`, but `outputs` is NOT synced atm - return h_output_ids_ + index * (size_t)session_len_; - } - }(); - std::copy_n(output_ids_data, output_len, seq.tokens.data()); + + // output_ids is updated & synced in `Finish` + const auto output_ids = state_->requests[index]->output_ids.getPtr(); + std::copy_n(output_ids, output_len, seq.tokens.data()); // Save random state in host memory seq.random_state.resize(sizeof(curandState_t)); @@ -1457,13 +1543,12 @@ auto LlamaBatch::Interrupt(int index, bool force_stop, bool force_end) -> Sig state_->sequences[index] = nullptr; - auto ec = std::exchange(state_->errors[index], 0); + auto ec = std::exchange(state_->errors[index], Request::kOk); + const auto len = state_->requests[index]->sequence_length.getVal(); // move the request handle into the signal - return [this, ec, r = std::move(state_->requests[index])] { - if (rank_ == 0) { - r->signal.set_value(ec); - } + return [this, len, force_stop, r = std::move(state_->requests[index])] { // + UpdateState(*r, force_stop ? Request::kCancel : Request::kFinish, len); }; } @@ -1476,33 +1561,30 @@ void LlamaBatch::InternalThreadEntry() // Initialize `AnomalyHandler` AnomalyHandler::instance().Init(rank_, model_->vocab_size_padded_, model_->end_id_, max_batch_size_, stream_); - auto& request_queue = shared_state_->request_queue; - auto& infer_requests = shared_state_->infer_requests; - auto& stop_requests = shared_state_->stop_requests; + // auto& request_queue = shared_state_->request_queue; + auto& infer_reqs = shared_state_->infer_reqs; + auto& kill_reqs = shared_state_->kill_reqs; GenerationState g{}; - constexpr int request_interval = 1; - long request_counter = 0; - while (1) { + if (rank_ == 0) { - const int free_slot_count = max_batch_size_ - state_->size + g.finished_count; - const bool is_empty = (free_slot_count == max_batch_size_); - stop_requests.clear(); - infer_requests.clear(); - if (is_empty || request_counter % request_interval == 0) { + { + NvtxScope _("pop"); + const int free_slot_count = max_batch_size_ - state_->size + g.finished_count; + const bool is_empty = (free_slot_count == max_batch_size_); // Block if batch is empty - request_queue.dequeue(stop_requests, infer_requests, free_slot_count, is_empty, shared_state_->abort); - if (!shared_state_->abort) { - RejectInvalidRequests(stop_requests, infer_requests); - } + gateway_->pop(infer_reqs, kill_reqs, free_slot_count, is_empty, shared_state_->abort); } + // Mark reqs to the same session_id as invalid (which are dangerous to the engine) + DisableConflictRequests(infer_reqs, kill_reqs); } NvtxScope scope("mainloop"); - // wait while rank-0 is dequeueing + // 1. Wait while rank-0 is dequeueing + // 2. Broadcast `ec` from rank-0 shared_state_->barrier->wait(); if (shared_state_->abort) { @@ -1510,90 +1592,58 @@ void LlamaBatch::InternalThreadEntry() return; } - auto signals = ProcessStopRequests(stop_requests); + std::vector signals; + + ProcessKillRequests(kill_reqs, signals); // Shared `priority` field will be assigned by rank-0 - ProcessInferRequests(infer_requests); + ProcessInferRequests(infer_reqs, signals); + + // is_canceled <- cancel_flag.load() + if (rank_ == 0) { + BroadcastCancelFlags(); + } - // Wait while shared `requests` is being used + // 1. Wait while shared `requests` is being used + // 2. Broadcast modifcations from rank-0 shared_state_->barrier->wait(); - SendSignals(std::move(signals)); + ProcessCancelRequests(signals); + + if (rank_ == 0) { + gateway_->notify(std::move(signals)); + } Initialize(g); if (state_->active_size) { // - (void)Forward(g); - // - if (auto signals = Finish(g); !signals.empty()) { - if (g.finished_count) { - // Finished requests and corresponding output tensors will be released when notified - // wait for all ranks to ensure no rank (except for output thread) will access related - // resources - shared_state_->barrier->wait(); - } - SendSignals(std::move(signals)); + Forward(g); + + Finish(g, signals); + + if (g.finished_count) { + // Finished requests and corresponding output tensors will be released when notified + // wait for all ranks to ensure no rank (except for output thread) will access related + // resources + shared_state_->barrier->wait(); } - } - ++request_counter; + if (rank_ == 0) { + gateway_->notify(std::move(signals)); + } + } } + // Unreachable FT_CHECK(0); } -template -void LlamaBatch::SendSignals(std::vector signals) -{ - if (rank_ != 0 || signals.empty()) { - return; - } - { - std::lock_guard lock{output_mutex_}; - output_signals_.insert(output_signals_.end(), // - std::move_iterator{signals.begin()}, - std::move_iterator{signals.end()}); - } - output_cv_.notify_one(); -} - template void LlamaBatch::Start() { TM_LOG_INFO("LlamaBatch::Start()"); internal_thread_ = std::thread(&LlamaBatch::InternalThreadEntry, this); - if (rank_ == 0) { - output_thread_ = std::thread(&LlamaBatch::OutputThreadEntry, this); - } -} - -template -void LlamaBatch::OutputThreadEntry() -{ - while (true) { - std::vector signals; - { - // Wait for signals to come - std::unique_lock lock(output_mutex_); - output_cv_.wait(lock, [&] { return !output_signals_.empty() || output_stop_token_; }); - if (output_stop_token_) { - TM_LOG_INFO("[OutputThreadEntry] stop requested."); - return; - } - signals = std::move(output_signals_); - } - if (rank_ == 0 && ffi_lock_) { - ffi_lock_(1); - } - // invoke stream cbs & signals - for (const auto& s : signals) { - s(); - } - if (rank_ == 0 && ffi_lock_) { - ffi_lock_(0); - } - } } template @@ -1662,20 +1712,11 @@ bool LlamaBatch::Forward(GenerationState& g) const int last = offsets[p + 1]; const int mini_batch_size = last - first; int* input_ids = context_decoder_ids_buf_; - // - std::vector decode_indices{}; - std::vector decode_lengths{}; - - std::vector sequences; BatchedCopy batched_copy; int sum_k = 0; for (int i = first; i < last; ++i) { input_ids = batched_copy.Add(input_d_ptrs[i], h_input_length_buf_[i], input_ids); - dbg(i, h_input_length_buf_[i]); - decode_indices.push_back(i); - decode_lengths.push_back(h_input_length_buf_[i]); - sequences.push_back(state_->sequences[i]); if (h_input_length_buf_[i] > 1) { sum_k += state_->h_context_length[i]; } @@ -1717,20 +1758,10 @@ bool LlamaBatch::Forward(GenerationState& g) dc_batch_size, pf_batch_size, lora_mask_buf_, - sequences.data()); - - // compute logits of inputs if requested - OutputContextLogits(context_decoder_output_buf_, decode_indices, decode_lengths, sequences); - } - - std::fill(h_input_length_buf_, h_input_length_buf_ + active_size, 0); + state_->sequences.data() + first); - // `SequenceManager` needs real-time value of cache length - for (int i = 0; i < active_size; ++i) { - if (state_->requests[i]) { - FT_CHECK(state_->sequences[i]); - state_->sequences[i]->cache_len += state_->sequences[i]->input_length; - } + ComputeAndOutputLogits(context_decoder_output_buf_, first, last); + OutputLastHiddenState(context_decoder_output_buf_, first, last); } if (active_size > g.partial) { @@ -1738,12 +1769,10 @@ bool LlamaBatch::Forward(GenerationState& g) AnomalyHandler::instance().FixLogits(logits_buf_, active_size - g.partial, 1); - // count_and_fix(logits_buf_, (active_size - g.partial) * model_->vocab_size_padded_, "logits", 1); + OutputLogits(logits_buf_, 0, active_size - g.partial, GenerationConfig::kGeneration); FT_CHECK(g.step >= 0); - // TM_LOG_INFO("dyn decode bsz %d, partial %d", active_size, g.partial); - if (!g.skip_init_sampling) { InitializeSampling(g); } @@ -1767,6 +1796,15 @@ bool LlamaBatch::Forward(GenerationState& g) active_size - g.partial); } + std::fill(h_input_length_buf_, h_input_length_buf_ + active_size, 0); + + // `SequenceManager` needs real-time value of cache length + for (int i = 0; i < active_size; ++i) { + FT_CHECK((bool)state_->requests[i]); + FT_CHECK(state_->sequences[i]); + state_->sequences[i]->cache_len += state_->sequences[i]->input_length; + } + AnomalyHandler::instance().Summarize([&](const int* is_anomaly, int batch_size) { for (int i = 0; i < batch_size; ++i) { if (is_anomaly[i]) { @@ -1800,101 +1838,6 @@ bool LlamaBatch::Forward(GenerationState& g) return true; } -static inline Tensor slice(const Tensor& tensor, int index) -{ - auto shape = tensor.shape; - if (shape.at(0) == 1) { - return tensor; - } - shape[0] = 1; - const auto offset = std::accumulate(shape.begin(), shape.end(), (size_t)index, std::multiplies<>{}); - return tensor.slice(shape, offset); -} - -// ! implicit conversion from `unordered_map` to `TensorMap` drops 0-sized tensors -static inline TensorMap slice(const std::unordered_map& src, int index) -{ - TensorMap dst; - for (const auto& kv : src) { - dst.insert({kv.first, slice(kv.second, index)}); - } - return dst; -} - -template -void LlamaBatch::Submit(std::unordered_map* outputs, - const std::unordered_map* inputs, - Control control) -{ - if (debug_) { - for (const auto& kv : *inputs) { - TM_LOG_INFO("[Submit] INPUT: %s", format(kv).c_str()); - } - for (const auto& kv : *outputs) { - TM_LOG_INFO("[Submit] OUTPUT: %s", format(kv).c_str()); - } - } - - const int batch_size = outputs->at("output_ids").shape[0]; - - std::vector> requests(batch_size); - - // allocates all requests for the batch - for (int i = 0; i < batch_size; ++i) { - requests[i] = std::make_shared(); - } - - for (int i = 0; i < batch_size; ++i) { - auto& r = requests[i]; - - r->inputs = slice(*inputs, i); - r->outputs = slice(*outputs, i); - - r->id = r->inputs.getVal("CORRID", i); - r->start_flag = r->inputs.getVal("START", 1); - r->end_flag = r->inputs.getVal("END", 1); - r->stop_flag = r->inputs.getVal("STOP", 0); - r->stream_cb = control.callback; - } - - // Submits the tasks and wait for finish - std::vector error_codes; - bool has_error = 0; - - TM_LOG_INFO("[forward] Enqueue requests"); - - std::vector ids; - for (const auto& r : requests) { - ids.push_back(r->id); - } - - auto futures = shared_state_->request_queue.enqueue(std::move(requests)); - - FT_CHECK_WITH_INFO(ids.size() == futures.size(), "check failed"); - - TM_LOG_INFO("[forward] Wait for requests to complete ..."); - - for (int i = 0; i < futures.size(); ++i) { - auto ec = futures[i].get(); - error_codes.push_back(ec); - if (ec) { - has_error = true; - TM_LOG_WARNING("[forward] Request failed for %ld, code %d", (long)ids[i], (int)ec); - } - else { - TM_LOG_INFO("[forward] Request completed for %ld", (long)ids[i]); - } - } - - if (has_error) { - std::stringstream ss; - for (int i = 0; i < error_codes.size(); ++i) { - ss << (i ? "" : " ") << error_codes[i]; - } - throw std::runtime_error(ss.str()); - } -} - namespace { template diff --git a/src/turbomind/models/llama/LlamaBatch.h b/src/turbomind/models/llama/LlamaBatch.h index f952da6bae..44a20bebe1 100644 --- a/src/turbomind/models/llama/LlamaBatch.h +++ b/src/turbomind/models/llama/LlamaBatch.h @@ -2,38 +2,31 @@ #pragma once +#include + +#include "src/turbomind/engine/gateway.h" +#include "src/turbomind/engine/request.h" + #include "src/turbomind/models/llama/Barrier.h" -#include "src/turbomind/models/llama/LlamaNcclGuard.h" -#include "src/turbomind/models/llama/Request.h" #include "src/turbomind/models/llama/SequenceManager.h" #include "src/turbomind/models/llama/context.h" #include "src/turbomind/models/llama/llama_kernels.h" #include "src/turbomind/models/llama/llama_params.h" + #include "src/turbomind/utils/allocator.h" #include "src/turbomind/utils/cublasMMWrapper.h" #include "src/turbomind/utils/cuda_utils.h" -#include -#include -#include -#include - -using ffi_api_lock_ctrl_t = std::function; namespace turbomind { struct SharedState { - std::vector> infer_requests; - std::vector> stop_requests; - RequestQueue request_queue; + std::vector> infer_reqs; + std::vector> kill_reqs; std::shared_ptr barrier; bool abort; std::atomic free_size{std::numeric_limits::max()}; }; -struct Control { - Request::Callback callback; -}; - struct BatchState { int* h_prompt_length; // history + input, ignore generated int* h_context_length; @@ -87,11 +80,11 @@ class LlamaBatch { using Requests = std::vector>; using Signal = std::function; - void RejectInvalidRequests(Requests& stop_reqs, Requests& infer_reqs); + void DisableConflictRequests(Requests& infer_reqs, Requests& kill_reqs); - [[nodiscard]] auto ProcessStopRequests(const Requests& requests) -> std::vector; + void ProcessKillRequests(const Requests& reqs, std::vector& signals); - void ProcessInferRequests(const Requests& requests); + void ProcessInferRequests(const Requests& reqs, std::vector& signals); int AdjustMaxInputCount(GenerationState& g, const std::vector& sequences, @@ -101,36 +94,29 @@ class LlamaBatch { void InitializeSampling(const GenerationState& g); - [[nodiscard]] bool Forward(GenerationState& g); + bool Forward(GenerationState& g); - [[nodiscard]] auto Finish(GenerationState& g) -> std::vector; + void Finish(GenerationState& g, std::vector& signals); [[nodiscard]] Signal Interrupt(int index, bool force_stop = false, bool force_end = false); - void OutputContextLogits(T* context_decoder_output, - const std::vector& indices, - const std::vector& lengths, - const std::vector& sequences); + void ComputeAndOutputLogits(T* hidden_states, int first, int last); + + void OutputLogits(const float* logits, int first, int last, GenerationConfig::OutType out_type); + + void OutputLastHiddenState(const T* hidden_states, int first, int last); explicit LlamaBatch(const EngineParam& param, std::unique_ptr> model, std::unique_ptr> ctx, std::shared_ptr state, + std::shared_ptr gateway, int device_id); ~LlamaBatch(); void Start(); - void Submit(std::unordered_map* outputs, - const std::unordered_map* inputs, - Control control); - - void set_ffi_lock(ffi_api_lock_ctrl_t func) - { - ffi_lock_ = func; - } - LlamaV2& model() noexcept { return *model_; @@ -144,14 +130,16 @@ class LlamaBatch { void tune(); private: + void BroadcastCancelFlags(); + + void ProcessCancelRequests(std::vector& signals); + void InternalThreadEntry(); void OutputThreadEntry(); void CopyState(const std::vector>& desc); - void SendSignals(std::vector signals); - // analogs to `std::copy_n` template U* Copy(const U* src, size_t count, U* dst) @@ -211,6 +199,7 @@ class LlamaBatch { private: const EngineParam param_; + const std::shared_ptr gateway_; const std::shared_ptr shared_state_; const int max_batch_size_; @@ -323,14 +312,6 @@ class LlamaBatch { std::thread internal_thread_; - // async stream callback utils - std::thread output_thread_; - std::mutex output_mutex_; - std::condition_variable output_cv_; - std::vector output_signals_; - bool output_stop_token_{false}; - ffi_api_lock_ctrl_t ffi_lock_; - int* h_output_ids_{}; }; diff --git a/src/turbomind/models/llama/LlamaV2.cc b/src/turbomind/models/llama/LlamaV2.cc index 05b22deed5..b25a8eac60 100644 --- a/src/turbomind/models/llama/LlamaV2.cc +++ b/src/turbomind/models/llama/LlamaV2.cc @@ -20,35 +20,30 @@ // Modified from // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.cc -#include "src/turbomind/models/llama/LlamaV2.h" -#include "src/turbomind/kernels/attention/attention_params.h" -#include "src/turbomind/kernels/decoding_kernels.h" -#include "src/turbomind/kernels/gemm/tuner/params.h" -#include "src/turbomind/kernels/gpt_kernels.h" +#include +#include + #include "src/turbomind/macro.h" + #include "src/turbomind/models/llama/LlamaBatch.h" #include "src/turbomind/models/llama/LlamaDenseWeight.h" #include "src/turbomind/models/llama/LlamaNcclGuard.h" +#include "src/turbomind/models/llama/LlamaV2.h" #include "src/turbomind/models/llama/LlamaWeight.h" -#include "src/turbomind/models/llama/Request.h" #include "src/turbomind/models/llama/SequenceManager.h" #include "src/turbomind/models/llama/llama_params.h" #include "src/turbomind/models/llama/llama_utils.h" #include "src/turbomind/models/llama/unified_decoder.h" + +#include "src/turbomind/kernels/decoding_kernels.h" +#include "src/turbomind/kernels/gpt_kernels.h" + #include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/allocator.h" #include "src/turbomind/utils/anomaly_handler.h" #include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/logger.h" #include "src/turbomind/utils/memory_utils.h" -#include "src/turbomind/utils/monotonic.h" -#include -#include -#include -#include -#include -#include -#include namespace turbomind { diff --git a/src/turbomind/models/llama/LlamaV2.h b/src/turbomind/models/llama/LlamaV2.h index a0d35b887f..8101310812 100644 --- a/src/turbomind/models/llama/LlamaV2.h +++ b/src/turbomind/models/llama/LlamaV2.h @@ -21,14 +21,9 @@ #pragma once -#include -#include - #include "src/turbomind/layers/DynamicDecodeLayer.h" -#include "src/turbomind/models/llama/Barrier.h" #include "src/turbomind/models/llama/LlamaBatch.h" #include "src/turbomind/models/llama/LlamaWeight.h" -#include "src/turbomind/models/llama/Request.h" #include "src/turbomind/models/llama/SequenceManager.h" #include "src/turbomind/models/llama/llama_params.h" #include "src/turbomind/models/llama/unified_decoder.h" diff --git a/src/turbomind/models/llama/Request.h b/src/turbomind/models/llama/Request.h deleted file mode 100644 index 2a715e9c9c..0000000000 --- a/src/turbomind/models/llama/Request.h +++ /dev/null @@ -1,115 +0,0 @@ -// Copyright (c) OpenMMLab. All rights reserved. - -#pragma once - -#include "src/turbomind/utils/Tensor.h" -#include -#include -#include -#include -#include -#include - -namespace turbomind { - -struct Request { - uint64_t id; // sequence id - uint64_t unique_id; // monotonic increasing - - bool start_flag; - bool end_flag; - bool stop_flag; - - // per rank inputs/outputs - TensorMap inputs; - TensorMap outputs; - - using Callback = std::function*)>; - Callback stream_cb; - - enum - { - kInvalid = 1, // Sequence not exist or both `start` & `stop` (instead of `end`) is set - kConflict = 2, // Concurrent requests to the same sequence - kBusy = 3, // Sequence is already running - kInactive = 4, // Sequence to `stop` is not active - kFail = 5, // Can't find sequence for `stop` request or internal error during inference - kTooLong = 6 // history + prompt > session_len - }; - - std::promise signal; -}; - -class RequestQueue { -public: - std::vector> enqueue(std::vector> requests) - { - std::vector> futures; - futures.reserve(requests.size()); - { - std::lock_guard lock(mutex_); - - if (closed_) { - throw std::runtime_error("Queue is closed"); - } - - for (auto& r : requests) { - futures.push_back(r->signal.get_future()); - if (r->stop_flag) { - stop_queue_.push(std::move(r)); - } - else { - infer_queue_.push(std::move(r)); - } - } - } - cv_.notify_one(); - return futures; - } - - void dequeue(std::vector>& stop_requests, - std::vector>& infer_requests, - unsigned max_infer_count, - bool blocking, - bool& abort) - { - std::unique_lock lock(mutex_); - if (blocking) { - cv_.wait(lock, [this] { return !(stop_queue_.empty() && infer_queue_.empty()) || closed_; }); - if (closed_) { - abort = true; - return; - } - } - - stop_requests.clear(); - while (!stop_queue_.empty()) { - stop_requests.push_back(std::move(stop_queue_.front())); - stop_queue_.pop(); - } - - infer_requests.clear(); - while (!infer_queue_.empty() && infer_requests.size() < max_infer_count) { - infer_requests.push_back(std::move(infer_queue_.front())); - infer_queue_.pop(); - } - } - - void close() - { - { - std::lock_guard lock(mutex_); - closed_ = true; - } - cv_.notify_all(); - } - -private: - std::queue> stop_queue_; - std::queue> infer_queue_; - std::mutex mutex_; - std::condition_variable cv_; - bool closed_{false}; -}; - -} // namespace turbomind diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp index 71792a4be8..042e858b09 100644 --- a/src/turbomind/python/bind.cpp +++ b/src/turbomind/python/bind.cpp @@ -1,6 +1,7 @@ // Copyright (c) OpenMMLab. All rights reserved. #include +#include #include #include @@ -11,6 +12,7 @@ #include #include +#include "src/turbomind/engine/model_request.h" #include "src/turbomind/python/dlpack.h" #include "src/turbomind/triton_backend/llama/LlamaTritonModel.h" #include "src/turbomind/triton_backend/transformer_triton_backend.hpp" @@ -22,18 +24,19 @@ namespace py = pybind11; namespace ft = turbomind; using namespace pybind11::literals; +using ft::ManagedTensor; +using ft::Tensor; + // prepare to bind container -using TensorVector = std::vector; -PYBIND11_MAKE_OPAQUE(TensorVector); -using TensorMap = std::unordered_map; +using TensorMap = std::unordered_map; PYBIND11_MAKE_OPAQUE(TensorMap); static const char kDlTensorCapsuleName[] = "dltensor"; -DLDevice getDLDevice(ft::Tensor& tensor) +DLDevice getDLDevice(const ft::Tensor& tensor) { int device_id = 0; if (tensor.where == ft::MEMORY_GPU) { - cudaPointerAttributes ptr_attr; + cudaPointerAttributes ptr_attr{}; cudaPointerGetAttributes(&ptr_attr, tensor.data); device_id = ptr_attr.device; } @@ -57,12 +60,12 @@ DLDevice getDLDevice(ft::Tensor& tensor) return device; } -DLManagedTensor* TritonTensorToDLManagedTensor(ft::Tensor& tensor) +DLManagedTensor* TritonTensorToDLManagedTensor(ManagedTensor& tensor) { - DLDevice device = getDLDevice(tensor); + DLDevice device = getDLDevice(*tensor); DLDataType data_type{0, 0, 1}; - switch (tensor.type) { + switch (tensor->type) { case ft::TYPE_BOOL: data_type.code = DLDataTypeCode::kDLBool; data_type.bits = 8; @@ -119,14 +122,26 @@ DLManagedTensor* TritonTensorToDLManagedTensor(ft::Tensor& tensor) default: break; } - DLTensor dl_tensor{const_cast(tensor.data), + ManagedTensor* ctx = new ManagedTensor(tensor); + DLTensor dl_tensor{const_cast((*ctx)->data), device, - (int32_t)(tensor.shape.size()), + (int32_t)((*ctx)->shape.size()), data_type, - reinterpret_cast(const_cast(tensor.shape.data())), + reinterpret_cast(const_cast((*ctx)->shape.data())), (int64_t*)(nullptr), 0}; - return new DLManagedTensor{dl_tensor, nullptr, [](DLManagedTensor* dlmt) { delete dlmt; }}; + return new DLManagedTensor{dl_tensor, ctx, [](DLManagedTensor* dlmt) { // + // auto& x = *(ManagedTensor*)dlmt->manager_ctx; + // std::stringstream ss; + // ss << "("; + // for (const auto& d : x->shape) { + // ss << d << ","; + // } + // ss << ")"; + // std::cerr << "turbomind tensor dtor " << ss.str() << " " << std::endl; + delete (ManagedTensor*)dlmt->manager_ctx; + delete dlmt; + }}; } ft::MemoryType getMemoryType(DLDevice device) @@ -200,7 +215,7 @@ ft::DataType getDataType(DLDataType data_type) } } -std::shared_ptr DLManagedTensorToTritonTensor(DLManagedTensor* tensor) +std::shared_ptr DLManagedTensorToTritonTensor(DLManagedTensor* tensor) { auto& dl_tensor = tensor->dl_tensor; auto where = getMemoryType(dl_tensor.device); @@ -209,14 +224,15 @@ std::shared_ptr DLManagedTensorToTritonTensor(DLManagedTensor* tenso std::vector shape(dl_tensor.shape, dl_tensor.shape + dl_tensor.ndim); auto data = dl_tensor.data; - return std::make_shared(where, dtype, shape, data); -} - -DLTensor GetDLTensor(py::object obj) -{ - py::capsule cap = obj.attr("__dlpack__")(); - DLManagedTensor* dlmt = static_cast(PyCapsule_GetPointer(cap.ptr(), kDlTensorCapsuleName)); - return dlmt->dl_tensor; + auto ret = std::make_shared(); + ret->tensor = Tensor(where, dtype, std::move(shape), data); + ret->data_holder.reset((void*)nullptr, [tensor](void*) { + // std::cerr << "dlpack tensor dtor" << std::endl; + if (tensor->deleter) { + tensor->deleter(tensor); + } + }); + return ret; } static void safe_memcpy(void* dst, const void* src, size_t size) @@ -264,6 +280,26 @@ static void safe_memcpy(void* dst, const void* src, size_t size) } } +namespace { + +struct ScopedGIL { + ScopedGIL(const ScopedGIL&) = delete; + ScopedGIL& operator=(const ScopedGIL&) = delete; + ScopedGIL(ScopedGIL&&) = delete; + ScopedGIL& operator=(ScopedGIL&&) = delete; + ScopedGIL() + { + state = PyGILState_Ensure(); + } + ~ScopedGIL() + { + PyGILState_Release(state); + } + PyGILState_STATE state; +}; + +} // namespace + PYBIND11_MODULE(_turbomind, m) { // nccl param @@ -272,7 +308,54 @@ PYBIND11_MODULE(_turbomind, m) .def("__str__", &ft::NcclParam::toString); // custom comm - py::class_>(m, "AbstractCustomComm"); + (void)py::class_>(m, "AbstractCustomComm"); + + py::class_(m, "SessionParam") + .def(py::init([](uint64_t id, int step, bool start, bool end) { + if (!start && end) { + throw std::logic_error("unsupported arguments: start=false, end=true"); + } + ft::SessionParam param{}; + param.id = id; + param.step = step; + param.start_flag = start; + param.end_flag = end; + return param; + }), + "id"_a, + "step"_a, + "start"_a, + "end"_a) + .def_readwrite("id", &ft::SessionParam::id) + .def_readwrite("step", &ft::SessionParam::step) + .def_readwrite("start", &ft::SessionParam::start_flag) + .def_readwrite("end", &ft::SessionParam::end_flag); + + py::class_(m, "GenerationConfig") + .def(py::init()) + .def_readwrite("max_new_tokens", &ft::GenerationConfig::max_new_tokens) + .def_readwrite("min_new_tokens", &ft::GenerationConfig::min_new_tokens) + .def_readwrite("top_p", &ft::GenerationConfig::top_p) + .def_readwrite("top_k", &ft::GenerationConfig::top_k) + .def_readwrite("min_p", &ft::GenerationConfig::min_p) + .def_readwrite("temperature", &ft::GenerationConfig::temperature) + .def_readwrite("repetition_penalty", &ft::GenerationConfig::repetition_penalty) + .def_readwrite("random_seed", &ft::GenerationConfig::random_seed) + .def_readwrite("output_logprobs", &ft::GenerationConfig::output_logprobs) + .def_readwrite("output_last_hidden_state", &ft::GenerationConfig::output_last_hidden_state) + .def_readwrite("output_logits", &ft::GenerationConfig::output_logits) + .def("__repr__", [](const ft::GenerationConfig& c) { + std::ostringstream oss; + oss << c; + return oss.str(); + }); + + py::class_>(m, "RequestState") + .def_readonly("status", &ft::RequestState::status) + .def_readonly("seq_len", &ft::RequestState::seq_len); + + py::class_>(m, "AtomicRequestState") + .def("consume", [](ft::AtomicRequestState& s) { return s.exchange(nullptr); }); // data type py::enum_(m, "DataType") @@ -299,45 +382,46 @@ PYBIND11_MODULE(_turbomind, m) .value("MEMORY_GPU", ft::MemoryType::MEMORY_GPU); // tensor - py::class_>(m, "Tensor") - .def_readonly("where", &ft::Tensor::where) - .def_readonly("type", &ft::Tensor::type) - .def_readonly("shape", &ft::Tensor::shape) - .def_readonly("data", &ft::Tensor::data) - .def(py::init( - [](const ft::MemoryType where, const ft::DataType type, const std::vector& shape, const long data) { - auto data_ptr = reinterpret_cast(data); - return new ft::Tensor(where, type, shape, data_ptr); - })) + py::class_>(m, "Tensor") + .def_property_readonly("where", [](const ManagedTensor& t) { return t->where; }) + .def_property_readonly("type", [](const ManagedTensor& t) { return t->type; }) + .def_property_readonly("shape", [](const ManagedTensor& t) { return t->shape; }) + .def_property_readonly("data", [](const ManagedTensor& t) { return t->data; }) .def( "view", - [](ft::Tensor* self, ft::DataType new_type) { - return new ft::Tensor(self->where, new_type, self->shape, self->data); + [](const ManagedTensor& self, ft::DataType new_type) { + auto x = self; + x->type = new_type; + return std::make_shared(std::move(x)); }, "new_type"_a) .def( "view", - [](ft::Tensor* self, std::vector new_shape) { - return new ft::Tensor(self->where, self->type, new_shape, self->data); + [](const ManagedTensor& self, std::vector new_shape) { + auto x = self; + x->shape = new_shape; + return std::make_shared(std::move(x)); }, "new_shape"_a) .def( "copy_from", - [](ft::Tensor* self, py::object obj) { + [](ManagedTensor& self, py::object obj) { py::capsule cap = obj.attr("__dlpack__")(); DLManagedTensor* dlmt = static_cast(PyCapsule_GetPointer(cap.ptr(), kDlTensorCapsuleName)); auto src = DLManagedTensorToTritonTensor(dlmt); + // take ownership of capsule's payload + cap.set_name("used_dltensor"); switch (self->type) { case ft::TYPE_FP16: case ft::TYPE_FP32: case ft::TYPE_INT32: case ft::TYPE_BF16: { - auto num_element = - std::accumulate(src->shape.begin(), src->shape.end(), 1LL, std::multiplies()); + auto num_element = std::accumulate( + (*src)->shape.begin(), (*src)->shape.end(), 1LL, std::multiplies()); auto num_bytes = num_element * dlmt->dl_tensor.dtype.bits / 8; ft::FT_CHECK(self->shape.size() == 1 && num_bytes == self->shape[0]); - safe_memcpy(const_cast(self->data), src->data, num_bytes); + safe_memcpy(const_cast(self->data), (*src)->data, num_bytes); break; } default: @@ -347,8 +431,8 @@ PYBIND11_MODULE(_turbomind, m) "tensor"_a) .def( "__dlpack__", - [](ft::Tensor* self, long stream) { - DLManagedTensor* dlmt = TritonTensorToDLManagedTensor(*self); + [](ManagedTensor& self, long stream) { + DLManagedTensor* dlmt = TritonTensorToDLManagedTensor(self); return py::capsule(dlmt, kDlTensorCapsuleName, [](PyObject* obj) { DLManagedTensor* dlmt = static_cast(PyCapsule_GetPointer(obj, kDlTensorCapsuleName)); @@ -363,7 +447,7 @@ PYBIND11_MODULE(_turbomind, m) }); }, "stream"_a = 0) - .def("__dlpack_device__", [](ft::Tensor* self) { + .def("__dlpack_device__", [](const ManagedTensor& self) { auto device = getDLDevice(*self); return std::tuple(int(device.device_type), device.device_id); }); @@ -374,29 +458,59 @@ PYBIND11_MODULE(_turbomind, m) DLManagedTensor* dlmt = static_cast(PyCapsule_GetPointer(cap.ptr(), kDlTensorCapsuleName)); auto ret = DLManagedTensorToTritonTensor(dlmt); + // take ownership of capsule's payload + cap.set_name("used_dltensor"); return ret; }, "dl_managed_tensor"_a); // transformer model instance - using ft::AbstractTransformerModelInstance; + using ft::ModelRequest; py::bind_map>(m, "TensorMap"); - py::class_(m, "AbstractTransformerModelInstance") + py::class_(m, "ModelRequest") .def( "forward", - [](AbstractTransformerModelInstance* model, std::shared_ptr input_tensors) { - return model->forward(input_tensors); + [](ModelRequest* model_request, + std::shared_ptr input_tensors, + const ft::SessionParam& session, + const ft::GenerationConfig& gen_cfg, + bool stream_output, + std::function cb) { + ModelRequest::InputParam param{}; + param.tensors = std::move(input_tensors); + param.session = session; + param.gen_cfg = gen_cfg; + param.stream_output = stream_output; + auto ret = model_request->Forward(std::move(param), [cb = std::move(cb)]() { + try { + cb(); + } + catch (const py::error_already_set& e) { + std::cerr << e.what() << std::endl; + } + }); + return std::make_tuple(std::move(ret.tensors), std::move(ret.state)); }, py::call_guard(), - "input_tensors"_a) + "input_tensors"_a, + "session"_a, + "gen_cfg"_a, + "stream_output"_a, + "cb"_a) .def( - "register_callback", - [](AbstractTransformerModelInstance* self, ft::triton_stream_cb_t cb, py::object ctx) { - self->registerCallback(cb, ctx.ptr()); + "cancel", + [](ModelRequest* model_request) { + model_request->Cancel(); // }, - "callback"_a, - "context"_a = nullptr) - .def("unregister_callback", &AbstractTransformerModelInstance::unRegisterCallback); + py::call_guard()) + .def( + "end", + [](ModelRequest* model_request, std::function cb, uint64_t session_id) { + model_request->End(std::move(cb), session_id); // + }, + py::call_guard(), + "cb"_a, + "session_id"_a); // transformer model using ft::AbstractTransformerModel; @@ -410,25 +524,19 @@ PYBIND11_MODULE(_turbomind, m) size_t pipeline_para_size, int enable_custom_all_reduce, std::string data_type) -> std::shared_ptr { - auto gil_control = [state = PyGILState_STATE{}](int op) mutable { - if (op) { - state = PyGILState_Ensure(); - } - else { - PyGILState_Release(state); - } + auto gil_factory = [] { // + // erase the type + return std::static_pointer_cast(std::make_shared()); }; if (data_type == "half" || data_type == "fp16" || data_type == "float16" || data_type == "int4") { auto model = std::make_shared>( - tensor_para_size, pipeline_para_size, enable_custom_all_reduce, model_dir, config); - model->set_ffi_lock(gil_control); + tensor_para_size, pipeline_para_size, enable_custom_all_reduce, model_dir, config, gil_factory); return model; } else if (data_type == "bf16" || data_type == "bfloat16") { #ifdef ENABLE_BF16 auto model = std::make_shared>( - tensor_para_size, pipeline_para_size, enable_custom_all_reduce, model_dir, config); - model->set_ffi_lock(gil_control); + tensor_para_size, pipeline_para_size, enable_custom_all_reduce, model_dir, config, gil_factory); return model; #else throw std::runtime_error("Error: turbomind has not been built with bf16 support."); @@ -437,8 +545,7 @@ PYBIND11_MODULE(_turbomind, m) else { #ifdef ENABLE_FP32 auto model = std::make_shared>( - tensor_para_size, pipeline_para_size, enable_custom_all_reduce, model_dir, config); - model->set_ffi_lock(gil_control); + tensor_para_size, pipeline_para_size, enable_custom_all_reduce, model_dir, config, gil_factory); return model; #else throw std::runtime_error("Error: turbomind has not been built with fp32 support."); @@ -466,21 +573,9 @@ PYBIND11_MODULE(_turbomind, m) "world_size"_a) .def( "create_model_instance", - [](AbstractTransformerModel* model, - int deviceId, - int rank, - long stream_id, - std::pair, std::vector> nccl_params, - std::shared_ptr custom_all_reduce_comm = nullptr) { - cudaStream_t stream = reinterpret_cast(stream_id); - return model->createModelInstance(deviceId, rank, stream, nccl_params, custom_all_reduce_comm); - }, + [](AbstractTransformerModel* model, int deviceId) { return model->createModelInstance(deviceId); }, py::call_guard(), - "device_id"_a, - "rank"_a, - "stream"_a, - "nccl_params"_a, - "custom_all_reduce_comm"_a = nullptr) + "device_id"_a) .def("create_shared_weights", &AbstractTransformerModel::createSharedWeights, py::call_guard(), @@ -489,8 +584,13 @@ PYBIND11_MODULE(_turbomind, m) .def( "get_params", [](AbstractTransformerModel* model, int deviceId, int rank) { - TensorMap output = model->getParams(deviceId, rank); - return output; + auto output = model->getParams(deviceId, rank); + TensorMap ret; + for (const auto& [k, v] : output) { + // export reference to weight data only (no ownership) + ret.emplace(k, ManagedTensor{v}); + } + return ret; }, py::call_guard(), "device_id"_a, diff --git a/src/turbomind/triton_backend/llama/CMakeLists.txt b/src/turbomind/triton_backend/llama/CMakeLists.txt index 26c580714a..7e193f6677 100644 --- a/src/turbomind/triton_backend/llama/CMakeLists.txt +++ b/src/turbomind/triton_backend/llama/CMakeLists.txt @@ -19,7 +19,6 @@ cmake_minimum_required(VERSION 3.8) set(llama_triton_backend_files LlamaTritonModel.cc - LlamaTritonModelInstance.cc ) find_package(CUDAToolkit REQUIRED) diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc index 40c5ac8907..c4c3c00d6c 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc @@ -24,15 +24,16 @@ #include #include +#include "src/turbomind/engine/gateway.h" +#include "src/turbomind/engine/model_request.h" #include "src/turbomind/models/llama/LlamaDenseWeight.h" +#include "src/turbomind/models/llama/LlamaV2.h" #include "src/turbomind/models/llama/context.h" #include "src/turbomind/models/llama/llama_params.h" #include "src/turbomind/utils/allocator.h" #include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/triton_backend/llama/LlamaTritonModel.h" -#include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h" -#include "src/turbomind/triton_backend/transformer_triton_backend.hpp" namespace turbomind { @@ -60,56 +61,6 @@ static std::optional get_moe_method() return value; } -std::shared_ptr AbstractTransformerModel::createLlamaModel(std::string config_file) -{ - YAML::Node reader; - try { - reader = YAML::Load(config_file); - } - catch (const YAML::Exception& e) { - std::cerr << "Error reading YAML config: " << e.what() << std::endl; - FT_CHECK(false); - } - - const auto ft_instance_hyperparameter = reader["ft_instance_hyperparameter"]; - const std::string data_type = ft_instance_hyperparameter["data_type"].as(); - int tensor_para_size = ft_instance_hyperparameter["tensor_para_size"].as(); - std::string model_dir = ft_instance_hyperparameter["model_dir"].as(); - - if (data_type == "half" || data_type == "fp16" || data_type == "float16") { - return std::make_shared>( - ft_instance_hyperparameter["tensor_para_size"].as(), - ft_instance_hyperparameter["pipeline_para_size"].as(), - ft_instance_hyperparameter["enable_custom_all_reduce"].as(0), - model_dir); - } - else if (data_type == "bf16" || data_type == "bfloat16") { -#ifdef ENABLE_BF16 - return std::make_shared>( - ft_instance_hyperparameter["tensor_para_size"].as(), - ft_instance_hyperparameter["pipeline_para_size"].as(), - ft_instance_hyperparameter["enable_custom_all_reduce"].as(0), - model_dir); -#else - TM_LOG_ERROR("[ERROR] Turbomind is not built with ENABLE_BF16"); - FT_CHECK(false); -#endif - } - else { -#ifdef ENABLE_FP32 - return std::make_shared>( - ft_instance_hyperparameter["tensor_para_size"].as(), - ft_instance_hyperparameter["pipeline_para_size"].as(), - ft_instance_hyperparameter["enable_custom_all_reduce"].as(0), - model_dir); -#else - TM_LOG_ERROR("[ERROR] Turbomind is not built with ENABLE_BF32"); - FT_CHECK(false); -#endif - } - return nullptr; -} - template std::map> getLoraPattern(std::string pattern, T (*func)(const std::string& s)) { @@ -207,6 +158,9 @@ template LlamaTritonModel::~LlamaTritonModel() { FT_CHECK(weights_.size() == engines_.size()); + + gateway_->shutdown(); + for (int device_id = 0; device_id < (int)engines_.size(); ++device_id) { // Set device id before destructing CUDA resources check_cuda_error(cudaSetDevice(device_id)); @@ -216,11 +170,17 @@ LlamaTritonModel::~LlamaTritonModel() } template -LlamaTritonModel::LlamaTritonModel(size_t tensor_para_size, - size_t pipeline_para_size, - int enable_custom_all_reduce, - std::string model_dir, - std::string config): +LlamaTritonModel::LlamaTritonModel(size_t tensor_para_size, + size_t pipeline_para_size, + int enable_custom_all_reduce, + std::string model_dir, + std::string config, + std::function()> ffi_ctx_factory): + model_param_{}, + attn_param_{}, + moe_param_{}, + lora_param_{}, + engine_param_{}, tensor_para_size_(tensor_para_size), pipeline_para_size_(pipeline_para_size), weights_(getDeviceCount()), @@ -333,6 +293,8 @@ LlamaTritonModel::LlamaTritonModel(size_t tensor_para_size, shared_state_ = std::make_shared(); shared_state_->barrier = std::make_shared(tensor_para_size); + gateway_ = std::make_shared(ffi_ctx_factory); + const auto device_count = getDeviceCount(); engines_.resize(device_count); @@ -398,6 +360,7 @@ LlamaTritonModel::createSharedModelInstance(int std::move(model), std::move(ctx), shared_state_, + gateway_, device_id); // Wait for pinned buffers to be allocated for all ranks, otherwise tuning will hang @@ -410,22 +373,17 @@ LlamaTritonModel::createSharedModelInstance(int } template -std::unique_ptr -LlamaTritonModel::createModelInstance(int device_id, - int rank, - cudaStream_t stream, - std::pair, std::vector>, - std::shared_ptr) +std::unique_ptr LlamaTritonModel::createModelInstance(int device_id) { check_cuda_error(cudaSetDevice(device_id)); FT_CHECK(engines_[device_id] != nullptr); - auto allocator = std::make_unique>(device_id, false); - - allocator->setStream(stream); - - return std::make_unique>(*engines_[device_id], std::move(allocator), device_id); + return std::make_unique(gateway_.get(), + getTensorType(), + engine_param_.session_len, + model_param_.vocab_size, + model_param_.hidden_units); } template @@ -483,7 +441,6 @@ void LlamaTritonModel::createEngine(int { auto engine = createSharedModelInstance(device_id, rank, nccl_params, custom_all_reduce_comm); - engine->set_ffi_lock(ffi_lock_); engine->tune(); diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.h b/src/turbomind/triton_backend/llama/LlamaTritonModel.h index 8f473cd4cd..21b124e5a8 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.h +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.h @@ -20,35 +20,29 @@ #pragma once +#include "src/turbomind/engine/gateway.h" #include "src/turbomind/models/llama/LlamaBatch.h" -#include "src/turbomind/models/llama/LlamaV2.h" +#include "src/turbomind/models/llama/LlamaWeight.h" #include "src/turbomind/models/llama/llama_params.h" -#include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h" #include "src/turbomind/triton_backend/transformer_triton_backend.hpp" -#include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/custom_ar_comm.h" #include "src/turbomind/utils/nccl_utils.h" #include -#include namespace turbomind { template struct LlamaTritonModel: public AbstractTransformerModel { - LlamaTritonModel(size_t tensor_para_size, - size_t pipeline_para_size, - int enable_custom_all_reduce, - std::string model_dir, - std::string config = ""); + LlamaTritonModel(size_t tensor_para_size, + size_t pipeline_para_size, + int enable_custom_all_reduce, + std::string model_dir, + std::string config, + std::function()> ffi_ctx_factory); ~LlamaTritonModel() override; - std::unique_ptr - createModelInstance(int deviceId, - int rank, - cudaStream_t stream, - std::pair, std::vector> nccl_params, - std::shared_ptr custom_all_reduce_comm = nullptr) override; + std::unique_ptr createModelInstance(int deviceId) override; void createSharedWeights(int deviceId, int rank) override; @@ -66,11 +60,6 @@ struct LlamaTritonModel: public AbstractTransformerModel { void handleMissingParams(); - void set_ffi_lock(ffi_api_lock_ctrl_t func) - { - ffi_lock_ = func; - } - std::string toString() override; int getTensorParaSize() override; int getPipelineParaSize() override; @@ -91,6 +80,8 @@ struct LlamaTritonModel: public AbstractTransformerModel { size_t pipeline_para_size_; std::shared_ptr shared_state_; + std::shared_ptr gateway_; + // Weights & engine instances for the ranks std::vector>> weights_; std::vector>> engines_; @@ -100,8 +91,6 @@ struct LlamaTritonModel: public AbstractTransformerModel { std::string model_name_; std::string model_dir_; - - ffi_api_lock_ctrl_t ffi_lock_ = nullptr; }; } // namespace turbomind diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc deleted file mode 100644 index 976fc9cc1d..0000000000 --- a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc +++ /dev/null @@ -1,216 +0,0 @@ -/* - * Copyright (c) OpenMMLab. All rights reserved. - * Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Modified from -// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h - -#include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h" -#include "src/turbomind/macro.h" -#include "src/turbomind/triton_backend/transformer_triton_backend.hpp" -#include "src/turbomind/utils/Tensor.h" -#include "src/turbomind/utils/constant.h" -#include "src/turbomind/utils/cuda_utils.h" -#include -#include -#include -#include -#include -#include - -namespace turbomind { - -template -void triton_stream_callback(std::unordered_map* outputs, void* ctx) -{ - LlamaTritonModelInstance* model = reinterpret_cast*>(ctx); - model->stream_cb_(std::make_shared>(*outputs), model->stream_ctx_); -} - -template -LlamaTritonModelInstance::LlamaTritonModelInstance(Engine& instance, - std::unique_ptr> allocator, - int device_id): - device_id_{device_id}, instance_(&instance), allocator_(std::move(allocator)) -{ -} - -template -std::string format_vector(const std::vector& vec) -{ - std::stringstream ss; - ss << "["; - bool first = true; - for (const auto& x : vec) { - ss << (first ? "" : ", ") << x; - first = false; - } - ss << "]"; - return ss.str(); -} - -template -std::shared_ptr> -LlamaTritonModelInstance::forward(std::shared_ptr> inputs) -{ - TM_LOG_DEBUG(__PRETTY_FUNCTION__); - - // In some cases, this is needed to trigger the creation of CUDA context, or later `cudaMallocAsync` will die - check_cuda_error(cudaSetDevice(device_id_)); - - FT_CHECK_WITH_INFO(inputs->at("input_ids").shape.size() == 2, "inputs->at(\"input_ids\").shape.size() == 2"); - FT_CHECK_WITH_INFO(inputs->at("input_lengths").shape.size() == 1, - "inputs->at(\"input_lengths\").shape.size() == 1"); - - const uint32_t request_batch_size = inputs->at("input_ids").shape[0]; - const uint32_t max_request_output_len = (size_t)*std::max_element((int*)inputs->at("request_output_len").data, - (int*)inputs->at("request_output_len").data - + inputs->at("request_output_len").shape[0]); - // const uint32_t total_output_len = max_request_output_len + input_tensors->at("input_ids").shape[1]; - const uint32_t beam_width = inputs->count("beam_width") ? (size_t)(*(uint*)inputs->at("beam_width").data) : 1; - FT_CHECK_WITH_INFO(beam_width == 1, "Beam search is not implemented"); - - h_total_output_lengths_ = - (uint32_t*)std::realloc((void*)h_total_output_lengths_, request_batch_size * sizeof(uint32_t)); - - const size_t max_input_len = inputs->at("input_ids").shape[1]; - const bool is_return_logits = inputs->count("is_return_logits") && *(bool*)inputs->at("is_return_logits").data; - - const size_t vocab_size = instance_->model().vocab_size(); - - allocateBuffer(request_batch_size, max_input_len, beam_width, instance_->session_len(), is_return_logits); - - std::unordered_map outputs{ - {"output_ids", - Tensor{MEMORY_CPU, - TYPE_UINT32, - std::vector{request_batch_size, beam_width, (size_t)instance_->session_len()}, - d_output_ids_}}, - {"sequence_length", - Tensor{MEMORY_CPU, TYPE_UINT32, std::vector{request_batch_size, beam_width}, d_sequence_lengths_}}}; - - if (inputs->count("is_return_log_probs") && *((bool*)inputs->at("is_return_log_probs").data)) { - outputs.insert({"output_log_probs", - Tensor{MEMORY_GPU, - TYPE_FP32, - std::vector{request_batch_size, beam_width, max_request_output_len}, - d_output_log_probs_}}); - outputs.insert( - {"cum_log_probs", - Tensor{MEMORY_GPU, TYPE_FP32, std::vector{request_batch_size, beam_width}, d_cum_log_probs_}}); - } - - if (inputs->count("logprobs")) { - size_t max_logprob_length = std::min((int)max_request_output_len, instance_->session_len()) + 1; - h_logprob_vals_ = (float*)std::realloc( - h_logprob_vals_, sizeof(float) * request_batch_size * beam_width * max_logprob_length * kMaxLogProb); - h_logprob_indexes_ = (uint32_t*)std::realloc( - h_logprob_indexes_, sizeof(uint32_t) * request_batch_size * beam_width * max_logprob_length * kMaxLogProb); - h_logprob_nums_ = (uint32_t*)std::realloc( - h_logprob_nums_, sizeof(uint32_t) * request_batch_size * beam_width * max_logprob_length); - - outputs.insert({{"logprob_vals", - Tensor{MEMORY_CPU, - TYPE_FP32, - std::vector{request_batch_size, beam_width, max_logprob_length, kMaxLogProb}, - h_logprob_vals_}}}); - - outputs.insert({{"logprob_indexes", - Tensor{MEMORY_CPU, - TYPE_UINT32, - std::vector{request_batch_size, beam_width, max_logprob_length, kMaxLogProb}, - h_logprob_indexes_}}}); - - outputs.insert({{"logprob_nums", - Tensor{MEMORY_CPU, - TYPE_UINT32, - std::vector{request_batch_size, beam_width, max_logprob_length}, - h_logprob_nums_}}}); - } - - if (is_return_logits) { - outputs.insert( - {{"logits", {MEMORY_GPU, TYPE_FP32, {request_batch_size, max_input_len, vocab_size}, d_output_logits_}}}); - } - - try { - Request::Callback callback; - - if (stream_cb_) { - callback = [this](std::unordered_map* outputs) { - triton_stream_callback(outputs, this); - }; - } - - check_cuda_error(cudaStreamSynchronize(allocator_->returnStream())); - - instance_->Submit(&outputs, inputs.get(), {callback}); - // ! stream synced by the model before returning - } - catch (...) { - h_exception_ = std::current_exception(); - outputs.insert({"error_message", Tensor{MEMORY_CPU, TYPE_BYTES, {1}, &h_exception_}}); - } - - return std::make_shared>(std::move(outputs)); -} - -template -LlamaTritonModelInstance::~LlamaTritonModelInstance() -{ - freeBuffer(); -} - -template -void LlamaTritonModelInstance::allocateBuffer(const size_t request_batch_size, - const size_t max_input_len, - const size_t beam_width, - const size_t session_len, - const bool is_return_logits) -{ - d_output_ids_ = (int*)std::realloc(d_output_ids_, sizeof(int) * request_batch_size * beam_width * session_len); - d_sequence_lengths_ = (int*)std::realloc(d_sequence_lengths_, sizeof(int) * request_batch_size * beam_width); - - if (is_return_logits) { - d_output_logits_ = (float*)allocator_->reMalloc(d_output_logits_, - sizeof(float) * request_batch_size * max_input_len - * instance_->model().vocab_size(), - false); - } -} - -template -void LlamaTritonModelInstance::freeBuffer() -{ - std::free(d_output_ids_); - std::free(d_sequence_lengths_); - allocator_->free((void**)(&d_output_log_probs_)); - allocator_->free((void**)(&d_cum_log_probs_)); - std::free(h_total_output_lengths_); - std::free(h_logprob_vals_); - std::free(h_logprob_indexes_); - std::free(h_logprob_nums_); -} - -#ifdef ENABLE_FP32 -template struct LlamaTritonModelInstance; -#endif -template struct LlamaTritonModelInstance; -#ifdef ENABLE_BF16 -template struct LlamaTritonModelInstance<__nv_bfloat16>; -#endif - -} // namespace turbomind diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h deleted file mode 100644 index 2cf69b9fa5..0000000000 --- a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright (c) OpenMMLab. All rights reserved. - * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Modified from -// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h - -#pragma once - -#include - -#include "src/turbomind/models/llama/LlamaBatch.h" -#include "src/turbomind/models/llama/LlamaV2.h" -#include "src/turbomind/triton_backend/llama/LlamaTritonModel.h" -#include "src/turbomind/triton_backend/transformer_triton_backend.hpp" - -namespace turbomind { - -template -struct LlamaTritonModelInstance: AbstractTransformerModelInstance { - - LlamaTritonModelInstance(Engine& instance, - std::unique_ptr> allocator, - int device_id); - ~LlamaTritonModelInstance() override; - - virtual std::shared_ptr> - forward(std::shared_ptr> input_tensors) override; - -private: - Engine* instance_; - const std::unique_ptr> allocator_; - - void allocateBuffer(const size_t request_batch_size, - const size_t max_input_len, - const size_t beam_width, - const size_t session_len, - const bool is_return_logits); - void freeBuffer(); - - int device_id_; - - int* d_input_ids_ = nullptr; - int* d_input_lengths_ = nullptr; - int* d_input_bad_words_ = nullptr; - int* d_input_stop_words_ = nullptr; - int* d_request_prompt_lengths_ = nullptr; - T* d_request_prompt_embedding_ = nullptr; - float* d_top_p_decay_ = nullptr; - float* d_top_p_min_ = nullptr; - int* d_top_p_reset_ids_ = nullptr; - - int* d_output_ids_ = nullptr; - int* d_sequence_lengths_ = nullptr; - float* d_output_log_probs_ = nullptr; - float* d_cum_log_probs_ = nullptr; - float* d_output_logits_ = nullptr; - - float* h_logprob_vals_ = nullptr; - uint32_t* h_logprob_indexes_ = nullptr; - uint32_t* h_logprob_nums_ = nullptr; - - uint32_t* h_total_output_lengths_ = nullptr; - std::exception_ptr h_exception_ = nullptr; -}; - -} // namespace turbomind diff --git a/src/turbomind/triton_backend/transformer_triton_backend.hpp b/src/turbomind/triton_backend/transformer_triton_backend.hpp index 6d49df4578..7e1e235160 100644 --- a/src/turbomind/triton_backend/transformer_triton_backend.hpp +++ b/src/turbomind/triton_backend/transformer_triton_backend.hpp @@ -22,16 +22,18 @@ #include #include -#include +#include + #ifdef __linux__ #include #endif -#include #include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/custom_ar_comm.h" #include "src/turbomind/utils/nccl_utils.h" +#include "src/turbomind/engine/model_request.h" + namespace turbomind { using triton_stream_cb_t = std::function>, void*)>; @@ -62,7 +64,6 @@ struct AbstractTransformerModelInstance { }; struct AbstractTransformerModel { - static std::shared_ptr createLlamaModel(std::string model_dir); virtual ~AbstractTransformerModel() = default; @@ -72,12 +73,7 @@ struct AbstractTransformerModel { virtual void createCustomComms(std::vector>* custom_all_reduce_comms, int world_size) = 0; - virtual std::unique_ptr - createModelInstance(int deviceId, - int rank, - cudaStream_t stream, - std::pair, std::vector> nccl_params, - std::shared_ptr custom_all_reduce_comm = nullptr) = 0; + virtual std::unique_ptr createModelInstance(int deviceId) = 0; virtual void createSharedWeights(int deviceId, int rank) = 0; diff --git a/src/turbomind/utils/Tensor.h b/src/turbomind/utils/Tensor.h index b2b8524e09..1f6f737b72 100644 --- a/src/turbomind/utils/Tensor.h +++ b/src/turbomind/utils/Tensor.h @@ -530,4 +530,29 @@ class TensorMap { void saveNpy(const std::string& base_folder); }; +struct ManagedTensor { + Tensor tensor; + std::shared_ptr data_holder; + + Tensor* operator->() noexcept + { + return &tensor; + } + + const Tensor* operator->() const noexcept + { + return &tensor; + } + + Tensor& operator*() noexcept + { + return tensor; + } + + const Tensor& operator*() const noexcept + { + return tensor; + } +}; + } // namespace turbomind diff --git a/src/turbomind/utils/cuda_utils.h b/src/turbomind/utils/cuda_utils.h index 8311e6eb9e..ac1664bdd2 100644 --- a/src/turbomind/utils/cuda_utils.h +++ b/src/turbomind/utils/cuda_utils.h @@ -306,7 +306,8 @@ inline std::string getDeviceName() return std::string(props.name); } -inline int div_up(int a, int n) +template +inline T div_up(T a, T n) { return (a + n - 1) / n; } From a6d4b4adb11914fa32089428a012146b3c99fe43 Mon Sep 17 00:00:00 2001 From: Chen Xin Date: Sat, 11 Jan 2025 12:38:27 +0800 Subject: [PATCH 2/4] fix xcomposer2 when transformers is upgraded greater than 4.46 (#3001) --- lmdeploy/vl/model/xcomposer2.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lmdeploy/vl/model/xcomposer2.py b/lmdeploy/vl/model/xcomposer2.py index 3c72d0c29f..312ef9132b 100644 --- a/lmdeploy/vl/model/xcomposer2.py +++ b/lmdeploy/vl/model/xcomposer2.py @@ -156,6 +156,9 @@ def build_model(self): trust_remote_code=True) model.vit.load_model() model.vit.resize_pos() + if hasattr(self.hf_config, 'img_size'): + model.vit.vision_tower.vision_model.embeddings.image_size = \ + self.hf_config.img_size model.vit.vision_tower.vision_model.post_layernorm.to_empty( device='cpu').half() self.vl_model = model From 551e6d532bab2194a59c0c89fdbb687c5f66ff8f Mon Sep 17 00:00:00 2001 From: Wei Tao <1136862851@qq.com> Date: Sun, 12 Jan 2025 17:21:07 +0800 Subject: [PATCH 3/4] [dlinfer]rm rope reshape (#2984) --- lmdeploy/pytorch/kernels/dlinfer/apply_rotary_pos_emb.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/lmdeploy/pytorch/kernels/dlinfer/apply_rotary_pos_emb.py b/lmdeploy/pytorch/kernels/dlinfer/apply_rotary_pos_emb.py index 0f13f3f38c..0fd07cf10c 100644 --- a/lmdeploy/pytorch/kernels/dlinfer/apply_rotary_pos_emb.py +++ b/lmdeploy/pytorch/kernels/dlinfer/apply_rotary_pos_emb.py @@ -15,15 +15,12 @@ def apply_rotary_pos_emb( ) -> Tuple[Tensor, Tensor]: query_states = query_states.contiguous() key_states = key_states.contiguous() - bs = query_states.shape[0] query_states_reshaped = query_states.unsqueeze(0) key_states_reshaped = key_states.unsqueeze(0) - cos_reshaped = cos.reshape(1, bs, 1, -1) - sin_reshaped = sin.reshape(1, bs, 1, -1) query_states_reshaped, key_states_reshaped = \ ext_ops.apply_rotary_pos_emb(query_states_reshaped, key_states_reshaped, - cos_reshaped, sin_reshaped, + cos, sin, None, None) if q_embed is None: q_embed = query_states_reshaped.view(query_states.shape) From 086481ed84b59bee3b8e4274e5fc69620040c048 Mon Sep 17 00:00:00 2001 From: AllentDan <41138331+AllentDan@users.noreply.github.com> Date: Sun, 12 Jan 2025 17:33:47 +0800 Subject: [PATCH 4/4] Expose spaces_between_special_tokens (#2991) --- lmdeploy/messages.py | 4 ++++ lmdeploy/serve/async_engine.py | 4 +++- lmdeploy/serve/gradio/vl.py | 4 +++- lmdeploy/serve/openai/api_server.py | 31 ++++++++++++++++++++++------- lmdeploy/serve/openai/protocol.py | 2 ++ 5 files changed, 36 insertions(+), 9 deletions(-) diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py index d4e6571b79..11626f44a2 100644 --- a/lmdeploy/messages.py +++ b/lmdeploy/messages.py @@ -52,6 +52,9 @@ class GenerationConfig: ignoring the number of tokens in the prompt. skip_special_tokens (bool): Whether or not to remove special tokens in the decoding. Default to be True. + spaces_between_special_tokens (bool): Whether or not to add spaces + around special tokens. The behavior of Fast tokenizers is to have + this to False. This is setup to True in slow tokenizers. logprobs (int): Number of log probabilities to return per output token. response_format (Dict): Only pytorch backend support formatting response. Examples: @@ -94,6 +97,7 @@ class GenerationConfig: bad_token_ids: List[int] = None min_new_tokens: int = None skip_special_tokens: bool = True + spaces_between_special_tokens: bool = True logprobs: int = None response_format: Optional[Dict] = None logits_processors: Optional[List[LogitsProcessor]] = None diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py index d7366c654b..2b2d02b38f 100644 --- a/lmdeploy/serve/async_engine.py +++ b/lmdeploy/serve/async_engine.py @@ -781,7 +781,9 @@ def is_error(status): response, state = self.tokenizer.detokenize_incrementally( token_ids, state, - skip_special_tokens=gen_config.skip_special_tokens) + skip_special_tokens=gen_config.skip_special_tokens, + spaces_between_special_tokens=gen_config. + spaces_between_special_tokens) res = token_ids[ids_offset:] out = GenOut(response, history_len, input_len, gen_len, diff --git a/lmdeploy/serve/gradio/vl.py b/lmdeploy/serve/gradio/vl.py index bf8ee87e68..26f23613af 100644 --- a/lmdeploy/serve/gradio/vl.py +++ b/lmdeploy/serve/gradio/vl.py @@ -151,7 +151,9 @@ def chat(chatbot, session, max_new_tokens, top_p, top_k, temperature): response, state = engine.tokenizer.detokenize_incrementally( res, state, - skip_special_tokens=gen_config.skip_special_tokens) + skip_special_tokens=gen_config.skip_special_tokens, + spaces_between_special_tokens=gen_config. + spaces_between_special_tokens) # noqa if chatbot[-1][1] is None: chatbot[-1][1] = '' history[-1][1] = '' diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py index c37f7572a1..a284250f21 100644 --- a/lmdeploy/serve/openai/api_server.py +++ b/lmdeploy/serve/openai/api_server.py @@ -149,7 +149,8 @@ def _create_completion_logprobs(tokenizer: Tokenizer, skip_special_tokens: bool = True, offset: int = 0, all_token_ids: List[int] = None, - state: DetokenizeState = None): + state: DetokenizeState = None, + spaces_between_special_tokens: bool = True): """create openai LogProbs for completion. Args: @@ -162,6 +163,9 @@ def _create_completion_logprobs(tokenizer: Tokenizer, offset (int): text offset. all_token_ids (int): the history output token ids. state (DetokenizeState): tokenizer decode state. + spaces_between_special_tokens (bool): Whether or not to add spaces + around special tokens. The behavior of Fast tokenizers is to have + this to False. This is setup to True in slow tokenizers. """ if logprobs is None or len(logprobs) == 0: return None, None, None, None @@ -183,7 +187,8 @@ def _create_completion_logprobs(tokenizer: Tokenizer, response, _state = tokenizer.detokenize_incrementally( all_token_ids + [top_id], copy.deepcopy(state), - skip_special_tokens=skip_special_tokens) + skip_special_tokens=skip_special_tokens, + spaces_between_special_tokens=spaces_between_special_tokens) res[response] = prob if top_id == token_id: out_state = _state @@ -323,6 +328,9 @@ async def chat_completions_v1(request: ChatCompletionRequest, - ignore_eos (bool): indicator for ignoring eos - skip_special_tokens (bool): Whether or not to remove special tokens in the decoding. Default to be True. + - spaces_between_special_tokens (bool): Whether or not to add spaces + around special tokens. The behavior of Fast tokenizers is to have + this to False. This is setup to True in slow tokenizers. - min_new_tokens (int): To generate at least numbers of tokens. - min_p (float): Minimum token probability, which will be scaled by the probability of the most likely token. It must be a value between @@ -393,7 +401,8 @@ async def chat_completions_v1(request: ChatCompletionRequest, logits_processors=logits_processors, min_new_tokens=request.min_new_tokens, min_p=request.min_p, - random_seed=random_seed) + random_seed=random_seed, + spaces_between_special_tokens=request.spaces_between_special_tokens) tools = None if request.tools and request.tool_choice != 'none': @@ -581,6 +590,9 @@ async def completions_v1(request: CompletionRequest, - ignore_eos (bool): indicator for ignoring eos - skip_special_tokens (bool): Whether or not to remove special tokens in the decoding. Default to be True. + - spaces_between_special_tokens (bool): Whether or not to add spaces + around special tokens. The behavior of Fast tokenizers is to have + this to False. This is setup to True in slow tokenizers. - top_k (int): The number of the highest probability vocabulary tokens to keep for top-k-filtering @@ -623,7 +635,8 @@ async def completions_v1(request: CompletionRequest, ignore_eos=request.ignore_eos, stop_words=request.stop, skip_special_tokens=request.skip_special_tokens, - random_seed=random_seed) + random_seed=random_seed, + spaces_between_special_tokens=request.spaces_between_special_tokens) generators = [] for i in range(len(request.prompt)): result_generator = VariableInterface.async_engine.generate( @@ -672,7 +685,7 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]: VariableInterface.async_engine.tokenizer, res.token_ids, res.logprobs, gen_config.skip_special_tokens, offset, all_token_ids, - state) + state, gen_config.spaces_between_special_tokens) if request.stream_options and request.stream_options.include_usage: # noqa E501 final_res = res total_tokens = sum([ @@ -724,8 +737,12 @@ async def _inner_call(i, generator): logprobs = None if request.logprobs and len(final_logprobs): logprobs, _, _, _ = _create_completion_logprobs( - VariableInterface.async_engine.tokenizer, final_token_ids, - final_logprobs, gen_config.skip_special_tokens) + VariableInterface.async_engine.tokenizer, + final_token_ids, + final_logprobs, + gen_config.skip_special_tokens, + spaces_between_special_tokens=gen_config. + spaces_between_special_tokens) assert final_res is not None choice_data = CompletionResponseChoice( diff --git a/lmdeploy/serve/openai/protocol.py b/lmdeploy/serve/openai/protocol.py index 2b9d39c7b7..a6f945ac13 100644 --- a/lmdeploy/serve/openai/protocol.py +++ b/lmdeploy/serve/openai/protocol.py @@ -135,6 +135,7 @@ class ChatCompletionRequest(BaseModel): session_id: Optional[int] = -1 ignore_eos: Optional[bool] = False skip_special_tokens: Optional[bool] = True + spaces_between_special_tokens: Optional[bool] = True top_k: Optional[int] = 40 seed: Optional[int] = None min_new_tokens: Optional[int] = Field(default=None, examples=[None]) @@ -251,6 +252,7 @@ class CompletionRequest(BaseModel): session_id: Optional[int] = -1 ignore_eos: Optional[bool] = False skip_special_tokens: Optional[bool] = True + spaces_between_special_tokens: Optional[bool] = True top_k: Optional[int] = 40 # for opencompass seed: Optional[int] = None