diff --git a/tests/integration/llm/client.py b/tests/integration/llm/client.py index b26c7595f..e63e3bb6b 100644 --- a/tests/integration/llm/client.py +++ b/tests/integration/llm/client.py @@ -61,14 +61,14 @@ def get_model_name(): "batch_size": [1, 4], "seq_length": [16, 32], "worker": 1, - "stream_output": True, + "stream": [True], }, "t5-large": { "max_memory_per_gpu": [5.0], "batch_size": [1], "seq_length": [32], "worker": 1, - "stream_output": True, + "stream": [True], }, "gpt4all-lora": { "max_memory_per_gpu": [10.0, 12.0], @@ -1396,6 +1396,7 @@ def test_handler_rolling_batch(model, model_spec): spec = model_spec[args.model] if "worker" in spec: check_worker_number(spec["worker"]) + stream_values = spec.get("stream", [False, True]) # dryrun phase req = {"inputs": batch_generation(1)[0]} seq_length = 100 @@ -1405,20 +1406,25 @@ def test_handler_rolling_batch(model, model_spec): req["parameters"].update(spec["parameters"]) if "adapters" in spec: req["adapters"] = spec.get("adapters")[0] - LOGGER.info(f"req {req}") - res = send_json(req) - message = res.content.decode("utf-8") - LOGGER.info(f"res: {message}") - response_checker(res, message) + + for stream in stream_values: + req["stream"] = stream + LOGGER.info(f"req {req}") + res = send_json(req) + message = res.content.decode("utf-8") + LOGGER.info(f"res: {message}") + response_checker(res, message) # awscurl little benchmark phase for i, batch_size in enumerate(spec["batch_size"]): for seq_length in spec["seq_length"]: - LOGGER.info( - f"Little benchmark: concurrency {batch_size} seq_len {seq_length}" - ) - req["parameters"]["max_new_tokens"] = seq_length - awscurl_run(req, spec.get("tokenizer", None), batch_size) + for stream in stream_values: + req["stream"] = stream + LOGGER.info( + f"Little benchmark: concurrency {batch_size} seq_len {seq_length}" + ) + req["parameters"]["max_new_tokens"] = seq_length + awscurl_run(req, spec.get("tokenizer", None), batch_size) def test_handler_adapters(model, model_spec): @@ -1426,6 +1432,7 @@ def test_handler_adapters(model, model_spec): spec = model_spec[args.model] if "worker" in spec: check_worker_number(spec["worker"]) + stream_values = spec.get("stream", [False, True]) # dryrun phase reqs = [] inputs = batch_generation(len(spec.get("adapters"))) @@ -1440,24 +1447,28 @@ def test_handler_adapters(model, model_spec): req["parameters"] = params req["adapters"] = adapter reqs.append(req) - LOGGER.info(f"reqs {reqs}") for req in reqs: - res = send_json(req) - message = res.content.decode("utf-8") - LOGGER.info(f"res: {message}") - response_checker(res, message) + for stream in stream_values: + req["stream"] = stream + LOGGER.info(f"req: {req}") + res = send_json(req) + message = res.content.decode("utf-8") + LOGGER.info(f"res: {message}") + response_checker(res, message) # awscurl little benchmark phase for i, batch_size in enumerate(spec["batch_size"]): for seq_length in spec["seq_length"]: - LOGGER.info( - f"Little benchmark: concurrency {batch_size} seq_len {seq_length}" - ) - for req in reqs: - req["parameters"]["max_new_tokens"] = seq_length - awscurl_run(reqs, - spec.get("tokenizer", None), - batch_size, - dataset=True) + for stream in stream_values: + LOGGER.info( + f"Little benchmark: concurrency {batch_size} seq_len {seq_length}" + ) + for req in reqs: + req["parameters"]["max_new_tokens"] = seq_length + req["stream"] = stream + awscurl_run(reqs, + spec.get("tokenizer", None), + batch_size, + dataset=True) # Test removing and querying invalid/removed adapter del_adapter = spec.get("adapters")[0] res = requests.delete( @@ -1489,6 +1500,7 @@ def test_handler_rolling_batch_chat(model, model_spec): spec = model_spec[args.model] if "worker" in spec: check_worker_number(spec["worker"]) + stream_values = spec.get("stream", [False, True]) # dryrun phase req = {"messages": batch_generation_chat(1)[0]} seq_length = 100 @@ -1497,17 +1509,20 @@ def test_handler_rolling_batch_chat(model, model_spec): req["top_logprobs"] = 1 if "adapters" in spec: req["adapters"] = spec.get("adapters")[0] - LOGGER.info(f"req {req}") - res = send_json(req) - LOGGER.info(f"res: {res.content}") - # awscurl little benchmark phase - for i, batch_size in enumerate(spec["batch_size"]): - for seq_length in spec["seq_length"]: - LOGGER.info( - f"Little benchmark: concurrency {batch_size} seq_len {seq_length}" - ) - req["max_tokens"] = seq_length - awscurl_run(req, spec.get("tokenizer", None), batch_size) + + for stream in stream_values: + req["stream"] = stream + LOGGER.info(f"req {req}") + res = send_json(req) + LOGGER.info(f"res: {res.content}") + # awscurl little benchmark phase + for i, batch_size in enumerate(spec["batch_size"]): + for seq_length in spec["seq_length"]: + LOGGER.info( + f"Little benchmark: concurrency {batch_size} seq_len {seq_length}" + ) + req["max_tokens"] = seq_length + awscurl_run(req, spec.get("tokenizer", None), batch_size) def test_handler(model, model_spec): @@ -1515,38 +1530,41 @@ def test_handler(model, model_spec): spec = model_spec[args.model] if "worker" in spec: check_worker_number(spec["worker"]) + stream_values = spec.get("stream", [False, True]) for i, batch_size in enumerate(spec["batch_size"]): for seq_length in spec["seq_length"]: - if "t5" in model: - req = {"inputs": t5_batch_generation(batch_size)} - else: - req = {"inputs": batch_generation(batch_size)} - if spec.get("adapters", []): - req["adapters"] = spec.get("adapters") - params = {"max_new_tokens": seq_length} - if spec.get("details", False): - params["details"] = True - req["parameters"] = params - LOGGER.info(f"req {req}") - res = send_json(req) - if spec.get("stream_output", False): - LOGGER.info(f"res: {res.content}") - result = res.content.decode().split("\n")[:-1] - assert len( - result - ) <= seq_length, "generated more tokens than max_new_tokens" - else: - res = res.json() - LOGGER.info(f"res {res}") - if isinstance(res, list): - result = [item['generated_text'] for item in res] - assert len(result) == batch_size - elif isinstance(res, dict): - assert 1 == batch_size - if "max_memory_per_gpu" in spec: - validate_memory_usage(spec["max_memory_per_gpu"][i]) - if "tokenizer" in spec: - awscurl_run(req, spec.get("tokenizer"), batch_size) + for stream in stream_values: + if "t5" in model: + req = {"inputs": t5_batch_generation(batch_size)} + else: + req = {"inputs": batch_generation(batch_size)} + if spec.get("adapters", []): + req["adapters"] = spec.get("adapters") + params = {"max_new_tokens": seq_length} + if spec.get("details", False): + params["details"] = True + req["parameters"] = params + req["stream"] = stream + LOGGER.info(f"req {req}") + res = send_json(req) + if stream: + LOGGER.info(f"res: {res.content}") + result = res.content.decode().split("\n")[:-1] + assert len( + result + ) <= seq_length, "generated more tokens than max_new_tokens" + else: + res = res.json() + LOGGER.info(f"res {res}") + if isinstance(res, list): + result = [item['generated_text'] for item in res] + assert len(result) == batch_size + elif isinstance(res, dict): + assert 1 == batch_size + if "max_memory_per_gpu" in spec: + validate_memory_usage(spec["max_memory_per_gpu"][i]) + if "tokenizer" in spec: + awscurl_run(req, spec.get("tokenizer"), batch_size) def log_awscurl_benchmark(metric_name: str, diff --git a/tests/integration/llm/prepare.py b/tests/integration/llm/prepare.py index c17507da9..8500e864d 100644 --- a/tests/integration/llm/prepare.py +++ b/tests/integration/llm/prepare.py @@ -190,7 +190,6 @@ "option.max_rolling_batch_size": 4, "option.model_loading_timeout": 2400, "option.load_split_model": True, - "option.output_formatter": "jsonlines" }, "llama-3-8b-rb-vllm": { "option.model_id": "s3://djl-llm/llama-3-8b-hf/", @@ -199,7 +198,6 @@ "option.max_rolling_batch_size": 4, "option.rolling_batch": 'vllm', "option.model_loading_timeout": 2400, - "option.output_formatter": "jsonlines" }, "tiny-llama-rb-vllm": { "option.model_id": "s3://djl-llm/tinyllama-1.1b-chat/", @@ -209,7 +207,6 @@ "option.rolling_batch": 'vllm', "option.model_loader": 'vllm', "option.model_loading_timeout": 1200, - "option.output_formatter": "jsonlines" }, "mistral-7b-rb": { "option.model_id": "s3://djl-llm/mistral-7b-instruct-v02/", @@ -225,7 +222,6 @@ "option.tensor_parallel_degree": 12, "option.max_rolling_batch_size": 1, "option.model_loading_timeout": 3600, - "option.output_formatter": "jsonlines" }, "llama-speculative-compiled-rb": { "option.model_id": "s3://djl-llm/llama-2-13b-hf/", @@ -238,7 +234,6 @@ "option.tensor_parallel_degree": 12, "option.max_rolling_batch_size": 1, "option.model_loading_timeout": 3600, - "option.output_formatter": "jsonlines" }, "tiny-llama-rb-aot": { "option.model_id": "s3://djl-llm/tinyllama-1.1b-chat/", @@ -603,13 +598,11 @@ "option.model_id": "s3://djl-llm/llama-2-70b-hf/", "option.tensor_parallel_degree": 8, "option.max_rolling_batch_size": 32, - "option.output_formatter": "jsonlines" }, "mixtral-8x7b": { "option.model_id": "s3://djl-llm/mixtral-8x7b/", "option.tensor_parallel_degree": 8, "option.max_rolling_batch_size": 32, - "option.output_formatter": "jsonlines" }, "qwen2-7b-fp8": { "option.model_id": "neuralmagic/Qwen2-7B-Instruct-FP8", @@ -807,7 +800,6 @@ "option.model_id": "s3://djl-llm/llama-2-13b-hf/", "option.tensor_parallel_degree": 4, "option.rolling_batch": "trtllm", - "option.output_formatter": "jsonlines", }, "llama2-7b-smoothquant": { "option.model_id": "s3://djl-llm/meta-llama-Llama-2-7b-chat-hf/", @@ -816,25 +808,21 @@ "option.smoothquant_per_token": "True", "option.smoothquant_per_channel": "True", "option.rolling_batch": "trtllm", - "option.output_formatter": "jsonlines", }, "internlm-7b": { "option.model_id": "internlm/internlm-7b", "option.tensor_parallel_degree": 4, - "option.output_formatter": "jsonlines", "option.trust_remote_code": True }, "baichuan2-13b": { "option.model_id": "s3://djl-llm/baichuan2-13b/", "option.tensor_parallel_degree": 4, "option.baichuan_model_version": "v2_13b", - "option.output_formatter": "jsonlines", "option.trust_remote_code": True }, "chatglm3-6b": { "option.model_id": "s3://djl-llm/chatglm3-6b/", "option.tensor_parallel_degree": 4, - "option.output_formatter": "jsonlines", "option.trust_remote_code": True, "option.chatglm_model_version": "chatglm3" }, @@ -842,7 +830,6 @@ "option.model_id": "s3://djl-llm/mistral-7b/", "option.tensor_parallel_degree": 4, "option.rolling_batch": "trtllm", - "option.output_formatter": "jsonlines" }, "gpt-j-6b": { "option.model_id": "s3://djl-llm/gpt-j-6b/", @@ -851,13 +838,11 @@ "option.max_output_len": 256, "option.max_rolling_batch_size": 16, "option.rolling_batch": "auto", - "option.output_formatter": "jsonlines" }, "qwen-7b": { "option.model_id": "Qwen/Qwen-7B", "option.tensor_parallel_degree": 4, "option.trust_remote_code": True, - "option.output_formatter": "jsonlines" }, "gpt2": { "option.model_id": "gpt2", @@ -865,7 +850,6 @@ "option.max_rolling_batch_size": 16, "option.trust_remote_code": True, "option.max_draft_len": 20, - "option.output_formatter": "jsonlines" }, "santacoder": { "option.model_id": "bigcode/santacoder", @@ -873,21 +857,18 @@ "option.max_rolling_batch_size": 16, "option.trust_remote_code": True, "option.gpt_model_version": "santacoder", - "option.output_formatter": "jsonlines" }, "llama2-70b": { "option.model_id": "s3://djl-llm/llama-2-70b-hf/", "option.tensor_parallel_degree": 8, "option.use_custom_all_reduce": True, "option.max_rolling_batch_size": 32, - "option.output_formatter": "jsonlines" }, "mixtral-8x7b": { "option.model_id": "s3://djl-llm/mixtral-8x7b/", "option.tensor_parallel_degree": 8, "option.use_custom_all_reduce": False, "option.max_rolling_batch_size": 32, - "option.output_formatter": "jsonlines" }, "llama2-7b-chat": { "option.model_id": "s3://djl-llm/meta-llama-Llama-2-7b-chat-hf/", @@ -984,54 +965,46 @@ "option.model_id": "s3://djl-llm/llama-3-8b-hf/", "option.tensor_parallel_degree": 1, "option.rolling_batch": "trtllm", - "option.output_formatter": "jsonlines" }, "llama3-8b-tp4-awq": { "option.model_id": "s3://djl-llm/llama-3-8b-hf/", "option.tensor_parallel_degree": 4, "option.rolling_batch": "trtllm", - "option.output_formatter": "jsonlines", "option.quantize": "awq" }, "llama3-8b-tp4-fp8": { "option.model_id": "s3://djl-llm/llama-3-8b-hf/", "option.tensor_parallel_degree": 4, "option.rolling_batch": "trtllm", - "option.output_formatter": "jsonlines", "option.quantize": "fp8" }, "llama3-8b-tp4-smoothquant": { "option.model_id": "s3://djl-llm/llama-3-8b-hf/", "option.tensor_parallel_degree": 4, "option.rolling_batch": "trtllm", - "option.output_formatter": "jsonlines", "option.quantize": "smoothquant" }, "llama3-70b-tp8-fp16": { "option.model_id": "s3://djl-llm/llama-3-70b-hf/", "option.tensor_parallel_degree": 8, "option.rolling_batch": "trtllm", - "option.output_formatter": "jsonlines" }, "llama3-70b-tp8-awq": { "option.model_id": "s3://djl-llm/llama-3-70b-hf/", "option.tensor_parallel_degree": 8, "option.rolling_batch": "trtllm", - "option.output_formatter": "jsonlines", "option.quantize": "awq" }, "llama3-70b-tp8-fp8": { "option.model_id": "s3://djl-llm/llama-3-70b-hf/", "option.tensor_parallel_degree": 8, "option.rolling_batch": "trtllm", - "option.output_formatter": "jsonlines", "option.quantize": "fp8" }, "llama3-70b-tp8-smoothquant": { "option.model_id": "s3://djl-llm/llama-3-70b-hf/", "option.tensor_parallel_degree": 8, "option.rolling_batch": "trtllm", - "option.output_formatter": "jsonlines", "option.quantize": "smoothquant" } } @@ -1289,7 +1262,6 @@ def build_lmi_dist_model(model): options = lmi_dist_model_list[model] options["engine"] = "MPI" options["option.rolling_batch"] = "lmi-dist" - options["option.output_formatter"] = "jsonlines" adapter_ids = options.pop("adapter_ids", []) adapter_names = options.pop("adapter_names", []) @@ -1307,7 +1279,6 @@ def build_vllm_model(model): options = vllm_model_list[model] options["engine"] = "Python" options["option.rolling_batch"] = "vllm" - options["option.output_formatter"] = "jsonlines" adapter_ids = options.pop("adapter_ids", []) adapter_names = options.pop("adapter_names", []) @@ -1336,7 +1307,6 @@ def build_lmi_dist_aiccl_model(model): options["option.task"] = "text-generation" options["option.tensor_parallel_degree"] = 8 options["option.rolling_batch"] = "lmi-dist" - options["option.output_formatter"] = "jsonlines" options["option.max_rolling_batch_size"] = 16 write_model_artifacts(options) @@ -1379,7 +1349,6 @@ def build_correctness_model(model): f"{model} is not one of the supporting handler {list(correctness_model_list.keys())}" ) options = correctness_model_list[model] - options["option.output_formatter"] = "json" write_model_artifacts(options)