Skip to content

Commit

Permalink
[ci][lmi] remove output formatter from model configurations, test str…
Browse files Browse the repository at this point in the history
…eaming/non-streaming on client side
  • Loading branch information
siddvenk committed Sep 23, 2024
1 parent 1e98449 commit 138736b
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 98 deletions.
146 changes: 79 additions & 67 deletions tests/integration/llm/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,14 +61,12 @@ def get_model_name():
"batch_size": [1, 4],
"seq_length": [16, 32],
"worker": 1,
"stream_output": True,
},
"t5-large": {
"max_memory_per_gpu": [5.0],
"batch_size": [1],
"seq_length": [32],
"worker": 1,
"stream_output": True,
},
"gpt4all-lora": {
"max_memory_per_gpu": [10.0, 12.0],
Expand Down Expand Up @@ -1405,20 +1403,25 @@ def test_handler_rolling_batch(model, model_spec):
req["parameters"].update(spec["parameters"])
if "adapters" in spec:
req["adapters"] = spec.get("adapters")[0]
LOGGER.info(f"req {req}")
res = send_json(req)
message = res.content.decode("utf-8")
LOGGER.info(f"res: {message}")
response_checker(res, message)

for stream in [False, True]:
req["stream"] = stream
LOGGER.info(f"req {req}")
res = send_json(req)
message = res.content.decode("utf-8")
LOGGER.info(f"res: {message}")
response_checker(res, message)

# awscurl little benchmark phase
for i, batch_size in enumerate(spec["batch_size"]):
for seq_length in spec["seq_length"]:
LOGGER.info(
f"Little benchmark: concurrency {batch_size} seq_len {seq_length}"
)
req["parameters"]["max_new_tokens"] = seq_length
awscurl_run(req, spec.get("tokenizer", None), batch_size)
for stream in [False, True]:
req["stream"] = stream
LOGGER.info(
f"Little benchmark: concurrency {batch_size} seq_len {seq_length}"
)
req["parameters"]["max_new_tokens"] = seq_length
awscurl_run(req, spec.get("tokenizer", None), batch_size)


def test_handler_adapters(model, model_spec):
Expand All @@ -1440,24 +1443,28 @@ def test_handler_adapters(model, model_spec):
req["parameters"] = params
req["adapters"] = adapter
reqs.append(req)
LOGGER.info(f"reqs {reqs}")
for req in reqs:
res = send_json(req)
message = res.content.decode("utf-8")
LOGGER.info(f"res: {message}")
response_checker(res, message)
for stream in [False, True]:
req["stream"] = stream
LOGGER.info(f"req: {req}")
res = send_json(req)
message = res.content.decode("utf-8")
LOGGER.info(f"res: {message}")
response_checker(res, message)
# awscurl little benchmark phase
for i, batch_size in enumerate(spec["batch_size"]):
for seq_length in spec["seq_length"]:
LOGGER.info(
f"Little benchmark: concurrency {batch_size} seq_len {seq_length}"
)
for req in reqs:
req["parameters"]["max_new_tokens"] = seq_length
awscurl_run(reqs,
spec.get("tokenizer", None),
batch_size,
dataset=True)
for stream in [False, True]:
LOGGER.info(
f"Little benchmark: concurrency {batch_size} seq_len {seq_length}"
)
for req in reqs:
req["parameters"]["max_new_tokens"] = seq_length
req["stream"] = stream
awscurl_run(reqs,
spec.get("tokenizer", None),
batch_size,
dataset=True)
# Test removing and querying invalid/removed adapter
del_adapter = spec.get("adapters")[0]
res = requests.delete(
Expand Down Expand Up @@ -1497,17 +1504,20 @@ def test_handler_rolling_batch_chat(model, model_spec):
req["top_logprobs"] = 1
if "adapters" in spec:
req["adapters"] = spec.get("adapters")[0]
LOGGER.info(f"req {req}")
res = send_json(req)
LOGGER.info(f"res: {res.content}")
# awscurl little benchmark phase
for i, batch_size in enumerate(spec["batch_size"]):
for seq_length in spec["seq_length"]:
LOGGER.info(
f"Little benchmark: concurrency {batch_size} seq_len {seq_length}"
)
req["max_tokens"] = seq_length
awscurl_run(req, spec.get("tokenizer", None), batch_size)

for stream in [False, True]:
req["stream"] = stream
LOGGER.info(f"req {req}")
res = send_json(req)
LOGGER.info(f"res: {res.content}")
# awscurl little benchmark phase
for i, batch_size in enumerate(spec["batch_size"]):
for seq_length in spec["seq_length"]:
LOGGER.info(
f"Little benchmark: concurrency {batch_size} seq_len {seq_length}"
)
req["max_tokens"] = seq_length
awscurl_run(req, spec.get("tokenizer", None), batch_size)


def test_handler(model, model_spec):
Expand All @@ -1517,36 +1527,38 @@ def test_handler(model, model_spec):
check_worker_number(spec["worker"])
for i, batch_size in enumerate(spec["batch_size"]):
for seq_length in spec["seq_length"]:
if "t5" in model:
req = {"inputs": t5_batch_generation(batch_size)}
else:
req = {"inputs": batch_generation(batch_size)}
if spec.get("adapters", []):
req["adapters"] = spec.get("adapters")
params = {"max_new_tokens": seq_length}
if spec.get("details", False):
params["details"] = True
req["parameters"] = params
LOGGER.info(f"req {req}")
res = send_json(req)
if spec.get("stream_output", False):
LOGGER.info(f"res: {res.content}")
result = res.content.decode().split("\n")[:-1]
assert len(
result
) <= seq_length, "generated more tokens than max_new_tokens"
else:
res = res.json()
LOGGER.info(f"res {res}")
if isinstance(res, list):
result = [item['generated_text'] for item in res]
assert len(result) == batch_size
elif isinstance(res, dict):
assert 1 == batch_size
if "max_memory_per_gpu" in spec:
validate_memory_usage(spec["max_memory_per_gpu"][i])
if "tokenizer" in spec:
awscurl_run(req, spec.get("tokenizer"), batch_size)
for stream in [False, True]:
if "t5" in model:
req = {"inputs": t5_batch_generation(batch_size)}
else:
req = {"inputs": batch_generation(batch_size)}
if spec.get("adapters", []):
req["adapters"] = spec.get("adapters")
params = {"max_new_tokens": seq_length}
if spec.get("details", False):
params["details"] = True
req["parameters"] = params
req["stream"] = stream
LOGGER.info(f"req {req}")
res = send_json(req)
if stream:
LOGGER.info(f"res: {res.content}")
result = res.content.decode().split("\n")[:-1]
assert len(
result
) <= seq_length, "generated more tokens than max_new_tokens"
else:
res = res.json()
LOGGER.info(f"res {res}")
if isinstance(res, list):
result = [item['generated_text'] for item in res]
assert len(result) == batch_size
elif isinstance(res, dict):
assert 1 == batch_size
if "max_memory_per_gpu" in spec:
validate_memory_usage(spec["max_memory_per_gpu"][i])
if "tokenizer" in spec:
awscurl_run(req, spec.get("tokenizer"), batch_size)


def log_awscurl_benchmark(metric_name: str,
Expand Down
Loading

0 comments on commit 138736b

Please sign in to comment.