Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix exception handler for proxy server #2901

Merged
merged 13 commits into from
Dec 26, 2024
1 change: 1 addition & 0 deletions lmdeploy/cli/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,7 @@ def add_parser_proxy():
help='the strategy to dispatch requests to nodes')
ArgumentHelper.api_keys(parser)
ArgumentHelper.ssl(parser)
ArgumentHelper.log_level(parser)

@staticmethod
def gradio(args):
Expand Down
2 changes: 1 addition & 1 deletion lmdeploy/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -1921,5 +1921,5 @@ def best_match_model(query: str) -> Optional[str]:
for name, model in MODELS.module_dict.items():
if model.match(query):
return model.match(query)
logger.warn(f'Did not find a chat template matching {query}.')
logger.warning(f'Did not find a chat template matching {query}.')
return 'base'
4 changes: 2 additions & 2 deletions lmdeploy/serve/async_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -505,8 +505,8 @@ async def generate(
if gen_config.stop_token_ids is None:
gen_config.stop_token_ids = self.stop_words
if not gen_config.do_sample:
logger.warn(f'GenerationConfig: {gen_config}')
logger.warn(
logger.warning(f'GenerationConfig: {gen_config}')
logger.warning(
'Since v0.6.0, lmdeploy add `do_sample` in '
'GenerationConfig. It defaults to False, meaning greedy '
'decoding. Please set `do_sample=True` if sampling '
Expand Down
2 changes: 1 addition & 1 deletion lmdeploy/serve/proxy/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import enum

LATENCY_DEEQUE_LEN = 15
LATENCY_DEQUE_LEN = 15
API_TIMEOUT_LEN = 100


Expand Down
37 changes: 26 additions & 11 deletions lmdeploy/serve/proxy/proxy.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Copyright (c) OpenMMLab. All rights reserved.
import asyncio
import copy
import json
import os
Expand All @@ -18,15 +19,15 @@
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse, StreamingResponse
from pydantic import BaseModel, Field
from requests.exceptions import RequestException

from lmdeploy.serve.openai.api_server import (check_api_key,
create_error_response)
from lmdeploy.serve.openai.protocol import ( # noqa: E501
ChatCompletionRequest, CompletionRequest, ModelCard, ModelList,
ModelPermission)
from lmdeploy.serve.proxy.constants import (API_TIMEOUT_LEN,
LATENCY_DEEQUE_LEN, ErrorCodes,
Strategy, err_msg)
from lmdeploy.serve.proxy.constants import (API_TIMEOUT_LEN, LATENCY_DEQUE_LEN,
ErrorCodes, Strategy, err_msg)
from lmdeploy.utils import get_logger

logger = get_logger('lmdeploy')
Expand All @@ -36,7 +37,7 @@ class Status(BaseModel):
"""Status protocol consists of models' information."""
models: Optional[List[str]] = Field(default=[], examples=[[]])
unfinished: int = 0
latency: Deque = Field(default=deque(maxlen=LATENCY_DEEQUE_LEN),
latency: Deque = Field(default=deque(maxlen=LATENCY_DEQUE_LEN),
examples=[[]])
speed: Optional[int] = Field(default=None, examples=[None])

Expand Down Expand Up @@ -87,6 +88,9 @@ def __init__(self,
with open(self.config_path, 'r') as config_file:
self.nodes = yaml.safe_load(config_file)['nodes']
for url, status in self.nodes.items():
latency = deque(status.get('latency', []),
maxlen=LATENCY_DEQUE_LEN)
status['latency'] = latency
status = Status(**status)
self.nodes[url] = status
self.heart_beat_thread = threading.Thread(target=heart_beat_controller,
Expand All @@ -99,7 +103,7 @@ def update_config_file(self):
nodes = copy.deepcopy(self.nodes)
for url, status in nodes.items():
nodes[url] = status.model_dump()
nodes[url]['latency'] = list(status.latency)
nodes[url]['latency'] = list(status.latency)[-LATENCY_DEQUE_LEN:]
with open(self.config_path, 'w') as config_file: # update cfg yml
yaml.dump(dict(nodes=nodes), config_file)

Expand Down Expand Up @@ -251,7 +255,7 @@ def handle_unavailable_model(self, model_name):
Args:
model_name (str): the model in the request.
"""
logger.info(f'no model name: {model_name}')
logger.warning(f'no model name: {model_name}')
ret = {
'error_code': ErrorCodes.MODEL_NOT_FOUND,
'text': err_msg[ErrorCodes.MODEL_NOT_FOUND],
Expand All @@ -260,9 +264,9 @@ def handle_unavailable_model(self, model_name):

def handle_api_timeout(self, node_url):
"""Handle the api time out."""
logger.info(f'api timeout: {node_url}')
logger.warning(f'api timeout: {node_url}')
ret = {
'error_code': ErrorCodes.API_TIMEOUT,
'error_code': ErrorCodes.API_TIMEOUT.value,
'text': err_msg[ErrorCodes.API_TIMEOUT],
}
return json.dumps(ret).encode() + b'\n'
Expand All @@ -286,7 +290,8 @@ def stream_generate(self, request: Dict, node_url: str, node_path: str):
delimiter=b'\n'):
if chunk:
yield chunk + b'\n\n'
except requests.exceptions.RequestException as e: # noqa
except (Exception, GeneratorExit, RequestException) as e: # noqa
# exception happened, reduce unfinished num
lvhan028 marked this conversation as resolved.
Show resolved Hide resolved
yield self.handle_api_timeout(node_url)

async def generate(self, request: Dict, node_url: str, node_path: str):
Expand All @@ -304,7 +309,7 @@ async def generate(self, request: Dict, node_url: str, node_path: str):
json=request,
timeout=API_TIMEOUT_LEN)
return response.text
except requests.exceptions.RequestException as e: # noqa
except (Exception, GeneratorExit, RequestException, asyncio.CancelledError) as e: # noqa # yapf: disable
return self.handle_api_timeout(node_url)
lvhan028 marked this conversation as resolved.
Show resolved Hide resolved

def pre_call(self, node_url):
Expand Down Expand Up @@ -381,7 +386,11 @@ def add_node(node: Node, raw_request: Request = None):
RPM or other metric. All the values of nodes should be the same metric.
"""
try:
node_manager.add(node.url, node.status)
res = node_manager.add(node.url, node.status)
if res is not None:
logger.error(f'add node {node.url} failed, {res}')
return res
logger.info(f'add node {node.url} successfully')
return 'Added successfully'
except: # noqa
return 'Failed to add, please check the input url.'
Expand All @@ -392,8 +401,10 @@ def remove_node(node_url: str):
"""Show available models."""
try:
node_manager.remove(node_url)
logger.info(f'delete node {node_url} successfully')
return 'Deleted successfully'
except: # noqa
logger.error(f'delete node {node_url} failed.')
return 'Failed to delete, please check the input url.'


Expand Down Expand Up @@ -439,6 +450,7 @@ async def chat_completions_v1(request: ChatCompletionRequest,
if not node_url:
return node_manager.handle_unavailable_model(request.model)

logger.info(f'A request is dispatched to {node_url}')
request_dict = request.model_dump()
start = node_manager.pre_call(node_url)
if request.stream is True:
Expand Down Expand Up @@ -497,6 +509,7 @@ async def completions_v1(request: CompletionRequest,
if not node_url:
return node_manager.handle_unavailable_model(request.model)

logger.info(f'A request is dispatched to {node_url}')
request_dict = request.model_dump()
start = node_manager.pre_call(node_url)
if request.stream is True:
Expand All @@ -517,6 +530,7 @@ def proxy(server_name: str = '0.0.0.0',
'min_observed_latency'] = 'min_expected_latency',
api_keys: Optional[Union[List[str], str]] = None,
ssl: bool = False,
log_level: str = 'INFO',
**kwargs):
"""To launch the proxy server.

Expand All @@ -540,6 +554,7 @@ def proxy(server_name: str = '0.0.0.0',
if ssl:
ssl_keyfile = os.environ['SSL_KEYFILE']
ssl_certfile = os.environ['SSL_CERTFILE']
logger.setLevel(log_level)
uvicorn.run(app=app,
host=server_name,
port=server_port,
Expand Down
9 changes: 5 additions & 4 deletions lmdeploy/turbomind/deploy/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,16 +129,17 @@ def get_output_model_registered_name_and_config(model_path: str,
] else 'float16'
elif dtype in ['float16', 'bfloat16']:
if weight_type == 'int4':
logger.warn(f'The model {model_path} is a quantized model, so the '
f'specified data type {dtype} is ignored')
logger.warning(
f'The model {model_path} is a quantized model, so the '
f'specified data type {dtype} is ignored')
else:
weight_type = dtype
else:
assert 0, f'unsupported specified data type {dtype}'

if weight_type == 'bfloat16' and not is_bf16_supported():
logger.warn('data type fallback to float16 since '
'torch.cuda.is_bf16_supported is False')
logger.warning('data type fallback to float16 since '
'torch.cuda.is_bf16_supported is False')
weight_type = 'float16'
config.model_config.model_arch = model_arch
config.model_config.weight_type = weight_type
Expand Down
Loading