InternLM · lvhan028 · Dec 26, 2024 · Dec 16, 2024 · Dec 16, 2024 · Dec 16, 2024
diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py
@@ -238,6 +238,7 @@ def add_parser_proxy():
             help='the strategy to dispatch requests to nodes')
         ArgumentHelper.api_keys(parser)
         ArgumentHelper.ssl(parser)
+        ArgumentHelper.log_level(parser)
 
     @staticmethod
     def gradio(args):

diff --git a/lmdeploy/model.py b/lmdeploy/model.py
@@ -1921,5 +1921,5 @@ def best_match_model(query: str) -> Optional[str]:
     for name, model in MODELS.module_dict.items():
         if model.match(query):
             return model.match(query)
-    logger.warn(f'Did not find a chat template matching {query}.')
+    logger.warning(f'Did not find a chat template matching {query}.')
     return 'base'
diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py
@@ -505,8 +505,8 @@ async def generate(
         if gen_config.stop_token_ids is None:
             gen_config.stop_token_ids = self.stop_words
         if not gen_config.do_sample:
-            logger.warn(f'GenerationConfig: {gen_config}')
-            logger.warn(
+            logger.warning(f'GenerationConfig: {gen_config}')
+            logger.warning(
                 'Since v0.6.0, lmdeploy add `do_sample` in '
                 'GenerationConfig. It defaults to False, meaning greedy '
                 'decoding. Please set `do_sample=True` if sampling '

diff --git a/lmdeploy/serve/proxy/constants.py b/lmdeploy/serve/proxy/constants.py
@@ -2,7 +2,7 @@
 
 import enum
 
-LATENCY_DEEQUE_LEN = 15
+LATENCY_DEQUE_LEN = 15
 API_TIMEOUT_LEN = 100
 
 

diff --git a/lmdeploy/serve/proxy/proxy.py b/lmdeploy/serve/proxy/proxy.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import asyncio
 import copy
 import json
 import os
@@ -18,15 +19,15 @@
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, StreamingResponse
 from pydantic import BaseModel, Field
+from requests.exceptions import RequestException
 
 from lmdeploy.serve.openai.api_server import (check_api_key,
                                               create_error_response)
 from lmdeploy.serve.openai.protocol import (  # noqa: E501
     ChatCompletionRequest, CompletionRequest, ModelCard, ModelList,
     ModelPermission)
-from lmdeploy.serve.proxy.constants import (API_TIMEOUT_LEN,
-                                            LATENCY_DEEQUE_LEN, ErrorCodes,
-                                            Strategy, err_msg)
+from lmdeploy.serve.proxy.constants import (API_TIMEOUT_LEN, LATENCY_DEQUE_LEN,
+                                            ErrorCodes, Strategy, err_msg)
 from lmdeploy.utils import get_logger
 
 logger = get_logger('lmdeploy')
@@ -36,7 +37,7 @@ class Status(BaseModel):
     """Status protocol consists of models' information."""
     models: Optional[List[str]] = Field(default=[], examples=[[]])
     unfinished: int = 0
-    latency: Deque = Field(default=deque(maxlen=LATENCY_DEEQUE_LEN),
+    latency: Deque = Field(default=deque(maxlen=LATENCY_DEQUE_LEN),
                            examples=[[]])
     speed: Optional[int] = Field(default=None, examples=[None])
 
@@ -87,6 +88,9 @@ def __init__(self,
             with open(self.config_path, 'r') as config_file:
                 self.nodes = yaml.safe_load(config_file)['nodes']
                 for url, status in self.nodes.items():
+                    latency = deque(status.get('latency', []),
+                                    maxlen=LATENCY_DEQUE_LEN)
+                    status['latency'] = latency
                     status = Status(**status)
                     self.nodes[url] = status
         self.heart_beat_thread = threading.Thread(target=heart_beat_controller,
@@ -99,7 +103,7 @@ def update_config_file(self):
         nodes = copy.deepcopy(self.nodes)
         for url, status in nodes.items():
             nodes[url] = status.model_dump()
-            nodes[url]['latency'] = list(status.latency)
+            nodes[url]['latency'] = list(status.latency)[-LATENCY_DEQUE_LEN:]
         with open(self.config_path, 'w') as config_file:  # update cfg yml
             yaml.dump(dict(nodes=nodes), config_file)
 
@@ -251,7 +255,7 @@ def handle_unavailable_model(self, model_name):
         Args:
             model_name (str): the model in the request.
         """
-        logger.info(f'no model name: {model_name}')
+        logger.warning(f'no model name: {model_name}')
         ret = {
             'error_code': ErrorCodes.MODEL_NOT_FOUND,
             'text': err_msg[ErrorCodes.MODEL_NOT_FOUND],
@@ -260,9 +264,9 @@ def handle_unavailable_model(self, model_name):
 
     def handle_api_timeout(self, node_url):
         """Handle the api time out."""
-        logger.info(f'api timeout: {node_url}')
+        logger.warning(f'api timeout: {node_url}')
         ret = {
-            'error_code': ErrorCodes.API_TIMEOUT,
+            'error_code': ErrorCodes.API_TIMEOUT.value,
             'text': err_msg[ErrorCodes.API_TIMEOUT],
         }
         return json.dumps(ret).encode() + b'\n'
@@ -286,7 +290,8 @@ def stream_generate(self, request: Dict, node_url: str, node_path: str):
                                              delimiter=b'\n'):
                 if chunk:
                     yield chunk + b'\n\n'
-        except requests.exceptions.RequestException as e:  # noqa
+        except (Exception, GeneratorExit, RequestException) as e:  # noqa
+            # exception happened, reduce unfinished num
             yield self.handle_api_timeout(node_url)
 
     async def generate(self, request: Dict, node_url: str, node_path: str):
@@ -304,7 +309,7 @@ async def generate(self, request: Dict, node_url: str, node_path: str):
                                              json=request,
                                              timeout=API_TIMEOUT_LEN)
                 return response.text
-        except requests.exceptions.RequestException as e:  # noqa
+        except (Exception, GeneratorExit, RequestException, asyncio.CancelledError) as e:  # noqa  # yapf: disable
             return self.handle_api_timeout(node_url)
 
     def pre_call(self, node_url):
@@ -381,7 +386,11 @@ def add_node(node: Node, raw_request: Request = None):
         RPM or other metric. All the values of nodes should be the same metric.
     """
     try:
-        node_manager.add(node.url, node.status)
+        res = node_manager.add(node.url, node.status)
+        if res is not None:
+            logger.error(f'add node {node.url} failed, {res}')
+            return res
+        logger.info(f'add node {node.url} successfully')
         return 'Added successfully'
     except:  # noqa
         return 'Failed to add, please check the input url.'
@@ -392,8 +401,10 @@ def remove_node(node_url: str):
     """Show available models."""
     try:
         node_manager.remove(node_url)
+        logger.info(f'delete node {node_url} successfully')
         return 'Deleted successfully'
     except:  # noqa
+        logger.error(f'delete node {node_url} failed.')
         return 'Failed to delete, please check the input url.'
 
 
@@ -439,6 +450,7 @@ async def chat_completions_v1(request: ChatCompletionRequest,
     if not node_url:
         return node_manager.handle_unavailable_model(request.model)
 
+    logger.info(f'A request is dispatched to {node_url}')
     request_dict = request.model_dump()
     start = node_manager.pre_call(node_url)
     if request.stream is True:
@@ -497,6 +509,7 @@ async def completions_v1(request: CompletionRequest,
     if not node_url:
         return node_manager.handle_unavailable_model(request.model)
 
+    logger.info(f'A request is dispatched to {node_url}')
     request_dict = request.model_dump()
     start = node_manager.pre_call(node_url)
     if request.stream is True:
@@ -517,6 +530,7 @@ def proxy(server_name: str = '0.0.0.0',
                             'min_observed_latency'] = 'min_expected_latency',
           api_keys: Optional[Union[List[str], str]] = None,
           ssl: bool = False,
+          log_level: str = 'INFO',
           **kwargs):
     """To launch the proxy server.
 
@@ -540,6 +554,7 @@ def proxy(server_name: str = '0.0.0.0',
     if ssl:
         ssl_keyfile = os.environ['SSL_KEYFILE']
         ssl_certfile = os.environ['SSL_CERTFILE']
+    logger.setLevel(log_level)
     uvicorn.run(app=app,
                 host=server_name,
                 port=server_port,

diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py
@@ -129,16 +129,17 @@ def get_output_model_registered_name_and_config(model_path: str,
         ] else 'float16'
     elif dtype in ['float16', 'bfloat16']:
         if weight_type == 'int4':
-            logger.warn(f'The model {model_path} is a quantized model, so the '
-                        f'specified data type {dtype} is ignored')
+            logger.warning(
+                f'The model {model_path} is a quantized model, so the '
+                f'specified data type {dtype} is ignored')
         else:
             weight_type = dtype
     else:
         assert 0, f'unsupported specified data type {dtype}'
 
     if weight_type == 'bfloat16' and not is_bf16_supported():
-        logger.warn('data type fallback to float16 since '
-                    'torch.cuda.is_bf16_supported is False')
+        logger.warning('data type fallback to float16 since '
+                       'torch.cuda.is_bf16_supported is False')
         weight_type = 'float16'
     config.model_config.model_arch = model_arch
     config.model_config.weight_type = weight_type