✨ Feature: Add features to support setting different rate limits for …

…different models at the user level.
yym68686 · Nov 25, 2024 · 8e3a5c1 · 8e3a5c1
1 parent 89affc8
commit 8e3a5c1
Show file tree

Hide file tree

Showing 4 changed files with 30 additions and 5 deletions.
diff --git a/README.md b/README.md
@@ -151,8 +151,11 @@ api_keys:
       # When SCHEDULING_ALGORITHM is random, use random polling load balancing, randomly request the channel of the model with a request.
       # When SCHEDULING_ALGORITHM is round_robin, use polling load balancing, request the channel of the model used by the user in order.
       AUTO_RETRY: true # Whether to automatically retry, automatically retry the next provider, true for automatic retry, false for no automatic retry, default is true. Also supports setting a number, indicating the number of retries.
-      RATE_LIMIT: 2/min # Supports rate limiting, maximum number of requests per minute, can be set to an integer, such as 2/min, 2 times per minute, 5/hour, 5 times per hour, 10/day, 10 times per day, 10/month, 10 times per month, 10/year, 10 times per year. Default is 60/min, optional
-      # RATE_LIMIT: 2/min,10/day # Supports multiple frequency constraints
+      rate_limit: 15/min # Supports rate limiting, each API Key can request up to 15 times per minute, optional. The default is 999999/min. Supports multiple frequency constraints: 15/min,10/day
+      # rate_limit: # You can set different frequency limits for each model
+      #   gemini-1.5-pro: 3/min
+      #   gemini-1.5-flash: 2/min
+      #   default: 4/min # If the model does not set the frequency limit, use the frequency limit of default
       ENABLE_MODERATION: true # Whether to enable message moderation, true for enable, false for disable, default is false, when enabled, it will moderate the user's message, if inappropriate messages are found, an error message will be returned.
 
   # Channel-level weighted load balancing configuration example

diff --git a/README_CN.md b/README_CN.md
@@ -151,8 +151,11 @@ api_keys:
       # 当 SCHEDULING_ALGORITHM 为 random 时，使用随机轮训负载均衡，随机请求拥有请求的模型的渠道。
       # 当 SCHEDULING_ALGORITHM 为 round_robin 时，使用轮训负载均衡，按照顺序请求用户使用的模型的渠道。
       AUTO_RETRY: true # 是否自动重试，自动重试下一个提供商，true 为自动重试，false 为不自动重试，默认为 true。也可以设置为数字，表示重试次数。
-      RATE_LIMIT: 2/min # 支持限流，每分钟最多请求次数，可以设置为整数，如 2/min，2 次每分钟、5/hour，5 次每小时、10/day，10 次每天，10/month，10 次每月，10/year，10 次每年。默认60/min，选填
-      # RATE_LIMIT: 2/min,10/day 支持多个频率约束条件
+      rate_limit: 15/min # 支持限流，每分钟最多请求次数，可以设置为整数，如 2/min，2 次每分钟、5/hour，5 次每小时、10/day，10 次每天，10/month，10 次每月，10/year，10 次每年。默认999999/min，选填。支持多个频率约束条件：15/min,10/day
+      # rate_limit: # 可以为每个模型设置不同的频率限制
+      #   gemini-1.5-pro: 3/min
+      #   gemini-1.5-flash: 2/min
+      #   default: 4/min # 如果模型没有设置频率限制，使用 default 的频率限制
       ENABLE_MODERATION: true # 是否开启消息道德审查，true 为开启，false 为不开启，默认为 false，当开启后，会对用户的消息进行道德审查，如果发现不当的消息，会返回错误信息。
 
   # 渠道级加权负载均衡配置示例

diff --git a/main.py b/main.py
@@ -29,6 +29,7 @@
     error_handling_wrapper,
     rate_limiter,
     provider_api_circular_list,
+    ThreadSafeCircularList,
 )
 
 from collections import defaultdict
@@ -488,6 +489,15 @@ async def dispatch(self, request: Request, call_next):
                 model = request_model.model
                 current_info["model"] = model
 
+                final_api_key = app.state.api_list[api_index]
+                try:
+                    await app.state.user_api_keys_rate_limit[final_api_key].next(model)
+                except Exception as e:
+                    return JSONResponse(
+                        status_code=429,
+                        content={"error": "Too many requests"}
+                    )
+
                 moderated_content = None
                 if request_model.request_type == "chat":
                     moderated_content = request_model.get_last_text_message()
@@ -666,6 +676,15 @@ async def ensure_config(request: Request, call_next):
         # logger.warning("Config not found, attempting to reload")
         app.state.config, app.state.api_keys_db, app.state.api_list = await load_config(app)
 
+        if app.state.api_list:
+            app.state.user_api_keys_rate_limit = defaultdict(ThreadSafeCircularList)
+            for api_index, api_key in enumerate(app.state.api_list):
+                app.state.user_api_keys_rate_limit[api_key] = ThreadSafeCircularList(
+                    [api_key],
+                    safe_get(app.state.config, 'api_keys', api_index, "preferences", "rate_limit", default={"default": "999999/min"}),
+                    "round_robin"
+                )
+
         for item in app.state.api_keys_db:
             if item.get("role") == "admin":
                 app.state.admin_api_key = item.get("api")

diff --git a/utils.py b/utils.py
@@ -67,7 +67,7 @@ async def get_user_rate_limit(app, api_index: int = None):
     # 这里应该实现根据 token 获取用户速率限制的逻辑
     # 示例： 返回 (次数， 秒数)
     config = app.state.config
-    raw_rate_limit = safe_get(config, 'api_keys', api_index, "preferences", "RATE_LIMIT")
+    raw_rate_limit = safe_get(config, 'api_keys', api_index, "preferences", "rate_limit")
     # print("raw_rate_limit", raw_rate_limit)
     # print("not api_index or not raw_rate_limit", api_index == None, not raw_rate_limit, api_index == None or not raw_rate_limit, api_index, raw_rate_limit)