From 3e607e9f7cbfac4bdd7db602c6a7aadfabc4b56e Mon Sep 17 00:00:00 2001
From: Jensen Zhang <jingxuan.n.zhang@gmail.com>
Date: Thu, 12 Mar 2026 05:49:58 +0000
Subject: [PATCH 1/3] feat(token): add consumed mode for quota management

- Add 'consumed_mode_enabled' config option (opt-in, default false)
- Track local consumption separately from API quota
- consumed increases in both modes; quota deduction differs by mode
- Prefer tokens with lower consumed amount in pool selection
- UI dynamically switches between 'remaining quota' and 'consumed' display
- Disable quota editing in edit modal when consumed mode is enabled
- Maintain backward compatibility with existing configs
---
 _public/static/admin/js/config.js     |   3 +-
 _public/static/admin/js/token.js      |  76 ++++++++++++++--
 _public/static/admin/pages/token.html |   2 +-
 _public/static/i18n/locales/en.json   |   3 +
 _public/static/i18n/locales/zh.json   |   3 +
 app/api/v1/admin/token.py             |   9 +-
 app/services/token/manager.py         |  34 ++++++-
 app/services/token/models.py          |  89 ++++++++++++++++---
 app/services/token/pool.py            | 123 +++++++++++++++++++-------
 config.defaults.toml                  |   2 +
 10 files changed, 290 insertions(+), 54 deletions(-)

diff --git a/_public/static/admin/js/config.js b/_public/static/admin/js/config.js
index 84c7111e6..28589b385 100644
--- a/_public/static/admin/js/config.js
+++ b/_public/static/admin/js/config.js
@@ -150,7 +150,8 @@ const LOCALE_MAP = {
     "fail_threshold": { title: "失败阈值", desc: "单个 Token 连续失败多少次后被标记为不可用。" },
     "save_delay_ms": { title: "保存延迟", desc: "Token 变更合并写入的延迟（毫秒）。" },
     "usage_flush_interval_sec": { title: "用量落库间隔", desc: "用量类字段写入数据库的最小间隔（秒）。" },
-    "reload_interval_sec": { title: "同步间隔", desc: "多 worker 场景下 Token 状态刷新间隔（秒）。" }
+    "reload_interval_sec": { title: "同步间隔", desc: "多 worker 场景下 Token 状态刷新间隔（秒）。" },
+    "consumed_mode_enabled": { title: "启用消耗模式", desc: "启用新额度管理逻辑：使用本地消耗记录而非 API 返回值，支持更均衡的负载分配。（试验性功能，默认关闭）" }
   },
 
 
diff --git a/_public/static/admin/js/token.js b/_public/static/admin/js/token.js
index abe2a7e31..fa47f7af9 100644
--- a/_public/static/admin/js/token.js
+++ b/_public/static/admin/js/token.js
@@ -1,4 +1,5 @@
 let apiKey = '';
+let consumedModeEnabled = false;
 let allTokens = {};
 let flatTokens = [];
 let isBatchProcessing = false;
@@ -123,9 +124,11 @@ async function loadData() {
     });
     if (res.ok) {
       const data = await res.json();
-      allTokens = data;
-      processTokens(data);
-      updateStats(data);
+      allTokens = data.tokens;
+      consumedModeEnabled = data.consumed_mode_enabled || false;
+      updateQuotaHeader();
+      processTokens(data.tokens);
+      updateStats(data.tokens);
       renderTable();
     } else if (res.status === 401) {
       logout();
@@ -151,6 +154,7 @@ function processTokens(data) {
             token: t.token,
             status: t.status || 'active',
             quota: t.quota || 0,
+            consumed: t.consumed || 0,
             note: t.note || '',
             fail_count: t.fail_count || 0,
             use_count: t.use_count || 0,
@@ -168,6 +172,19 @@ function processTokens(data) {
   });
 }
 
+function updateQuotaHeader() {
+  const thQuota = document.getElementById('th-quota');
+  if (thQuota) {
+    if (consumedModeEnabled) {
+      thQuota.textContent = t('token.tableQuotaConsumed');
+      thQuota.dataset.i18n = 'token.tableQuotaConsumed';
+    } else {
+      thQuota.textContent = t('token.tableQuota');
+      thQuota.dataset.i18n = 'token.tableQuota';
+    }
+  }
+}
+
 function updateStats(data) {
   // Logic same as before, simplified reuse if possible, but let's re-run on flatTokens
   let totalTokens = flatTokens.length;
@@ -197,14 +214,27 @@ function updateStats(data) {
   });
 
   const imageQuota = Math.floor(chatQuota / 2);
+  const totalConsumed = flatTokens.reduce((sum, t) => sum + (t.consumed || 0), 0);
 
+  // 更新统计卡片 (这些不受 consumedMode 影响)
   setText('stat-total', totalTokens.toLocaleString());
   setText('stat-active', activeTokens.toLocaleString());
   setText('stat-cooling', coolingTokens.toLocaleString());
   setText('stat-invalid', invalidTokens.toLocaleString());
 
-  setText('stat-chat-quota', chatQuota.toLocaleString());
-  setText('stat-image-quota', imageQuota.toLocaleString());
+  // 根据配置决定显示消耗还是剩余
+  if (consumedModeEnabled) {
+    setText('stat-chat-quota', totalConsumed.toLocaleString());
+    setText('stat-image-quota', Math.floor(totalConsumed / 2).toLocaleString());
+    const chatLabel = document.querySelector('[data-i18n="token.statChatQuota"]');
+    const imageLabel = document.querySelector('[data-i18n="token.statImageQuota"]');
+    if (chatLabel) chatLabel.textContent = t('token.statChatConsumed');
+    if (imageLabel) imageLabel.textContent = t('token.statImageConsumed');
+  } else {
+    setText('stat-chat-quota', chatQuota.toLocaleString());
+    setText('stat-image-quota', imageQuota.toLocaleString());
+  }
+
   setText('stat-total-calls', totalCalls.toLocaleString());
 
   updateTabCounts({
@@ -293,7 +323,16 @@ function renderTable() {
     // Quota (Center)
     const tdQuota = document.createElement('td');
     tdQuota.className = 'text-center font-mono text-xs';
-    tdQuota.innerText = item.quota;
+    // 根据配置决定显示消耗还是剩余
+    if (consumedModeEnabled) {
+      tdQuota.innerText = item.consumed;
+      tdQuota.title = t('token.tableQuotaConsumed');
+    } else {
+      tdQuota.innerText = item.quota;
+      tdQuota.title = t('token.tableQuota');
+    }
+
+
 
     // Note (Left)
     const tdNote = document.createElement('td');
@@ -503,6 +542,23 @@ function openEditModal(index) {
     byId('edit-pool').value = item.pool;
     byId('edit-quota').value = item.quota;
     byId('edit-note').value = item.note;
+
+    // 根据配置决定是否禁用 quota 编辑
+    const quotaInput = byId('edit-quota');
+    const quotaInputParent = quotaInput?.closest('div');
+    const quotaLabel = quotaInputParent?.previousElementSibling;
+    if (consumedModeEnabled) {
+      quotaInput.disabled = true;
+      quotaInput.classList.add('bg-gray-100', 'text-gray-400');
+      if (quotaLabel) quotaLabel.textContent = t('token.tableQuotaConsumed');
+    } else {
+      quotaInput.disabled = false;
+      quotaInput.classList.remove('bg-gray-100', 'text-gray-400');
+      if (quotaLabel) quotaLabel.textContent = t('token.editQuota');
+    }
+
+    document.querySelector('#edit-modal h3').innerText = t('token.editTitle');
+    byId('edit-note').value = item.note;
     document.querySelector('#edit-modal h3').innerText = t('token.editTitle');
   } else {
     // New Token
@@ -518,6 +574,14 @@ function openEditModal(index) {
     byId('edit-quota').value = getDefaultQuotaForPool('ssoBasic');
     byId('edit-note').value = '';
     document.querySelector('#edit-modal h3').innerText = t('token.addTitle');
+
+    // 新建 Token 时启用 quota 编辑
+    const newQuotaInput = byId('edit-quota');
+    const newQuotaInputParent = newQuotaInput?.closest('div');
+    const newQuotaLabel = newQuotaInputParent?.previousElementSibling;
+    newQuotaInput.disabled = false;
+    newQuotaInput.classList.remove('bg-gray-100', 'text-gray-400');
+    if (newQuotaLabel) newQuotaLabel.textContent = t('token.editQuota');
   }
 
   openModal('edit-modal');
diff --git a/_public/static/admin/pages/token.html b/_public/static/admin/pages/token.html
index c4843d801..50f321d04 100644
--- a/_public/static/admin/pages/token.html
+++ b/_public/static/admin/pages/token.html
@@ -167,7 +167,7 @@ <h2 class="text-2xl font-semibold tracking-tight" data-i18n="token.title">Token
                 <th class="w-56 text-left" data-i18n="token.tableToken">Token</th>
                 <th class="w-24" data-i18n="token.tableType">类型</th>
                 <th class="w-24" data-i18n="token.tableStatus">状态</th>
-                <th class="w-20" data-i18n="token.tableQuota">额度</th>
+                <th class="w-20" id="th-quota" data-i18n="token.tableQuota">额度</th>
                 <th class="text-left" data-i18n="token.tableNote">备注</th>
                 <th class="w-44 text-center" data-i18n="token.tableActions">操作</th>
               </tr>
diff --git a/_public/static/i18n/locales/en.json b/_public/static/i18n/locales/en.json
index d417fc3f2..77eb65003 100644
--- a/_public/static/i18n/locales/en.json
+++ b/_public/static/i18n/locales/en.json
@@ -244,6 +244,8 @@
     "statInvalid": "Invalid",
     "statChatQuota": "Chat Remaining",
     "statImageQuota": "Image Remaining",
+    "statChatConsumed": "Chat Consumed",
+    "statImageConsumed": "Image Consumed",
     "statVideoQuota": "Video Remaining",
     "statVideoUnavailable": "N/A",
     "statTotalCalls": "Total Calls",
@@ -258,6 +260,7 @@
     "tableType": "Type",
     "tableStatus": "Status",
     "tableQuota": "Quota",
+    "tableQuotaConsumed": "Consumed",
     "tableNote": "Note",
     "tableActions": "Actions",
     "refreshStatus": "Refresh status",
diff --git a/_public/static/i18n/locales/zh.json b/_public/static/i18n/locales/zh.json
index 1cd4df3e4..00a5f7b3a 100644
--- a/_public/static/i18n/locales/zh.json
+++ b/_public/static/i18n/locales/zh.json
@@ -244,6 +244,8 @@
     "statInvalid": "Token 失效",
     "statChatQuota": "Chat 剩余",
     "statImageQuota": "Image 剩余",
+    "statChatConsumed": "Chat 已消耗",
+    "statImageConsumed": "Image 已消耗",
     "statVideoQuota": "Video 剩余",
     "statVideoUnavailable": "无法统计",
     "statTotalCalls": "总调用次数",
@@ -258,6 +260,7 @@
     "tableType": "类型",
     "tableStatus": "状态",
     "tableQuota": "额度",
+    "tableQuotaConsumed": "已消耗",
     "tableNote": "备注",
     "tableActions": "操作",
     "refreshStatus": "刷新状态",
diff --git a/app/api/v1/admin/token.py b/app/api/v1/admin/token.py
index d417ee887..ebe914052 100644
--- a/app/api/v1/admin/token.py
+++ b/app/api/v1/admin/token.py
@@ -48,7 +48,14 @@ async def get_tokens():
     """获取所有 Token"""
     storage = get_storage()
     tokens = await storage.load_tokens()
-    return tokens or {}
+    # 获取消耗模式配置
+    from app.core.config import get_config
+    consumed_mode = get_config("token.consumed_mode_enabled", False)
+    
+    return {
+        "tokens": tokens or {},
+        "consumed_mode_enabled": consumed_mode
+    }
 
 
 @router.post("/tokens", dependencies=[Depends(verify_app_key)])
diff --git a/app/services/token/manager.py b/app/services/token/manager.py
index eb668edb6..3fa9bbe98 100644
--- a/app/services/token/manager.py
+++ b/app/services/token/manager.py
@@ -471,7 +471,18 @@ async def consume(
             token = pool.get(raw_token)
             if token:
                 old_status = token.status
-                consumed = token.consume(effort)
+                # 检查是否启用消耗模式
+                consumed_mode = False
+                try:
+                    from app.core.config import get_config
+                    consumed_mode = get_config("token.consumed_mode_enabled", False)
+                except Exception:
+                    pass
+
+                if consumed_mode:
+                    consumed = token.consume_with_consumed(effort)
+                else:
+                    consumed = token.consume(effort)
                 logger.debug(
                     f"Token {raw_token[:10]}...: consumed {consumed} quota, use_count={token.use_count}"
                 )
@@ -936,6 +947,27 @@ async def _refresh_one(item: tuple[str, TokenInfo]) -> dict:
                     old_status = token_info.status
 
                     token_info.update_quota(new_quota)
+
+                    # 检查是否启用 consumed 模式
+                    consumed_mode = False
+                    try:
+                        from app.core.config import get_config
+                        consumed_mode = get_config("token.consumed_mode_enabled", False)
+                    except Exception:
+                        pass
+
+                    if consumed_mode:
+                        # Consumed 模式：使用新逻辑
+                        token_info.update_quota_with_consumed(new_quota)
+                        # 刷新成功后重置消耗记录
+                        if new_quota > 0:
+                            token_info.consumed = 0
+                    else:
+                        # 默认模式：使用旧逻辑
+                        token_info.update_quota(new_quota)
+                        # 刷新成功后如果 quota > 0，清除冷却状态
+                        if new_quota > 0:
+                            token_info.status = TokenStatus.ACTIVE
                     token_info.mark_synced()
 
                     window_size = self._extract_window_size_seconds(result)
diff --git a/app/services/token/models.py b/app/services/token/models.py
index 0e8b3b5f7..f256586be 100644
--- a/app/services/token/models.py
+++ b/app/services/token/models.py
@@ -51,6 +51,10 @@ class TokenInfo(BaseModel):
     status: TokenStatus = TokenStatus.ACTIVE
     quota: int = BASIC__DEFAULT_QUOTA
 
+    # 消耗记录（本地累加，不依赖 API 返回值）
+    # 仅在 consumed_mode_enabled=true 时使用
+    consumed: int = 0
+
     # 统计
     created_at: int = Field(
         default_factory=lambda: int(datetime.now().timestamp() * 1000)
@@ -106,29 +110,33 @@ def _normalize_token(cls, value):
         return token
 
     def is_available(self) -> bool:
-        """检查是否可用（状态正常且配额 > 0）"""
+        """检查是否可用（状态正常且未达到冷却阈值）"""
+        # 兼容旧数据：没有 consumed 字段时回退到 quota 判断
+        if self.consumed > 0:
+            return self.status == TokenStatus.ACTIVE
         return self.status == TokenStatus.ACTIVE and self.quota > 0
 
     def consume(self, effort: EffortType = EffortType.LOW) -> int:
         """
-        消耗配额
+        消耗配额（默认：扣减 quota）
 
         Args:
-            effort: LOW 扣 1 配额并计 1 次，HIGH 扣 4 配额并计 4 次
+            effort: LOW 计 1 次，HIGH 计 4 次
 
         Returns:
             实际扣除的配额
         """
         cost = EFFORT_COST[effort]
+
+        # 默认行为：扣减 quota
         actual_cost = min(cost, self.quota)
 
         self.last_used_at = int(datetime.now().timestamp() * 1000)
-        self.use_count += actual_cost  # 使用 actual_cost 避免配额不足时过度计数
+        self.consumed += cost  # 无论是否开启消耗模式，都记录消耗
+        self.use_count += actual_cost
         self.quota = max(0, self.quota - actual_cost)
 
-        # 注意：不在这里清零 fail_count，只有 record_success() 才清零
-        # 这样可以避免失败后调用 consume 导致失败计数被重置
-
+        # 默认行为：quota 耗尽时标记冷却
         if self.quota == 0:
             self.status = TokenStatus.COOLING
         elif self.status == TokenStatus.COOLING:
@@ -137,9 +145,34 @@ def consume(self, effort: EffortType = EffortType.LOW) -> int:
 
         return actual_cost
 
+    def consume_with_consumed(self, effort: EffortType = EffortType.LOW) -> int:
+        """
+        消耗配额（consumed 模式：累加 consumed 而非扣减 quota）
+
+        仅在 consumed_mode_enabled=true 时使用
+
+        Args:
+            effort: LOW 计 1 次，HIGH 计 4 次
+
+        Returns:
+            实际计入的消耗次数
+        """
+        cost = EFFORT_COST[effort]
+
+        self.consumed += cost  # 累加消耗记录
+        self.last_used_at = int(datetime.now().timestamp() * 1000)
+        self.use_count += 1
+
+        # consumed 模式下不自动判断冷却，由 Rate Limits 检查或 429 触发
+        if self.status == TokenStatus.COOLING:
+            # 只从 COOLING 恢复，不从 EXPIRED 恢复
+            self.status = TokenStatus.ACTIVE
+
+        return cost
+
     def update_quota(self, new_quota: int):
         """
-        更新配额（用于 API 同步）
+        更新配额（用于 API 同步 - 默认模式）
 
         Args:
             new_quota: 新的配额值
@@ -154,6 +187,19 @@ def update_quota(self, new_quota: int):
         ]:
             self.status = TokenStatus.ACTIVE
 
+    def update_quota_with_consumed(self, new_quota: int):
+        """
+        更新配额（consumed 模式）
+
+        仅在 consumed_mode_enabled=true 时使用
+
+        Args:
+            new_quota: 新的配额值
+        """
+        self.quota = max(0, new_quota)
+
+        # consumed 模式下不再自动判断冷却，冷却由 Rate Limits 检查或 429 触发
+
     def reset(self, default_quota: Optional[int] = None):
         """重置配额到默认值"""
         quota = BASIC__DEFAULT_QUOTA if default_quota is None else default_quota
@@ -161,6 +207,8 @@ def reset(self, default_quota: Optional[int] = None):
         self.status = TokenStatus.ACTIVE
         self.fail_count = 0
         self.last_fail_reason = None
+        # 重置消耗记录
+        self.consumed = 0
 
     def record_fail(
         self,
@@ -182,7 +230,7 @@ def record_fail(
             self.status = TokenStatus.EXPIRED
 
     def record_success(self, is_usage: bool = True):
-        """记录成功，清空失败计数并根据配额更新状态"""
+        """记录成功，清空失败计数"""
         self.fail_count = 0
         self.last_fail_at = None
         self.last_fail_reason = None
@@ -191,11 +239,6 @@ def record_success(self, is_usage: bool = True):
             self.use_count += 1
             self.last_used_at = int(datetime.now().timestamp() * 1000)
 
-        if self.quota == 0:
-            self.status = TokenStatus.COOLING
-        else:
-            self.status = TokenStatus.ACTIVE
-
     def need_refresh(self, interval_hours: int = 8) -> bool:
         """检查是否需要刷新配额"""
         if self.status != TokenStatus.COOLING:
@@ -212,6 +255,22 @@ def mark_synced(self):
         """标记已同步"""
         self.last_sync_at = int(datetime.now().timestamp() * 1000)
 
+    def should_cool_down(self, remaining_tokens: int, threshold: int = 10) -> bool:
+        """
+        根据 Rate Limits 返回值判断是否应该冷却
+
+        Args:
+            remaining_tokens: API 返回的剩余配额
+            threshold: 冷却阈值，默认 10
+
+        Returns:
+            是否应该进入冷却状态
+        """
+        if remaining_tokens <= threshold:
+            self.status = TokenStatus.COOLING
+            return True
+        return False
+
 
 class TokenPoolStats(BaseModel):
     """Token 池统计"""
@@ -223,6 +282,8 @@ class TokenPoolStats(BaseModel):
     cooling: int = 0
     total_quota: int = 0
     avg_quota: float = 0.0
+    total_consumed: int = 0
+    avg_consumed: float = 0.0
 
 
 __all__ = [
diff --git a/app/services/token/pool.py b/app/services/token/pool.py
index ec43c75fd..23d87a43c 100644
--- a/app/services/token/pool.py
+++ b/app/services/token/pool.py
@@ -4,6 +4,7 @@
 from typing import Dict, List, Optional, Iterator, Set
 
 from app.services.token.models import TokenInfo, TokenStatus, TokenPoolStats
+from app.core.config import get_config
 
 
 class TokenPool:
@@ -28,43 +29,103 @@ def get(self, token_str: str) -> Optional[TokenInfo]:
         """获取 Token"""
         return self._tokens.get(token_str)
 
-    def select(self, exclude: set = None, prefer_tags: Optional[Set[str]] = None) -> Optional[TokenInfo]:
+    def _is_consumed_mode(self) -> bool:
+        """检查是否启用 consumed 模式"""
+        try:
+            return get_config("token.consumed_mode_enabled", False)
+        except Exception:
+            return False
+
+    def select(
+        self, exclude: set = None, prefer_tags: Optional[Set[str]] = None
+    ) -> Optional[TokenInfo]:
         """
         选择一个可用 Token
-        策略:
-        1. 选择 active 状态且有配额的 token
-        2. 优先选择剩余额度最多的
-        3. 如果额度相同，随机选择（避免并发冲突）
+
+        默认模式（consumed_mode_enabled=false）:
+            1. 选择 active 状态且 quota > 0 的 token
+            2. 优先选择剩余额度最多的
+            3. 如果额度相同，随机选择
+
+        Consumed 模式（consumed_mode_enabled=true）:
+            1. 选择 active 状态的 token
+            2. 优先选择消耗次数（consumed）最少的
+            3. 如果 consumed 相同，随机选择
 
         Args:
             exclude: 需要排除的 token 字符串集合
             prefer_tags: 优先选择包含这些 tag 的 token（若存在则仅在其子集中选择）
         """
-        # 选择 token
-        available = [
-            t
-            for t in self._tokens.values()
-            if t.status == TokenStatus.ACTIVE and t.quota > 0
-            and (not exclude or t.token not in exclude)
-        ]
-
-        if not available:
-            return None
-
-        # 优先选带指定标签的 token（若存在）
-        if prefer_tags:
-            preferred = [t for t in available if prefer_tags.issubset(set(t.tags or []))]
-            if preferred:
-                available = preferred
-
-        # 找到最大额度
-        max_quota = max(t.quota for t in available)
-
-        # 筛选最大额度
-        candidates = [t for t in available if t.quota == max_quota]
-
-        # 随机选择
-        return random.choice(candidates)
+        consumed_mode = self._is_consumed_mode()
+
+        if consumed_mode:
+            # ===== Consumed 模式（新逻辑）=====
+            available = [
+                t
+                for t in self._tokens.values()
+                if t.status == TokenStatus.ACTIVE
+                and (not exclude or t.token not in exclude)
+            ]
+
+            if not available:
+                return None
+
+            # 优先选带指定标签的 token（若存在）
+            if prefer_tags:
+                preferred = [
+                    t for t in available if prefer_tags.issubset(set(t.tags or []))
+                ]
+                if preferred:
+                    available = preferred
+
+            # 分离新旧数据：consumed > 0 为新逻辑，consumed == 0 需要兼容旧逻辑
+            new_logic_tokens = [t for t in available if t.consumed > 0]
+            old_logic_tokens = [t for t in available if t.consumed == 0]
+
+            # 旧数据需要检查 quota > 0
+            old_logic_tokens = [t for t in old_logic_tokens if t.quota > 0]
+
+            # 优先使用新逻辑的 Token（按 consumed 排序）
+            if new_logic_tokens:
+                available = new_logic_tokens
+            elif old_logic_tokens:
+                available = old_logic_tokens
+            else:
+                return None
+
+            # 找到最小消耗（优先选择消耗少的）
+            min_consumed = min(t.consumed for t in available)
+            candidates = [t for t in available if t.consumed == min_consumed]
+            return random.choice(candidates)
+        else:
+            # ===== 默认模式（旧逻辑）=====
+            available = [
+                t
+                for t in self._tokens.values()
+                if t.status == TokenStatus.ACTIVE
+                and t.quota > 0
+                and (not exclude or t.token not in exclude)
+            ]
+
+            if not available:
+                return None
+
+            # 优先选带指定标签的 token（若存在）
+            if prefer_tags:
+                preferred = [
+                    t for t in available if prefer_tags.issubset(set(t.tags or []))
+                ]
+                if preferred:
+                    available = preferred
+
+            # 找到最大额度
+            max_quota = max(t.quota for t in available)
+
+            # 筛选最大额度
+            candidates = [t for t in available if t.quota == max_quota]
+
+            # 随机选择
+            return random.choice(candidates)
 
     def count(self) -> int:
         """Token 数量"""
@@ -80,6 +141,7 @@ def get_stats(self) -> TokenPoolStats:
 
         for token in self._tokens.values():
             stats.total_quota += token.quota
+            stats.total_consumed += token.consumed
 
             if token.status == TokenStatus.ACTIVE:
                 stats.active += 1
@@ -92,6 +154,7 @@ def get_stats(self) -> TokenPoolStats:
 
         if stats.total > 0:
             stats.avg_quota = stats.total_quota / stats.total
+            stats.avg_consumed = stats.total_consumed / stats.total
 
         return stats
 
diff --git a/config.defaults.toml b/config.defaults.toml
index c9e318593..55d83a426 100644
--- a/config.defaults.toml
+++ b/config.defaults.toml
@@ -90,6 +90,8 @@ save_delay_ms = 500
 usage_flush_interval_sec = 5
 # 多 worker 状态同步间隔（秒）
 reload_interval_sec = 30
+# 启用消耗模式（试验性功能，默认关闭）
+consumed_mode_enabled = false
 
 # ==================== 缓存管理 ====================
 [cache]

From 2ea3d9f2fd2ffcffbc459e9dbdc32a1e616e5991 Mon Sep 17 00:00:00 2001
From: Jensen Zhang <jingxuan.n.zhang@gmail.com>
Date: Thu, 12 Mar 2026 08:24:54 +0000
Subject: [PATCH 2/3] fix(token): reset consumed when token enters cooling
 state

- Reset consumed to 0 when quota reaches 0 in default mode (consume())
- Reset consumed to 0 when 429 rate limit is triggered (mark_rate_limited())
- Remove consumed reset on API sync, only restore active status
- This ensures consumed represents 'usage count per cooling cycle'
---
 app/services/token/manager.py | 11 +++++------
 app/services/token/models.py  |  3 ++-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/app/services/token/manager.py b/app/services/token/manager.py
index 3fa9bbe98..0bd2cc456 100644
--- a/app/services/token/manager.py
+++ b/app/services/token/manager.py
@@ -673,6 +673,7 @@ async def mark_rate_limited(self, token_str: str) -> bool:
                 old_quota = token.quota
                 token.quota = 0
                 token.status = TokenStatus.COOLING
+                token.consumed = 0  # 进入冷却时重置本轮消耗
                 logger.warning(
                     f"Token {raw_token[:10]}...: marked as rate limited "
                     f"(quota {old_quota} -> 0, status -> cooling)"
@@ -959,15 +960,13 @@ async def _refresh_one(item: tuple[str, TokenInfo]) -> dict:
                     if consumed_mode:
                         # Consumed 模式：使用新逻辑
                         token_info.update_quota_with_consumed(new_quota)
-                        # 刷新成功后重置消耗记录
-                        if new_quota > 0:
-                            token_info.consumed = 0
                     else:
                         # 默认模式：使用旧逻辑
                         token_info.update_quota(new_quota)
-                        # 刷新成功后如果 quota > 0，清除冷却状态
-                        if new_quota > 0:
-                            token_info.status = TokenStatus.ACTIVE
+
+                    # 刷新成功后如果 quota > 0，恢复活跃状态
+                    if new_quota > 0:
+                        token_info.status = TokenStatus.ACTIVE
                     token_info.mark_synced()
 
                     window_size = self._extract_window_size_seconds(result)
diff --git a/app/services/token/models.py b/app/services/token/models.py
index f256586be..775958f59 100644
--- a/app/services/token/models.py
+++ b/app/services/token/models.py
@@ -136,9 +136,10 @@ def consume(self, effort: EffortType = EffortType.LOW) -> int:
         self.use_count += actual_cost
         self.quota = max(0, self.quota - actual_cost)
 
-        # 默认行为：quota 耗尽时标记冷却
+        # 默认行为：quota 耗尽时标记冷却，并重置消耗记录
         if self.quota == 0:
             self.status = TokenStatus.COOLING
+            self.consumed = 0  # 进入冷却时重置本轮消耗
         elif self.status == TokenStatus.COOLING:
             # 只从 COOLING 恢复，不从 EXPIRED 恢复
             self.status = TokenStatus.ACTIVE

From c577a2a01d8cad9d58541a711ff9621e86062b3e Mon Sep 17 00:00:00 2001
From: Jensen Zhang <jingxuan.n.zhang@gmail.com>
Date: Fri, 13 Mar 2026 16:06:00 +0800
Subject: [PATCH 3/3] fix(token): correct load balancing logic in consumed mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

在 consumed 模式下，移除错误的『新旧 token 分离逻辑』，改为直接在所有
available token 中选择 consumed 最少的。

问题根因:
- 原逻辑将 consumed > 0 的 token 标记为『新逻辑』并优先选择
- 导致一旦某个 token 被使用过，其他 consumed=0 的 token 就永久被排除
- 造成单个 token 过载，其他 token 闲置的问题

修复方案:
- 删除 new_logic_tokens 和 old_logic_tokens 的分离逻辑
- 直接在所有 available token 中按 consumed 排序，选择最少的
- 确保所有 token 都能参与负载均衡，实现真正的消耗均衡

测试验证:
- 5 个 token，10 次请求，每个 token 恰好被选择 2 次
- 负载均衡差异为 0，符合预期
---
 app/services/token/pool.py | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

diff --git a/app/services/token/pool.py b/app/services/token/pool.py
index 23d87a43c..c118b0a2a 100644
--- a/app/services/token/pool.py
+++ b/app/services/token/pool.py
@@ -78,25 +78,12 @@ def select(
                 if preferred:
                     available = preferred
 
-            # 分离新旧数据：consumed > 0 为新逻辑，consumed == 0 需要兼容旧逻辑
-            new_logic_tokens = [t for t in available if t.consumed > 0]
-            old_logic_tokens = [t for t in available if t.consumed == 0]
-
-            # 旧数据需要检查 quota > 0
-            old_logic_tokens = [t for t in old_logic_tokens if t.quota > 0]
-
-            # 优先使用新逻辑的 Token（按 consumed 排序）
-            if new_logic_tokens:
-                available = new_logic_tokens
-            elif old_logic_tokens:
-                available = old_logic_tokens
-            else:
-                return None
-
             # 找到最小消耗（优先选择消耗少的）
             min_consumed = min(t.consumed for t in available)
             candidates = [t for t in available if t.consumed == min_consumed]
             return random.choice(candidates)
+
+
         else:
             # ===== 默认模式（旧逻辑）=====
             available = [