Merge pull request #523 from touwaeriol/feat/antigravity-improvements

feat: Antigravity improvements and scope-to-model rate limiting refactor

Merge pull request #523 from touwaeriol/feat/antigravity-improvements
feat: Antigravity improvements and scope-to-model rate limiting refactor
149e4267 · Wesley Liddick · GitHub · 5fa93ebd · 9a479d1b · 149e4267
Unverified Commit 149e4267 authored Feb 09, 2026 by Wesley Liddick Committed by GitHub Feb 09, 2026
--- a/backend/cmd/server/wire_gen.go
+++ b/backend/cmd/server/wire_gen.go
@@ -154,7 +154,8 @@ func initializeApplication(buildInfo handler.BuildInfo) (*Application, error) {
 	identityService := service.NewIdentityService(identityCache)
 	deferredService := service.ProvideDeferredService(accountRepository, timingWheelService)
 	claudeTokenProvider := service.NewClaudeTokenProvider(accountRepository, geminiTokenCache, oAuthService)
-	gatewayService := service.NewGatewayService(accountRepository, groupRepository, usageLogRepository, userRepository, userSubscriptionRepository, userGroupRateRepository, gatewayCache, configConfig, schedulerSnapshotService, concurrencyService, billingService, rateLimitService, billingCacheService, identityService, httpUpstream, deferredService, claudeTokenProvider, sessionLimitCache)
+	digestSessionStore := service.NewDigestSessionStore()
+	gatewayService := service.NewGatewayService(accountRepository, groupRepository, usageLogRepository, userRepository, userSubscriptionRepository, userGroupRateRepository, gatewayCache, configConfig, schedulerSnapshotService, concurrencyService, billingService, rateLimitService, billingCacheService, identityService, httpUpstream, deferredService, claudeTokenProvider, sessionLimitCache, digestSessionStore)
 	openAITokenProvider := service.NewOpenAITokenProvider(accountRepository, geminiTokenCache, openAIOAuthService)
 	openAIGatewayService := service.NewOpenAIGatewayService(accountRepository, usageLogRepository, userRepository, userSubscriptionRepository, gatewayCache, configConfig, schedulerSnapshotService, concurrencyService, billingService, rateLimitService, billingCacheService, httpUpstream, deferredService, openAITokenProvider)
 	geminiMessagesCompatService := service.NewGeminiMessagesCompatService(accountRepository, groupRepository, gatewayCache, schedulerSnapshotService, geminiTokenProvider, rateLimitService, httpUpstream, antigravityGatewayService, configConfig)

--- a/backend/go.mod
+++ b/backend/go.mod
@@ -103,6 +103,7 @@ require (
 	github.com/ncruces/go-strftime v1.0.0 // indirect
 	github.com/opencontainers/go-digest v1.0.0 // indirect
 	github.com/opencontainers/image-spec v1.1.1 // indirect
+	github.com/patrickmn/go-cache v2.1.0+incompatible // indirect
 	github.com/pelletier/go-toml/v2 v2.2.2 // indirect
 	github.com/pkg/errors v0.9.1 // indirect
 	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect

--- a/backend/go.sum
+++ b/backend/go.sum
@@ -213,6 +213,8 @@ github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8
 github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM=
 github.com/opencontainers/image-spec v1.1.1 h1:y0fUlFfIZhPF1W537XOLg0/fcx6zcHCJwooC2xJA040=
 github.com/opencontainers/image-spec v1.1.1/go.mod h1:qpqAh3Dmcf36wStyyWU+kCeDgrGnAve2nCC8+7h8Q0M=
+github.com/patrickmn/go-cache v2.1.0+incompatible h1:HRMgzkcYKYpi3C8ajMPV8OFXaaRUnok+kx1WdO15EQc=
+github.com/patrickmn/go-cache v2.1.0+incompatible/go.mod h1:3Qf8kWWT7OJRJbdiICTKqZju1ZixQ/KpMGzzAfe6+WQ=
 github.com/pelletier/go-toml/v2 v2.2.2 h1:aYUidT7k73Pcl9nb2gScu7NSrKCSHIDE89b3+6Wq+LM=
 github.com/pelletier/go-toml/v2 v2.2.2/go.mod h1:1t835xjRzz80PqgE6HHgN2JOsmgYu/h4qDAS4n929Rs=
 github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=

--- a/backend/internal/handler/dto/types.go
+++ b/backend/internal/handler/dto/types.go
@@ -2,11 +2,6 @@ package dto

 import "time"

-type ScopeRateLimitInfo struct {
-	ResetAt      time.Time `json:"reset_at"`
-	RemainingSec int64     `json:"remaining_sec"`
-}
-
 type User struct {
 	ID            int64     `json:"id"`
 	Email         string    `json:"email"`
@@ -129,9 +124,6 @@ type Account struct {
 	RateLimitResetAt *time.Time `json:"rate_limit_reset_at"`
 	OverloadUntil    *time.Time `json:"overload_until"`

-	// Antigravity scope 级限流状态（从 extra 提取）
-	ScopeRateLimits map[string]ScopeRateLimitInfo `json:"scope_rate_limits,omitempty"`
-
 	TempUnschedulableUntil  *time.Time `json:"temp_unschedulable_until"`
 	TempUnschedulableReason string     `json:"temp_unschedulable_reason"`


--- a/backend/internal/handler/gateway_handler.go
+++ b/backend/internal/handler/gateway_handler.go
@@ -13,6 +13,7 @@ import (
 	"time"

 	"github.com/Wei-Shaw/sub2api/internal/config"
+	"github.com/Wei-Shaw/sub2api/internal/domain"
 	"github.com/Wei-Shaw/sub2api/internal/pkg/antigravity"
 	"github.com/Wei-Shaw/sub2api/internal/pkg/claude"
 	"github.com/Wei-Shaw/sub2api/internal/pkg/ctxkey"
@@ -114,7 +115,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) {

 	setOpsRequestContext(c, "", false, body)

-	parsedReq, err := service.ParseGatewayRequest(body)
+	parsedReq, err := service.ParseGatewayRequest(body, domain.PlatformAnthropic)
 	if err != nil {
 		h.errorResponse(c, http.StatusBadRequest, "invalid_request_error", "Failed to parse request body")
 		return
@@ -203,6 +204,11 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 	}

 	// 计算粘性会话hash
+	parsedReq.SessionContext = &service.SessionContext{
+		ClientIP:  ip.GetClientIP(c),
+		UserAgent: c.GetHeader("User-Agent"),
+		APIKeyID:  apiKey.ID,
+	}
 	sessionHash := h.gatewayService.GenerateSessionHash(parsedReq)

 	// 获取平台：优先使用强制平台（/antigravity 路由，中间件已设置 request.Context），否则使用分组平台
@@ -335,7 +341,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 				if errors.As(err, &failoverErr) {
 					failedAccountIDs[account.ID] = struct{}{}
 					lastFailoverErr = failoverErr
-					if failoverErr.ForceCacheBilling {
+					if needForceCacheBilling(hasBoundSession, failoverErr) {
 						forceCacheBilling = true
 					}
 					if switchCount >= maxAccountSwitches {
@@ -344,6 +350,11 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 					}
 					switchCount++
 					log.Printf("Account %d: upstream error %d, switching account %d/%d", account.ID, failoverErr.StatusCode, switchCount, maxAccountSwitches)
+					if account.Platform == service.PlatformAntigravity {
+						if !sleepFailoverDelay(c.Request.Context(), switchCount) {
+							return
+						}
+					}
 					continue
 				}
 				// 错误响应已在Forward中处理，这里只记录日志
@@ -530,7 +541,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 				if errors.As(err, &failoverErr) {
 					failedAccountIDs[account.ID] = struct{}{}
 					lastFailoverErr = failoverErr
-					if failoverErr.ForceCacheBilling {
+					if needForceCacheBilling(hasBoundSession, failoverErr) {
 						forceCacheBilling = true
 					}
 					if switchCount >= maxAccountSwitches {
@@ -539,6 +550,11 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 					}
 					switchCount++
 					log.Printf("Account %d: upstream error %d, switching account %d/%d", account.ID, failoverErr.StatusCode, switchCount, maxAccountSwitches)
+					if account.Platform == service.PlatformAntigravity {
+						if !sleepFailoverDelay(c.Request.Context(), switchCount) {
+							return
+						}
+					}
 					continue
 				}
 				// 错误响应已在Forward中处理，这里只记录日志
@@ -801,6 +817,27 @@ func (h *GatewayHandler) handleConcurrencyError(c *gin.Context, err error, slotT
 		fmt.Sprintf("Concurrency limit exceeded for %s, please retry later", slotType), streamStarted)
 }

+// needForceCacheBilling 判断 failover 时是否需要强制缓存计费
+// 粘性会话切换账号、或上游明确标记时，将 input_tokens 转为 cache_read 计费
+func needForceCacheBilling(hasBoundSession bool, failoverErr *service.UpstreamFailoverError) bool {
+	return hasBoundSession || (failoverErr != nil && failoverErr.ForceCacheBilling)
+}
+
+// sleepFailoverDelay 账号切换线性递增延时：第1次0s、第2次1s、第3次2s…
+// 返回 false 表示 context 已取消。
+func sleepFailoverDelay(ctx context.Context, switchCount int) bool {
+	delay := time.Duration(switchCount-1) * time.Second
+	if delay <= 0 {
+		return true
+	}
+	select {
+	case <-ctx.Done():
+		return false
+	case <-time.After(delay):
+		return true
+	}
+}
+
 func (h *GatewayHandler) handleFailoverExhausted(c *gin.Context, failoverErr *service.UpstreamFailoverError, platform string, streamStarted bool) {
 	statusCode := failoverErr.StatusCode
 	responseBody := failoverErr.ResponseBody
@@ -934,7 +971,7 @@ func (h *GatewayHandler) CountTokens(c *gin.Context) {

 	setOpsRequestContext(c, "", false, body)

-	parsedReq, err := service.ParseGatewayRequest(body)
+	parsedReq, err := service.ParseGatewayRequest(body, domain.PlatformAnthropic)
 	if err != nil {
 		h.errorResponse(c, http.StatusBadRequest, "invalid_request_error", "Failed to parse request body")
 		return
@@ -962,6 +999,11 @@ func (h *GatewayHandler) CountTokens(c *gin.Context) {
 	}

 	// 计算粘性会话 hash
+	parsedReq.SessionContext = &service.SessionContext{
+		ClientIP:  ip.GetClientIP(c),
+		UserAgent: c.GetHeader("User-Agent"),
+		APIKeyID:  apiKey.ID,
+	}
 	sessionHash := h.gatewayService.GenerateSessionHash(parsedReq)

 	// 选择支持该模型的账号

--- a/backend/internal/handler/gemini_v1beta_handler.go
+++ b/backend/internal/handler/gemini_v1beta_handler.go
@@ -14,6 +14,7 @@ import (
 	"strings"
 	"time"

+	"github.com/Wei-Shaw/sub2api/internal/domain"
 	"github.com/Wei-Shaw/sub2api/internal/pkg/antigravity"
 	"github.com/Wei-Shaw/sub2api/internal/pkg/ctxkey"
 	"github.com/Wei-Shaw/sub2api/internal/pkg/gemini"
@@ -30,13 +31,6 @@ import (
 // 匹配格式: /Users/xxx/.gemini/tmp/[64位十六进制哈希]
 var geminiCLITmpDirRegex = regexp.MustCompile(`/\.gemini/tmp/([A-Fa-f0-9]{64})`)

-func isGeminiCLIRequest(c *gin.Context, body []byte) bool {
-	if strings.TrimSpace(c.GetHeader("x-gemini-api-privileged-user-id")) != "" {
-		return true
-	}
-	return geminiCLITmpDirRegex.Match(body)
-}
-
 // GeminiV1BetaListModels proxies:
 // GET /v1beta/models
 func (h *GatewayHandler) GeminiV1BetaListModels(c *gin.Context) {
@@ -239,7 +233,14 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
 	sessionHash := extractGeminiCLISessionHash(c, body)
 	if sessionHash == "" {
 		// Fallback: 使用通用的会话哈希生成逻辑（适用于其他客户端）
-		parsedReq, _ := service.ParseGatewayRequest(body)
+		parsedReq, _ := service.ParseGatewayRequest(body, domain.PlatformGemini)
+		if parsedReq != nil {
+			parsedReq.SessionContext = &service.SessionContext{
+				ClientIP:  ip.GetClientIP(c),
+				UserAgent: c.GetHeader("User-Agent"),
+				APIKeyID:  apiKey.ID,
+			}
+		}
 		sessionHash = h.gatewayService.GenerateSessionHash(parsedReq)
 	}
 	sessionKey := sessionHash
@@ -258,6 +259,7 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
 	var geminiDigestChain string
 	var geminiPrefixHash string
 	var geminiSessionUUID string
+	var matchedDigestChain string
 	useDigestFallback := sessionBoundAccountID == 0

 	if useDigestFallback {
@@ -284,13 +286,14 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
 				)

 				// 查找会话
-				foundUUID, foundAccountID, found := h.gatewayService.FindGeminiSession(
+				foundUUID, foundAccountID, foundMatchedChain, found := h.gatewayService.FindGeminiSession(
 					c.Request.Context(),
 					derefGroupID(apiKey.GroupID),
 					geminiPrefixHash,
 					geminiDigestChain,
 				)
 				if found {
+					matchedDigestChain = foundMatchedChain
 					sessionBoundAccountID = foundAccountID
 					geminiSessionUUID = foundUUID
 					log.Printf("[Gemini] Digest fallback matched: uuid=%s, accountID=%d, chain=%s",
@@ -316,7 +319,6 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {

 	// 判断是否真的绑定了粘性会话：有 sessionKey 且已经绑定到某个账号
 	hasBoundSession := sessionKey != "" && sessionBoundAccountID > 0
-	isCLI := isGeminiCLIRequest(c, body)
 	cleanedForUnknownBinding := false

 	maxAccountSwitches := h.maxAccountSwitchesGemini
@@ -344,10 +346,10 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
 			log.Printf("[Gemini] Sticky session account switched: %d -> %d, cleaning thoughtSignature", sessionBoundAccountID, account.ID)
 			body = service.CleanGeminiNativeThoughtSignatures(body)
 			sessionBoundAccountID = account.ID
-		} else if sessionKey != "" && sessionBoundAccountID == 0 && isCLI && !cleanedForUnknownBinding && bytes.Contains(body, []byte(`"thoughtSignature"`)) {
-			// 无缓存绑定但请求里已有 thoughtSignature：常见于缓存丢失/TTL 过期后，CLI 继续携带旧签名。
+		} else if sessionKey != "" && sessionBoundAccountID == 0 && !cleanedForUnknownBinding && bytes.Contains(body, []byte(`"thoughtSignature"`)) {
+			// 无缓存绑定但请求里已有 thoughtSignature：常见于缓存丢失/TTL 过期后，客户端继续携带旧签名。
 			// 为避免第一次转发就 400，这里做一次确定性清理，让新账号重新生成签名链路。
-			log.Printf("[Gemini] Sticky session binding missing for CLI request, cleaning thoughtSignature proactively")
+			log.Printf("[Gemini] Sticky session binding missing, cleaning thoughtSignature proactively")
 			body = service.CleanGeminiNativeThoughtSignatures(body)
 			cleanedForUnknownBinding = true
 			sessionBoundAccountID = account.ID
@@ -422,7 +424,7 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
 			var failoverErr *service.UpstreamFailoverError
 			if errors.As(err, &failoverErr) {
 				failedAccountIDs[account.ID] = struct{}{}
-				if failoverErr.ForceCacheBilling {
+				if needForceCacheBilling(hasBoundSession, failoverErr) {
 					forceCacheBilling = true
 				}
 				if switchCount >= maxAccountSwitches {
@@ -433,6 +435,11 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
 				lastFailoverErr = failoverErr
 				switchCount++
 				log.Printf("Gemini account %d: upstream error %d, switching account %d/%d", account.ID, failoverErr.StatusCode, switchCount, maxAccountSwitches)
+				if account.Platform == service.PlatformAntigravity {
+					if !sleepFailoverDelay(c.Request.Context(), switchCount) {
+						return
+					}
+				}
 				continue
 			}
 			// ForwardNative already wrote the response
@@ -453,6 +460,7 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
 				geminiDigestChain,
 				geminiSessionUUID,
 				account.ID,
+				matchedDigestChain,
 			); err != nil {
 				log.Printf("[Gemini] Failed to save digest session: %v", err)
 			}

--- a/backend/internal/repository/account_repo.go
+++ b/backend/internal/repository/account_repo.go
@@ -798,53 +798,6 @@ func (r *accountRepository) SetRateLimited(ctx context.Context, id int64, resetA
 	return nil
 }

-func (r *accountRepository) SetAntigravityQuotaScopeLimit(ctx context.Context, id int64, scope service.AntigravityQuotaScope, resetAt time.Time) error {
-	now := time.Now().UTC()
-	payload := map[string]string{
-		"rate_limited_at":     now.Format(time.RFC3339),
-		"rate_limit_reset_at": resetAt.UTC().Format(time.RFC3339),
-	}
-	raw, err := json.Marshal(payload)
-	if err != nil {
-		return err
-	}
-
-	scopeKey := string(scope)
-	client := clientFromContext(ctx, r.client)
-	result, err := client.ExecContext(
-		ctx,
-		`UPDATE accounts SET
-			extra = jsonb_set(
-				jsonb_set(COALESCE(extra, '{}'::jsonb), '{antigravity_quota_scopes}'::text[], COALESCE(extra->'antigravity_quota_scopes', '{}'::jsonb), true),
-				ARRAY['antigravity_quota_scopes', $1]::text[],
-				$2::jsonb,
-				true
-			),
-			updated_at = NOW(),
-			last_used_at = NOW()
-		WHERE id = $3 AND deleted_at IS NULL`,
-		scopeKey,
-		raw,
-		id,
-	)
-	if err != nil {
-		return err
-	}
-
-	affected, err := result.RowsAffected()
-	if err != nil {
-		return err
-	}
-	if affected == 0 {
-		return service.ErrAccountNotFound
-	}
-
-	if err := enqueueSchedulerOutbox(ctx, r.sql, service.SchedulerOutboxEventAccountChanged, &id, nil, nil); err != nil {
-		log.Printf("[SchedulerOutbox] enqueue quota scope failed: account=%d err=%v", id, err)
-	}
-	return nil
-}
-
 func (r *accountRepository) SetModelRateLimit(ctx context.Context, id int64, scope string, resetAt time.Time) error {
 	if scope == "" {
 		return nil

--- a/backend/internal/repository/gateway_cache.go
+++ b/backend/internal/repository/gateway_cache.go
@@ -11,63 +11,6 @@ import (

 const stickySessionPrefix = "sticky_session:"

-// Gemini Trie Lua 脚本
-const (
-	// geminiTrieFindScript 查找最长前缀匹配的 Lua 脚本
-	// KEYS[1] = trie key
-	// ARGV[1] = digestChain (如 "u:a-m:b-u:c-m:d")
-	// ARGV[2] = TTL seconds (用于刷新)
-	// 返回: 最长匹配的 value (uuid:accountID) 或 nil
-	// 查找成功时自动刷新 TTL，防止活跃会话意外过期
-	geminiTrieFindScript = `
-local chain = ARGV[1]
-local ttl = tonumber(ARGV[2])
-local lastMatch = nil
-local path = ""
-
-for part in string.gmatch(chain, "[^-]+") do
-    path = path == "" and part or path .. "-" .. part
-    local val = redis.call('HGET', KEYS[1], path)
-    if val and val ~= "" then
-        lastMatch = val
-    end
-end
-
-if lastMatch then
-    redis.call('EXPIRE', KEYS[1], ttl)
-end
-
-return lastMatch
-`
-
-	// geminiTrieSaveScript 保存会话到 Trie 的 Lua 脚本
-	// KEYS[1] = trie key
-	// ARGV[1] = digestChain
-	// ARGV[2] = value (uuid:accountID)
-	// ARGV[3] = TTL seconds
-	geminiTrieSaveScript = `
-local chain = ARGV[1]
-local value = ARGV[2]
-local ttl = tonumber(ARGV[3])
-local path = ""
-
-for part in string.gmatch(chain, "[^-]+") do
-    path = path == "" and part or path .. "-" .. part
-end
-redis.call('HSET', KEYS[1], path, value)
-redis.call('EXPIRE', KEYS[1], ttl)
-return "OK"
-`
-)
-
-// 模型负载统计相关常量
-const (
-	modelLoadKeyPrefix     = "ag:model_load:"      // 模型调用次数 key 前缀
-	modelLastUsedKeyPrefix = "ag:model_last_used:" // 模型最后调度时间 key 前缀
-	modelLoadTTL           = 24 * time.Hour        // 调用次数 TTL（24 小时无调用后清零）
-	modelLastUsedTTL       = 24 * time.Hour        // 最后调度时间 TTL
-)
-
 type gatewayCache struct {
 	rdb *redis.Client
 }
@@ -108,171 +51,3 @@ func (c *gatewayCache) DeleteSessionAccountID(ctx context.Context, groupID int64
 	key := buildSessionKey(groupID, sessionHash)
 	return c.rdb.Del(ctx, key).Err()
 }
-
-// ============ Antigravity 模型负载统计方法 ============
-
-// modelLoadKey 构建模型调用次数 key
-// 格式: ag:model_load:{accountID}:{model}
-func modelLoadKey(accountID int64, model string) string {
-	return fmt.Sprintf("%s%d:%s", modelLoadKeyPrefix, accountID, model)
-}
-
-// modelLastUsedKey 构建模型最后调度时间 key
-// 格式: ag:model_last_used:{accountID}:{model}
-func modelLastUsedKey(accountID int64, model string) string {
-	return fmt.Sprintf("%s%d:%s", modelLastUsedKeyPrefix, accountID, model)
-}
-
-// IncrModelCallCount 增加模型调用次数并更新最后调度时间
-// 返回更新后的调用次数
-func (c *gatewayCache) IncrModelCallCount(ctx context.Context, accountID int64, model string) (int64, error) {
-	loadKey := modelLoadKey(accountID, model)
-	lastUsedKey := modelLastUsedKey(accountID, model)
-
-	pipe := c.rdb.Pipeline()
-	incrCmd := pipe.Incr(ctx, loadKey)
-	pipe.Expire(ctx, loadKey, modelLoadTTL) // 每次调用刷新 TTL
-	pipe.Set(ctx, lastUsedKey, time.Now().Unix(), modelLastUsedTTL)
-	if _, err := pipe.Exec(ctx); err != nil {
-		return 0, err
-	}
-	return incrCmd.Val(), nil
-}
-
-// GetModelLoadBatch 批量获取账号的模型负载信息
-func (c *gatewayCache) GetModelLoadBatch(ctx context.Context, accountIDs []int64, model string) (map[int64]*service.ModelLoadInfo, error) {
-	if len(accountIDs) == 0 {
-		return make(map[int64]*service.ModelLoadInfo), nil
-	}
-
-	loadCmds, lastUsedCmds := c.pipelineModelLoadGet(ctx, accountIDs, model)
-	return c.parseModelLoadResults(accountIDs, loadCmds, lastUsedCmds), nil
-}
-
-// pipelineModelLoadGet 批量获取模型负载的 Pipeline 操作
-func (c *gatewayCache) pipelineModelLoadGet(
-	ctx context.Context,
-	accountIDs []int64,
-	model string,
-) (map[int64]*redis.StringCmd, map[int64]*redis.StringCmd) {
-	pipe := c.rdb.Pipeline()
-	loadCmds := make(map[int64]*redis.StringCmd, len(accountIDs))
-	lastUsedCmds := make(map[int64]*redis.StringCmd, len(accountIDs))
-
-	for _, id := range accountIDs {
-		loadCmds[id] = pipe.Get(ctx, modelLoadKey(id, model))
-		lastUsedCmds[id] = pipe.Get(ctx, modelLastUsedKey(id, model))
-	}
-	_, _ = pipe.Exec(ctx) // 忽略错误，key 不存在是正常的
-	return loadCmds, lastUsedCmds
-}
-
-// parseModelLoadResults 解析 Pipeline 结果
-func (c *gatewayCache) parseModelLoadResults(
-	accountIDs []int64,
-	loadCmds map[int64]*redis.StringCmd,
-	lastUsedCmds map[int64]*redis.StringCmd,
-) map[int64]*service.ModelLoadInfo {
-	result := make(map[int64]*service.ModelLoadInfo, len(accountIDs))
-	for _, id := range accountIDs {
-		result[id] = &service.ModelLoadInfo{
-			CallCount:  getInt64OrZero(loadCmds[id]),
-			LastUsedAt: getTimeOrZero(lastUsedCmds[id]),
-		}
-	}
-	return result
-}
-
-// getInt64OrZero 从 StringCmd 获取 int64 值，失败返回 0
-func getInt64OrZero(cmd *redis.StringCmd) int64 {
-	val, _ := cmd.Int64()
-	return val
-}
-
-// getTimeOrZero 从 StringCmd 获取 time.Time，失败返回零值
-func getTimeOrZero(cmd *redis.StringCmd) time.Time {
-	val, err := cmd.Int64()
-	if err != nil {
-		return time.Time{}
-	}
-	return time.Unix(val, 0)
-}
-
-// ============ Gemini 会话 Fallback 方法 (Trie 实现) ============
-
-// FindGeminiSession 查找 Gemini 会话（使用 Trie + Lua 脚本实现 O(L) 查询）
-// 返回最长匹配的会话信息，匹配成功时自动刷新 TTL
-func (c *gatewayCache) FindGeminiSession(ctx context.Context, groupID int64, prefixHash, digestChain string) (uuid string, accountID int64, found bool) {
-	if digestChain == "" {
-		return "", 0, false
-	}
-
-	trieKey := service.BuildGeminiTrieKey(groupID, prefixHash)
-	ttlSeconds := int(service.GeminiSessionTTL().Seconds())
-
-	// 使用 Lua 脚本在 Redis 端执行 Trie 查找，O(L) 次 HGET，1 次网络往返
-	// 查找成功时自动刷新 TTL，防止活跃会话意外过期
-	result, err := c.rdb.Eval(ctx, geminiTrieFindScript, []string{trieKey}, digestChain, ttlSeconds).Result()
-	if err != nil || result == nil {
-		return "", 0, false
-	}
-
-	value, ok := result.(string)
-	if !ok || value == "" {
-		return "", 0, false
-	}
-
-	uuid, accountID, ok = service.ParseGeminiSessionValue(value)
-	return uuid, accountID, ok
-}
-
-// SaveGeminiSession 保存 Gemini 会话（使用 Trie + Lua 脚本）
-func (c *gatewayCache) SaveGeminiSession(ctx context.Context, groupID int64, prefixHash, digestChain, uuid string, accountID int64) error {
-	if digestChain == "" {
-		return nil
-	}
-
-	trieKey := service.BuildGeminiTrieKey(groupID, prefixHash)
-	value := service.FormatGeminiSessionValue(uuid, accountID)
-	ttlSeconds := int(service.GeminiSessionTTL().Seconds())
-
-	return c.rdb.Eval(ctx, geminiTrieSaveScript, []string{trieKey}, digestChain, value, ttlSeconds).Err()
-}
-
-// ============ Anthropic 会话 Fallback 方法 (复用 Trie 实现) ============
-
-// FindAnthropicSession 查找 Anthropic 会话（复用 Gemini Trie Lua 脚本）
-func (c *gatewayCache) FindAnthropicSession(ctx context.Context, groupID int64, prefixHash, digestChain string) (uuid string, accountID int64, found bool) {
-	if digestChain == "" {
-		return "", 0, false
-	}
-
-	trieKey := service.BuildAnthropicTrieKey(groupID, prefixHash)
-	ttlSeconds := int(service.AnthropicSessionTTL().Seconds())
-
-	result, err := c.rdb.Eval(ctx, geminiTrieFindScript, []string{trieKey}, digestChain, ttlSeconds).Result()
-	if err != nil || result == nil {
-		return "", 0, false
-	}
-
-	value, ok := result.(string)
-	if !ok || value == "" {
-		return "", 0, false
-	}
-
-	uuid, accountID, ok = service.ParseGeminiSessionValue(value)
-	return uuid, accountID, ok
-}
-
-// SaveAnthropicSession 保存 Anthropic 会话（复用 Gemini Trie Lua 脚本）
-func (c *gatewayCache) SaveAnthropicSession(ctx context.Context, groupID int64, prefixHash, digestChain, uuid string, accountID int64) error {
-	if digestChain == "" {
-		return nil
-	}
-
-	trieKey := service.BuildAnthropicTrieKey(groupID, prefixHash)
-	value := service.FormatGeminiSessionValue(uuid, accountID)
-	ttlSeconds := int(service.AnthropicSessionTTL().Seconds())
-
-	return c.rdb.Eval(ctx, geminiTrieSaveScript, []string{trieKey}, digestChain, value, ttlSeconds).Err()
-}
--- a/backend/internal/repository/gateway_cache_integration_test.go
+++ b/backend/internal/repository/gateway_cache_integration_test.go
@@ -104,157 +104,6 @@ func (s *GatewayCacheSuite) TestGetSessionAccountID_CorruptedValue() {
 	require.False(s.T(), errors.Is(err, redis.Nil), "expected parsing error, not redis.Nil")
 }

-// ============ Gemini Trie 会话测试 ============
-
-func (s *GatewayCacheSuite) TestGeminiSessionTrie_SaveAndFind() {
-	groupID := int64(1)
-	prefixHash := "testprefix"
-	digestChain := "u:hash1-m:hash2-u:hash3"
-	uuid := "test-uuid-123"
-	accountID := int64(42)
-
-	// 保存会话
-	err := s.cache.SaveGeminiSession(s.ctx, groupID, prefixHash, digestChain, uuid, accountID)
-	require.NoError(s.T(), err, "SaveGeminiSession")
-
-	// 精确匹配查找
-	foundUUID, foundAccountID, found := s.cache.FindGeminiSession(s.ctx, groupID, prefixHash, digestChain)
-	require.True(s.T(), found, "should find exact match")
-	require.Equal(s.T(), uuid, foundUUID)
-	require.Equal(s.T(), accountID, foundAccountID)
-}
-
-func (s *GatewayCacheSuite) TestGeminiSessionTrie_PrefixMatch() {
-	groupID := int64(1)
-	prefixHash := "prefixmatch"
-	shortChain := "u:a-m:b"
-	longChain := "u:a-m:b-u:c-m:d"
-	uuid := "uuid-prefix"
-	accountID := int64(100)
-
-	// 保存短链
-	err := s.cache.SaveGeminiSession(s.ctx, groupID, prefixHash, shortChain, uuid, accountID)
-	require.NoError(s.T(), err)
-
-	// 用长链查找，应该匹配到短链（前缀匹配）
-	foundUUID, foundAccountID, found := s.cache.FindGeminiSession(s.ctx, groupID, prefixHash, longChain)
-	require.True(s.T(), found, "should find prefix match")
-	require.Equal(s.T(), uuid, foundUUID)
-	require.Equal(s.T(), accountID, foundAccountID)
-}
-
-func (s *GatewayCacheSuite) TestGeminiSessionTrie_LongestPrefixMatch() {
-	groupID := int64(1)
-	prefixHash := "longestmatch"
-
-	// 保存多个不同长度的链
-	err := s.cache.SaveGeminiSession(s.ctx, groupID, prefixHash, "u:a", "uuid-short", 1)
-	require.NoError(s.T(), err)
-	err = s.cache.SaveGeminiSession(s.ctx, groupID, prefixHash, "u:a-m:b", "uuid-medium", 2)
-	require.NoError(s.T(), err)
-	err = s.cache.SaveGeminiSession(s.ctx, groupID, prefixHash, "u:a-m:b-u:c", "uuid-long", 3)
-	require.NoError(s.T(), err)
-
-	// 查找更长的链，应该匹配到最长的前缀
-	foundUUID, foundAccountID, found := s.cache.FindGeminiSession(s.ctx, groupID, prefixHash, "u:a-m:b-u:c-m:d-u:e")
-	require.True(s.T(), found, "should find longest prefix match")
-	require.Equal(s.T(), "uuid-long", foundUUID)
-	require.Equal(s.T(), int64(3), foundAccountID)
-
-	// 查找中等长度的链
-	foundUUID, foundAccountID, found = s.cache.FindGeminiSession(s.ctx, groupID, prefixHash, "u:a-m:b-u:x")
-	require.True(s.T(), found)
-	require.Equal(s.T(), "uuid-medium", foundUUID)
-	require.Equal(s.T(), int64(2), foundAccountID)
-}
-
-func (s *GatewayCacheSuite) TestGeminiSessionTrie_NoMatch() {
-	groupID := int64(1)
-	prefixHash := "nomatch"
-	digestChain := "u:a-m:b"
-
-	// 保存一个会话
-	err := s.cache.SaveGeminiSession(s.ctx, groupID, prefixHash, digestChain, "uuid", 1)
-	require.NoError(s.T(), err)
-
-	// 用不同的链查找，应该找不到
-	_, _, found := s.cache.FindGeminiSession(s.ctx, groupID, prefixHash, "u:x-m:y")
-	require.False(s.T(), found, "should not find non-matching chain")
-}
-
-func (s *GatewayCacheSuite) TestGeminiSessionTrie_DifferentPrefixHash() {
-	groupID := int64(1)
-	digestChain := "u:a-m:b"
-
-	// 保存到 prefixHash1
-	err := s.cache.SaveGeminiSession(s.ctx, groupID, "prefix1", digestChain, "uuid1", 1)
-	require.NoError(s.T(), err)
-
-	// 用 prefixHash2 查找，应该找不到（不同用户/客户端隔离）
-	_, _, found := s.cache.FindGeminiSession(s.ctx, groupID, "prefix2", digestChain)
-	require.False(s.T(), found, "different prefixHash should be isolated")
-}
-
-func (s *GatewayCacheSuite) TestGeminiSessionTrie_DifferentGroupID() {
-	prefixHash := "sameprefix"
-	digestChain := "u:a-m:b"
-
-	// 保存到 groupID 1
-	err := s.cache.SaveGeminiSession(s.ctx, 1, prefixHash, digestChain, "uuid1", 1)
-	require.NoError(s.T(), err)
-
-	// 用 groupID 2 查找，应该找不到（分组隔离）
-	_, _, found := s.cache.FindGeminiSession(s.ctx, 2, prefixHash, digestChain)
-	require.False(s.T(), found, "different groupID should be isolated")
-}
-
-func (s *GatewayCacheSuite) TestGeminiSessionTrie_EmptyDigestChain() {
-	groupID := int64(1)
-	prefixHash := "emptytest"
-
-	// 空链不应该保存
-	err := s.cache.SaveGeminiSession(s.ctx, groupID, prefixHash, "", "uuid", 1)
-	require.NoError(s.T(), err, "empty chain should not error")
-
-	// 空链查找应该返回 false
-	_, _, found := s.cache.FindGeminiSession(s.ctx, groupID, prefixHash, "")
-	require.False(s.T(), found, "empty chain should not match")
-}
-
-func (s *GatewayCacheSuite) TestGeminiSessionTrie_MultipleSessions() {
-	groupID := int64(1)
-	prefixHash := "multisession"
-
-	// 保存多个不同会话（模拟 1000 个并发会话的场景）
-	sessions := []struct {
-		chain     string
-		uuid      string
-		accountID int64
-	}{
-		{"u:session1", "uuid-1", 1},
-		{"u:session2-m:reply2", "uuid-2", 2},
-		{"u:session3-m:reply3-u:msg3", "uuid-3", 3},
-	}
-
-	for _, sess := range sessions {
-		err := s.cache.SaveGeminiSession(s.ctx, groupID, prefixHash, sess.chain, sess.uuid, sess.accountID)
-		require.NoError(s.T(), err)
-	}
-
-	// 验证每个会话都能正确查找
-	for _, sess := range sessions {
-		foundUUID, foundAccountID, found := s.cache.FindGeminiSession(s.ctx, groupID, prefixHash, sess.chain)
-		require.True(s.T(), found, "should find session: %s", sess.chain)
-		require.Equal(s.T(), sess.uuid, foundUUID)
-		require.Equal(s.T(), sess.accountID, foundAccountID)
-	}
-
-	// 验证继续对话的场景
-	foundUUID, foundAccountID, found := s.cache.FindGeminiSession(s.ctx, groupID, prefixHash, "u:session2-m:reply2-u:newmsg")
-	require.True(s.T(), found)
-	require.Equal(s.T(), "uuid-2", foundUUID)
-	require.Equal(s.T(), int64(2), foundAccountID)
-}

 func TestGatewayCacheSuite(t *testing.T) {
 	suite.Run(t, new(GatewayCacheSuite))

--- a/backend/internal/repository/gateway_cache_model_load_integration_test.go
+++ b/backend/internal/repository/gateway_cache_model_load_integration_test.go
-//go:build integration
-
-package repository
-
-import (
-	"context"
-	"testing"
-	"time"
-
-	"github.com/stretchr/testify/require"
-	"github.com/stretchr/testify/suite"
-)
-
-// ============ Gateway Cache 模型负载统计集成测试 ============
-
-type GatewayCacheModelLoadSuite struct {
-	suite.Suite
-}
-
-func TestGatewayCacheModelLoadSuite(t *testing.T) {
-	suite.Run(t, new(GatewayCacheModelLoadSuite))
-}
-
-func (s *GatewayCacheModelLoadSuite) TestIncrModelCallCount_Basic() {
-	t := s.T()
-	rdb := testRedis(t)
-	cache := &gatewayCache{rdb: rdb}
-	ctx := context.Background()
-
-	accountID := int64(123)
-	model := "claude-sonnet-4-20250514"
-
-	// 首次调用应返回 1
-	count1, err := cache.IncrModelCallCount(ctx, accountID, model)
-	require.NoError(t, err)
-	require.Equal(t, int64(1), count1)
-
-	// 第二次调用应返回 2
-	count2, err := cache.IncrModelCallCount(ctx, accountID, model)
-	require.NoError(t, err)
-	require.Equal(t, int64(2), count2)
-
-	// 第三次调用应返回 3
-	count3, err := cache.IncrModelCallCount(ctx, accountID, model)
-	require.NoError(t, err)
-	require.Equal(t, int64(3), count3)
-}
-
-func (s *GatewayCacheModelLoadSuite) TestIncrModelCallCount_DifferentModels() {
-	t := s.T()
-	rdb := testRedis(t)
-	cache := &gatewayCache{rdb: rdb}
-	ctx := context.Background()
-
-	accountID := int64(456)
-	model1 := "claude-sonnet-4-20250514"
-	model2 := "claude-opus-4-5-20251101"
-
-	// 不同模型应该独立计数
-	count1, err := cache.IncrModelCallCount(ctx, accountID, model1)
-	require.NoError(t, err)
-	require.Equal(t, int64(1), count1)
-
-	count2, err := cache.IncrModelCallCount(ctx, accountID, model2)
-	require.NoError(t, err)
-	require.Equal(t, int64(1), count2)
-
-	count1Again, err := cache.IncrModelCallCount(ctx, accountID, model1)
-	require.NoError(t, err)
-	require.Equal(t, int64(2), count1Again)
-}
-
-func (s *GatewayCacheModelLoadSuite) TestIncrModelCallCount_DifferentAccounts() {
-	t := s.T()
-	rdb := testRedis(t)
-	cache := &gatewayCache{rdb: rdb}
-	ctx := context.Background()
-
-	account1 := int64(111)
-	account2 := int64(222)
-	model := "gemini-2.5-pro"
-
-	// 不同账号应该独立计数
-	count1, err := cache.IncrModelCallCount(ctx, account1, model)
-	require.NoError(t, err)
-	require.Equal(t, int64(1), count1)
-
-	count2, err := cache.IncrModelCallCount(ctx, account2, model)
-	require.NoError(t, err)
-	require.Equal(t, int64(1), count2)
-}
-
-func (s *GatewayCacheModelLoadSuite) TestGetModelLoadBatch_Empty() {
-	t := s.T()
-	rdb := testRedis(t)
-	cache := &gatewayCache{rdb: rdb}
-	ctx := context.Background()
-
-	result, err := cache.GetModelLoadBatch(ctx, []int64{}, "any-model")
-	require.NoError(t, err)
-	require.NotNil(t, result)
-	require.Empty(t, result)
-}
-
-func (s *GatewayCacheModelLoadSuite) TestGetModelLoadBatch_NonExistent() {
-	t := s.T()
-	rdb := testRedis(t)
-	cache := &gatewayCache{rdb: rdb}
-	ctx := context.Background()
-
-	// 查询不存在的账号应返回零值
-	result, err := cache.GetModelLoadBatch(ctx, []int64{9999, 9998}, "claude-sonnet-4-20250514")
-	require.NoError(t, err)
-	require.Len(t, result, 2)
-
-	require.Equal(t, int64(0), result[9999].CallCount)
-	require.True(t, result[9999].LastUsedAt.IsZero())
-	require.Equal(t, int64(0), result[9998].CallCount)
-	require.True(t, result[9998].LastUsedAt.IsZero())
-}
-
-func (s *GatewayCacheModelLoadSuite) TestGetModelLoadBatch_AfterIncrement() {
-	t := s.T()
-	rdb := testRedis(t)
-	cache := &gatewayCache{rdb: rdb}
-	ctx := context.Background()
-
-	accountID := int64(789)
-	model := "claude-sonnet-4-20250514"
-
-	// 先增加调用次数
-	beforeIncr := time.Now()
-	_, err := cache.IncrModelCallCount(ctx, accountID, model)
-	require.NoError(t, err)
-	_, err = cache.IncrModelCallCount(ctx, accountID, model)
-	require.NoError(t, err)
-	_, err = cache.IncrModelCallCount(ctx, accountID, model)
-	require.NoError(t, err)
-	afterIncr := time.Now()
-
-	// 获取负载信息
-	result, err := cache.GetModelLoadBatch(ctx, []int64{accountID}, model)
-	require.NoError(t, err)
-	require.Len(t, result, 1)
-
-	loadInfo := result[accountID]
-	require.NotNil(t, loadInfo)
-	require.Equal(t, int64(3), loadInfo.CallCount)
-	require.False(t, loadInfo.LastUsedAt.IsZero())
-	// LastUsedAt 应该在 beforeIncr 和 afterIncr 之间
-	require.True(t, loadInfo.LastUsedAt.After(beforeIncr.Add(-time.Second)) || loadInfo.LastUsedAt.Equal(beforeIncr))
-	require.True(t, loadInfo.LastUsedAt.Before(afterIncr.Add(time.Second)) || loadInfo.LastUsedAt.Equal(afterIncr))
-}
-
-func (s *GatewayCacheModelLoadSuite) TestGetModelLoadBatch_MultipleAccounts() {
-	t := s.T()
-	rdb := testRedis(t)
-	cache := &gatewayCache{rdb: rdb}
-	ctx := context.Background()
-
-	model := "claude-opus-4-5-20251101"
-	account1 := int64(1001)
-	account2 := int64(1002)
-	account3 := int64(1003) // 不调用
-
-	// account1 调用 2 次
-	_, err := cache.IncrModelCallCount(ctx, account1, model)
-	require.NoError(t, err)
-	_, err = cache.IncrModelCallCount(ctx, account1, model)
-	require.NoError(t, err)
-
-	// account2 调用 5 次
-	for i := 0; i < 5; i++ {
-		_, err = cache.IncrModelCallCount(ctx, account2, model)
-		require.NoError(t, err)
-	}
-
-	// 批量获取
-	result, err := cache.GetModelLoadBatch(ctx, []int64{account1, account2, account3}, model)
-	require.NoError(t, err)
-	require.Len(t, result, 3)
-
-	require.Equal(t, int64(2), result[account1].CallCount)
-	require.False(t, result[account1].LastUsedAt.IsZero())
-
-	require.Equal(t, int64(5), result[account2].CallCount)
-	require.False(t, result[account2].LastUsedAt.IsZero())
-
-	require.Equal(t, int64(0), result[account3].CallCount)
-	require.True(t, result[account3].LastUsedAt.IsZero())
-}
-
-func (s *GatewayCacheModelLoadSuite) TestGetModelLoadBatch_ModelIsolation() {
-	t := s.T()
-	rdb := testRedis(t)
-	cache := &gatewayCache{rdb: rdb}
-	ctx := context.Background()
-
-	accountID := int64(2001)
-	model1 := "claude-sonnet-4-20250514"
-	model2 := "gemini-2.5-pro"
-
-	// 对 model1 调用 3 次
-	for i := 0; i < 3; i++ {
-		_, err := cache.IncrModelCallCount(ctx, accountID, model1)
-		require.NoError(t, err)
-	}
-
-	// 获取 model1 的负载
-	result1, err := cache.GetModelLoadBatch(ctx, []int64{accountID}, model1)
-	require.NoError(t, err)
-	require.Equal(t, int64(3), result1[accountID].CallCount)
-
-	// 获取 model2 的负载（应该为 0）
-	result2, err := cache.GetModelLoadBatch(ctx, []int64{accountID}, model2)
-	require.NoError(t, err)
-	require.Equal(t, int64(0), result2[accountID].CallCount)
-}
-
-// ============ 辅助函数测试 ============
-
-func (s *GatewayCacheModelLoadSuite) TestModelLoadKey_Format() {
-	t := s.T()
-
-	key := modelLoadKey(123, "claude-sonnet-4")
-	require.Equal(t, "ag:model_load:123:claude-sonnet-4", key)
-}
-
-func (s *GatewayCacheModelLoadSuite) TestModelLastUsedKey_Format() {
-	t := s.T()
-
-	key := modelLastUsedKey(456, "gemini-2.5-pro")
-	require.Equal(t, "ag:model_last_used:456:gemini-2.5-pro", key)
-}
--- a/backend/internal/server/api_contract_test.go
+++ b/backend/internal/server/api_contract_test.go
@@ -1008,10 +1008,6 @@ func (s *stubAccountRepo) SetRateLimited(ctx context.Context, id int64, resetAt
 	return errors.New("not implemented")
 }

-func (s *stubAccountRepo) SetAntigravityQuotaScopeLimit(ctx context.Context, id int64, scope service.AntigravityQuotaScope, resetAt time.Time) error {
-	return errors.New("not implemented")
-}
-
 func (s *stubAccountRepo) SetModelRateLimit(ctx context.Context, id int64, scope string, resetAt time.Time) error {
 	return errors.New("not implemented")
 }

--- a/backend/internal/service/account_service.go
+++ b/backend/internal/service/account_service.go
@@ -50,7 +50,6 @@ type AccountRepository interface {
 	ListSchedulableByGroupIDAndPlatforms(ctx context.Context, groupID int64, platforms []string) ([]Account, error)

 	SetRateLimited(ctx context.Context, id int64, resetAt time.Time) error
-	SetAntigravityQuotaScopeLimit(ctx context.Context, id int64, scope AntigravityQuotaScope, resetAt time.Time) error
 	SetModelRateLimit(ctx context.Context, id int64, scope string, resetAt time.Time) error
 	SetOverloaded(ctx context.Context, id int64, until time.Time) error
 	SetTempUnschedulable(ctx context.Context, id int64, until time.Time, reason string) error

--- a/backend/internal/service/account_service_delete_test.go
+++ b/backend/internal/service/account_service_delete_test.go
@@ -143,10 +143,6 @@ func (s *accountRepoStub) SetRateLimited(ctx context.Context, id int64, resetAt
 	panic("unexpected SetRateLimited call")
 }

-func (s *accountRepoStub) SetAntigravityQuotaScopeLimit(ctx context.Context, id int64, scope AntigravityQuotaScope, resetAt time.Time) error {
-	panic("unexpected SetAntigravityQuotaScopeLimit call")
-}
-
 func (s *accountRepoStub) SetModelRateLimit(ctx context.Context, id int64, scope string, resetAt time.Time) error {
 	panic("unexpected SetModelRateLimit call")
 }

--- a/backend/internal/service/anthropic_session.go
+++ b/backend/internal/service/anthropic_session.go
@@ -2,7 +2,6 @@ package service

 import (
 	"encoding/json"
-	"strconv"
 	"strings"
 	"time"
 )
@@ -12,9 +11,6 @@ const (
 	// anthropicSessionTTLSeconds Anthropic 会话缓存 TTL（5 分钟）
 	anthropicSessionTTLSeconds = 300

-	// anthropicTrieKeyPrefix Anthropic Trie 会话 key 前缀
-	anthropicTrieKeyPrefix = "anthropic:trie:"
-
 	// anthropicDigestSessionKeyPrefix Anthropic 摘要 fallback 会话 key 前缀
 	anthropicDigestSessionKeyPrefix = "anthropic:digest:"
 )
@@ -68,12 +64,6 @@ func rolePrefix(role string) string {
 	}
 }

-// BuildAnthropicTrieKey 构建 Anthropic Trie Redis key
-// 格式: anthropic:trie:{groupID}:{prefixHash}
-func BuildAnthropicTrieKey(groupID int64, prefixHash string) string {
-	return anthropicTrieKeyPrefix + strconv.FormatInt(groupID, 10) + ":" + prefixHash
-}
-
 // GenerateAnthropicDigestSessionKey 生成 Anthropic 摘要 fallback 的 sessionKey
 // 组合 prefixHash 前 8 位 + uuid 前 8 位，确保不同会话产生不同的 sessionKey
 func GenerateAnthropicDigestSessionKey(prefixHash, uuid string) string {

--- a/backend/internal/service/anthropic_session_test.go
+++ b/backend/internal/service/anthropic_session_test.go
@@ -236,43 +236,6 @@ func TestBuildAnthropicDigestChain_Deterministic(t *testing.T) {
 	}
 }

-func TestBuildAnthropicTrieKey(t *testing.T) {
-	tests := []struct {
-		name       string
-		groupID    int64
-		prefixHash string
-		want       string
-	}{
-		{
-			name:       "normal",
-			groupID:    123,
-			prefixHash: "abcdef12",
-			want:       "anthropic:trie:123:abcdef12",
-		},
-		{
-			name:       "zero group",
-			groupID:    0,
-			prefixHash: "xyz",
-			want:       "anthropic:trie:0:xyz",
-		},
-		{
-			name:       "empty prefix",
-			groupID:    1,
-			prefixHash: "",
-			want:       "anthropic:trie:1:",
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			got := BuildAnthropicTrieKey(tt.groupID, tt.prefixHash)
-			if got != tt.want {
-				t.Errorf("BuildAnthropicTrieKey(%d, %q) = %q, want %q", tt.groupID, tt.prefixHash, got, tt.want)
-			}
-		})
-	}
-}
-
 func TestGenerateAnthropicDigestSessionKey(t *testing.T) {
 	tests := []struct {
 		name       string

--- a/backend/internal/service/antigravity_gateway_service.go
+++ b/backend/internal/service/antigravity_gateway_service.go
--- a/backend/internal/service/antigravity_gateway_service_test.go
+++ b/backend/internal/service/antigravity_gateway_service_test.go
@@ -4,17 +4,42 @@ import (
 	"bytes"
 	"context"
 	"encoding/json"
+	"errors"
+	"fmt"
 	"io"
 	"net/http"
 	"net/http/httptest"
 	"testing"
 	"time"

+	"github.com/Wei-Shaw/sub2api/internal/config"
 	"github.com/Wei-Shaw/sub2api/internal/pkg/antigravity"
 	"github.com/gin-gonic/gin"
 	"github.com/stretchr/testify/require"
 )

+// antigravityFailingWriter 模拟客户端断开连接的 gin.ResponseWriter
+type antigravityFailingWriter struct {
+	gin.ResponseWriter
+	failAfter int // 允许成功写入的次数，之后所有写入返回错误
+	writes    int
+}
+
+func (w *antigravityFailingWriter) Write(p []byte) (int, error) {
+	if w.writes >= w.failAfter {
+		return 0, errors.New("write failed: client disconnected")
+	}
+	w.writes++
+	return w.ResponseWriter.Write(p)
+}
+
+// newAntigravityTestService 创建用于流式测试的 AntigravityGatewayService
+func newAntigravityTestService(cfg *config.Config) *AntigravityGatewayService {
+	return &AntigravityGatewayService{
+		settingService: &SettingService{cfg: cfg},
+	}
+}
+
 func TestStripSignatureSensitiveBlocksFromClaudeRequest(t *testing.T) {
 	req := &antigravity.ClaudeRequest{
 		Model: "claude-sonnet-4-5",
@@ -337,8 +362,8 @@ func TestAntigravityGatewayService_Forward_StickySessionForceCacheBilling(t *tes
 	require.True(t, failoverErr.ForceCacheBilling, "ForceCacheBilling should be true for sticky session switch")
 }

-// TestAntigravityGatewayService_ForwardGemini_StickySessionForceCacheBilling
-// 验证：ForwardGemini 粘性会话切换时，UpstreamFailoverError.ForceCacheBilling 应为 true
+// TestAntigravityGatewayService_ForwardGemini_StickySessionForceCacheBilling verifies
+// that ForwardGemini sets ForceCacheBilling=true for sticky session switch.
 func TestAntigravityGatewayService_ForwardGemini_StickySessionForceCacheBilling(t *testing.T) {
 	gin.SetMode(gin.TestMode)
 	writer := httptest.NewRecorder()
@@ -391,3 +416,438 @@ func TestAntigravityGatewayService_ForwardGemini_StickySessionForceCacheBilling(
 	require.Equal(t, http.StatusServiceUnavailable, failoverErr.StatusCode)
 	require.True(t, failoverErr.ForceCacheBilling, "ForceCacheBilling should be true for sticky session switch")
 }
+
+// --- 流式 happy path 测试 ---
+
+// TestStreamUpstreamResponse_NormalComplete
+// 验证：正常流式转发完成时，数据正确透传、usage 正确收集、clientDisconnect=false
+func TestStreamUpstreamResponse_NormalComplete(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+	svc := newAntigravityTestService(&config.Config{
+		Gateway: config.GatewayConfig{MaxLineSize: defaultMaxLineSize},
+	})
+
+	rec := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(rec)
+	c.Request = httptest.NewRequest(http.MethodPost, "/", nil)
+
+	pr, pw := io.Pipe()
+	resp := &http.Response{StatusCode: http.StatusOK, Body: pr, Header: http.Header{}}
+
+	go func() {
+		defer func() { _ = pw.Close() }()
+		fmt.Fprintln(pw, `event: message_start`)
+		fmt.Fprintln(pw, `data: {"type":"message_start","message":{"usage":{"input_tokens":10}}}`)
+		fmt.Fprintln(pw, "")
+		fmt.Fprintln(pw, `event: content_block_delta`)
+		fmt.Fprintln(pw, `data: {"type":"content_block_delta","delta":{"text":"hello"}}`)
+		fmt.Fprintln(pw, "")
+		fmt.Fprintln(pw, `event: message_delta`)
+		fmt.Fprintln(pw, `data: {"type":"message_delta","usage":{"output_tokens":5}}`)
+		fmt.Fprintln(pw, "")
+	}()
+
+	result := svc.streamUpstreamResponse(c, resp, time.Now())
+	_ = pr.Close()
+
+	require.NotNil(t, result)
+	require.False(t, result.clientDisconnect, "normal completion should not set clientDisconnect")
+	require.NotNil(t, result.usage)
+	require.Equal(t, 5, result.usage.OutputTokens, "should collect output_tokens from message_delta")
+	require.NotNil(t, result.firstTokenMs, "should record first token time")
+
+	// 验证数据被透传到客户端
+	body := rec.Body.String()
+	require.Contains(t, body, "event: message_start")
+	require.Contains(t, body, "content_block_delta")
+	require.Contains(t, body, "message_delta")
+}
+
+// TestHandleGeminiStreamingResponse_NormalComplete
+// 验证：正常 Gemini 流式转发，数据正确透传、usage 正确收集
+func TestHandleGeminiStreamingResponse_NormalComplete(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+	svc := newAntigravityTestService(&config.Config{
+		Gateway: config.GatewayConfig{MaxLineSize: defaultMaxLineSize},
+	})
+
+	rec := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(rec)
+	c.Request = httptest.NewRequest(http.MethodPost, "/", nil)
+
+	pr, pw := io.Pipe()
+	resp := &http.Response{StatusCode: http.StatusOK, Body: pr, Header: http.Header{}}
+
+	go func() {
+		defer func() { _ = pw.Close() }()
+		// 第一个 chunk（部分内容）
+		fmt.Fprintln(pw, `data: {"candidates":[{"content":{"parts":[{"text":"Hello"}]}}],"usageMetadata":{"promptTokenCount":10,"candidatesTokenCount":3}}`)
+		fmt.Fprintln(pw, "")
+		// 第二个 chunk（最终内容+完整 usage）
+		fmt.Fprintln(pw, `data: {"candidates":[{"content":{"parts":[{"text":" world"}]},"finishReason":"STOP"}],"usageMetadata":{"promptTokenCount":10,"candidatesTokenCount":8,"cachedContentTokenCount":2}}`)
+		fmt.Fprintln(pw, "")
+	}()
+
+	result, err := svc.handleGeminiStreamingResponse(c, resp, time.Now())
+	_ = pr.Close()
+
+	require.NoError(t, err)
+	require.NotNil(t, result)
+	require.False(t, result.clientDisconnect, "normal completion should not set clientDisconnect")
+	require.NotNil(t, result.usage)
+	// Gemini usage: promptTokenCount=10, candidatesTokenCount=8, cachedContentTokenCount=2
+	// → InputTokens=10-2=8, OutputTokens=8, CacheReadInputTokens=2
+	require.Equal(t, 8, result.usage.InputTokens)
+	require.Equal(t, 8, result.usage.OutputTokens)
+	require.Equal(t, 2, result.usage.CacheReadInputTokens)
+	require.NotNil(t, result.firstTokenMs, "should record first token time")
+
+	// 验证数据被透传到客户端
+	body := rec.Body.String()
+	require.Contains(t, body, "Hello")
+	require.Contains(t, body, "world")
+	// 不应包含错误事件
+	require.NotContains(t, body, "event: error")
+}
+
+// TestHandleClaudeStreamingResponse_NormalComplete
+// 验证：正常 Claude 流式转发（Gemini→Claude 转换），数据正确转换并输出
+func TestHandleClaudeStreamingResponse_NormalComplete(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+	svc := newAntigravityTestService(&config.Config{
+		Gateway: config.GatewayConfig{MaxLineSize: defaultMaxLineSize},
+	})
+
+	rec := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(rec)
+	c.Request = httptest.NewRequest(http.MethodPost, "/", nil)
+
+	pr, pw := io.Pipe()
+	resp := &http.Response{StatusCode: http.StatusOK, Body: pr, Header: http.Header{}}
+
+	go func() {
+		defer func() { _ = pw.Close() }()
+		// v1internal 包装格式：Gemini 数据嵌套在 "response" 字段下
+		// ProcessLine 先尝试反序列化为 V1InternalResponse，裸格式会导致 Response.UsageMetadata 为空
+		fmt.Fprintln(pw, `data: {"response":{"candidates":[{"content":{"parts":[{"text":"Hi there"}]},"finishReason":"STOP"}],"usageMetadata":{"promptTokenCount":5,"candidatesTokenCount":3}}}`)
+		fmt.Fprintln(pw, "")
+	}()
+
+	result, err := svc.handleClaudeStreamingResponse(c, resp, time.Now(), "claude-sonnet-4-5")
+	_ = pr.Close()
+
+	require.NoError(t, err)
+	require.NotNil(t, result)
+	require.False(t, result.clientDisconnect, "normal completion should not set clientDisconnect")
+	require.NotNil(t, result.usage)
+	// Gemini→Claude 转换的 usage：promptTokenCount=5→InputTokens=5, candidatesTokenCount=3→OutputTokens=3
+	require.Equal(t, 5, result.usage.InputTokens)
+	require.Equal(t, 3, result.usage.OutputTokens)
+	require.NotNil(t, result.firstTokenMs, "should record first token time")
+
+	// 验证输出是 Claude SSE 格式（processor 会转换）
+	body := rec.Body.String()
+	require.Contains(t, body, "event: message_start", "should contain Claude message_start event")
+	require.Contains(t, body, "event: message_stop", "should contain Claude message_stop event")
+	// 不应包含错误事件
+	require.NotContains(t, body, "event: error")
+}
+
+// --- 流式客户端断开检测测试 ---
+
+// TestStreamUpstreamResponse_ClientDisconnectDrainsUsage
+// 验证：客户端写入失败后，streamUpstreamResponse 继续读取上游以收集 usage
+func TestStreamUpstreamResponse_ClientDisconnectDrainsUsage(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+	svc := newAntigravityTestService(&config.Config{
+		Gateway: config.GatewayConfig{MaxLineSize: defaultMaxLineSize},
+	})
+
+	rec := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(rec)
+	c.Request = httptest.NewRequest(http.MethodPost, "/", nil)
+	c.Writer = &antigravityFailingWriter{ResponseWriter: c.Writer, failAfter: 0}
+
+	pr, pw := io.Pipe()
+	resp := &http.Response{StatusCode: http.StatusOK, Body: pr, Header: http.Header{}}
+
+	go func() {
+		defer func() { _ = pw.Close() }()
+		fmt.Fprintln(pw, `event: message_start`)
+		fmt.Fprintln(pw, `data: {"type":"message_start","message":{"usage":{"input_tokens":10}}}`)
+		fmt.Fprintln(pw, "")
+		fmt.Fprintln(pw, `event: message_delta`)
+		fmt.Fprintln(pw, `data: {"type":"message_delta","usage":{"output_tokens":20}}`)
+		fmt.Fprintln(pw, "")
+	}()
+
+	result := svc.streamUpstreamResponse(c, resp, time.Now())
+	_ = pr.Close()
+
+	require.NotNil(t, result)
+	require.True(t, result.clientDisconnect)
+	require.NotNil(t, result.usage)
+	require.Equal(t, 20, result.usage.OutputTokens)
+}
+
+// TestStreamUpstreamResponse_ContextCanceled
+// 验证：context 取消时返回 usage 且标记 clientDisconnect
+func TestStreamUpstreamResponse_ContextCanceled(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+	svc := newAntigravityTestService(&config.Config{
+		Gateway: config.GatewayConfig{MaxLineSize: defaultMaxLineSize},
+	})
+
+	rec := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(rec)
+	ctx, cancel := context.WithCancel(context.Background())
+	cancel()
+	c.Request = httptest.NewRequest(http.MethodPost, "/", nil).WithContext(ctx)
+
+	resp := &http.Response{StatusCode: http.StatusOK, Body: cancelReadCloser{}, Header: http.Header{}}
+
+	result := svc.streamUpstreamResponse(c, resp, time.Now())
+
+	require.NotNil(t, result)
+	require.True(t, result.clientDisconnect)
+	require.NotContains(t, rec.Body.String(), "event: error")
+}
+
+// TestStreamUpstreamResponse_Timeout
+// 验证：上游超时时返回已收集的 usage
+func TestStreamUpstreamResponse_Timeout(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+	svc := newAntigravityTestService(&config.Config{
+		Gateway: config.GatewayConfig{StreamDataIntervalTimeout: 1, MaxLineSize: defaultMaxLineSize},
+	})
+
+	rec := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(rec)
+	c.Request = httptest.NewRequest(http.MethodPost, "/", nil)
+
+	pr, pw := io.Pipe()
+	resp := &http.Response{StatusCode: http.StatusOK, Body: pr, Header: http.Header{}}
+
+	result := svc.streamUpstreamResponse(c, resp, time.Now())
+	_ = pw.Close()
+	_ = pr.Close()
+
+	require.NotNil(t, result)
+	require.False(t, result.clientDisconnect)
+}
+
+// TestStreamUpstreamResponse_TimeoutAfterClientDisconnect
+// 验证：客户端断开后上游超时，返回 usage 并标记 clientDisconnect
+func TestStreamUpstreamResponse_TimeoutAfterClientDisconnect(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+	svc := newAntigravityTestService(&config.Config{
+		Gateway: config.GatewayConfig{StreamDataIntervalTimeout: 1, MaxLineSize: defaultMaxLineSize},
+	})
+
+	rec := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(rec)
+	c.Request = httptest.NewRequest(http.MethodPost, "/", nil)
+	c.Writer = &antigravityFailingWriter{ResponseWriter: c.Writer, failAfter: 0}
+
+	pr, pw := io.Pipe()
+	resp := &http.Response{StatusCode: http.StatusOK, Body: pr, Header: http.Header{}}
+
+	go func() {
+		fmt.Fprintln(pw, `data: {"type":"message_start","message":{"usage":{"input_tokens":5}}}`)
+		fmt.Fprintln(pw, "")
+		// 不关闭 pw → 等待超时
+	}()
+
+	result := svc.streamUpstreamResponse(c, resp, time.Now())
+	_ = pw.Close()
+	_ = pr.Close()
+
+	require.NotNil(t, result)
+	require.True(t, result.clientDisconnect)
+}
+
+// TestHandleGeminiStreamingResponse_ClientDisconnect
+// 验证：Gemini 流式转发中客户端断开后继续 drain 上游
+func TestHandleGeminiStreamingResponse_ClientDisconnect(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+	svc := newAntigravityTestService(&config.Config{
+		Gateway: config.GatewayConfig{MaxLineSize: defaultMaxLineSize},
+	})
+
+	rec := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(rec)
+	c.Request = httptest.NewRequest(http.MethodPost, "/", nil)
+	c.Writer = &antigravityFailingWriter{ResponseWriter: c.Writer, failAfter: 0}
+
+	pr, pw := io.Pipe()
+	resp := &http.Response{StatusCode: http.StatusOK, Body: pr, Header: http.Header{}}
+
+	go func() {
+		defer func() { _ = pw.Close() }()
+		fmt.Fprintln(pw, `data: {"candidates":[{"content":{"parts":[{"text":"hi"}]}}],"usageMetadata":{"promptTokenCount":5,"candidatesTokenCount":10}}`)
+		fmt.Fprintln(pw, "")
+	}()
+
+	result, err := svc.handleGeminiStreamingResponse(c, resp, time.Now())
+	_ = pr.Close()
+
+	require.NoError(t, err)
+	require.NotNil(t, result)
+	require.True(t, result.clientDisconnect)
+	require.NotContains(t, rec.Body.String(), "write_failed")
+}
+
+// TestHandleGeminiStreamingResponse_ContextCanceled
+// 验证：context 取消时不注入错误事件
+func TestHandleGeminiStreamingResponse_ContextCanceled(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+	svc := newAntigravityTestService(&config.Config{
+		Gateway: config.GatewayConfig{MaxLineSize: defaultMaxLineSize},
+	})
+
+	rec := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(rec)
+	ctx, cancel := context.WithCancel(context.Background())
+	cancel()
+	c.Request = httptest.NewRequest(http.MethodPost, "/", nil).WithContext(ctx)
+
+	resp := &http.Response{StatusCode: http.StatusOK, Body: cancelReadCloser{}, Header: http.Header{}}
+
+	result, err := svc.handleGeminiStreamingResponse(c, resp, time.Now())
+
+	require.NoError(t, err)
+	require.NotNil(t, result)
+	require.True(t, result.clientDisconnect)
+	require.NotContains(t, rec.Body.String(), "event: error")
+}
+
+// TestHandleClaudeStreamingResponse_ClientDisconnect
+// 验证：Claude 流式转发中客户端断开后继续 drain 上游
+func TestHandleClaudeStreamingResponse_ClientDisconnect(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+	svc := newAntigravityTestService(&config.Config{
+		Gateway: config.GatewayConfig{MaxLineSize: defaultMaxLineSize},
+	})
+
+	rec := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(rec)
+	c.Request = httptest.NewRequest(http.MethodPost, "/", nil)
+	c.Writer = &antigravityFailingWriter{ResponseWriter: c.Writer, failAfter: 0}
+
+	pr, pw := io.Pipe()
+	resp := &http.Response{StatusCode: http.StatusOK, Body: pr, Header: http.Header{}}
+
+	go func() {
+		defer func() { _ = pw.Close() }()
+		// v1internal 包装格式
+		fmt.Fprintln(pw, `data: {"response":{"candidates":[{"content":{"parts":[{"text":"hello"}]},"finishReason":"STOP"}],"usageMetadata":{"promptTokenCount":8,"candidatesTokenCount":15}}}`)
+		fmt.Fprintln(pw, "")
+	}()
+
+	result, err := svc.handleClaudeStreamingResponse(c, resp, time.Now(), "claude-sonnet-4-5")
+	_ = pr.Close()
+
+	require.NoError(t, err)
+	require.NotNil(t, result)
+	require.True(t, result.clientDisconnect)
+}
+
+// TestHandleClaudeStreamingResponse_ContextCanceled
+// 验证：context 取消时不注入错误事件
+func TestHandleClaudeStreamingResponse_ContextCanceled(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+	svc := newAntigravityTestService(&config.Config{
+		Gateway: config.GatewayConfig{MaxLineSize: defaultMaxLineSize},
+	})
+
+	rec := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(rec)
+	ctx, cancel := context.WithCancel(context.Background())
+	cancel()
+	c.Request = httptest.NewRequest(http.MethodPost, "/", nil).WithContext(ctx)
+
+	resp := &http.Response{StatusCode: http.StatusOK, Body: cancelReadCloser{}, Header: http.Header{}}
+
+	result, err := svc.handleClaudeStreamingResponse(c, resp, time.Now(), "claude-sonnet-4-5")
+
+	require.NoError(t, err)
+	require.NotNil(t, result)
+	require.True(t, result.clientDisconnect)
+	require.NotContains(t, rec.Body.String(), "event: error")
+}
+
+// TestExtractSSEUsage 验证 extractSSEUsage 从 SSE data 行正确提取 usage
+func TestExtractSSEUsage(t *testing.T) {
+	svc := &AntigravityGatewayService{}
+	tests := []struct {
+		name     string
+		line     string
+		expected ClaudeUsage
+	}{
+		{
+			name:     "message_delta with output_tokens",
+			line:     `data: {"type":"message_delta","usage":{"output_tokens":42}}`,
+			expected: ClaudeUsage{OutputTokens: 42},
+		},
+		{
+			name:     "non-data line ignored",
+			line:     `event: message_start`,
+			expected: ClaudeUsage{},
+		},
+		{
+			name:     "top-level usage with all fields",
+			line:     `data: {"usage":{"input_tokens":10,"output_tokens":20,"cache_read_input_tokens":5,"cache_creation_input_tokens":3}}`,
+			expected: ClaudeUsage{InputTokens: 10, OutputTokens: 20, CacheReadInputTokens: 5, CacheCreationInputTokens: 3},
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			usage := &ClaudeUsage{}
+			svc.extractSSEUsage(tt.line, usage)
+			require.Equal(t, tt.expected, *usage)
+		})
+	}
+}
+
+// TestAntigravityClientWriter 验证 antigravityClientWriter 的断开检测
+func TestAntigravityClientWriter(t *testing.T) {
+	t.Run("normal write succeeds", func(t *testing.T) {
+		gin.SetMode(gin.TestMode)
+		rec := httptest.NewRecorder()
+		c, _ := gin.CreateTestContext(rec)
+		flusher, _ := c.Writer.(http.Flusher)
+		cw := newAntigravityClientWriter(c.Writer, flusher, "test")
+
+		ok := cw.Write([]byte("hello"))
+		require.True(t, ok)
+		require.False(t, cw.Disconnected())
+		require.Contains(t, rec.Body.String(), "hello")
+	})
+
+	t.Run("write failure marks disconnected", func(t *testing.T) {
+		gin.SetMode(gin.TestMode)
+		rec := httptest.NewRecorder()
+		c, _ := gin.CreateTestContext(rec)
+		fw := &antigravityFailingWriter{ResponseWriter: c.Writer, failAfter: 0}
+		flusher, _ := c.Writer.(http.Flusher)
+		cw := newAntigravityClientWriter(fw, flusher, "test")
+
+		ok := cw.Write([]byte("hello"))
+		require.False(t, ok)
+		require.True(t, cw.Disconnected())
+	})
+
+	t.Run("subsequent writes are no-op", func(t *testing.T) {
+		gin.SetMode(gin.TestMode)
+		rec := httptest.NewRecorder()
+		c, _ := gin.CreateTestContext(rec)
+		fw := &antigravityFailingWriter{ResponseWriter: c.Writer, failAfter: 0}
+		flusher, _ := c.Writer.(http.Flusher)
+		cw := newAntigravityClientWriter(fw, flusher, "test")
+
+		cw.Write([]byte("first"))
+		ok := cw.Fprintf("second %d", 2)
+		require.False(t, ok)
+		require.True(t, cw.Disconnected())
+	})
+}
--- a/backend/internal/service/antigravity_quota_scope.go
+++ b/backend/internal/service/antigravity_quota_scope.go
@@ -2,63 +2,23 @@ package service

 import (
 	"context"
-	"slices"
 	"strings"
 	"time"
 )

-const antigravityQuotaScopesKey = "antigravity_quota_scopes"
-
-// AntigravityQuotaScope 表示 Antigravity 的配额域
-type AntigravityQuotaScope string
-
-const (
-	AntigravityQuotaScopeClaude      AntigravityQuotaScope = "claude"
-	AntigravityQuotaScopeGeminiText  AntigravityQuotaScope = "gemini_text"
-	AntigravityQuotaScopeGeminiImage AntigravityQuotaScope = "gemini_image"
-)
-
-// IsScopeSupported 检查给定的 scope 是否在分组支持的 scope 列表中
-func IsScopeSupported(supportedScopes []string, scope AntigravityQuotaScope) bool {
-	if len(supportedScopes) == 0 {
-		// 未配置时默认全部支持
-		return true
-	}
-	supported := slices.Contains(supportedScopes, string(scope))
-	return supported
-}
-
-// ResolveAntigravityQuotaScope 根据模型名称解析配额域（导出版本）
-func ResolveAntigravityQuotaScope(requestedModel string) (AntigravityQuotaScope, bool) {
-	return resolveAntigravityQuotaScope(requestedModel)
-}
-
-// resolveAntigravityQuotaScope 根据模型名称解析配额域
-func resolveAntigravityQuotaScope(requestedModel string) (AntigravityQuotaScope, bool) {
-	model := normalizeAntigravityModelName(requestedModel)
-	if model == "" {
-		return "", false
-	}
-	switch {
-	case strings.HasPrefix(model, "claude-"):
-		return AntigravityQuotaScopeClaude, true
-	case strings.HasPrefix(model, "gemini-"):
-		if isImageGenerationModel(model) {
-			return AntigravityQuotaScopeGeminiImage, true
-		}
-		return AntigravityQuotaScopeGeminiText, true
-	default:
-		return "", false
-	}
-}
-
 func normalizeAntigravityModelName(model string) string {
 	normalized := strings.ToLower(strings.TrimSpace(model))
 	normalized = strings.TrimPrefix(normalized, "models/")
 	return normalized
 }

-// IsSchedulableForModel 结合 Antigravity 配额域限流判断是否可调度。
+// resolveAntigravityModelKey 根据请求的模型名解析限流 key
+// 返回空字符串表示无法解析
+func resolveAntigravityModelKey(requestedModel string) string {
+	return normalizeAntigravityModelName(requestedModel)
+}
+
+// IsSchedulableForModel 结合模型级限流判断是否可调度。
 // 保持旧签名以兼容既有调用方；默认使用 context.Background()。
 func (a *Account) IsSchedulableForModel(requestedModel string) bool {
 	return a.IsSchedulableForModelWithContext(context.Background(), requestedModel)
@@ -74,107 +34,20 @@ func (a *Account) IsSchedulableForModelWithContext(ctx context.Context, requeste
 	if a.isModelRateLimitedWithContext(ctx, requestedModel) {
 		return false
 	}
-	if a.Platform != PlatformAntigravity {
-		return true
-	}
-	scope, ok := resolveAntigravityQuotaScope(requestedModel)
-	if !ok {
 	return true
-	}
-	resetAt := a.antigravityQuotaScopeResetAt(scope)
-	if resetAt == nil {
-		return true
-	}
-	now := time.Now()
-	return !now.Before(*resetAt)
-}
-
-func (a *Account) antigravityQuotaScopeResetAt(scope AntigravityQuotaScope) *time.Time {
-	if a == nil || a.Extra == nil || scope == "" {
-		return nil
-	}
-	rawScopes, ok := a.Extra[antigravityQuotaScopesKey].(map[string]any)
-	if !ok {
-		return nil
-	}
-	rawScope, ok := rawScopes[string(scope)].(map[string]any)
-	if !ok {
-		return nil
-	}
-	resetAtRaw, ok := rawScope["rate_limit_reset_at"].(string)
-	if !ok || strings.TrimSpace(resetAtRaw) == "" {
-		return nil
-	}
-	resetAt, err := time.Parse(time.RFC3339, resetAtRaw)
-	if err != nil {
-		return nil
-	}
-	return &resetAt
-}
-
-var antigravityAllScopes = []AntigravityQuotaScope{
-	AntigravityQuotaScopeClaude,
-	AntigravityQuotaScopeGeminiText,
-	AntigravityQuotaScopeGeminiImage,
-}
-
-func (a *Account) GetAntigravityScopeRateLimits() map[string]int64 {
-	if a == nil || a.Platform != PlatformAntigravity {
-		return nil
-	}
-	now := time.Now()
-	result := make(map[string]int64)
-	for _, scope := range antigravityAllScopes {
-		resetAt := a.antigravityQuotaScopeResetAt(scope)
-		if resetAt != nil && now.Before(*resetAt) {
-			remainingSec := int64(time.Until(*resetAt).Seconds())
-			if remainingSec > 0 {
-				result[string(scope)] = remainingSec
-			}
-		}
-	}
-	if len(result) == 0 {
-		return nil
-	}
-	return result
 }

-// GetQuotaScopeRateLimitRemainingTime 获取模型域限流剩余时间
-// 返回 0 表示未限流或已过期
-func (a *Account) GetQuotaScopeRateLimitRemainingTime(requestedModel string) time.Duration {
-	if a == nil || a.Platform != PlatformAntigravity {
-		return 0
-	}
-	scope, ok := resolveAntigravityQuotaScope(requestedModel)
-	if !ok {
-		return 0
-	}
-	resetAt := a.antigravityQuotaScopeResetAt(scope)
-	if resetAt == nil {
-		return 0
-	}
-	if remaining := time.Until(*resetAt); remaining > 0 {
-		return remaining
-	}
-	return 0
-}
-
-// GetRateLimitRemainingTime 获取限流剩余时间（模型限流和模型域限流取最大值）
+// GetRateLimitRemainingTime 获取限流剩余时间（模型级限流）
 // 返回 0 表示未限流或已过期
 func (a *Account) GetRateLimitRemainingTime(requestedModel string) time.Duration {
 	return a.GetRateLimitRemainingTimeWithContext(context.Background(), requestedModel)
 }

-// GetRateLimitRemainingTimeWithContext 获取限流剩余时间（模型限流和模型域限流取最大值）
+// GetRateLimitRemainingTimeWithContext 获取限流剩余时间（模型级限流）
 // 返回 0 表示未限流或已过期
 func (a *Account) GetRateLimitRemainingTimeWithContext(ctx context.Context, requestedModel string) time.Duration {
 	if a == nil {
 		return 0
 	}
-	modelRemaining := a.GetModelRateLimitRemainingTimeWithContext(ctx, requestedModel)
-	scopeRemaining := a.GetQuotaScopeRateLimitRemainingTime(requestedModel)
-	if modelRemaining > scopeRemaining {
-		return modelRemaining
-	}
-	return scopeRemaining
+	return a.GetModelRateLimitRemainingTimeWithContext(ctx, requestedModel)
 }
--- a/backend/internal/service/antigravity_rate_limit_test.go
+++ b/backend/internal/service/antigravity_rate_limit_test.go
@@ -59,12 +59,6 @@ func (s *stubAntigravityUpstream) DoWithTLS(req *http.Request, proxyURL string,
 	return s.Do(req, proxyURL, accountID, accountConcurrency)
 }

-type scopeLimitCall struct {
-	accountID int64
-	scope     AntigravityQuotaScope
-	resetAt   time.Time
-}
-
 type rateLimitCall struct {
 	accountID int64
 	resetAt   time.Time
@@ -78,16 +72,10 @@ type modelRateLimitCall struct {

 type stubAntigravityAccountRepo struct {
 	AccountRepository
-	scopeCalls          []scopeLimitCall
 	rateCalls           []rateLimitCall
 	modelRateLimitCalls []modelRateLimitCall
 }

-func (s *stubAntigravityAccountRepo) SetAntigravityQuotaScopeLimit(ctx context.Context, id int64, scope AntigravityQuotaScope, resetAt time.Time) error {
-	s.scopeCalls = append(s.scopeCalls, scopeLimitCall{accountID: id, scope: scope, resetAt: resetAt})
-	return nil
-}
-
 func (s *stubAntigravityAccountRepo) SetRateLimited(ctx context.Context, id int64, resetAt time.Time) error {
 	s.rateCalls = append(s.rateCalls, rateLimitCall{accountID: id, resetAt: resetAt})
 	return nil
@@ -131,10 +119,9 @@ func TestAntigravityRetryLoop_URLFallback_UsesLatestSuccess(t *testing.T) {
 		accessToken:    "token",
 		action:         "generateContent",
 		body:           []byte(`{"input":"test"}`),
-		quotaScope:     AntigravityQuotaScopeClaude,
 		httpUpstream:   upstream,
 		requestedModel: "claude-sonnet-4-5",
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			handleErrorCalled = true
 			return nil
 		},
@@ -155,23 +142,6 @@ func TestAntigravityRetryLoop_URLFallback_UsesLatestSuccess(t *testing.T) {
 	require.Equal(t, base2, available[0])
 }

-func TestAntigravityHandleUpstreamError_UsesScopeLimit(t *testing.T) {
-	// 分区限流始终开启，不再支持通过环境变量关闭
-	repo := &stubAntigravityAccountRepo{}
-	svc := &AntigravityGatewayService{accountRepo: repo}
-	account := &Account{ID: 9, Name: "acc-9", Platform: PlatformAntigravity}
-
-	body := buildGeminiRateLimitBody("3s")
-	svc.handleUpstreamError(context.Background(), "[test]", account, http.StatusTooManyRequests, http.Header{}, body, AntigravityQuotaScopeClaude, 0, "", false)
-
-	require.Len(t, repo.scopeCalls, 1)
-	require.Empty(t, repo.rateCalls)
-	call := repo.scopeCalls[0]
-	require.Equal(t, account.ID, call.accountID)
-	require.Equal(t, AntigravityQuotaScopeClaude, call.scope)
-	require.WithinDuration(t, time.Now().Add(3*time.Second), call.resetAt, 2*time.Second)
-}
-
 // TestHandleUpstreamError_429_ModelRateLimit 测试 429 模型限流场景
 func TestHandleUpstreamError_429_ModelRateLimit(t *testing.T) {
 	repo := &stubAntigravityAccountRepo{}
@@ -189,7 +159,7 @@ func TestHandleUpstreamError_429_ModelRateLimit(t *testing.T) {
 		}
 	}`)

-	result := svc.handleUpstreamError(context.Background(), "[test]", account, http.StatusTooManyRequests, http.Header{}, body, AntigravityQuotaScopeClaude, 0, "", false)
+	result := svc.handleUpstreamError(context.Background(), "[test]", account, http.StatusTooManyRequests, http.Header{}, body, "claude-sonnet-4-5", 0, "", false)

 	// 应该触发模型限流
 	require.NotNil(t, result)
@@ -200,22 +170,22 @@ func TestHandleUpstreamError_429_ModelRateLimit(t *testing.T) {
 	require.Equal(t, "claude-sonnet-4-5", repo.modelRateLimitCalls[0].modelKey)
 }

-// TestHandleUpstreamError_429_NonModelRateLimit 测试 429 非模型限流场景（走 scope 限流）
+// TestHandleUpstreamError_429_NonModelRateLimit 测试 429 非模型限流场景（走模型级限流兜底）
 func TestHandleUpstreamError_429_NonModelRateLimit(t *testing.T) {
 	repo := &stubAntigravityAccountRepo{}
 	svc := &AntigravityGatewayService{accountRepo: repo}
 	account := &Account{ID: 2, Name: "acc-2", Platform: PlatformAntigravity}

-	// 429 + 普通限流响应（无 RATE_LIMIT_EXCEEDED reason）→ scope 限流
+	// 429 + 普通限流响应（无 RATE_LIMIT_EXCEEDED reason）→ 走模型级限流兜底
 	body := buildGeminiRateLimitBody("5s")

-	result := svc.handleUpstreamError(context.Background(), "[test]", account, http.StatusTooManyRequests, http.Header{}, body, AntigravityQuotaScopeClaude, 0, "", false)
+	result := svc.handleUpstreamError(context.Background(), "[test]", account, http.StatusTooManyRequests, http.Header{}, body, "claude-sonnet-4-5", 0, "", false)

-	// 不应该触发模型限流，应该走 scope 限流
+	// handleModelRateLimit 不会处理（因为没有 RATE_LIMIT_EXCEEDED），
+	// 但 429 兜底逻辑会使用 requestedModel 设置模型级限流
 	require.Nil(t, result)
-	require.Empty(t, repo.modelRateLimitCalls)
-	require.Len(t, repo.scopeCalls, 1)
-	require.Equal(t, AntigravityQuotaScopeClaude, repo.scopeCalls[0].scope)
+	require.Len(t, repo.modelRateLimitCalls, 1)
+	require.Equal(t, "claude-sonnet-4-5", repo.modelRateLimitCalls[0].modelKey)
 }

 // TestHandleUpstreamError_503_ModelRateLimit 测试 503 模型限流场景
@@ -235,7 +205,7 @@ func TestHandleUpstreamError_503_ModelRateLimit(t *testing.T) {
 		}
 	}`)

-	result := svc.handleUpstreamError(context.Background(), "[test]", account, http.StatusServiceUnavailable, http.Header{}, body, AntigravityQuotaScopeGeminiText, 0, "", false)
+	result := svc.handleUpstreamError(context.Background(), "[test]", account, http.StatusServiceUnavailable, http.Header{}, body, "gemini-3-pro-high", 0, "", false)

 	// 应该触发模型限流
 	require.NotNil(t, result)
@@ -263,12 +233,11 @@ func TestHandleUpstreamError_503_NonModelRateLimit(t *testing.T) {
 		}
 	}`)

-	result := svc.handleUpstreamError(context.Background(), "[test]", account, http.StatusServiceUnavailable, http.Header{}, body, AntigravityQuotaScopeGeminiText, 0, "", false)
+	result := svc.handleUpstreamError(context.Background(), "[test]", account, http.StatusServiceUnavailable, http.Header{}, body, "gemini-3-pro-high", 0, "", false)

 	// 503 非模型限流不应该做任何处理
 	require.Nil(t, result)
 	require.Empty(t, repo.modelRateLimitCalls, "503 non-model rate limit should not trigger model rate limit")
-	require.Empty(t, repo.scopeCalls, "503 non-model rate limit should not trigger scope rate limit")
 	require.Empty(t, repo.rateCalls, "503 non-model rate limit should not trigger account rate limit")
 }

@@ -281,12 +250,11 @@ func TestHandleUpstreamError_503_EmptyBody(t *testing.T) {
 	// 503 + 空响应体 → 不做任何处理
 	body := []byte(`{}`)

-	result := svc.handleUpstreamError(context.Background(), "[test]", account, http.StatusServiceUnavailable, http.Header{}, body, AntigravityQuotaScopeGeminiText, 0, "", false)
+	result := svc.handleUpstreamError(context.Background(), "[test]", account, http.StatusServiceUnavailable, http.Header{}, body, "gemini-3-pro-high", 0, "", false)

 	// 503 空响应不应该做任何处理
 	require.Nil(t, result)
 	require.Empty(t, repo.modelRateLimitCalls)
-	require.Empty(t, repo.scopeCalls)
 	require.Empty(t, repo.rateCalls)
 }

@@ -307,15 +275,7 @@ func TestAccountIsSchedulableForModel_AntigravityRateLimits(t *testing.T) {
 	require.False(t, account.IsSchedulableForModel("gemini-3-flash"))

 	account.RateLimitResetAt = nil
-	account.Extra = map[string]any{
-		antigravityQuotaScopesKey: map[string]any{
-			"claude": map[string]any{
-				"rate_limit_reset_at": future.Format(time.RFC3339),
-			},
-		},
-	}
-
-	require.False(t, account.IsSchedulableForModel("claude-sonnet-4-5"))
+	require.True(t, account.IsSchedulableForModel("claude-sonnet-4-5"))
 	require.True(t, account.IsSchedulableForModel("gemini-3-flash"))
 }

@@ -635,6 +595,7 @@ func TestShouldTriggerAntigravitySmartRetry(t *testing.T) {
 			}`,
 			expectedShouldRetry:     false,
 			expectedShouldRateLimit: true,
+			minWait:                 7 * time.Second,
 			modelName:               "gemini-pro",
 		},
 		{
@@ -652,6 +613,7 @@ func TestShouldTriggerAntigravitySmartRetry(t *testing.T) {
 			}`,
 			expectedShouldRetry:     false,
 			expectedShouldRateLimit: true,
+			minWait:                 39 * time.Second,
 			modelName:               "gemini-3-pro-high",
 		},
 		{
@@ -669,6 +631,7 @@ func TestShouldTriggerAntigravitySmartRetry(t *testing.T) {
 			}`,
 			expectedShouldRetry:     false,
 			expectedShouldRateLimit: true,
+			minWait:                 30 * time.Second,
 			modelName:               "gemini-2.5-flash",
 		},
 		{
@@ -686,6 +649,7 @@ func TestShouldTriggerAntigravitySmartRetry(t *testing.T) {
 			}`,
 			expectedShouldRetry:     false,
 			expectedShouldRateLimit: true,
+			minWait:                 30 * time.Second,
 			modelName:               "claude-sonnet-4-5",
 		},
 	}
@@ -704,6 +668,11 @@ func TestShouldTriggerAntigravitySmartRetry(t *testing.T) {
 					t.Errorf("wait = %v, want >= %v", wait, tt.minWait)
 				}
 			}
+			if shouldRateLimit && tt.minWait > 0 {
+				if wait < tt.minWait {
+					t.Errorf("rate limit wait = %v, want >= %v", wait, tt.minWait)
+				}
+			}
 			if (shouldRetry || shouldRateLimit) && model != tt.modelName {
 				t.Errorf("modelName = %q, want %q", model, tt.modelName)
 			}
@@ -832,7 +801,7 @@ func TestAntigravityRetryLoop_PreCheck_SwitchesWhenRateLimited(t *testing.T) {
 		requestedModel:  "claude-sonnet-4-5",
 		httpUpstream:    upstream,
 		isStickySession: true,
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	})
@@ -875,7 +844,7 @@ func TestAntigravityRetryLoop_PreCheck_SwitchesWhenRemainingLong(t *testing.T) {
 		requestedModel:  "claude-sonnet-4-5",
 		httpUpstream:    upstream,
 		isStickySession: true,
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	})

--- a/backend/internal/service/antigravity_smart_retry_test.go
+++ b/backend/internal/service/antigravity_smart_retry_test.go
@@ -75,7 +75,7 @@ func TestHandleSmartRetry_URLLevelRateLimit(t *testing.T) {
 		accessToken: "token",
 		action:      "generateContent",
 		body:        []byte(`{"input":"test"}`),
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	}
@@ -127,7 +127,7 @@ func TestHandleSmartRetry_LongDelay_ReturnsSwitchError(t *testing.T) {
 		body:            []byte(`{"input":"test"}`),
 		accountRepo:     repo,
 		isStickySession: true,
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	}
@@ -194,7 +194,7 @@ func TestHandleSmartRetry_ShortDelay_SmartRetrySuccess(t *testing.T) {
 		action:       "generateContent",
 		body:         []byte(`{"input":"test"}`),
 		httpUpstream: upstream,
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	}
@@ -269,7 +269,7 @@ func TestHandleSmartRetry_ShortDelay_SmartRetryFailed_ReturnsSwitchError(t *test
 		httpUpstream:    upstream,
 		accountRepo:     repo,
 		isStickySession: false,
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	}
@@ -331,7 +331,7 @@ func TestHandleSmartRetry_503_ModelCapacityExhausted_ReturnsSwitchError(t *testi
 		body:            []byte(`{"input":"test"}`),
 		accountRepo:     repo,
 		isStickySession: true,
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	}
@@ -387,7 +387,7 @@ func TestHandleSmartRetry_NonAntigravityAccount_ContinuesDefaultLogic(t *testing
 		accessToken: "token",
 		action:      "generateContent",
 		body:        []byte(`{"input":"test"}`),
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	}
@@ -436,7 +436,7 @@ func TestHandleSmartRetry_NonModelRateLimit_ContinuesDefaultLogic(t *testing.T)
 		accessToken: "token",
 		action:      "generateContent",
 		body:        []byte(`{"input":"test"}`),
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	}
@@ -487,7 +487,7 @@ func TestHandleSmartRetry_ExactlyAtThreshold_ReturnsSwitchError(t *testing.T) {
 		action:      "generateContent",
 		body:        []byte(`{"input":"test"}`),
 		accountRepo: repo,
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	}
@@ -548,7 +548,7 @@ func TestAntigravityRetryLoop_HandleSmartRetry_SwitchError_Propagates(t *testing
 		httpUpstream:    upstream,
 		accountRepo:     repo,
 		isStickySession: true,
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	})
@@ -604,7 +604,7 @@ func TestHandleSmartRetry_NetworkError_ExhaustsRetry(t *testing.T) {
 		body:         []byte(`{"input":"test"}`),
 		httpUpstream: upstream,
 		accountRepo:  repo,
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	}
@@ -662,7 +662,7 @@ func TestHandleSmartRetry_NoRetryDelay_UsesDefaultRateLimit(t *testing.T) {
 		body:            []byte(`{"input":"test"}`),
 		accountRepo:     repo,
 		isStickySession: true,
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	}
@@ -754,7 +754,7 @@ func TestHandleSmartRetry_ShortDelay_StickySession_FailedRetry_ClearsSession(t *
 		isStickySession: true,
 		groupID:         42,
 		sessionHash:     "sticky-hash-abc",
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	}
@@ -842,7 +842,7 @@ func TestHandleSmartRetry_ShortDelay_NonStickySession_FailedRetry_NoDeleteSessio
 		isStickySession: false,
 		groupID:         42,
 		sessionHash:     "", // 非粘性会话，sessionHash 为空
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	}
@@ -918,7 +918,7 @@ func TestHandleSmartRetry_ShortDelay_StickySession_FailedRetry_NilCache_NoPanic(
 		isStickySession: true,
 		groupID:         42,
 		sessionHash:     "sticky-hash-nil-cache",
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	}
@@ -983,7 +983,7 @@ func TestHandleSmartRetry_ShortDelay_StickySession_SuccessRetry_NoDeleteSession(
 		isStickySession: true,
 		groupID:         42,
 		sessionHash:     "sticky-hash-success",
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	}
@@ -1043,7 +1043,7 @@ func TestHandleSmartRetry_LongDelay_StickySession_NoDeleteInHandleSmartRetry(t *
 		isStickySession: true,
 		groupID:         42,
 		sessionHash:     "sticky-hash-long-delay",
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	}
@@ -1108,7 +1108,7 @@ func TestHandleSmartRetry_ShortDelay_NetworkError_StickySession_ClearsSession(t
 		isStickySession: true,
 		groupID:         99,
 		sessionHash:     "sticky-net-error",
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	}
@@ -1188,7 +1188,7 @@ func TestHandleSmartRetry_ShortDelay_503_StickySession_FailedRetry_ClearsSession
 		isStickySession: true,
 		groupID:         77,
 		sessionHash:     "sticky-503-short",
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	}
@@ -1278,7 +1278,7 @@ func TestAntigravityRetryLoop_SmartRetryFailed_StickySession_SwitchErrorPropagat
 		isStickySession: true,
 		groupID:         55,
 		sessionHash:     "sticky-loop-test",
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	})