Merge pull request #523 from touwaeriol/feat/antigravity-improvements

feat: Antigravity improvements and scope-to-model rate limiting refactor

Merge pull request #523 from touwaeriol/feat/antigravity-improvements
feat: Antigravity improvements and scope-to-model rate limiting refactor
149e4267 · Wesley Liddick · GitHub · 5fa93ebd · 9a479d1b · 149e4267
Unverified Commit 149e4267 authored Feb 09, 2026 by Wesley Liddick Committed by GitHub Feb 09, 2026
--- a/backend/cmd/server/wire_gen.go
+++ b/backend/cmd/server/wire_gen.go
@@ -154,7 +154,8 @@ func initializeApplication(buildInfo handler.BuildInfo) (*Application, error) {
 	identityService := service.NewIdentityService(identityCache)
 	deferredService := service.ProvideDeferredService(accountRepository, timingWheelService)
 	claudeTokenProvider := service.NewClaudeTokenProvider(accountRepository, geminiTokenCache, oAuthService)
-	gatewayService := service.NewGatewayService(accountRepository, groupRepository, usageLogRepository, userRepository, userSubscriptionRepository, userGroupRateRepository, gatewayCache, configConfig, schedulerSnapshotService, concurrencyService, billingService, rateLimitService, billingCacheService, identityService, httpUpstream, deferredService, claudeTokenProvider, sessionLimitCache)
+	digestSessionStore := service.NewDigestSessionStore()
+	gatewayService := service.NewGatewayService(accountRepository, groupRepository, usageLogRepository, userRepository, userSubscriptionRepository, userGroupRateRepository, gatewayCache, configConfig, schedulerSnapshotService, concurrencyService, billingService, rateLimitService, billingCacheService, identityService, httpUpstream, deferredService, claudeTokenProvider, sessionLimitCache, digestSessionStore)
 	openAITokenProvider := service.NewOpenAITokenProvider(accountRepository, geminiTokenCache, openAIOAuthService)
 	openAIGatewayService := service.NewOpenAIGatewayService(accountRepository, usageLogRepository, userRepository, userSubscriptionRepository, gatewayCache, configConfig, schedulerSnapshotService, concurrencyService, billingService, rateLimitService, billingCacheService, httpUpstream, deferredService, openAITokenProvider)
 	geminiMessagesCompatService := service.NewGeminiMessagesCompatService(accountRepository, groupRepository, gatewayCache, schedulerSnapshotService, geminiTokenProvider, rateLimitService, httpUpstream, antigravityGatewayService, configConfig)

--- a/backend/go.mod
+++ b/backend/go.mod
@@ -103,6 +103,7 @@ require (
 	github.com/ncruces/go-strftime v1.0.0 // indirect
 	github.com/opencontainers/go-digest v1.0.0 // indirect
 	github.com/opencontainers/image-spec v1.1.1 // indirect
+	github.com/patrickmn/go-cache v2.1.0+incompatible // indirect
 	github.com/pelletier/go-toml/v2 v2.2.2 // indirect
 	github.com/pkg/errors v0.9.1 // indirect
 	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect

--- a/backend/go.sum
+++ b/backend/go.sum
@@ -213,6 +213,8 @@ github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8
 github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM=
 github.com/opencontainers/image-spec v1.1.1 h1:y0fUlFfIZhPF1W537XOLg0/fcx6zcHCJwooC2xJA040=
 github.com/opencontainers/image-spec v1.1.1/go.mod h1:qpqAh3Dmcf36wStyyWU+kCeDgrGnAve2nCC8+7h8Q0M=
+github.com/patrickmn/go-cache v2.1.0+incompatible h1:HRMgzkcYKYpi3C8ajMPV8OFXaaRUnok+kx1WdO15EQc=
+github.com/patrickmn/go-cache v2.1.0+incompatible/go.mod h1:3Qf8kWWT7OJRJbdiICTKqZju1ZixQ/KpMGzzAfe6+WQ=
 github.com/pelletier/go-toml/v2 v2.2.2 h1:aYUidT7k73Pcl9nb2gScu7NSrKCSHIDE89b3+6Wq+LM=
 github.com/pelletier/go-toml/v2 v2.2.2/go.mod h1:1t835xjRzz80PqgE6HHgN2JOsmgYu/h4qDAS4n929Rs=
 github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=

--- a/backend/internal/handler/dto/types.go
+++ b/backend/internal/handler/dto/types.go
@@ -2,11 +2,6 @@ package dto

 import "time"

-type ScopeRateLimitInfo struct {
-	ResetAt      time.Time `json:"reset_at"`
-	RemainingSec int64     `json:"remaining_sec"`
-}
-
 type User struct {
 	ID            int64     `json:"id"`
 	Email         string    `json:"email"`
@@ -129,9 +124,6 @@ type Account struct {
 	RateLimitResetAt *time.Time `json:"rate_limit_reset_at"`
 	OverloadUntil    *time.Time `json:"overload_until"`

-	// Antigravity scope 级限流状态（从 extra 提取）
-	ScopeRateLimits map[string]ScopeRateLimitInfo `json:"scope_rate_limits,omitempty"`
-
 	TempUnschedulableUntil  *time.Time `json:"temp_unschedulable_until"`
 	TempUnschedulableReason string     `json:"temp_unschedulable_reason"`


--- a/backend/internal/handler/gateway_handler.go
+++ b/backend/internal/handler/gateway_handler.go
@@ -13,6 +13,7 @@ import (
 	"time"

 	"github.com/Wei-Shaw/sub2api/internal/config"
+	"github.com/Wei-Shaw/sub2api/internal/domain"
 	"github.com/Wei-Shaw/sub2api/internal/pkg/antigravity"
 	"github.com/Wei-Shaw/sub2api/internal/pkg/claude"
 	"github.com/Wei-Shaw/sub2api/internal/pkg/ctxkey"
@@ -114,7 +115,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) {

 	setOpsRequestContext(c, "", false, body)

-	parsedReq, err := service.ParseGatewayRequest(body)
+	parsedReq, err := service.ParseGatewayRequest(body, domain.PlatformAnthropic)
 	if err != nil {
 		h.errorResponse(c, http.StatusBadRequest, "invalid_request_error", "Failed to parse request body")
 		return
@@ -203,6 +204,11 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 	}

 	// 计算粘性会话hash
+	parsedReq.SessionContext = &service.SessionContext{
+		ClientIP:  ip.GetClientIP(c),
+		UserAgent: c.GetHeader("User-Agent"),
+		APIKeyID:  apiKey.ID,
+	}
 	sessionHash := h.gatewayService.GenerateSessionHash(parsedReq)

 	// 获取平台：优先使用强制平台（/antigravity 路由，中间件已设置 request.Context），否则使用分组平台
@@ -335,7 +341,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 				if errors.As(err, &failoverErr) {
 					failedAccountIDs[account.ID] = struct{}{}
 					lastFailoverErr = failoverErr
-					if failoverErr.ForceCacheBilling {
+					if needForceCacheBilling(hasBoundSession, failoverErr) {
 						forceCacheBilling = true
 					}
 					if switchCount >= maxAccountSwitches {
@@ -344,6 +350,11 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 					}
 					switchCount++
 					log.Printf("Account %d: upstream error %d, switching account %d/%d", account.ID, failoverErr.StatusCode, switchCount, maxAccountSwitches)
+					if account.Platform == service.PlatformAntigravity {
+						if !sleepFailoverDelay(c.Request.Context(), switchCount) {
+							return
+						}
+					}
 					continue
 				}
 				// 错误响应已在Forward中处理，这里只记录日志
@@ -530,7 +541,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 				if errors.As(err, &failoverErr) {
 					failedAccountIDs[account.ID] = struct{}{}
 					lastFailoverErr = failoverErr
-					if failoverErr.ForceCacheBilling {
+					if needForceCacheBilling(hasBoundSession, failoverErr) {
 						forceCacheBilling = true
 					}
 					if switchCount >= maxAccountSwitches {
@@ -539,6 +550,11 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
 					}
 					switchCount++
 					log.Printf("Account %d: upstream error %d, switching account %d/%d", account.ID, failoverErr.StatusCode, switchCount, maxAccountSwitches)
+					if account.Platform == service.PlatformAntigravity {
+						if !sleepFailoverDelay(c.Request.Context(), switchCount) {
+							return
+						}
+					}
 					continue
 				}
 				// 错误响应已在Forward中处理，这里只记录日志
@@ -801,6 +817,27 @@ func (h *GatewayHandler) handleConcurrencyError(c *gin.Context, err error, slotT
 		fmt.Sprintf("Concurrency limit exceeded for %s, please retry later", slotType), streamStarted)
 }

+// needForceCacheBilling 判断 failover 时是否需要强制缓存计费
+// 粘性会话切换账号、或上游明确标记时，将 input_tokens 转为 cache_read 计费
+func needForceCacheBilling(hasBoundSession bool, failoverErr *service.UpstreamFailoverError) bool {
+	return hasBoundSession || (failoverErr != nil && failoverErr.ForceCacheBilling)
+}
+
+// sleepFailoverDelay 账号切换线性递增延时：第1次0s、第2次1s、第3次2s…
+// 返回 false 表示 context 已取消。
+func sleepFailoverDelay(ctx context.Context, switchCount int) bool {
+	delay := time.Duration(switchCount-1) * time.Second
+	if delay <= 0 {
+		return true
+	}
+	select {
+	case <-ctx.Done():
+		return false
+	case <-time.After(delay):
+		return true
+	}
+}
+
 func (h *GatewayHandler) handleFailoverExhausted(c *gin.Context, failoverErr *service.UpstreamFailoverError, platform string, streamStarted bool) {
 	statusCode := failoverErr.StatusCode
 	responseBody := failoverErr.ResponseBody
@@ -934,7 +971,7 @@ func (h *GatewayHandler) CountTokens(c *gin.Context) {

 	setOpsRequestContext(c, "", false, body)

-	parsedReq, err := service.ParseGatewayRequest(body)
+	parsedReq, err := service.ParseGatewayRequest(body, domain.PlatformAnthropic)
 	if err != nil {
 		h.errorResponse(c, http.StatusBadRequest, "invalid_request_error", "Failed to parse request body")
 		return
@@ -962,6 +999,11 @@ func (h *GatewayHandler) CountTokens(c *gin.Context) {
 	}

 	// 计算粘性会话 hash
+	parsedReq.SessionContext = &service.SessionContext{
+		ClientIP:  ip.GetClientIP(c),
+		UserAgent: c.GetHeader("User-Agent"),
+		APIKeyID:  apiKey.ID,
+	}
 	sessionHash := h.gatewayService.GenerateSessionHash(parsedReq)

 	// 选择支持该模型的账号

--- a/backend/internal/handler/gemini_v1beta_handler.go
+++ b/backend/internal/handler/gemini_v1beta_handler.go
@@ -14,6 +14,7 @@ import (
 	"strings"
 	"time"

+	"github.com/Wei-Shaw/sub2api/internal/domain"
 	"github.com/Wei-Shaw/sub2api/internal/pkg/antigravity"
 	"github.com/Wei-Shaw/sub2api/internal/pkg/ctxkey"
 	"github.com/Wei-Shaw/sub2api/internal/pkg/gemini"
@@ -30,13 +31,6 @@ import (
 // 匹配格式: /Users/xxx/.gemini/tmp/[64位十六进制哈希]
 var geminiCLITmpDirRegex = regexp.MustCompile(`/\.gemini/tmp/([A-Fa-f0-9]{64})`)

-func isGeminiCLIRequest(c *gin.Context, body []byte) bool {
-	if strings.TrimSpace(c.GetHeader("x-gemini-api-privileged-user-id")) != "" {
-		return true
-	}
-	return geminiCLITmpDirRegex.Match(body)
-}
-
 // GeminiV1BetaListModels proxies:
 // GET /v1beta/models
 func (h *GatewayHandler) GeminiV1BetaListModels(c *gin.Context) {
@@ -239,7 +233,14 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
 	sessionHash := extractGeminiCLISessionHash(c, body)
 	if sessionHash == "" {
 		// Fallback: 使用通用的会话哈希生成逻辑（适用于其他客户端）
-		parsedReq, _ := service.ParseGatewayRequest(body)
+		parsedReq, _ := service.ParseGatewayRequest(body, domain.PlatformGemini)
+		if parsedReq != nil {
+			parsedReq.SessionContext = &service.SessionContext{
+				ClientIP:  ip.GetClientIP(c),
+				UserAgent: c.GetHeader("User-Agent"),
+				APIKeyID:  apiKey.ID,
+			}
+		}
 		sessionHash = h.gatewayService.GenerateSessionHash(parsedReq)
 	}
 	sessionKey := sessionHash
@@ -258,6 +259,7 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
 	var geminiDigestChain string
 	var geminiPrefixHash string
 	var geminiSessionUUID string
+	var matchedDigestChain string
 	useDigestFallback := sessionBoundAccountID == 0

 	if useDigestFallback {
@@ -284,13 +286,14 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
 				)

 				// 查找会话
-				foundUUID, foundAccountID, found := h.gatewayService.FindGeminiSession(
+				foundUUID, foundAccountID, foundMatchedChain, found := h.gatewayService.FindGeminiSession(
 					c.Request.Context(),
 					derefGroupID(apiKey.GroupID),
 					geminiPrefixHash,
 					geminiDigestChain,
 				)
 				if found {
+					matchedDigestChain = foundMatchedChain
 					sessionBoundAccountID = foundAccountID
 					geminiSessionUUID = foundUUID
 					log.Printf("[Gemini] Digest fallback matched: uuid=%s, accountID=%d, chain=%s",
@@ -316,7 +319,6 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {

 	// 判断是否真的绑定了粘性会话：有 sessionKey 且已经绑定到某个账号
 	hasBoundSession := sessionKey != "" && sessionBoundAccountID > 0
-	isCLI := isGeminiCLIRequest(c, body)
 	cleanedForUnknownBinding := false

 	maxAccountSwitches := h.maxAccountSwitchesGemini
@@ -344,10 +346,10 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
 			log.Printf("[Gemini] Sticky session account switched: %d -> %d, cleaning thoughtSignature", sessionBoundAccountID, account.ID)
 			body = service.CleanGeminiNativeThoughtSignatures(body)
 			sessionBoundAccountID = account.ID
-		} else if sessionKey != "" && sessionBoundAccountID == 0 && isCLI && !cleanedForUnknownBinding && bytes.Contains(body, []byte(`"thoughtSignature"`)) {
-			// 无缓存绑定但请求里已有 thoughtSignature：常见于缓存丢失/TTL 过期后，CLI 继续携带旧签名。
+		} else if sessionKey != "" && sessionBoundAccountID == 0 && !cleanedForUnknownBinding && bytes.Contains(body, []byte(`"thoughtSignature"`)) {
+			// 无缓存绑定但请求里已有 thoughtSignature：常见于缓存丢失/TTL 过期后，客户端继续携带旧签名。
 			// 为避免第一次转发就 400，这里做一次确定性清理，让新账号重新生成签名链路。
-			log.Printf("[Gemini] Sticky session binding missing for CLI request, cleaning thoughtSignature proactively")
+			log.Printf("[Gemini] Sticky session binding missing, cleaning thoughtSignature proactively")
 			body = service.CleanGeminiNativeThoughtSignatures(body)
 			cleanedForUnknownBinding = true
 			sessionBoundAccountID = account.ID
@@ -422,7 +424,7 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
 			var failoverErr *service.UpstreamFailoverError
 			if errors.As(err, &failoverErr) {
 				failedAccountIDs[account.ID] = struct{}{}
-				if failoverErr.ForceCacheBilling {
+				if needForceCacheBilling(hasBoundSession, failoverErr) {
 					forceCacheBilling = true
 				}
 				if switchCount >= maxAccountSwitches {
@@ -433,6 +435,11 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
 				lastFailoverErr = failoverErr
 				switchCount++
 				log.Printf("Gemini account %d: upstream error %d, switching account %d/%d", account.ID, failoverErr.StatusCode, switchCount, maxAccountSwitches)
+				if account.Platform == service.PlatformAntigravity {
+					if !sleepFailoverDelay(c.Request.Context(), switchCount) {
+						return
+					}
+				}
 				continue
 			}
 			// ForwardNative already wrote the response
@@ -453,6 +460,7 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
 				geminiDigestChain,
 				geminiSessionUUID,
 				account.ID,
+				matchedDigestChain,
 			); err != nil {
 				log.Printf("[Gemini] Failed to save digest session: %v", err)
 			}

--- a/backend/internal/repository/account_repo.go
+++ b/backend/internal/repository/account_repo.go
@@ -798,53 +798,6 @@ func (r *accountRepository) SetRateLimited(ctx context.Context, id int64, resetA
 	return nil
 }

-func (r *accountRepository) SetAntigravityQuotaScopeLimit(ctx context.Context, id int64, scope service.AntigravityQuotaScope, resetAt time.Time) error {
-	now := time.Now().UTC()
-	payload := map[string]string{
-		"rate_limited_at":     now.Format(time.RFC3339),
-		"rate_limit_reset_at": resetAt.UTC().Format(time.RFC3339),
-	}
-	raw, err := json.Marshal(payload)
-	if err != nil {
-		return err
-	}
-
-	scopeKey := string(scope)
-	client := clientFromContext(ctx, r.client)
-	result, err := client.ExecContext(
-		ctx,
-		`UPDATE accounts SET
-			extra = jsonb_set(
-				jsonb_set(COALESCE(extra, '{}'::jsonb), '{antigravity_quota_scopes}'::text[], COALESCE(extra->'antigravity_quota_scopes', '{}'::jsonb), true),
-				ARRAY['antigravity_quota_scopes', $1]::text[],
-				$2::jsonb,
-				true
-			),
-			updated_at = NOW(),
-			last_used_at = NOW()
-		WHERE id = $3 AND deleted_at IS NULL`,
-		scopeKey,
-		raw,
-		id,
-	)
-	if err != nil {
-		return err
-	}
-
-	affected, err := result.RowsAffected()
-	if err != nil {
-		return err
-	}
-	if affected == 0 {
-		return service.ErrAccountNotFound
-	}
-
-	if err := enqueueSchedulerOutbox(ctx, r.sql, service.SchedulerOutboxEventAccountChanged, &id, nil, nil); err != nil {
-		log.Printf("[SchedulerOutbox] enqueue quota scope failed: account=%d err=%v", id, err)
-	}
-	return nil
-}
-
 func (r *accountRepository) SetModelRateLimit(ctx context.Context, id int64, scope string, resetAt time.Time) error {
 	if scope == "" {
 		return nil

--- a/backend/internal/repository/gateway_cache.go
+++ b/backend/internal/repository/gateway_cache.go
@@ -11,63 +11,6 @@ import (

 const stickySessionPrefix = "sticky_session:"

-// Gemini Trie Lua 脚本
-const (
-	// geminiTrieFindScript 查找最长前缀匹配的 Lua 脚本
-	// KEYS[1] = trie key
-	// ARGV[1] = digestChain (如 "u:a-m:b-u:c-m:d")
-	// ARGV[2] = TTL seconds (用于刷新)
-	// 返回: 最长匹配的 value (uuid:accountID) 或 nil
-	// 查找成功时自动刷新 TTL，防止活跃会话意外过期
-	geminiTrieFindScript = `
-local chain = ARGV[1]
-local ttl = tonumber(ARGV[2])
-local lastMatch = nil
-local path = ""
-
-for part in string.gmatch(chain, "[^-]+") do
-    path = path == "" and part or path .. "-" .. part
-    local val = redis.call('HGET', KEYS[1], path)
-    if val and val ~= "" then
-        lastMatch = val
-    end
-end
-
-if lastMatch then
-    redis.call('EXPIRE', KEYS[1], ttl)
-end
-
-return lastMatch
-`
-
-	// geminiTrieSaveScript 保存会话到 Trie 的 Lua 脚本
-	// KEYS[1] = trie key
-	// ARGV[1] = digestChain
-	// ARGV[2] = value (uuid:accountID)
-	// ARGV[3] = TTL seconds
-	geminiTrieSaveScript = `
-local chain = ARGV[1]
-local value = ARGV[2]
-local ttl = tonumber(ARGV[3])
-local path = ""
-
-for part in string.gmatch(chain, "[^-]+") do
-    path = path == "" and part or path .. "-" .. part
-end
-redis.call('HSET', KEYS[1], path, value)
-redis.call('EXPIRE', KEYS[1], ttl)
-return "OK"
-`
-)
-
-// 模型负载统计相关常量
-const (
-	modelLoadKeyPrefix     = "ag:model_load:"      // 模型调用次数 key 前缀
-	modelLastUsedKeyPrefix = "ag:model_last_used:" // 模型最后调度时间 key 前缀
-	modelLoadTTL           = 24 * time.Hour        // 调用次数 TTL（24 小时无调用后清零）
-	modelLastUsedTTL       = 24 * time.Hour        // 最后调度时间 TTL
-)
-
 type gatewayCache struct {
 	rdb *redis.Client
 }
@@ -108,171 +51,3 @@ func (c *gatewayCache) DeleteSessionAccountID(ctx context.Context, groupID int64
 	key := buildSessionKey(groupID, sessionHash)
 	return c.rdb.Del(ctx, key).Err()
 }
-
-// ============ Antigravity 模型负载统计方法 ============
-
-// modelLoadKey 构建模型调用次数 key
-// 格式: ag:model_load:{accountID}:{model}
-func modelLoadKey(accountID int64, model string) string {
-	return fmt.Sprintf("%s%d:%s", modelLoadKeyPrefix, accountID, model)
-}
-
-// modelLastUsedKey 构建模型最后调度时间 key
-// 格式: ag:model_last_used:{accountID}:{model}
-func modelLastUsedKey(accountID int64, model string) string {
-	return fmt.Sprintf("%s%d:%s", modelLastUsedKeyPrefix, accountID, model)
-}
-
-// IncrModelCallCount 增加模型调用次数并更新最后调度时间
-// 返回更新后的调用次数
-func (c *gatewayCache) IncrModelCallCount(ctx context.Context, accountID int64, model string) (int64, error) {
-	loadKey := modelLoadKey(accountID, model)
-	lastUsedKey := modelLastUsedKey(accountID, model)
-
-	pipe := c.rdb.Pipeline()
-	incrCmd := pipe.Incr(ctx, loadKey)
-	pipe.Expire(ctx, loadKey, modelLoadTTL) // 每次调用刷新 TTL
-	pipe.Set(ctx, lastUsedKey, time.Now().Unix(), modelLastUsedTTL)
-	if _, err := pipe.Exec(ctx); err != nil {
-		return 0, err
-	}
-	return incrCmd.Val(), nil
-}
-
-// GetModelLoadBatch 批量获取账号的模型负载信息
-func (c *gatewayCache) GetModelLoadBatch(ctx context.Context, accountIDs []int64, model string) (map[int64]*service.ModelLoadInfo, error) {
-	if len(accountIDs) == 0 {
-		return make(map[int64]*service.ModelLoadInfo), nil
-	}
-
-	loadCmds, lastUsedCmds := c.pipelineModelLoadGet(ctx, accountIDs, model)
-	return c.parseModelLoadResults(accountIDs, loadCmds, lastUsedCmds), nil
-}
-
-// pipelineModelLoadGet 批量获取模型负载的 Pipeline 操作
-func (c *gatewayCache) pipelineModelLoadGet(
-	ctx context.Context,
-	accountIDs []int64,
-	model string,
-) (map[int64]*redis.StringCmd, map[int64]*redis.StringCmd) {
-	pipe := c.rdb.Pipeline()
-	loadCmds := make(map[int64]*redis.StringCmd, len(accountIDs))
-	lastUsedCmds := make(map[int64]*redis.StringCmd, len(accountIDs))
-
-	for _, id := range accountIDs {
-		loadCmds[id] = pipe.Get(ctx, modelLoadKey(id, model))
-		lastUsedCmds[id] = pipe.Get(ctx, modelLastUsedKey(id, model))
-	}
-	_, _ = pipe.Exec(ctx) // 忽略错误，key 不存在是正常的
-	return loadCmds, lastUsedCmds
-}
-
-// parseModelLoadResults 解析 Pipeline 结果
-func (c *gatewayCache) parseModelLoadResults(
-	accountIDs []int64,
-	loadCmds map[int64]*redis.StringCmd,
-	lastUsedCmds map[int64]*redis.StringCmd,
-) map[int64]*service.ModelLoadInfo {
-	result := make(map[int64]*service.ModelLoadInfo, len(accountIDs))
-	for _, id := range accountIDs {
-		result[id] = &service.ModelLoadInfo{
-			CallCount:  getInt64OrZero(loadCmds[id]),
-			LastUsedAt: getTimeOrZero(lastUsedCmds[id]),
-		}
-	}
-	return result
-}
-
-// getInt64OrZero 从 StringCmd 获取 int64 值，失败返回 0
-func getInt64OrZero(cmd *redis.StringCmd) int64 {
-	val, _ := cmd.Int64()
-	return val
-}
-
-// getTimeOrZero 从 StringCmd 获取 time.Time，失败返回零值
-func getTimeOrZero(cmd *redis.StringCmd) time.Time {
-	val, err := cmd.Int64()
-	if err != nil {
-		return time.Time{}
-	}
-	return time.Unix(val, 0)
-}
-
-// ============ Gemini 会话 Fallback 方法 (Trie 实现) ============
-
-// FindGeminiSession 查找 Gemini 会话（使用 Trie + Lua 脚本实现 O(L) 查询）
-// 返回最长匹配的会话信息，匹配成功时自动刷新 TTL
-func (c *gatewayCache) FindGeminiSession(ctx context.Context, groupID int64, prefixHash, digestChain string) (uuid string, accountID int64, found bool) {
-	if digestChain == "" {
-		return "", 0, false
-	}
-
-	trieKey := service.BuildGeminiTrieKey(groupID, prefixHash)
-	ttlSeconds := int(service.GeminiSessionTTL().Seconds())
-
-	// 使用 Lua 脚本在 Redis 端执行 Trie 查找，O(L) 次 HGET，1 次网络往返
-	// 查找成功时自动刷新 TTL，防止活跃会话意外过期
-	result, err := c.rdb.Eval(ctx, geminiTrieFindScript, []string{trieKey}, digestChain, ttlSeconds).Result()
-	if err != nil || result == nil {
-		return "", 0, false
-	}
-
-	value, ok := result.(string)
-	if !ok || value == "" {
-		return "", 0, false
-	}
-
-	uuid, accountID, ok = service.ParseGeminiSessionValue(value)
-	return uuid, accountID, ok
-}
-
-// SaveGeminiSession 保存 Gemini 会话（使用 Trie + Lua 脚本）
-func (c *gatewayCache) SaveGeminiSession(ctx context.Context, groupID int64, prefixHash, digestChain, uuid string, accountID int64) error {
-	if digestChain == "" {
-		return nil
-	}
-
-	trieKey := service.BuildGeminiTrieKey(groupID, prefixHash)
-	value := service.FormatGeminiSessionValue(uuid, accountID)
-	ttlSeconds := int(service.GeminiSessionTTL().Seconds())
-
-	return c.rdb.Eval(ctx, geminiTrieSaveScript, []string{trieKey}, digestChain, value, ttlSeconds).Err()
-}
-
-// ============ Anthropic 会话 Fallback 方法 (复用 Trie 实现) ============
-
-// FindAnthropicSession 查找 Anthropic 会话（复用 Gemini Trie Lua 脚本）
-func (c *gatewayCache) FindAnthropicSession(ctx context.Context, groupID int64, prefixHash, digestChain string) (uuid string, accountID int64, found bool) {
-	if digestChain == "" {
-		return "", 0, false
-	}
-
-	trieKey := service.BuildAnthropicTrieKey(groupID, prefixHash)
-	ttlSeconds := int(service.AnthropicSessionTTL().Seconds())
-
-	result, err := c.rdb.Eval(ctx, geminiTrieFindScript, []string{trieKey}, digestChain, ttlSeconds).Result()
-	if err != nil || result == nil {
-		return "", 0, false
-	}
-
-	value, ok := result.(string)
-	if !ok || value == "" {
-		return "", 0, false
-	}
-
-	uuid, accountID, ok = service.ParseGeminiSessionValue(value)
-	return uuid, accountID, ok
-}
-
-// SaveAnthropicSession 保存 Anthropic 会话（复用 Gemini Trie Lua 脚本）
-func (c *gatewayCache) SaveAnthropicSession(ctx context.Context, groupID int64, prefixHash, digestChain, uuid string, accountID int64) error {
-	if digestChain == "" {
-		return nil
-	}
-
-	trieKey := service.BuildAnthropicTrieKey(groupID, prefixHash)
-	value := service.FormatGeminiSessionValue(uuid, accountID)
-	ttlSeconds := int(service.AnthropicSessionTTL().Seconds())
-
-	return c.rdb.Eval(ctx, geminiTrieSaveScript, []string{trieKey}, digestChain, value, ttlSeconds).Err()
-}
--- a/backend/internal/repository/gateway_cache_integration_test.go
+++ b/backend/internal/repository/gateway_cache_integration_test.go
@@ -104,157 +104,6 @@ func (s *GatewayCacheSuite) TestGetSessionAccountID_CorruptedValue() {
 	require.False(s.T(), errors.Is(err, redis.Nil), "expected parsing error, not redis.Nil")
 }

-// ============ Gemini Trie 会话测试 ============
-
-func (s *GatewayCacheSuite) TestGeminiSessionTrie_SaveAndFind() {
-	groupID := int64(1)
-	prefixHash := "testprefix"
-	digestChain := "u:hash1-m:hash2-u:hash3"
-	uuid := "test-uuid-123"
-	accountID := int64(42)
-
-	// 保存会话
-	err := s.cache.SaveGeminiSession(s.ctx, groupID, prefixHash, digestChain, uuid, accountID)
-	require.NoError(s.T(), err, "SaveGeminiSession")
-
-	// 精确匹配查找
-	foundUUID, foundAccountID, found := s.cache.FindGeminiSession(s.ctx, groupID, prefixHash, digestChain)
-	require.True(s.T(), found, "should find exact match")
-	require.Equal(s.T(), uuid, foundUUID)
-	require.Equal(s.T(), accountID, foundAccountID)
-}
-
-func (s *GatewayCacheSuite) TestGeminiSessionTrie_PrefixMatch() {
-	groupID := int64(1)
-	prefixHash := "prefixmatch"
-	shortChain := "u:a-m:b"
-	longChain := "u:a-m:b-u:c-m:d"
-	uuid := "uuid-prefix"
-	accountID := int64(100)
-
-	// 保存短链
-	err := s.cache.SaveGeminiSession(s.ctx, groupID, prefixHash, shortChain, uuid, accountID)
-	require.NoError(s.T(), err)
-
-	// 用长链查找，应该匹配到短链（前缀匹配）
-	foundUUID, foundAccountID, found := s.cache.FindGeminiSession(s.ctx, groupID, prefixHash, longChain)
-	require.True(s.T(), found, "should find prefix match")
-	require.Equal(s.T(), uuid, foundUUID)
-	require.Equal(s.T(), accountID, foundAccountID)
-}
-
-func (s *GatewayCacheSuite) TestGeminiSessionTrie_LongestPrefixMatch() {
-	groupID := int64(1)
-	prefixHash := "longestmatch"
-
-	// 保存多个不同长度的链
-	err := s.cache.SaveGeminiSession(s.ctx, groupID, prefixHash, "u:a", "uuid-short", 1)
-	require.NoError(s.T(), err)
-	err = s.cache.SaveGeminiSession(s.ctx, groupID, prefixHash, "u:a-m:b", "uuid-medium", 2)
-	require.NoError(s.T(), err)
-	err = s.cache.SaveGeminiSession(s.ctx, groupID, prefixHash, "u:a-m:b-u:c", "uuid-long", 3)
-	require.NoError(s.T(), err)
-
-	// 查找更长的链，应该匹配到最长的前缀
-	foundUUID, foundAccountID, found := s.cache.FindGeminiSession(s.ctx, groupID, prefixHash, "u:a-m:b-u:c-m:d-u:e")
-	require.True(s.T(), found, "should find longest prefix match")
-	require.Equal(s.T(), "uuid-long", foundUUID)
-	require.Equal(s.T(), int64(3), foundAccountID)
-
-	// 查找中等长度的链
-	foundUUID, foundAccountID, found = s.cache.FindGeminiSession(s.ctx, groupID, prefixHash, "u:a-m:b-u:x")
-	require.True(s.T(), found)
-	require.Equal(s.T(), "uuid-medium", foundUUID)
-	require.Equal(s.T(), int64(2), foundAccountID)
-}
-
-func (s *GatewayCacheSuite) TestGeminiSessionTrie_NoMatch() {
-	groupID := int64(1)
-	prefixHash := "nomatch"
-	digestChain := "u:a-m:b"
-
-	// 保存一个会话
-	err := s.cache.SaveGeminiSession(s.ctx, groupID, prefixHash, digestChain, "uuid", 1)
-	require.NoError(s.T(), err)
-
-	// 用不同的链查找，应该找不到
-	_, _, found := s.cache.FindGeminiSession(s.ctx, groupID, prefixHash, "u:x-m:y")
-	require.False(s.T(), found, "should not find non-matching chain")
-}
-
-func (s *GatewayCacheSuite) TestGeminiSessionTrie_DifferentPrefixHash() {
-	groupID := int64(1)
-	digestChain := "u:a-m:b"
-
-	// 保存到 prefixHash1
-	err := s.cache.SaveGeminiSession(s.ctx, groupID, "prefix1", digestChain, "uuid1", 1)
-	require.NoError(s.T(), err)
-
-	// 用 prefixHash2 查找，应该找不到（不同用户/客户端隔离）
-	_, _, found := s.cache.FindGeminiSession(s.ctx, groupID, "prefix2", digestChain)
-	require.False(s.T(), found, "different prefixHash should be isolated")
-}
-
-func (s *GatewayCacheSuite) TestGeminiSessionTrie_DifferentGroupID() {
-	prefixHash := "sameprefix"
-	digestChain := "u:a-m:b"
-
-	// 保存到 groupID 1
-	err := s.cache.SaveGeminiSession(s.ctx, 1, prefixHash, digestChain, "uuid1", 1)
-	require.NoError(s.T(), err)
-
-	// 用 groupID 2 查找，应该找不到（分组隔离）
-	_, _, found := s.cache.FindGeminiSession(s.ctx, 2, prefixHash, digestChain)
-	require.False(s.T(), found, "different groupID should be isolated")
-}
-
-func (s *GatewayCacheSuite) TestGeminiSessionTrie_EmptyDigestChain() {
-	groupID := int64(1)
-	prefixHash := "emptytest"
-
-	// 空链不应该保存
-	err := s.cache.SaveGeminiSession(s.ctx, groupID, prefixHash, "", "uuid", 1)
-	require.NoError(s.T(), err, "empty chain should not error")
-
-	// 空链查找应该返回 false
-	_, _, found := s.cache.FindGeminiSession(s.ctx, groupID, prefixHash, "")
-	require.False(s.T(), found, "empty chain should not match")
-}
-
-func (s *GatewayCacheSuite) TestGeminiSessionTrie_MultipleSessions() {
-	groupID := int64(1)
-	prefixHash := "multisession"
-
-	// 保存多个不同会话（模拟 1000 个并发会话的场景）
-	sessions := []struct {
-		chain     string
-		uuid      string
-		accountID int64
-	}{
-		{"u:session1", "uuid-1", 1},
-		{"u:session2-m:reply2", "uuid-2", 2},
-		{"u:session3-m:reply3-u:msg3", "uuid-3", 3},
-	}
-
-	for _, sess := range sessions {
-		err := s.cache.SaveGeminiSession(s.ctx, groupID, prefixHash, sess.chain, sess.uuid, sess.accountID)
-		require.NoError(s.T(), err)
-	}
-
-	// 验证每个会话都能正确查找
-	for _, sess := range sessions {
-		foundUUID, foundAccountID, found := s.cache.FindGeminiSession(s.ctx, groupID, prefixHash, sess.chain)
-		require.True(s.T(), found, "should find session: %s", sess.chain)
-		require.Equal(s.T(), sess.uuid, foundUUID)
-		require.Equal(s.T(), sess.accountID, foundAccountID)
-	}
-
-	// 验证继续对话的场景
-	foundUUID, foundAccountID, found := s.cache.FindGeminiSession(s.ctx, groupID, prefixHash, "u:session2-m:reply2-u:newmsg")
-	require.True(s.T(), found)
-	require.Equal(s.T(), "uuid-2", foundUUID)
-	require.Equal(s.T(), int64(2), foundAccountID)
-}

 func TestGatewayCacheSuite(t *testing.T) {
 	suite.Run(t, new(GatewayCacheSuite))

--- a/backend/internal/repository/gateway_cache_model_load_integration_test.go
+++ b/backend/internal/repository/gateway_cache_model_load_integration_test.go
-//go:build integration
-
-package repository
-
-import (
-	"context"
-	"testing"
-	"time"
-
-	"github.com/stretchr/testify/require"
-	"github.com/stretchr/testify/suite"
-)
-
-// ============ Gateway Cache 模型负载统计集成测试 ============
-
-type GatewayCacheModelLoadSuite struct {
-	suite.Suite
-}
-
-func TestGatewayCacheModelLoadSuite(t *testing.T) {
-	suite.Run(t, new(GatewayCacheModelLoadSuite))
-}
-
-func (s *GatewayCacheModelLoadSuite) TestIncrModelCallCount_Basic() {
-	t := s.T()
-	rdb := testRedis(t)
-	cache := &gatewayCache{rdb: rdb}
-	ctx := context.Background()
-
-	accountID := int64(123)
-	model := "claude-sonnet-4-20250514"
-
-	// 首次调用应返回 1
-	count1, err := cache.IncrModelCallCount(ctx, accountID, model)
-	require.NoError(t, err)
-	require.Equal(t, int64(1), count1)
-
-	// 第二次调用应返回 2
-	count2, err := cache.IncrModelCallCount(ctx, accountID, model)
-	require.NoError(t, err)
-	require.Equal(t, int64(2), count2)
-
-	// 第三次调用应返回 3
-	count3, err := cache.IncrModelCallCount(ctx, accountID, model)
-	require.NoError(t, err)
-	require.Equal(t, int64(3), count3)
-}
-
-func (s *GatewayCacheModelLoadSuite) TestIncrModelCallCount_DifferentModels() {
-	t := s.T()
-	rdb := testRedis(t)
-	cache := &gatewayCache{rdb: rdb}
-	ctx := context.Background()
-
-	accountID := int64(456)
-	model1 := "claude-sonnet-4-20250514"
-	model2 := "claude-opus-4-5-20251101"
-
-	// 不同模型应该独立计数
-	count1, err := cache.IncrModelCallCount(ctx, accountID, model1)
-	require.NoError(t, err)
-	require.Equal(t, int64(1), count1)
-
-	count2, err := cache.IncrModelCallCount(ctx, accountID, model2)
-	require.NoError(t, err)
-	require.Equal(t, int64(1), count2)
-
-	count1Again, err := cache.IncrModelCallCount(ctx, accountID, model1)
-	require.NoError(t, err)
-	require.Equal(t, int64(2), count1Again)
-}
-
-func (s *GatewayCacheModelLoadSuite) TestIncrModelCallCount_DifferentAccounts() {
-	t := s.T()
-	rdb := testRedis(t)
-	cache := &gatewayCache{rdb: rdb}
-	ctx := context.Background()
-
-	account1 := int64(111)
-	account2 := int64(222)
-	model := "gemini-2.5-pro"
-
-	// 不同账号应该独立计数
-	count1, err := cache.IncrModelCallCount(ctx, account1, model)
-	require.NoError(t, err)
-	require.Equal(t, int64(1), count1)
-
-	count2, err := cache.IncrModelCallCount(ctx, account2, model)
-	require.NoError(t, err)
-	require.Equal(t, int64(1), count2)
-}
-
-func (s *GatewayCacheModelLoadSuite) TestGetModelLoadBatch_Empty() {
-	t := s.T()
-	rdb := testRedis(t)
-	cache := &gatewayCache{rdb: rdb}
-	ctx := context.Background()
-
-	result, err := cache.GetModelLoadBatch(ctx, []int64{}, "any-model")
-	require.NoError(t, err)
-	require.NotNil(t, result)
-	require.Empty(t, result)
-}
-
-func (s *GatewayCacheModelLoadSuite) TestGetModelLoadBatch_NonExistent() {
-	t := s.T()
-	rdb := testRedis(t)
-	cache := &gatewayCache{rdb: rdb}
-	ctx := context.Background()
-
-	// 查询不存在的账号应返回零值
-	result, err := cache.GetModelLoadBatch(ctx, []int64{9999, 9998}, "claude-sonnet-4-20250514")
-	require.NoError(t, err)
-	require.Len(t, result, 2)
-
-	require.Equal(t, int64(0), result[9999].CallCount)
-	require.True(t, result[9999].LastUsedAt.IsZero())
-	require.Equal(t, int64(0), result[9998].CallCount)
-	require.True(t, result[9998].LastUsedAt.IsZero())
-}
-
-func (s *GatewayCacheModelLoadSuite) TestGetModelLoadBatch_AfterIncrement() {
-	t := s.T()
-	rdb := testRedis(t)
-	cache := &gatewayCache{rdb: rdb}
-	ctx := context.Background()
-
-	accountID := int64(789)
-	model := "claude-sonnet-4-20250514"
-
-	// 先增加调用次数
-	beforeIncr := time.Now()
-	_, err := cache.IncrModelCallCount(ctx, accountID, model)
-	require.NoError(t, err)
-	_, err = cache.IncrModelCallCount(ctx, accountID, model)
-	require.NoError(t, err)
-	_, err = cache.IncrModelCallCount(ctx, accountID, model)
-	require.NoError(t, err)
-	afterIncr := time.Now()
-
-	// 获取负载信息
-	result, err := cache.GetModelLoadBatch(ctx, []int64{accountID}, model)
-	require.NoError(t, err)
-	require.Len(t, result, 1)
-
-	loadInfo := result[accountID]
-	require.NotNil(t, loadInfo)
-	require.Equal(t, int64(3), loadInfo.CallCount)
-	require.False(t, loadInfo.LastUsedAt.IsZero())
-	// LastUsedAt 应该在 beforeIncr 和 afterIncr 之间
-	require.True(t, loadInfo.LastUsedAt.After(beforeIncr.Add(-time.Second)) || loadInfo.LastUsedAt.Equal(beforeIncr))
-	require.True(t, loadInfo.LastUsedAt.Before(afterIncr.Add(time.Second)) || loadInfo.LastUsedAt.Equal(afterIncr))
-}
-
-func (s *GatewayCacheModelLoadSuite) TestGetModelLoadBatch_MultipleAccounts() {
-	t := s.T()
-	rdb := testRedis(t)
-	cache := &gatewayCache{rdb: rdb}
-	ctx := context.Background()
-
-	model := "claude-opus-4-5-20251101"
-	account1 := int64(1001)
-	account2 := int64(1002)
-	account3 := int64(1003) // 不调用
-
-	// account1 调用 2 次
-	_, err := cache.IncrModelCallCount(ctx, account1, model)
-	require.NoError(t, err)
-	_, err = cache.IncrModelCallCount(ctx, account1, model)
-	require.NoError(t, err)
-
-	// account2 调用 5 次
-	for i := 0; i < 5; i++ {
-		_, err = cache.IncrModelCallCount(ctx, account2, model)
-		require.NoError(t, err)
-	}
-
-	// 批量获取
-	result, err := cache.GetModelLoadBatch(ctx, []int64{account1, account2, account3}, model)
-	require.NoError(t, err)
-	require.Len(t, result, 3)
-
-	require.Equal(t, int64(2), result[account1].CallCount)
-	require.False(t, result[account1].LastUsedAt.IsZero())
-
-	require.Equal(t, int64(5), result[account2].CallCount)
-	require.False(t, result[account2].LastUsedAt.IsZero())
-
-	require.Equal(t, int64(0), result[account3].CallCount)
-	require.True(t, result[account3].LastUsedAt.IsZero())
-}
-
-func (s *GatewayCacheModelLoadSuite) TestGetModelLoadBatch_ModelIsolation() {
-	t := s.T()
-	rdb := testRedis(t)
-	cache := &gatewayCache{rdb: rdb}
-	ctx := context.Background()
-
-	accountID := int64(2001)
-	model1 := "claude-sonnet-4-20250514"
-	model2 := "gemini-2.5-pro"
-
-	// 对 model1 调用 3 次
-	for i := 0; i < 3; i++ {
-		_, err := cache.IncrModelCallCount(ctx, accountID, model1)
-		require.NoError(t, err)
-	}
-
-	// 获取 model1 的负载
-	result1, err := cache.GetModelLoadBatch(ctx, []int64{accountID}, model1)
-	require.NoError(t, err)
-	require.Equal(t, int64(3), result1[accountID].CallCount)
-
-	// 获取 model2 的负载（应该为 0）
-	result2, err := cache.GetModelLoadBatch(ctx, []int64{accountID}, model2)
-	require.NoError(t, err)
-	require.Equal(t, int64(0), result2[accountID].CallCount)
-}
-
-// ============ 辅助函数测试 ============
-
-func (s *GatewayCacheModelLoadSuite) TestModelLoadKey_Format() {
-	t := s.T()
-
-	key := modelLoadKey(123, "claude-sonnet-4")
-	require.Equal(t, "ag:model_load:123:claude-sonnet-4", key)
-}
-
-func (s *GatewayCacheModelLoadSuite) TestModelLastUsedKey_Format() {
-	t := s.T()
-
-	key := modelLastUsedKey(456, "gemini-2.5-pro")
-	require.Equal(t, "ag:model_last_used:456:gemini-2.5-pro", key)
-}
--- a/backend/internal/server/api_contract_test.go
+++ b/backend/internal/server/api_contract_test.go
@@ -1008,10 +1008,6 @@ func (s *stubAccountRepo) SetRateLimited(ctx context.Context, id int64, resetAt
 	return errors.New("not implemented")
 }

-func (s *stubAccountRepo) SetAntigravityQuotaScopeLimit(ctx context.Context, id int64, scope service.AntigravityQuotaScope, resetAt time.Time) error {
-	return errors.New("not implemented")
-}
-
 func (s *stubAccountRepo) SetModelRateLimit(ctx context.Context, id int64, scope string, resetAt time.Time) error {
 	return errors.New("not implemented")
 }

--- a/backend/internal/service/account_service.go
+++ b/backend/internal/service/account_service.go
@@ -50,7 +50,6 @@ type AccountRepository interface {
 	ListSchedulableByGroupIDAndPlatforms(ctx context.Context, groupID int64, platforms []string) ([]Account, error)

 	SetRateLimited(ctx context.Context, id int64, resetAt time.Time) error
-	SetAntigravityQuotaScopeLimit(ctx context.Context, id int64, scope AntigravityQuotaScope, resetAt time.Time) error
 	SetModelRateLimit(ctx context.Context, id int64, scope string, resetAt time.Time) error
 	SetOverloaded(ctx context.Context, id int64, until time.Time) error
 	SetTempUnschedulable(ctx context.Context, id int64, until time.Time, reason string) error

--- a/backend/internal/service/account_service_delete_test.go
+++ b/backend/internal/service/account_service_delete_test.go
@@ -143,10 +143,6 @@ func (s *accountRepoStub) SetRateLimited(ctx context.Context, id int64, resetAt
 	panic("unexpected SetRateLimited call")
 }

-func (s *accountRepoStub) SetAntigravityQuotaScopeLimit(ctx context.Context, id int64, scope AntigravityQuotaScope, resetAt time.Time) error {
-	panic("unexpected SetAntigravityQuotaScopeLimit call")
-}
-
 func (s *accountRepoStub) SetModelRateLimit(ctx context.Context, id int64, scope string, resetAt time.Time) error {
 	panic("unexpected SetModelRateLimit call")
 }

--- a/backend/internal/service/anthropic_session.go
+++ b/backend/internal/service/anthropic_session.go
@@ -2,7 +2,6 @@ package service

 import (
 	"encoding/json"
-	"strconv"
 	"strings"
 	"time"
 )
@@ -12,9 +11,6 @@ const (
 	// anthropicSessionTTLSeconds Anthropic 会话缓存 TTL（5 分钟）
 	anthropicSessionTTLSeconds = 300

-	// anthropicTrieKeyPrefix Anthropic Trie 会话 key 前缀
-	anthropicTrieKeyPrefix = "anthropic:trie:"
-
 	// anthropicDigestSessionKeyPrefix Anthropic 摘要 fallback 会话 key 前缀
 	anthropicDigestSessionKeyPrefix = "anthropic:digest:"
 )
@@ -68,12 +64,6 @@ func rolePrefix(role string) string {
 	}
 }

-// BuildAnthropicTrieKey 构建 Anthropic Trie Redis key
-// 格式: anthropic:trie:{groupID}:{prefixHash}
-func BuildAnthropicTrieKey(groupID int64, prefixHash string) string {
-	return anthropicTrieKeyPrefix + strconv.FormatInt(groupID, 10) + ":" + prefixHash
-}
-
 // GenerateAnthropicDigestSessionKey 生成 Anthropic 摘要 fallback 的 sessionKey
 // 组合 prefixHash 前 8 位 + uuid 前 8 位，确保不同会话产生不同的 sessionKey
 func GenerateAnthropicDigestSessionKey(prefixHash, uuid string) string {

--- a/backend/internal/service/anthropic_session_test.go
+++ b/backend/internal/service/anthropic_session_test.go
@@ -236,43 +236,6 @@ func TestBuildAnthropicDigestChain_Deterministic(t *testing.T) {
 	}
 }

-func TestBuildAnthropicTrieKey(t *testing.T) {
-	tests := []struct {
-		name       string
-		groupID    int64
-		prefixHash string
-		want       string
-	}{
-		{
-			name:       "normal",
-			groupID:    123,
-			prefixHash: "abcdef12",
-			want:       "anthropic:trie:123:abcdef12",
-		},
-		{
-			name:       "zero group",
-			groupID:    0,
-			prefixHash: "xyz",
-			want:       "anthropic:trie:0:xyz",
-		},
-		{
-			name:       "empty prefix",
-			groupID:    1,
-			prefixHash: "",
-			want:       "anthropic:trie:1:",
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			got := BuildAnthropicTrieKey(tt.groupID, tt.prefixHash)
-			if got != tt.want {
-				t.Errorf("BuildAnthropicTrieKey(%d, %q) = %q, want %q", tt.groupID, tt.prefixHash, got, tt.want)
-			}
-		})
-	}
-}
-
 func TestGenerateAnthropicDigestSessionKey(t *testing.T) {
 	tests := []struct {
 		name       string

--- a/backend/internal/service/antigravity_gateway_service.go
+++ b/backend/internal/service/antigravity_gateway_service.go
@@ -9,6 +9,7 @@ import (
 	"fmt"
 	"io"
 	"log"
+	"log/slog"
 	mathrand "math/rand"
 	"net"
 	"net/http"
@@ -100,12 +101,11 @@ type antigravityRetryLoopParams struct {
 	accessToken     string
 	action          string
 	body            []byte
-	quotaScope      AntigravityQuotaScope
 	c               *gin.Context
 	httpUpstream    HTTPUpstream
 	settingService  *SettingService
 	accountRepo     AccountRepository // 用于智能重试的模型级别限流
-	handleError     func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult
+	handleError     func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult
 	requestedModel  string // 用于限流检查的原始请求模型
 	isStickySession bool   // 是否为粘性会话（用于账号切换时的缓存计费判断）
 	groupID         int64  // 用于模型级限流时清除粘性会话
@@ -148,13 +148,17 @@ func (s *AntigravityGatewayService) handleSmartRetry(p antigravityRetryLoopParam

 	// 情况1: retryDelay >= 阈值，限流模型并切换账号
 	if shouldRateLimitModel {
-		log.Printf("%s status=%d oauth_long_delay model=%s account=%d (model rate limit, switch account)",
-			p.prefix, resp.StatusCode, modelName, p.account.ID)
+		rateLimitDuration := waitDuration
+		if rateLimitDuration <= 0 {
+			rateLimitDuration = antigravityDefaultRateLimitDuration
+		}
+		log.Printf("%s status=%d oauth_long_delay model=%s account=%d upstream_retry_delay=%v body=%s (model rate limit, switch account)",
+			p.prefix, resp.StatusCode, modelName, p.account.ID, rateLimitDuration, truncateForLog(respBody, 200))

-		resetAt := time.Now().Add(antigravityDefaultRateLimitDuration)
+		resetAt := time.Now().Add(rateLimitDuration)
 		if !setModelRateLimitByModelName(p.ctx, p.accountRepo, p.account.ID, modelName, p.prefix, resp.StatusCode, resetAt, false) {
-			p.handleError(p.ctx, p.prefix, p.account, resp.StatusCode, resp.Header, respBody, p.quotaScope, p.groupID, p.sessionHash, p.isStickySession)
-			log.Printf("%s status=%d rate_limited account=%d (no scope mapping)", p.prefix, resp.StatusCode, p.account.ID)
+			p.handleError(p.ctx, p.prefix, p.account, resp.StatusCode, resp.Header, respBody, p.requestedModel, p.groupID, p.sessionHash, p.isStickySession)
+			log.Printf("%s status=%d rate_limited account=%d (no model mapping)", p.prefix, resp.StatusCode, p.account.ID)
 		} else {
 			s.updateAccountModelRateLimitInCache(p.ctx, p.account, modelName, resetAt)
 		}
@@ -190,7 +194,7 @@ func (s *AntigravityGatewayService) handleSmartRetry(p antigravityRetryLoopParam
 			retryReq, err := antigravity.NewAPIRequestWithURL(p.ctx, baseURL, p.action, p.accessToken, p.body)
 			if err != nil {
 				log.Printf("%s status=smart_retry_request_build_failed error=%v", p.prefix, err)
-				p.handleError(p.ctx, p.prefix, p.account, resp.StatusCode, resp.Header, respBody, p.quotaScope, p.groupID, p.sessionHash, p.isStickySession)
+				p.handleError(p.ctx, p.prefix, p.account, resp.StatusCode, resp.Header, respBody, p.requestedModel, p.groupID, p.sessionHash, p.isStickySession)
 				return &smartRetryResult{
 					action: smartRetryActionBreakWithResp,
 					resp: &http.Response{
@@ -233,16 +237,24 @@ func (s *AntigravityGatewayService) handleSmartRetry(p antigravityRetryLoopParam
 		}

 		// 所有重试都失败，限流当前模型并切换账号
-		log.Printf("%s status=%d smart_retry_exhausted attempts=%d model=%s account=%d (switch account)",
-			p.prefix, resp.StatusCode, antigravitySmartRetryMaxAttempts, modelName, p.account.ID)
+		rateLimitDuration := waitDuration
+		if rateLimitDuration <= 0 {
+			rateLimitDuration = antigravityDefaultRateLimitDuration
+		}
+		retryBody := lastRetryBody
+		if retryBody == nil {
+			retryBody = respBody
+		}
+		log.Printf("%s status=%d smart_retry_exhausted attempts=%d model=%s account=%d upstream_retry_delay=%v body=%s (switch account)",
+			p.prefix, resp.StatusCode, antigravitySmartRetryMaxAttempts, modelName, p.account.ID, rateLimitDuration, truncateForLog(retryBody, 200))

-		resetAt := time.Now().Add(antigravityDefaultRateLimitDuration)
+		resetAt := time.Now().Add(rateLimitDuration)
 		if p.accountRepo != nil && modelName != "" {
 			if err := p.accountRepo.SetModelRateLimit(p.ctx, p.account.ID, modelName, resetAt); err != nil {
 				log.Printf("%s status=%d model_rate_limit_failed model=%s error=%v", p.prefix, resp.StatusCode, modelName, err)
 			} else {
 				log.Printf("%s status=%d model_rate_limited_after_smart_retry model=%s account=%d reset_in=%v",
-					p.prefix, resp.StatusCode, modelName, p.account.ID, antigravityDefaultRateLimitDuration)
+					p.prefix, resp.StatusCode, modelName, p.account.ID, rateLimitDuration)
 				s.updateAccountModelRateLimitInCache(p.ctx, p.account, modelName, resetAt)
 			}
 		}
@@ -353,87 +365,102 @@ urlFallbackLoop:
 				return nil, fmt.Errorf("upstream request failed after retries: %w", err)
 			}

-			// 429/503 限流处理：区分 URL 级别限流、智能重试和账户配额限流
-			if resp.StatusCode == http.StatusTooManyRequests || resp.StatusCode == http.StatusServiceUnavailable {
+			// 统一处理错误响应
+			if resp.StatusCode >= 400 {
 				respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
 				_ = resp.Body.Close()

-				// 尝试智能重试处理（OAuth 账号专用）
-				smartResult := s.handleSmartRetry(p, resp, respBody, baseURL, urlIdx, availableURLs)
-				switch smartResult.action {
-				case smartRetryActionContinueURL:
-					continue urlFallbackLoop
-				case smartRetryActionBreakWithResp:
-					if smartResult.err != nil {
-						return nil, smartResult.err
+				// ★ 统一入口：自定义错误码 + 临时不可调度
+				if handled, policyErr := s.applyErrorPolicy(p, resp.StatusCode, resp.Header, respBody); handled {
+					if policyErr != nil {
+						return nil, policyErr
 					}
-					// 模型限流时返回切换账号信号
-					if smartResult.switchError != nil {
-						return nil, smartResult.switchError
+					resp = &http.Response{
+						StatusCode: resp.StatusCode,
+						Header:     resp.Header.Clone(),
+						Body:       io.NopCloser(bytes.NewReader(respBody)),
 					}
-					resp = smartResult.resp
 					break urlFallbackLoop
 				}
-				// smartRetryActionContinue: 继续默认重试逻辑

-				// 账户/模型配额限流，重试 3 次（指数退避）- 默认逻辑（非 OAuth 账号或解析失败）
-				if attempt < antigravityMaxRetries {
-					upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody))
-					upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
-					appendOpsUpstreamError(p.c, OpsUpstreamErrorEvent{
-						Platform:           p.account.Platform,
-						AccountID:          p.account.ID,
-						AccountName:        p.account.Name,
-						UpstreamStatusCode: resp.StatusCode,
-						UpstreamRequestID:  resp.Header.Get("x-request-id"),
-						Kind:               "retry",
-						Message:            upstreamMsg,
-						Detail:             getUpstreamDetail(respBody),
-					})
-					log.Printf("%s status=%d retry=%d/%d body=%s", p.prefix, resp.StatusCode, attempt, antigravityMaxRetries, truncateForLog(respBody, 200))
-					if !sleepAntigravityBackoffWithContext(p.ctx, attempt) {
-						log.Printf("%s status=context_canceled_during_backoff", p.prefix)
-						return nil, p.ctx.Err()
+				// 429/503 限流处理：区分 URL 级别限流、智能重试和账户配额限流
+				if resp.StatusCode == http.StatusTooManyRequests || resp.StatusCode == http.StatusServiceUnavailable {
+					// 尝试智能重试处理（OAuth 账号专用）
+					smartResult := s.handleSmartRetry(p, resp, respBody, baseURL, urlIdx, availableURLs)
+					switch smartResult.action {
+					case smartRetryActionContinueURL:
+						continue urlFallbackLoop
+					case smartRetryActionBreakWithResp:
+						if smartResult.err != nil {
+							return nil, smartResult.err
+						}
+						// 模型限流时返回切换账号信号
+						if smartResult.switchError != nil {
+							return nil, smartResult.switchError
+						}
+						resp = smartResult.resp
+						break urlFallbackLoop
+					}
+					// smartRetryActionContinue: 继续默认重试逻辑
+
+					// 账户/模型配额限流，重试 3 次（指数退避）- 默认逻辑（非 OAuth 账号或解析失败）
+					if attempt < antigravityMaxRetries {
+						upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody))
+						upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
+						appendOpsUpstreamError(p.c, OpsUpstreamErrorEvent{
+							Platform:           p.account.Platform,
+							AccountID:          p.account.ID,
+							AccountName:        p.account.Name,
+							UpstreamStatusCode: resp.StatusCode,
+							UpstreamRequestID:  resp.Header.Get("x-request-id"),
+							Kind:               "retry",
+							Message:            upstreamMsg,
+							Detail:             getUpstreamDetail(respBody),
+						})
+						log.Printf("%s status=%d retry=%d/%d body=%s", p.prefix, resp.StatusCode, attempt, antigravityMaxRetries, truncateForLog(respBody, 200))
+						if !sleepAntigravityBackoffWithContext(p.ctx, attempt) {
+							log.Printf("%s status=context_canceled_during_backoff", p.prefix)
+							return nil, p.ctx.Err()
+						}
+						continue
 					}
-					continue
-				}

-				// 重试用尽，标记账户限流
-				p.handleError(p.ctx, p.prefix, p.account, resp.StatusCode, resp.Header, respBody, p.quotaScope, p.groupID, p.sessionHash, p.isStickySession)
-				log.Printf("%s status=%d rate_limited base_url=%s body=%s", p.prefix, resp.StatusCode, baseURL, truncateForLog(respBody, 200))
-				resp = &http.Response{
-					StatusCode: resp.StatusCode,
-					Header:     resp.Header.Clone(),
-					Body:       io.NopCloser(bytes.NewReader(respBody)),
+					// 重试用尽，标记账户限流
+					p.handleError(p.ctx, p.prefix, p.account, resp.StatusCode, resp.Header, respBody, p.requestedModel, p.groupID, p.sessionHash, p.isStickySession)
+					log.Printf("%s status=%d rate_limited base_url=%s body=%s", p.prefix, resp.StatusCode, baseURL, truncateForLog(respBody, 200))
+					resp = &http.Response{
+						StatusCode: resp.StatusCode,
+						Header:     resp.Header.Clone(),
+						Body:       io.NopCloser(bytes.NewReader(respBody)),
+					}
+					break urlFallbackLoop
 				}
-				break urlFallbackLoop
-			}
-
-			// 其他可重试错误（不包括 429 和 503，因为上面已处理）
-			if resp.StatusCode >= 400 && shouldRetryAntigravityError(resp.StatusCode) {
-				respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
-				_ = resp.Body.Close()

-				if attempt < antigravityMaxRetries {
-					upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody))
-					upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
-					appendOpsUpstreamError(p.c, OpsUpstreamErrorEvent{
-						Platform:           p.account.Platform,
-						AccountID:          p.account.ID,
-						AccountName:        p.account.Name,
-						UpstreamStatusCode: resp.StatusCode,
-						UpstreamRequestID:  resp.Header.Get("x-request-id"),
-						Kind:               "retry",
-						Message:            upstreamMsg,
-						Detail:             getUpstreamDetail(respBody),
-					})
-					log.Printf("%s status=%d retry=%d/%d body=%s", p.prefix, resp.StatusCode, attempt, antigravityMaxRetries, truncateForLog(respBody, 500))
-					if !sleepAntigravityBackoffWithContext(p.ctx, attempt) {
-						log.Printf("%s status=context_canceled_during_backoff", p.prefix)
-						return nil, p.ctx.Err()
+				// 其他可重试错误（500/502/504/529，不包括 429 和 503）
+				if shouldRetryAntigravityError(resp.StatusCode) {
+					if attempt < antigravityMaxRetries {
+						upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody))
+						upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
+						appendOpsUpstreamError(p.c, OpsUpstreamErrorEvent{
+							Platform:           p.account.Platform,
+							AccountID:          p.account.ID,
+							AccountName:        p.account.Name,
+							UpstreamStatusCode: resp.StatusCode,
+							UpstreamRequestID:  resp.Header.Get("x-request-id"),
+							Kind:               "retry",
+							Message:            upstreamMsg,
+							Detail:             getUpstreamDetail(respBody),
+						})
+						log.Printf("%s status=%d retry=%d/%d body=%s", p.prefix, resp.StatusCode, attempt, antigravityMaxRetries, truncateForLog(respBody, 500))
+						if !sleepAntigravityBackoffWithContext(p.ctx, attempt) {
+							log.Printf("%s status=context_canceled_during_backoff", p.prefix)
+							return nil, p.ctx.Err()
+						}
+						continue
 					}
-					continue
 				}
+
+				// 其他 4xx 错误或重试用尽，直接返回
 				resp = &http.Response{
 					StatusCode: resp.StatusCode,
 					Header:     resp.Header.Clone(),
@@ -442,6 +469,7 @@ urlFallbackLoop:
 				break urlFallbackLoop
 			}

+			// 成功响应（< 400）
 			break urlFallbackLoop
 		}
 	}
@@ -574,6 +602,31 @@ func (s *AntigravityGatewayService) getUpstreamErrorDetail(body []byte) string {
 	return truncateString(string(body), maxBytes)
 }

+// checkErrorPolicy nil 安全的包装
+func (s *AntigravityGatewayService) checkErrorPolicy(ctx context.Context, account *Account, statusCode int, body []byte) ErrorPolicyResult {
+	if s.rateLimitService == nil {
+		return ErrorPolicyNone
+	}
+	return s.rateLimitService.CheckErrorPolicy(ctx, account, statusCode, body)
+}
+
+// applyErrorPolicy 应用错误策略结果，返回是否应终止当前循环
+func (s *AntigravityGatewayService) applyErrorPolicy(p antigravityRetryLoopParams, statusCode int, headers http.Header, respBody []byte) (handled bool, retErr error) {
+	switch s.checkErrorPolicy(p.ctx, p.account, statusCode, respBody) {
+	case ErrorPolicySkipped:
+		return true, nil
+	case ErrorPolicyMatched:
+		_ = p.handleError(p.ctx, p.prefix, p.account, statusCode, headers, respBody,
+			p.requestedModel, p.groupID, p.sessionHash, p.isStickySession)
+		return true, nil
+	case ErrorPolicyTempUnscheduled:
+		slog.Info("temp_unschedulable_matched",
+			"prefix", p.prefix, "status_code", statusCode, "account_id", p.account.ID)
+		return true, &AntigravityAccountSwitchError{OriginalAccountID: p.account.ID, IsStickySession: p.isStickySession}
+	}
+	return false, nil
+}
+
 // mapAntigravityModel 获取映射后的模型名
 // 完全依赖映射配置：账户映射（通配符）→ 默认映射兜底（DefaultAntigravityModelMapping）
 // 注意：返回空字符串表示模型不被支持，调度时会过滤掉该账号
@@ -969,6 +1022,11 @@ func isModelNotFoundError(statusCode int, body []byte) bool {
 //	          ├─ 成功 → 正常返回
 //	          └─ 失败 → 设置模型限流 + 清除粘性绑定 → 切换账号
 func (s *AntigravityGatewayService) Forward(ctx context.Context, c *gin.Context, account *Account, body []byte, isStickySession bool) (*ForwardResult, error) {
+	// 上游透传账号直接转发，不走 OAuth token 刷新
+	if account.Type == AccountTypeUpstream {
+		return s.ForwardUpstream(ctx, c, account, body)
+	}
+
 	startTime := time.Now()

 	sessionID := getSessionID(c)
@@ -988,11 +1046,9 @@ func (s *AntigravityGatewayService) Forward(ctx context.Context, c *gin.Context,
 	if mappedModel == "" {
 		return nil, s.writeClaudeError(c, http.StatusForbidden, "permission_error", fmt.Sprintf("model %s not in whitelist", claudeReq.Model))
 	}
-	loadModel := mappedModel
 	// 应用 thinking 模式自动后缀：如果 thinking 开启且目标是 claude-sonnet-4-5，自动改为 thinking 版本
 	thinkingEnabled := claudeReq.Thinking != nil && claudeReq.Thinking.Type == "enabled"
 	mappedModel = applyThinkingModelSuffix(mappedModel, thinkingEnabled)
-	quotaScope, _ := resolveAntigravityQuotaScope(originalModel)

 	// 获取 access_token
 	if s.tokenProvider == nil {
@@ -1027,11 +1083,6 @@ func (s *AntigravityGatewayService) Forward(ctx context.Context, c *gin.Context,
 	// 如果客户端请求非流式，在响应处理阶段会收集完整流式响应后转换返回
 	action := "streamGenerateContent"

-	// 统计模型调用次数（包括粘性会话，用于负载均衡调度）
-	if s.cache != nil {
-		_, _ = s.cache.IncrModelCallCount(ctx, account.ID, loadModel)
-	}
-
 	// 执行带重试的请求
 	result, err := s.antigravityRetryLoop(antigravityRetryLoopParams{
 		ctx:             ctx,
@@ -1041,7 +1092,6 @@ func (s *AntigravityGatewayService) Forward(ctx context.Context, c *gin.Context,
 		accessToken:     accessToken,
 		action:          action,
 		body:            geminiBody,
-		quotaScope:      quotaScope,
 		c:               c,
 		httpUpstream:    s.httpUpstream,
 		settingService:  s.settingService,
@@ -1122,7 +1172,6 @@ func (s *AntigravityGatewayService) Forward(ctx context.Context, c *gin.Context,
 					accessToken:     accessToken,
 					action:          action,
 					body:            retryGeminiBody,
-					quotaScope:      quotaScope,
 					c:               c,
 					httpUpstream:    s.httpUpstream,
 					settingService:  s.settingService,
@@ -1233,7 +1282,7 @@ func (s *AntigravityGatewayService) Forward(ctx context.Context, c *gin.Context,
 				}
 			}

-			s.handleUpstreamError(ctx, prefix, account, resp.StatusCode, resp.Header, respBody, quotaScope, 0, "", isStickySession)
+			s.handleUpstreamError(ctx, prefix, account, resp.StatusCode, resp.Header, respBody, originalModel, 0, "", isStickySession)

 			if s.shouldFailoverUpstreamError(resp.StatusCode) {
 				upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(respBody))
@@ -1263,6 +1312,7 @@ func (s *AntigravityGatewayService) Forward(ctx context.Context, c *gin.Context,

 	var usage *ClaudeUsage
 	var firstTokenMs *int
+	var clientDisconnect bool
 	if claudeReq.Stream {
 		// 客户端要求流式，直接透传转换
 		streamRes, err := s.handleClaudeStreamingResponse(c, resp, startTime, originalModel)
@@ -1272,6 +1322,7 @@ func (s *AntigravityGatewayService) Forward(ctx context.Context, c *gin.Context,
 		}
 		usage = streamRes.usage
 		firstTokenMs = streamRes.firstTokenMs
+		clientDisconnect = streamRes.clientDisconnect
 	} else {
 		// 客户端要求非流式，收集流式响应后转换返回
 		streamRes, err := s.handleClaudeStreamToNonStreaming(c, resp, startTime, originalModel)
@@ -1284,12 +1335,13 @@ func (s *AntigravityGatewayService) Forward(ctx context.Context, c *gin.Context,
 	}

 	return &ForwardResult{
-		RequestID:    requestID,
-		Usage:        *usage,
-		Model:        originalModel, // 使用原始模型用于计费和日志
-		Stream:       claudeReq.Stream,
-		Duration:     time.Since(startTime),
-		FirstTokenMs: firstTokenMs,
+		RequestID:        requestID,
+		Usage:            *usage,
+		Model:            originalModel, // 使用原始模型用于计费和日志
+		Stream:           claudeReq.Stream,
+		Duration:         time.Since(startTime),
+		FirstTokenMs:     firstTokenMs,
+		ClientDisconnect: clientDisconnect,
 	}, nil
 }

@@ -1613,7 +1665,6 @@ func (s *AntigravityGatewayService) ForwardGemini(ctx context.Context, c *gin.Co
 	if len(body) == 0 {
 		return nil, s.writeGoogleError(c, http.StatusBadRequest, "Request body is empty")
 	}
-	quotaScope, _ := resolveAntigravityQuotaScope(originalModel)

 	// 解析请求以获取 image_size（用于图片计费）
 	imageSize := s.extractImageSize(body)
@@ -1683,11 +1734,6 @@ func (s *AntigravityGatewayService) ForwardGemini(ctx context.Context, c *gin.Co
 	// 如果客户端请求非流式，在响应处理阶段会收集完整流式响应后返回
 	upstreamAction := "streamGenerateContent"

-	// 统计模型调用次数（包括粘性会话，用于负载均衡调度）
-	if s.cache != nil {
-		_, _ = s.cache.IncrModelCallCount(ctx, account.ID, mappedModel)
-	}
-
 	// 执行带重试的请求
 	result, err := s.antigravityRetryLoop(antigravityRetryLoopParams{
 		ctx:             ctx,
@@ -1697,7 +1743,6 @@ func (s *AntigravityGatewayService) ForwardGemini(ctx context.Context, c *gin.Co
 		accessToken:     accessToken,
 		action:          upstreamAction,
 		body:            wrappedBody,
-		quotaScope:      quotaScope,
 		c:               c,
 		httpUpstream:    s.httpUpstream,
 		settingService:  s.settingService,
@@ -1771,7 +1816,7 @@ func (s *AntigravityGatewayService) ForwardGemini(ctx context.Context, c *gin.Co
 		if unwrapErr != nil || len(unwrappedForOps) == 0 {
 			unwrappedForOps = respBody
 		}
-		s.handleUpstreamError(ctx, prefix, account, resp.StatusCode, resp.Header, respBody, quotaScope, 0, "", isStickySession)
+		s.handleUpstreamError(ctx, prefix, account, resp.StatusCode, resp.Header, respBody, originalModel, 0, "", isStickySession)
 		upstreamMsg := strings.TrimSpace(extractAntigravityErrorMessage(unwrappedForOps))
 		upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
 		upstreamDetail := s.getUpstreamErrorDetail(unwrappedForOps)
@@ -1818,6 +1863,7 @@ handleSuccess:

 	var usage *ClaudeUsage
 	var firstTokenMs *int
+	var clientDisconnect bool

 	if stream {
 		// 客户端要求流式，直接透传
@@ -1828,6 +1874,7 @@ handleSuccess:
 		}
 		usage = streamRes.usage
 		firstTokenMs = streamRes.firstTokenMs
+		clientDisconnect = streamRes.clientDisconnect
 	} else {
 		// 客户端要求非流式，收集流式响应后返回
 		streamRes, err := s.handleGeminiStreamToNonStreaming(c, resp, startTime)
@@ -1851,14 +1898,15 @@ handleSuccess:
 	}

 	return &ForwardResult{
-		RequestID:    requestID,
-		Usage:        *usage,
-		Model:        originalModel,
-		Stream:       stream,
-		Duration:     time.Since(startTime),
-		FirstTokenMs: firstTokenMs,
-		ImageCount:   imageCount,
-		ImageSize:    imageSize,
+		RequestID:        requestID,
+		Usage:            *usage,
+		Model:            originalModel,
+		Stream:           stream,
+		Duration:         time.Since(startTime),
+		FirstTokenMs:     firstTokenMs,
+		ClientDisconnect: clientDisconnect,
+		ImageCount:       imageCount,
+		ImageSize:        imageSize,
 	}, nil
 }

@@ -2067,9 +2115,9 @@ func shouldTriggerAntigravitySmartRetry(account *Account, respBody []byte) (shou
 	}

 	// retryDelay >= 阈值：直接限流模型，不重试
-	// 注意：如果上游未提供 retryDelay，parseAntigravitySmartRetryInfo 已设置为默认 5 分钟
+	// 注意：如果上游未提供 retryDelay，parseAntigravitySmartRetryInfo 已设置为默认 30s
 	if info.RetryDelay >= antigravityRateLimitThreshold {
-		return false, true, 0, info.ModelName
+		return false, true, info.RetryDelay, info.ModelName
 	}

 	// retryDelay < 阈值：智能重试
@@ -2191,10 +2239,10 @@ func (s *AntigravityGatewayService) updateAccountModelRateLimitInCache(ctx conte
 func (s *AntigravityGatewayService) handleUpstreamError(
 	ctx context.Context, prefix string, account *Account,
 	statusCode int, headers http.Header, body []byte,
-	quotaScope AntigravityQuotaScope,
+	requestedModel string,
 	groupID int64, sessionHash string, isStickySession bool,
 ) *handleModelRateLimitResult {
-	// ✨ 模型级限流处理（在原有逻辑之前）
+	// 模型级限流处理（优先）
 	result := s.handleModelRateLimit(&handleModelRateLimitParams{
 		ctx:             ctx,
 		prefix:          prefix,
@@ -2216,52 +2264,35 @@ func (s *AntigravityGatewayService) handleUpstreamError(
 		return nil
 	}

-	// ========== 原有逻辑，保持不变 ==========
-	// 429 使用 Gemini 格式解析（从 body 解析重置时间）
+	// 429：尝试解析模型级限流，解析失败时兜底为账号级限流
 	if statusCode == 429 {
-		// 调试日志遵循统一日志开关与长度限制，避免无条件记录完整上游响应体。
 		if logBody, maxBytes := s.getLogConfig(); logBody {
 			log.Printf("[Antigravity-Debug] 429 response body: %s", truncateString(string(body), maxBytes))
 		}

-		useScopeLimit := quotaScope != ""
 		resetAt := ParseGeminiRateLimitResetTime(body)
-		if resetAt == nil {
-			// 解析失败：使用默认限流时间（与临时限流保持一致）
-			// 可通过配置或环境变量覆盖
-			defaultDur := antigravityDefaultRateLimitDuration
-			if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.AntigravityFallbackCooldownMinutes > 0 {
-				defaultDur = time.Duration(s.settingService.cfg.Gateway.AntigravityFallbackCooldownMinutes) * time.Minute
-			}
-			// 秒级环境变量优先级最高
-			if override, ok := antigravityFallbackCooldownSeconds(); ok {
-				defaultDur = override
-			}
-			ra := time.Now().Add(defaultDur)
-			if useScopeLimit {
-				log.Printf("%s status=429 rate_limited scope=%s reset_in=%v (fallback)", prefix, quotaScope, defaultDur)
-				if err := s.accountRepo.SetAntigravityQuotaScopeLimit(ctx, account.ID, quotaScope, ra); err != nil {
-					log.Printf("%s status=429 rate_limit_set_failed scope=%s error=%v", prefix, quotaScope, err)
-				}
+		defaultDur := s.getDefaultRateLimitDuration()
+
+		// 尝试解析模型 key 并设置模型级限流
+		modelKey := resolveAntigravityModelKey(requestedModel)
+		if modelKey != "" {
+			ra := s.resolveResetTime(resetAt, defaultDur)
+			if err := s.accountRepo.SetModelRateLimit(ctx, account.ID, modelKey, ra); err != nil {
+				log.Printf("%s status=429 model_rate_limit_set_failed model=%s error=%v", prefix, modelKey, err)
 			} else {
-				log.Printf("%s status=429 rate_limited account=%d reset_in=%v (fallback)", prefix, account.ID, defaultDur)
-				if err := s.accountRepo.SetRateLimited(ctx, account.ID, ra); err != nil {
-					log.Printf("%s status=429 rate_limit_set_failed account=%d error=%v", prefix, account.ID, err)
-				}
+				log.Printf("%s status=429 model_rate_limited model=%s account=%d reset_at=%v reset_in=%v",
+					prefix, modelKey, account.ID, ra.Format("15:04:05"), time.Until(ra).Truncate(time.Second))
+				s.updateAccountModelRateLimitInCache(ctx, account, modelKey, ra)
 			}
 			return nil
 		}
-		resetTime := time.Unix(*resetAt, 0)
-		if useScopeLimit {
-			log.Printf("%s status=429 rate_limited scope=%s reset_at=%v reset_in=%v", prefix, quotaScope, resetTime.Format("15:04:05"), time.Until(resetTime).Truncate(time.Second))
-			if err := s.accountRepo.SetAntigravityQuotaScopeLimit(ctx, account.ID, quotaScope, resetTime); err != nil {
-				log.Printf("%s status=429 rate_limit_set_failed scope=%s error=%v", prefix, quotaScope, err)
-			}
-		} else {
-			log.Printf("%s status=429 rate_limited account=%d reset_at=%v reset_in=%v", prefix, account.ID, resetTime.Format("15:04:05"), time.Until(resetTime).Truncate(time.Second))
-			if err := s.accountRepo.SetRateLimited(ctx, account.ID, resetTime); err != nil {
-				log.Printf("%s status=429 rate_limit_set_failed account=%d error=%v", prefix, account.ID, err)
-			}
+
+		// 无法解析模型 key，兜底为账号级限流
+		ra := s.resolveResetTime(resetAt, defaultDur)
+		log.Printf("%s status=429 rate_limited account=%d reset_at=%v reset_in=%v (fallback)",
+			prefix, account.ID, ra.Format("15:04:05"), time.Until(ra).Truncate(time.Second))
+		if err := s.accountRepo.SetRateLimited(ctx, account.ID, ra); err != nil {
+			log.Printf("%s status=429 rate_limit_set_failed account=%d error=%v", prefix, account.ID, err)
 		}
 		return nil
 	}
@@ -2276,9 +2307,90 @@ func (s *AntigravityGatewayService) handleUpstreamError(
 	return nil
 }

+// getDefaultRateLimitDuration 获取默认限流时间
+func (s *AntigravityGatewayService) getDefaultRateLimitDuration() time.Duration {
+	defaultDur := antigravityDefaultRateLimitDuration
+	if s.settingService != nil && s.settingService.cfg != nil && s.settingService.cfg.Gateway.AntigravityFallbackCooldownMinutes > 0 {
+		defaultDur = time.Duration(s.settingService.cfg.Gateway.AntigravityFallbackCooldownMinutes) * time.Minute
+	}
+	if override, ok := antigravityFallbackCooldownSeconds(); ok {
+		defaultDur = override
+	}
+	return defaultDur
+}
+
+// resolveResetTime 根据解析的重置时间或默认时长计算重置时间点
+func (s *AntigravityGatewayService) resolveResetTime(resetAt *int64, defaultDur time.Duration) time.Time {
+	if resetAt != nil {
+		return time.Unix(*resetAt, 0)
+	}
+	return time.Now().Add(defaultDur)
+}
+
 type antigravityStreamResult struct {
-	usage        *ClaudeUsage
-	firstTokenMs *int
+	usage            *ClaudeUsage
+	firstTokenMs     *int
+	clientDisconnect bool // 客户端是否在流式传输过程中断开
+}
+
+// antigravityClientWriter 封装流式响应的客户端写入，自动检测断开并标记。
+// 断开后所有写入操作变为 no-op，调用方通过 Disconnected() 判断是否继续 drain 上游。
+type antigravityClientWriter struct {
+	w            gin.ResponseWriter
+	flusher      http.Flusher
+	disconnected bool
+	prefix       string // 日志前缀，标识来源方法
+}
+
+func newAntigravityClientWriter(w gin.ResponseWriter, flusher http.Flusher, prefix string) *antigravityClientWriter {
+	return &antigravityClientWriter{w: w, flusher: flusher, prefix: prefix}
+}
+
+// Write 写入数据到客户端，写入失败时标记断开并返回 false
+func (cw *antigravityClientWriter) Write(p []byte) bool {
+	if cw.disconnected {
+		return false
+	}
+	if _, err := cw.w.Write(p); err != nil {
+		cw.markDisconnected()
+		return false
+	}
+	cw.flusher.Flush()
+	return true
+}
+
+// Fprintf 格式化写入数据到客户端，写入失败时标记断开并返回 false
+func (cw *antigravityClientWriter) Fprintf(format string, args ...any) bool {
+	if cw.disconnected {
+		return false
+	}
+	if _, err := fmt.Fprintf(cw.w, format, args...); err != nil {
+		cw.markDisconnected()
+		return false
+	}
+	cw.flusher.Flush()
+	return true
+}
+
+func (cw *antigravityClientWriter) Disconnected() bool { return cw.disconnected }
+
+func (cw *antigravityClientWriter) markDisconnected() {
+	cw.disconnected = true
+	log.Printf("Client disconnected during streaming (%s), continuing to drain upstream for billing", cw.prefix)
+}
+
+// handleStreamReadError 处理上游读取错误的通用逻辑。
+// 返回 (clientDisconnect, handled)：handled=true 表示错误已处理，调用方应返回已收集的 usage。
+func handleStreamReadError(err error, clientDisconnected bool, prefix string) (disconnect bool, handled bool) {
+	if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
+		log.Printf("Context canceled during streaming (%s), returning collected usage", prefix)
+		return true, true
+	}
+	if clientDisconnected {
+		log.Printf("Upstream read error after client disconnect (%s): %v, returning collected usage", prefix, err)
+		return true, true
+	}
+	return false, false
 }

 func (s *AntigravityGatewayService) handleGeminiStreamingResponse(c *gin.Context, resp *http.Response, startTime time.Time) (*antigravityStreamResult, error) {
@@ -2354,10 +2466,12 @@ func (s *AntigravityGatewayService) handleGeminiStreamingResponse(c *gin.Context
 		intervalCh = intervalTicker.C
 	}

+	cw := newAntigravityClientWriter(c.Writer, flusher, "antigravity gemini")
+
 	// 仅发送一次错误事件，避免多次写入导致协议混乱
 	errorEventSent := false
 	sendErrorEvent := func(reason string) {
-		if errorEventSent {
+		if errorEventSent || cw.Disconnected() {
 			return
 		}
 		errorEventSent = true
@@ -2369,9 +2483,12 @@ func (s *AntigravityGatewayService) handleGeminiStreamingResponse(c *gin.Context
 		select {
 		case ev, ok := <-events:
 			if !ok {
-				return &antigravityStreamResult{usage: usage, firstTokenMs: firstTokenMs}, nil
+				return &antigravityStreamResult{usage: usage, firstTokenMs: firstTokenMs, clientDisconnect: cw.Disconnected()}, nil
 			}
 			if ev.err != nil {
+				if disconnect, handled := handleStreamReadError(ev.err, cw.Disconnected(), "antigravity gemini"); handled {
+					return &antigravityStreamResult{usage: usage, firstTokenMs: firstTokenMs, clientDisconnect: disconnect}, nil
+				}
 				if errors.Is(ev.err, bufio.ErrTooLong) {
 					log.Printf("SSE line too long (antigravity): max_size=%d error=%v", maxLineSize, ev.err)
 					sendErrorEvent("response_too_large")
@@ -2386,11 +2503,7 @@ func (s *AntigravityGatewayService) handleGeminiStreamingResponse(c *gin.Context
 			if strings.HasPrefix(trimmed, "data:") {
 				payload := strings.TrimSpace(strings.TrimPrefix(trimmed, "data:"))
 				if payload == "" || payload == "[DONE]" {
-					if _, err := fmt.Fprintf(c.Writer, "%s\n", line); err != nil {
-						sendErrorEvent("write_failed")
-						return &antigravityStreamResult{usage: usage, firstTokenMs: firstTokenMs}, err
-					}
-					flusher.Flush()
+					cw.Fprintf("%s\n", line)
 					continue
 				}

@@ -2426,27 +2539,22 @@ func (s *AntigravityGatewayService) handleGeminiStreamingResponse(c *gin.Context
 					firstTokenMs = &ms
 				}

-				if _, err := fmt.Fprintf(c.Writer, "data: %s\n\n", payload); err != nil {
-					sendErrorEvent("write_failed")
-					return &antigravityStreamResult{usage: usage, firstTokenMs: firstTokenMs}, err
-				}
-				flusher.Flush()
+				cw.Fprintf("data: %s\n\n", payload)
 				continue
 			}

-			if _, err := fmt.Fprintf(c.Writer, "%s\n", line); err != nil {
-				sendErrorEvent("write_failed")
-				return &antigravityStreamResult{usage: usage, firstTokenMs: firstTokenMs}, err
-			}
-			flusher.Flush()
+			cw.Fprintf("%s\n", line)

 		case <-intervalCh:
 			lastRead := time.Unix(0, atomic.LoadInt64(&lastReadAt))
 			if time.Since(lastRead) < streamInterval {
 				continue
 			}
+			if cw.Disconnected() {
+				log.Printf("Upstream timeout after client disconnect (antigravity gemini), returning collected usage")
+				return &antigravityStreamResult{usage: usage, firstTokenMs: firstTokenMs, clientDisconnect: true}, nil
+			}
 			log.Printf("Stream data interval timeout (antigravity)")
-			// 注意：此函数没有 account 上下文，无法调用 HandleStreamTimeout
 			sendErrorEvent("stream_timeout")
 			return &antigravityStreamResult{usage: usage, firstTokenMs: firstTokenMs}, fmt.Errorf("stream data interval timeout")
 		}
@@ -3144,10 +3252,12 @@ func (s *AntigravityGatewayService) handleClaudeStreamingResponse(c *gin.Context
 		intervalCh = intervalTicker.C
 	}

+	cw := newAntigravityClientWriter(c.Writer, flusher, "antigravity claude")
+
 	// 仅发送一次错误事件，避免多次写入导致协议混乱
 	errorEventSent := false
 	sendErrorEvent := func(reason string) {
-		if errorEventSent {
+		if errorEventSent || cw.Disconnected() {
 			return
 		}
 		errorEventSent = true
@@ -3155,19 +3265,27 @@ func (s *AntigravityGatewayService) handleClaudeStreamingResponse(c *gin.Context
 		flusher.Flush()
 	}

+	// finishUsage 是获取 processor 最终 usage 的辅助函数
+	finishUsage := func() *ClaudeUsage {
+		_, agUsage := processor.Finish()
+		return convertUsage(agUsage)
+	}
+
 	for {
 		select {
 		case ev, ok := <-events:
 			if !ok {
-				// 发送结束事件
+				// 上游完成，发送结束事件
 				finalEvents, agUsage := processor.Finish()
 				if len(finalEvents) > 0 {
-					_, _ = c.Writer.Write(finalEvents)
-					flusher.Flush()
+					cw.Write(finalEvents)
 				}
-				return &antigravityStreamResult{usage: convertUsage(agUsage), firstTokenMs: firstTokenMs}, nil
+				return &antigravityStreamResult{usage: convertUsage(agUsage), firstTokenMs: firstTokenMs, clientDisconnect: cw.Disconnected()}, nil
 			}
 			if ev.err != nil {
+				if disconnect, handled := handleStreamReadError(ev.err, cw.Disconnected(), "antigravity claude"); handled {
+					return &antigravityStreamResult{usage: finishUsage(), firstTokenMs: firstTokenMs, clientDisconnect: disconnect}, nil
+				}
 				if errors.Is(ev.err, bufio.ErrTooLong) {
 					log.Printf("SSE line too long (antigravity): max_size=%d error=%v", maxLineSize, ev.err)
 					sendErrorEvent("response_too_large")
@@ -3177,25 +3295,14 @@ func (s *AntigravityGatewayService) handleClaudeStreamingResponse(c *gin.Context
 				return nil, fmt.Errorf("stream read error: %w", ev.err)
 			}

-			line := ev.line
 			// 处理 SSE 行，转换为 Claude 格式
-			claudeEvents := processor.ProcessLine(strings.TrimRight(line, "\r\n"))
-
+			claudeEvents := processor.ProcessLine(strings.TrimRight(ev.line, "\r\n"))
 			if len(claudeEvents) > 0 {
 				if firstTokenMs == nil {
 					ms := int(time.Since(startTime).Milliseconds())
 					firstTokenMs = &ms
 				}
-
-				if _, writeErr := c.Writer.Write(claudeEvents); writeErr != nil {
-					finalEvents, agUsage := processor.Finish()
-					if len(finalEvents) > 0 {
-						_, _ = c.Writer.Write(finalEvents)
-					}
-					sendErrorEvent("write_failed")
-					return &antigravityStreamResult{usage: convertUsage(agUsage), firstTokenMs: firstTokenMs}, writeErr
-				}
-				flusher.Flush()
+				cw.Write(claudeEvents)
 			}

 		case <-intervalCh:
@@ -3203,13 +3310,15 @@ func (s *AntigravityGatewayService) handleClaudeStreamingResponse(c *gin.Context
 			if time.Since(lastRead) < streamInterval {
 				continue
 			}
+			if cw.Disconnected() {
+				log.Printf("Upstream timeout after client disconnect (antigravity claude), returning collected usage")
+				return &antigravityStreamResult{usage: finishUsage(), firstTokenMs: firstTokenMs, clientDisconnect: true}, nil
+			}
 			log.Printf("Stream data interval timeout (antigravity)")
-			// 注意：此函数没有 account 上下文，无法调用 HandleStreamTimeout
 			sendErrorEvent("stream_timeout")
 			return &antigravityStreamResult{usage: convertUsage(nil), firstTokenMs: firstTokenMs}, fmt.Errorf("stream data interval timeout")
 		}
 	}
-
 }

 // extractImageSize 从 Gemini 请求中提取 image_size 参数
@@ -3348,3 +3457,288 @@ func filterEmptyPartsFromGeminiRequest(body []byte) ([]byte, error) {
 	payload["contents"] = filtered
 	return json.Marshal(payload)
 }
+
+// ForwardUpstream 使用 base_url + /v1/messages + 双 header 认证透传上游 Claude 请求
+func (s *AntigravityGatewayService) ForwardUpstream(ctx context.Context, c *gin.Context, account *Account, body []byte) (*ForwardResult, error) {
+	startTime := time.Now()
+	sessionID := getSessionID(c)
+	prefix := logPrefix(sessionID, account.Name)
+
+	// 获取上游配置
+	baseURL := strings.TrimSpace(account.GetCredential("base_url"))
+	apiKey := strings.TrimSpace(account.GetCredential("api_key"))
+	if baseURL == "" || apiKey == "" {
+		return nil, fmt.Errorf("upstream account missing base_url or api_key")
+	}
+	baseURL = strings.TrimSuffix(baseURL, "/")
+
+	// 解析请求获取模型信息
+	var claudeReq antigravity.ClaudeRequest
+	if err := json.Unmarshal(body, &claudeReq); err != nil {
+		return nil, fmt.Errorf("parse claude request: %w", err)
+	}
+	if strings.TrimSpace(claudeReq.Model) == "" {
+		return nil, fmt.Errorf("missing model")
+	}
+	originalModel := claudeReq.Model
+	billingModel := originalModel
+
+	// 构建上游请求 URL
+	upstreamURL := baseURL + "/v1/messages"
+
+	// 创建请求
+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, upstreamURL, bytes.NewReader(body))
+	if err != nil {
+		return nil, fmt.Errorf("create upstream request: %w", err)
+	}
+
+	// 设置请求头
+	req.Header.Set("Content-Type", "application/json")
+	req.Header.Set("Authorization", "Bearer "+apiKey)
+	req.Header.Set("x-api-key", apiKey) // Claude API 兼容
+
+	// 透传 Claude 相关 headers
+	if v := c.GetHeader("anthropic-version"); v != "" {
+		req.Header.Set("anthropic-version", v)
+	}
+	if v := c.GetHeader("anthropic-beta"); v != "" {
+		req.Header.Set("anthropic-beta", v)
+	}
+
+	// 代理 URL
+	proxyURL := ""
+	if account.ProxyID != nil && account.Proxy != nil {
+		proxyURL = account.Proxy.URL()
+	}
+
+	// 发送请求
+	resp, err := s.httpUpstream.Do(req, proxyURL, account.ID, account.Concurrency)
+	if err != nil {
+		log.Printf("%s upstream request failed: %v", prefix, err)
+		return nil, fmt.Errorf("upstream request failed: %w", err)
+	}
+	defer func() { _ = resp.Body.Close() }()
+
+	// 处理错误响应
+	if resp.StatusCode >= 400 {
+		respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 2<<20))
+
+		// 429 错误时标记账号限流
+		if resp.StatusCode == http.StatusTooManyRequests {
+			s.handleUpstreamError(ctx, prefix, account, resp.StatusCode, resp.Header, respBody, originalModel, 0, "", false)
+		}
+
+		// 透传上游错误
+		c.Header("Content-Type", resp.Header.Get("Content-Type"))
+		c.Status(resp.StatusCode)
+		_, _ = c.Writer.Write(respBody)
+
+		return &ForwardResult{
+			Model: billingModel,
+		}, nil
+	}
+
+	// 处理成功响应（流式/非流式）
+	var usage *ClaudeUsage
+	var firstTokenMs *int
+	var clientDisconnect bool
+
+	if claudeReq.Stream {
+		// 流式响应：透传
+		c.Header("Content-Type", "text/event-stream")
+		c.Header("Cache-Control", "no-cache")
+		c.Header("Connection", "keep-alive")
+		c.Header("X-Accel-Buffering", "no")
+		c.Status(http.StatusOK)
+
+		streamRes := s.streamUpstreamResponse(c, resp, startTime)
+		usage = streamRes.usage
+		firstTokenMs = streamRes.firstTokenMs
+		clientDisconnect = streamRes.clientDisconnect
+	} else {
+		// 非流式响应：直接透传
+		respBody, err := io.ReadAll(resp.Body)
+		if err != nil {
+			return nil, fmt.Errorf("read upstream response: %w", err)
+		}
+
+		// 提取 usage
+		usage = s.extractClaudeUsage(respBody)
+
+		c.Header("Content-Type", resp.Header.Get("Content-Type"))
+		c.Status(http.StatusOK)
+		_, _ = c.Writer.Write(respBody)
+	}
+
+	// 构建计费结果
+	duration := time.Since(startTime)
+	log.Printf("%s status=success duration_ms=%d", prefix, duration.Milliseconds())
+
+	return &ForwardResult{
+		Model:            billingModel,
+		Stream:           claudeReq.Stream,
+		Duration:         duration,
+		FirstTokenMs:     firstTokenMs,
+		ClientDisconnect: clientDisconnect,
+		Usage: ClaudeUsage{
+			InputTokens:              usage.InputTokens,
+			OutputTokens:             usage.OutputTokens,
+			CacheReadInputTokens:     usage.CacheReadInputTokens,
+			CacheCreationInputTokens: usage.CacheCreationInputTokens,
+		},
+	}, nil
+}
+
+// streamUpstreamResponse 透传上游 SSE 流并提取 Claude usage
+func (s *AntigravityGatewayService) streamUpstreamResponse(c *gin.Context, resp *http.Response, startTime time.Time) *antigravityStreamResult {
+	usage := &ClaudeUsage{}
+	var firstTokenMs *int
+
+	scanner := bufio.NewScanner(resp.Body)
+	maxLineSize := defaultMaxLineSize
+	if s.settingService.cfg != nil && s.settingService.cfg.Gateway.MaxLineSize > 0 {
+		maxLineSize = s.settingService.cfg.Gateway.MaxLineSize
+	}
+	scanner.Buffer(make([]byte, 64*1024), maxLineSize)
+
+	type scanEvent struct {
+		line string
+		err  error
+	}
+	events := make(chan scanEvent, 16)
+	done := make(chan struct{})
+	sendEvent := func(ev scanEvent) bool {
+		select {
+		case events <- ev:
+			return true
+		case <-done:
+			return false
+		}
+	}
+	var lastReadAt int64
+	atomic.StoreInt64(&lastReadAt, time.Now().UnixNano())
+	go func() {
+		defer close(events)
+		for scanner.Scan() {
+			atomic.StoreInt64(&lastReadAt, time.Now().UnixNano())
+			if !sendEvent(scanEvent{line: scanner.Text()}) {
+				return
+			}
+		}
+		if err := scanner.Err(); err != nil {
+			_ = sendEvent(scanEvent{err: err})
+		}
+	}()
+	defer close(done)
+
+	streamInterval := time.Duration(0)
+	if s.settingService.cfg != nil && s.settingService.cfg.Gateway.StreamDataIntervalTimeout > 0 {
+		streamInterval = time.Duration(s.settingService.cfg.Gateway.StreamDataIntervalTimeout) * time.Second
+	}
+	var intervalTicker *time.Ticker
+	if streamInterval > 0 {
+		intervalTicker = time.NewTicker(streamInterval)
+		defer intervalTicker.Stop()
+	}
+	var intervalCh <-chan time.Time
+	if intervalTicker != nil {
+		intervalCh = intervalTicker.C
+	}
+
+	flusher, _ := c.Writer.(http.Flusher)
+	cw := newAntigravityClientWriter(c.Writer, flusher, "antigravity upstream")
+
+	for {
+		select {
+		case ev, ok := <-events:
+			if !ok {
+				return &antigravityStreamResult{usage: usage, firstTokenMs: firstTokenMs, clientDisconnect: cw.Disconnected()}
+			}
+			if ev.err != nil {
+				if disconnect, handled := handleStreamReadError(ev.err, cw.Disconnected(), "antigravity upstream"); handled {
+					return &antigravityStreamResult{usage: usage, firstTokenMs: firstTokenMs, clientDisconnect: disconnect}
+				}
+				log.Printf("Stream read error (antigravity upstream): %v", ev.err)
+				return &antigravityStreamResult{usage: usage, firstTokenMs: firstTokenMs}
+			}
+
+			line := ev.line
+
+			// 记录首 token 时间
+			if firstTokenMs == nil && len(line) > 0 {
+				ms := int(time.Since(startTime).Milliseconds())
+				firstTokenMs = &ms
+			}
+
+			// 尝试从 message_delta 或 message_stop 事件提取 usage
+			s.extractSSEUsage(line, usage)
+
+			// 透传行
+			cw.Fprintf("%s\n", line)
+
+		case <-intervalCh:
+			lastRead := time.Unix(0, atomic.LoadInt64(&lastReadAt))
+			if time.Since(lastRead) < streamInterval {
+				continue
+			}
+			if cw.Disconnected() {
+				log.Printf("Upstream timeout after client disconnect (antigravity upstream), returning collected usage")
+				return &antigravityStreamResult{usage: usage, firstTokenMs: firstTokenMs, clientDisconnect: true}
+			}
+			log.Printf("Stream data interval timeout (antigravity upstream)")
+			return &antigravityStreamResult{usage: usage, firstTokenMs: firstTokenMs}
+		}
+	}
+}
+
+// extractSSEUsage 从 SSE data 行中提取 Claude usage（用于流式透传场景）
+func (s *AntigravityGatewayService) extractSSEUsage(line string, usage *ClaudeUsage) {
+	if !strings.HasPrefix(line, "data: ") {
+		return
+	}
+	dataStr := strings.TrimPrefix(line, "data: ")
+	var event map[string]any
+	if json.Unmarshal([]byte(dataStr), &event) != nil {
+		return
+	}
+	u, ok := event["usage"].(map[string]any)
+	if !ok {
+		return
+	}
+	if v, ok := u["input_tokens"].(float64); ok && int(v) > 0 {
+		usage.InputTokens = int(v)
+	}
+	if v, ok := u["output_tokens"].(float64); ok && int(v) > 0 {
+		usage.OutputTokens = int(v)
+	}
+	if v, ok := u["cache_read_input_tokens"].(float64); ok && int(v) > 0 {
+		usage.CacheReadInputTokens = int(v)
+	}
+	if v, ok := u["cache_creation_input_tokens"].(float64); ok && int(v) > 0 {
+		usage.CacheCreationInputTokens = int(v)
+	}
+}
+
+// extractClaudeUsage 从非流式 Claude 响应提取 usage
+func (s *AntigravityGatewayService) extractClaudeUsage(body []byte) *ClaudeUsage {
+	usage := &ClaudeUsage{}
+	var resp map[string]any
+	if json.Unmarshal(body, &resp) != nil {
+		return usage
+	}
+	if u, ok := resp["usage"].(map[string]any); ok {
+		if v, ok := u["input_tokens"].(float64); ok {
+			usage.InputTokens = int(v)
+		}
+		if v, ok := u["output_tokens"].(float64); ok {
+			usage.OutputTokens = int(v)
+		}
+		if v, ok := u["cache_read_input_tokens"].(float64); ok {
+			usage.CacheReadInputTokens = int(v)
+		}
+		if v, ok := u["cache_creation_input_tokens"].(float64); ok {
+			usage.CacheCreationInputTokens = int(v)
+		}
+	}
+	return usage
+}
--- a/backend/internal/service/antigravity_gateway_service_test.go
+++ b/backend/internal/service/antigravity_gateway_service_test.go
@@ -4,17 +4,42 @@ import (
 	"bytes"
 	"context"
 	"encoding/json"
+	"errors"
+	"fmt"
 	"io"
 	"net/http"
 	"net/http/httptest"
 	"testing"
 	"time"

+	"github.com/Wei-Shaw/sub2api/internal/config"
 	"github.com/Wei-Shaw/sub2api/internal/pkg/antigravity"
 	"github.com/gin-gonic/gin"
 	"github.com/stretchr/testify/require"
 )

+// antigravityFailingWriter 模拟客户端断开连接的 gin.ResponseWriter
+type antigravityFailingWriter struct {
+	gin.ResponseWriter
+	failAfter int // 允许成功写入的次数，之后所有写入返回错误
+	writes    int
+}
+
+func (w *antigravityFailingWriter) Write(p []byte) (int, error) {
+	if w.writes >= w.failAfter {
+		return 0, errors.New("write failed: client disconnected")
+	}
+	w.writes++
+	return w.ResponseWriter.Write(p)
+}
+
+// newAntigravityTestService 创建用于流式测试的 AntigravityGatewayService
+func newAntigravityTestService(cfg *config.Config) *AntigravityGatewayService {
+	return &AntigravityGatewayService{
+		settingService: &SettingService{cfg: cfg},
+	}
+}
+
 func TestStripSignatureSensitiveBlocksFromClaudeRequest(t *testing.T) {
 	req := &antigravity.ClaudeRequest{
 		Model: "claude-sonnet-4-5",
@@ -337,8 +362,8 @@ func TestAntigravityGatewayService_Forward_StickySessionForceCacheBilling(t *tes
 	require.True(t, failoverErr.ForceCacheBilling, "ForceCacheBilling should be true for sticky session switch")
 }

-// TestAntigravityGatewayService_ForwardGemini_StickySessionForceCacheBilling
-// 验证：ForwardGemini 粘性会话切换时，UpstreamFailoverError.ForceCacheBilling 应为 true
+// TestAntigravityGatewayService_ForwardGemini_StickySessionForceCacheBilling verifies
+// that ForwardGemini sets ForceCacheBilling=true for sticky session switch.
 func TestAntigravityGatewayService_ForwardGemini_StickySessionForceCacheBilling(t *testing.T) {
 	gin.SetMode(gin.TestMode)
 	writer := httptest.NewRecorder()
@@ -391,3 +416,438 @@ func TestAntigravityGatewayService_ForwardGemini_StickySessionForceCacheBilling(
 	require.Equal(t, http.StatusServiceUnavailable, failoverErr.StatusCode)
 	require.True(t, failoverErr.ForceCacheBilling, "ForceCacheBilling should be true for sticky session switch")
 }
+
+// --- 流式 happy path 测试 ---
+
+// TestStreamUpstreamResponse_NormalComplete
+// 验证：正常流式转发完成时，数据正确透传、usage 正确收集、clientDisconnect=false
+func TestStreamUpstreamResponse_NormalComplete(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+	svc := newAntigravityTestService(&config.Config{
+		Gateway: config.GatewayConfig{MaxLineSize: defaultMaxLineSize},
+	})
+
+	rec := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(rec)
+	c.Request = httptest.NewRequest(http.MethodPost, "/", nil)
+
+	pr, pw := io.Pipe()
+	resp := &http.Response{StatusCode: http.StatusOK, Body: pr, Header: http.Header{}}
+
+	go func() {
+		defer func() { _ = pw.Close() }()
+		fmt.Fprintln(pw, `event: message_start`)
+		fmt.Fprintln(pw, `data: {"type":"message_start","message":{"usage":{"input_tokens":10}}}`)
+		fmt.Fprintln(pw, "")
+		fmt.Fprintln(pw, `event: content_block_delta`)
+		fmt.Fprintln(pw, `data: {"type":"content_block_delta","delta":{"text":"hello"}}`)
+		fmt.Fprintln(pw, "")
+		fmt.Fprintln(pw, `event: message_delta`)
+		fmt.Fprintln(pw, `data: {"type":"message_delta","usage":{"output_tokens":5}}`)
+		fmt.Fprintln(pw, "")
+	}()
+
+	result := svc.streamUpstreamResponse(c, resp, time.Now())
+	_ = pr.Close()
+
+	require.NotNil(t, result)
+	require.False(t, result.clientDisconnect, "normal completion should not set clientDisconnect")
+	require.NotNil(t, result.usage)
+	require.Equal(t, 5, result.usage.OutputTokens, "should collect output_tokens from message_delta")
+	require.NotNil(t, result.firstTokenMs, "should record first token time")
+
+	// 验证数据被透传到客户端
+	body := rec.Body.String()
+	require.Contains(t, body, "event: message_start")
+	require.Contains(t, body, "content_block_delta")
+	require.Contains(t, body, "message_delta")
+}
+
+// TestHandleGeminiStreamingResponse_NormalComplete
+// 验证：正常 Gemini 流式转发，数据正确透传、usage 正确收集
+func TestHandleGeminiStreamingResponse_NormalComplete(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+	svc := newAntigravityTestService(&config.Config{
+		Gateway: config.GatewayConfig{MaxLineSize: defaultMaxLineSize},
+	})
+
+	rec := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(rec)
+	c.Request = httptest.NewRequest(http.MethodPost, "/", nil)
+
+	pr, pw := io.Pipe()
+	resp := &http.Response{StatusCode: http.StatusOK, Body: pr, Header: http.Header{}}
+
+	go func() {
+		defer func() { _ = pw.Close() }()
+		// 第一个 chunk（部分内容）
+		fmt.Fprintln(pw, `data: {"candidates":[{"content":{"parts":[{"text":"Hello"}]}}],"usageMetadata":{"promptTokenCount":10,"candidatesTokenCount":3}}`)
+		fmt.Fprintln(pw, "")
+		// 第二个 chunk（最终内容+完整 usage）
+		fmt.Fprintln(pw, `data: {"candidates":[{"content":{"parts":[{"text":" world"}]},"finishReason":"STOP"}],"usageMetadata":{"promptTokenCount":10,"candidatesTokenCount":8,"cachedContentTokenCount":2}}`)
+		fmt.Fprintln(pw, "")
+	}()
+
+	result, err := svc.handleGeminiStreamingResponse(c, resp, time.Now())
+	_ = pr.Close()
+
+	require.NoError(t, err)
+	require.NotNil(t, result)
+	require.False(t, result.clientDisconnect, "normal completion should not set clientDisconnect")
+	require.NotNil(t, result.usage)
+	// Gemini usage: promptTokenCount=10, candidatesTokenCount=8, cachedContentTokenCount=2
+	// → InputTokens=10-2=8, OutputTokens=8, CacheReadInputTokens=2
+	require.Equal(t, 8, result.usage.InputTokens)
+	require.Equal(t, 8, result.usage.OutputTokens)
+	require.Equal(t, 2, result.usage.CacheReadInputTokens)
+	require.NotNil(t, result.firstTokenMs, "should record first token time")
+
+	// 验证数据被透传到客户端
+	body := rec.Body.String()
+	require.Contains(t, body, "Hello")
+	require.Contains(t, body, "world")
+	// 不应包含错误事件
+	require.NotContains(t, body, "event: error")
+}
+
+// TestHandleClaudeStreamingResponse_NormalComplete
+// 验证：正常 Claude 流式转发（Gemini→Claude 转换），数据正确转换并输出
+func TestHandleClaudeStreamingResponse_NormalComplete(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+	svc := newAntigravityTestService(&config.Config{
+		Gateway: config.GatewayConfig{MaxLineSize: defaultMaxLineSize},
+	})
+
+	rec := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(rec)
+	c.Request = httptest.NewRequest(http.MethodPost, "/", nil)
+
+	pr, pw := io.Pipe()
+	resp := &http.Response{StatusCode: http.StatusOK, Body: pr, Header: http.Header{}}
+
+	go func() {
+		defer func() { _ = pw.Close() }()
+		// v1internal 包装格式：Gemini 数据嵌套在 "response" 字段下
+		// ProcessLine 先尝试反序列化为 V1InternalResponse，裸格式会导致 Response.UsageMetadata 为空
+		fmt.Fprintln(pw, `data: {"response":{"candidates":[{"content":{"parts":[{"text":"Hi there"}]},"finishReason":"STOP"}],"usageMetadata":{"promptTokenCount":5,"candidatesTokenCount":3}}}`)
+		fmt.Fprintln(pw, "")
+	}()
+
+	result, err := svc.handleClaudeStreamingResponse(c, resp, time.Now(), "claude-sonnet-4-5")
+	_ = pr.Close()
+
+	require.NoError(t, err)
+	require.NotNil(t, result)
+	require.False(t, result.clientDisconnect, "normal completion should not set clientDisconnect")
+	require.NotNil(t, result.usage)
+	// Gemini→Claude 转换的 usage：promptTokenCount=5→InputTokens=5, candidatesTokenCount=3→OutputTokens=3
+	require.Equal(t, 5, result.usage.InputTokens)
+	require.Equal(t, 3, result.usage.OutputTokens)
+	require.NotNil(t, result.firstTokenMs, "should record first token time")
+
+	// 验证输出是 Claude SSE 格式（processor 会转换）
+	body := rec.Body.String()
+	require.Contains(t, body, "event: message_start", "should contain Claude message_start event")
+	require.Contains(t, body, "event: message_stop", "should contain Claude message_stop event")
+	// 不应包含错误事件
+	require.NotContains(t, body, "event: error")
+}
+
+// --- 流式客户端断开检测测试 ---
+
+// TestStreamUpstreamResponse_ClientDisconnectDrainsUsage
+// 验证：客户端写入失败后，streamUpstreamResponse 继续读取上游以收集 usage
+func TestStreamUpstreamResponse_ClientDisconnectDrainsUsage(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+	svc := newAntigravityTestService(&config.Config{
+		Gateway: config.GatewayConfig{MaxLineSize: defaultMaxLineSize},
+	})
+
+	rec := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(rec)
+	c.Request = httptest.NewRequest(http.MethodPost, "/", nil)
+	c.Writer = &antigravityFailingWriter{ResponseWriter: c.Writer, failAfter: 0}
+
+	pr, pw := io.Pipe()
+	resp := &http.Response{StatusCode: http.StatusOK, Body: pr, Header: http.Header{}}
+
+	go func() {
+		defer func() { _ = pw.Close() }()
+		fmt.Fprintln(pw, `event: message_start`)
+		fmt.Fprintln(pw, `data: {"type":"message_start","message":{"usage":{"input_tokens":10}}}`)
+		fmt.Fprintln(pw, "")
+		fmt.Fprintln(pw, `event: message_delta`)
+		fmt.Fprintln(pw, `data: {"type":"message_delta","usage":{"output_tokens":20}}`)
+		fmt.Fprintln(pw, "")
+	}()
+
+	result := svc.streamUpstreamResponse(c, resp, time.Now())
+	_ = pr.Close()
+
+	require.NotNil(t, result)
+	require.True(t, result.clientDisconnect)
+	require.NotNil(t, result.usage)
+	require.Equal(t, 20, result.usage.OutputTokens)
+}
+
+// TestStreamUpstreamResponse_ContextCanceled
+// 验证：context 取消时返回 usage 且标记 clientDisconnect
+func TestStreamUpstreamResponse_ContextCanceled(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+	svc := newAntigravityTestService(&config.Config{
+		Gateway: config.GatewayConfig{MaxLineSize: defaultMaxLineSize},
+	})
+
+	rec := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(rec)
+	ctx, cancel := context.WithCancel(context.Background())
+	cancel()
+	c.Request = httptest.NewRequest(http.MethodPost, "/", nil).WithContext(ctx)
+
+	resp := &http.Response{StatusCode: http.StatusOK, Body: cancelReadCloser{}, Header: http.Header{}}
+
+	result := svc.streamUpstreamResponse(c, resp, time.Now())
+
+	require.NotNil(t, result)
+	require.True(t, result.clientDisconnect)
+	require.NotContains(t, rec.Body.String(), "event: error")
+}
+
+// TestStreamUpstreamResponse_Timeout
+// 验证：上游超时时返回已收集的 usage
+func TestStreamUpstreamResponse_Timeout(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+	svc := newAntigravityTestService(&config.Config{
+		Gateway: config.GatewayConfig{StreamDataIntervalTimeout: 1, MaxLineSize: defaultMaxLineSize},
+	})
+
+	rec := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(rec)
+	c.Request = httptest.NewRequest(http.MethodPost, "/", nil)
+
+	pr, pw := io.Pipe()
+	resp := &http.Response{StatusCode: http.StatusOK, Body: pr, Header: http.Header{}}
+
+	result := svc.streamUpstreamResponse(c, resp, time.Now())
+	_ = pw.Close()
+	_ = pr.Close()
+
+	require.NotNil(t, result)
+	require.False(t, result.clientDisconnect)
+}
+
+// TestStreamUpstreamResponse_TimeoutAfterClientDisconnect
+// 验证：客户端断开后上游超时，返回 usage 并标记 clientDisconnect
+func TestStreamUpstreamResponse_TimeoutAfterClientDisconnect(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+	svc := newAntigravityTestService(&config.Config{
+		Gateway: config.GatewayConfig{StreamDataIntervalTimeout: 1, MaxLineSize: defaultMaxLineSize},
+	})
+
+	rec := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(rec)
+	c.Request = httptest.NewRequest(http.MethodPost, "/", nil)
+	c.Writer = &antigravityFailingWriter{ResponseWriter: c.Writer, failAfter: 0}
+
+	pr, pw := io.Pipe()
+	resp := &http.Response{StatusCode: http.StatusOK, Body: pr, Header: http.Header{}}
+
+	go func() {
+		fmt.Fprintln(pw, `data: {"type":"message_start","message":{"usage":{"input_tokens":5}}}`)
+		fmt.Fprintln(pw, "")
+		// 不关闭 pw → 等待超时
+	}()
+
+	result := svc.streamUpstreamResponse(c, resp, time.Now())
+	_ = pw.Close()
+	_ = pr.Close()
+
+	require.NotNil(t, result)
+	require.True(t, result.clientDisconnect)
+}
+
+// TestHandleGeminiStreamingResponse_ClientDisconnect
+// 验证：Gemini 流式转发中客户端断开后继续 drain 上游
+func TestHandleGeminiStreamingResponse_ClientDisconnect(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+	svc := newAntigravityTestService(&config.Config{
+		Gateway: config.GatewayConfig{MaxLineSize: defaultMaxLineSize},
+	})
+
+	rec := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(rec)
+	c.Request = httptest.NewRequest(http.MethodPost, "/", nil)
+	c.Writer = &antigravityFailingWriter{ResponseWriter: c.Writer, failAfter: 0}
+
+	pr, pw := io.Pipe()
+	resp := &http.Response{StatusCode: http.StatusOK, Body: pr, Header: http.Header{}}
+
+	go func() {
+		defer func() { _ = pw.Close() }()
+		fmt.Fprintln(pw, `data: {"candidates":[{"content":{"parts":[{"text":"hi"}]}}],"usageMetadata":{"promptTokenCount":5,"candidatesTokenCount":10}}`)
+		fmt.Fprintln(pw, "")
+	}()
+
+	result, err := svc.handleGeminiStreamingResponse(c, resp, time.Now())
+	_ = pr.Close()
+
+	require.NoError(t, err)
+	require.NotNil(t, result)
+	require.True(t, result.clientDisconnect)
+	require.NotContains(t, rec.Body.String(), "write_failed")
+}
+
+// TestHandleGeminiStreamingResponse_ContextCanceled
+// 验证：context 取消时不注入错误事件
+func TestHandleGeminiStreamingResponse_ContextCanceled(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+	svc := newAntigravityTestService(&config.Config{
+		Gateway: config.GatewayConfig{MaxLineSize: defaultMaxLineSize},
+	})
+
+	rec := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(rec)
+	ctx, cancel := context.WithCancel(context.Background())
+	cancel()
+	c.Request = httptest.NewRequest(http.MethodPost, "/", nil).WithContext(ctx)
+
+	resp := &http.Response{StatusCode: http.StatusOK, Body: cancelReadCloser{}, Header: http.Header{}}
+
+	result, err := svc.handleGeminiStreamingResponse(c, resp, time.Now())
+
+	require.NoError(t, err)
+	require.NotNil(t, result)
+	require.True(t, result.clientDisconnect)
+	require.NotContains(t, rec.Body.String(), "event: error")
+}
+
+// TestHandleClaudeStreamingResponse_ClientDisconnect
+// 验证：Claude 流式转发中客户端断开后继续 drain 上游
+func TestHandleClaudeStreamingResponse_ClientDisconnect(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+	svc := newAntigravityTestService(&config.Config{
+		Gateway: config.GatewayConfig{MaxLineSize: defaultMaxLineSize},
+	})
+
+	rec := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(rec)
+	c.Request = httptest.NewRequest(http.MethodPost, "/", nil)
+	c.Writer = &antigravityFailingWriter{ResponseWriter: c.Writer, failAfter: 0}
+
+	pr, pw := io.Pipe()
+	resp := &http.Response{StatusCode: http.StatusOK, Body: pr, Header: http.Header{}}
+
+	go func() {
+		defer func() { _ = pw.Close() }()
+		// v1internal 包装格式
+		fmt.Fprintln(pw, `data: {"response":{"candidates":[{"content":{"parts":[{"text":"hello"}]},"finishReason":"STOP"}],"usageMetadata":{"promptTokenCount":8,"candidatesTokenCount":15}}}`)
+		fmt.Fprintln(pw, "")
+	}()
+
+	result, err := svc.handleClaudeStreamingResponse(c, resp, time.Now(), "claude-sonnet-4-5")
+	_ = pr.Close()
+
+	require.NoError(t, err)
+	require.NotNil(t, result)
+	require.True(t, result.clientDisconnect)
+}
+
+// TestHandleClaudeStreamingResponse_ContextCanceled
+// 验证：context 取消时不注入错误事件
+func TestHandleClaudeStreamingResponse_ContextCanceled(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+	svc := newAntigravityTestService(&config.Config{
+		Gateway: config.GatewayConfig{MaxLineSize: defaultMaxLineSize},
+	})
+
+	rec := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(rec)
+	ctx, cancel := context.WithCancel(context.Background())
+	cancel()
+	c.Request = httptest.NewRequest(http.MethodPost, "/", nil).WithContext(ctx)
+
+	resp := &http.Response{StatusCode: http.StatusOK, Body: cancelReadCloser{}, Header: http.Header{}}
+
+	result, err := svc.handleClaudeStreamingResponse(c, resp, time.Now(), "claude-sonnet-4-5")
+
+	require.NoError(t, err)
+	require.NotNil(t, result)
+	require.True(t, result.clientDisconnect)
+	require.NotContains(t, rec.Body.String(), "event: error")
+}
+
+// TestExtractSSEUsage 验证 extractSSEUsage 从 SSE data 行正确提取 usage
+func TestExtractSSEUsage(t *testing.T) {
+	svc := &AntigravityGatewayService{}
+	tests := []struct {
+		name     string
+		line     string
+		expected ClaudeUsage
+	}{
+		{
+			name:     "message_delta with output_tokens",
+			line:     `data: {"type":"message_delta","usage":{"output_tokens":42}}`,
+			expected: ClaudeUsage{OutputTokens: 42},
+		},
+		{
+			name:     "non-data line ignored",
+			line:     `event: message_start`,
+			expected: ClaudeUsage{},
+		},
+		{
+			name:     "top-level usage with all fields",
+			line:     `data: {"usage":{"input_tokens":10,"output_tokens":20,"cache_read_input_tokens":5,"cache_creation_input_tokens":3}}`,
+			expected: ClaudeUsage{InputTokens: 10, OutputTokens: 20, CacheReadInputTokens: 5, CacheCreationInputTokens: 3},
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			usage := &ClaudeUsage{}
+			svc.extractSSEUsage(tt.line, usage)
+			require.Equal(t, tt.expected, *usage)
+		})
+	}
+}
+
+// TestAntigravityClientWriter 验证 antigravityClientWriter 的断开检测
+func TestAntigravityClientWriter(t *testing.T) {
+	t.Run("normal write succeeds", func(t *testing.T) {
+		gin.SetMode(gin.TestMode)
+		rec := httptest.NewRecorder()
+		c, _ := gin.CreateTestContext(rec)
+		flusher, _ := c.Writer.(http.Flusher)
+		cw := newAntigravityClientWriter(c.Writer, flusher, "test")
+
+		ok := cw.Write([]byte("hello"))
+		require.True(t, ok)
+		require.False(t, cw.Disconnected())
+		require.Contains(t, rec.Body.String(), "hello")
+	})
+
+	t.Run("write failure marks disconnected", func(t *testing.T) {
+		gin.SetMode(gin.TestMode)
+		rec := httptest.NewRecorder()
+		c, _ := gin.CreateTestContext(rec)
+		fw := &antigravityFailingWriter{ResponseWriter: c.Writer, failAfter: 0}
+		flusher, _ := c.Writer.(http.Flusher)
+		cw := newAntigravityClientWriter(fw, flusher, "test")
+
+		ok := cw.Write([]byte("hello"))
+		require.False(t, ok)
+		require.True(t, cw.Disconnected())
+	})
+
+	t.Run("subsequent writes are no-op", func(t *testing.T) {
+		gin.SetMode(gin.TestMode)
+		rec := httptest.NewRecorder()
+		c, _ := gin.CreateTestContext(rec)
+		fw := &antigravityFailingWriter{ResponseWriter: c.Writer, failAfter: 0}
+		flusher, _ := c.Writer.(http.Flusher)
+		cw := newAntigravityClientWriter(fw, flusher, "test")
+
+		cw.Write([]byte("first"))
+		ok := cw.Fprintf("second %d", 2)
+		require.False(t, ok)
+		require.True(t, cw.Disconnected())
+	})
+}
--- a/backend/internal/service/antigravity_quota_scope.go
+++ b/backend/internal/service/antigravity_quota_scope.go
@@ -2,63 +2,23 @@ package service

 import (
 	"context"
-	"slices"
 	"strings"
 	"time"
 )

-const antigravityQuotaScopesKey = "antigravity_quota_scopes"
-
-// AntigravityQuotaScope 表示 Antigravity 的配额域
-type AntigravityQuotaScope string
-
-const (
-	AntigravityQuotaScopeClaude      AntigravityQuotaScope = "claude"
-	AntigravityQuotaScopeGeminiText  AntigravityQuotaScope = "gemini_text"
-	AntigravityQuotaScopeGeminiImage AntigravityQuotaScope = "gemini_image"
-)
-
-// IsScopeSupported 检查给定的 scope 是否在分组支持的 scope 列表中
-func IsScopeSupported(supportedScopes []string, scope AntigravityQuotaScope) bool {
-	if len(supportedScopes) == 0 {
-		// 未配置时默认全部支持
-		return true
-	}
-	supported := slices.Contains(supportedScopes, string(scope))
-	return supported
-}
-
-// ResolveAntigravityQuotaScope 根据模型名称解析配额域（导出版本）
-func ResolveAntigravityQuotaScope(requestedModel string) (AntigravityQuotaScope, bool) {
-	return resolveAntigravityQuotaScope(requestedModel)
-}
-
-// resolveAntigravityQuotaScope 根据模型名称解析配额域
-func resolveAntigravityQuotaScope(requestedModel string) (AntigravityQuotaScope, bool) {
-	model := normalizeAntigravityModelName(requestedModel)
-	if model == "" {
-		return "", false
-	}
-	switch {
-	case strings.HasPrefix(model, "claude-"):
-		return AntigravityQuotaScopeClaude, true
-	case strings.HasPrefix(model, "gemini-"):
-		if isImageGenerationModel(model) {
-			return AntigravityQuotaScopeGeminiImage, true
-		}
-		return AntigravityQuotaScopeGeminiText, true
-	default:
-		return "", false
-	}
-}
-
 func normalizeAntigravityModelName(model string) string {
 	normalized := strings.ToLower(strings.TrimSpace(model))
 	normalized = strings.TrimPrefix(normalized, "models/")
 	return normalized
 }

-// IsSchedulableForModel 结合 Antigravity 配额域限流判断是否可调度。
+// resolveAntigravityModelKey 根据请求的模型名解析限流 key
+// 返回空字符串表示无法解析
+func resolveAntigravityModelKey(requestedModel string) string {
+	return normalizeAntigravityModelName(requestedModel)
+}
+
+// IsSchedulableForModel 结合模型级限流判断是否可调度。
 // 保持旧签名以兼容既有调用方；默认使用 context.Background()。
 func (a *Account) IsSchedulableForModel(requestedModel string) bool {
 	return a.IsSchedulableForModelWithContext(context.Background(), requestedModel)
@@ -74,107 +34,20 @@ func (a *Account) IsSchedulableForModelWithContext(ctx context.Context, requeste
 	if a.isModelRateLimitedWithContext(ctx, requestedModel) {
 		return false
 	}
-	if a.Platform != PlatformAntigravity {
-		return true
-	}
-	scope, ok := resolveAntigravityQuotaScope(requestedModel)
-	if !ok {
-		return true
-	}
-	resetAt := a.antigravityQuotaScopeResetAt(scope)
-	if resetAt == nil {
-		return true
-	}
-	now := time.Now()
-	return !now.Before(*resetAt)
+	return true
 }

-func (a *Account) antigravityQuotaScopeResetAt(scope AntigravityQuotaScope) *time.Time {
-	if a == nil || a.Extra == nil || scope == "" {
-		return nil
-	}
-	rawScopes, ok := a.Extra[antigravityQuotaScopesKey].(map[string]any)
-	if !ok {
-		return nil
-	}
-	rawScope, ok := rawScopes[string(scope)].(map[string]any)
-	if !ok {
-		return nil
-	}
-	resetAtRaw, ok := rawScope["rate_limit_reset_at"].(string)
-	if !ok || strings.TrimSpace(resetAtRaw) == "" {
-		return nil
-	}
-	resetAt, err := time.Parse(time.RFC3339, resetAtRaw)
-	if err != nil {
-		return nil
-	}
-	return &resetAt
-}
-
-var antigravityAllScopes = []AntigravityQuotaScope{
-	AntigravityQuotaScopeClaude,
-	AntigravityQuotaScopeGeminiText,
-	AntigravityQuotaScopeGeminiImage,
-}
-
-func (a *Account) GetAntigravityScopeRateLimits() map[string]int64 {
-	if a == nil || a.Platform != PlatformAntigravity {
-		return nil
-	}
-	now := time.Now()
-	result := make(map[string]int64)
-	for _, scope := range antigravityAllScopes {
-		resetAt := a.antigravityQuotaScopeResetAt(scope)
-		if resetAt != nil && now.Before(*resetAt) {
-			remainingSec := int64(time.Until(*resetAt).Seconds())
-			if remainingSec > 0 {
-				result[string(scope)] = remainingSec
-			}
-		}
-	}
-	if len(result) == 0 {
-		return nil
-	}
-	return result
-}
-
-// GetQuotaScopeRateLimitRemainingTime 获取模型域限流剩余时间
-// 返回 0 表示未限流或已过期
-func (a *Account) GetQuotaScopeRateLimitRemainingTime(requestedModel string) time.Duration {
-	if a == nil || a.Platform != PlatformAntigravity {
-		return 0
-	}
-	scope, ok := resolveAntigravityQuotaScope(requestedModel)
-	if !ok {
-		return 0
-	}
-	resetAt := a.antigravityQuotaScopeResetAt(scope)
-	if resetAt == nil {
-		return 0
-	}
-	if remaining := time.Until(*resetAt); remaining > 0 {
-		return remaining
-	}
-	return 0
-}
-
-// GetRateLimitRemainingTime 获取限流剩余时间（模型限流和模型域限流取最大值）
+// GetRateLimitRemainingTime 获取限流剩余时间（模型级限流）
 // 返回 0 表示未限流或已过期
 func (a *Account) GetRateLimitRemainingTime(requestedModel string) time.Duration {
 	return a.GetRateLimitRemainingTimeWithContext(context.Background(), requestedModel)
 }

-// GetRateLimitRemainingTimeWithContext 获取限流剩余时间（模型限流和模型域限流取最大值）
+// GetRateLimitRemainingTimeWithContext 获取限流剩余时间（模型级限流）
 // 返回 0 表示未限流或已过期
 func (a *Account) GetRateLimitRemainingTimeWithContext(ctx context.Context, requestedModel string) time.Duration {
 	if a == nil {
 		return 0
 	}
-	modelRemaining := a.GetModelRateLimitRemainingTimeWithContext(ctx, requestedModel)
-	scopeRemaining := a.GetQuotaScopeRateLimitRemainingTime(requestedModel)
-	if modelRemaining > scopeRemaining {
-		return modelRemaining
-	}
-	return scopeRemaining
+	return a.GetModelRateLimitRemainingTimeWithContext(ctx, requestedModel)
 }
--- a/backend/internal/service/antigravity_rate_limit_test.go
+++ b/backend/internal/service/antigravity_rate_limit_test.go
@@ -59,12 +59,6 @@ func (s *stubAntigravityUpstream) DoWithTLS(req *http.Request, proxyURL string,
 	return s.Do(req, proxyURL, accountID, accountConcurrency)
 }

-type scopeLimitCall struct {
-	accountID int64
-	scope     AntigravityQuotaScope
-	resetAt   time.Time
-}
-
 type rateLimitCall struct {
 	accountID int64
 	resetAt   time.Time
@@ -78,16 +72,10 @@ type modelRateLimitCall struct {

 type stubAntigravityAccountRepo struct {
 	AccountRepository
-	scopeCalls          []scopeLimitCall
 	rateCalls           []rateLimitCall
 	modelRateLimitCalls []modelRateLimitCall
 }

-func (s *stubAntigravityAccountRepo) SetAntigravityQuotaScopeLimit(ctx context.Context, id int64, scope AntigravityQuotaScope, resetAt time.Time) error {
-	s.scopeCalls = append(s.scopeCalls, scopeLimitCall{accountID: id, scope: scope, resetAt: resetAt})
-	return nil
-}
-
 func (s *stubAntigravityAccountRepo) SetRateLimited(ctx context.Context, id int64, resetAt time.Time) error {
 	s.rateCalls = append(s.rateCalls, rateLimitCall{accountID: id, resetAt: resetAt})
 	return nil
@@ -131,10 +119,9 @@ func TestAntigravityRetryLoop_URLFallback_UsesLatestSuccess(t *testing.T) {
 		accessToken:    "token",
 		action:         "generateContent",
 		body:           []byte(`{"input":"test"}`),
-		quotaScope:     AntigravityQuotaScopeClaude,
 		httpUpstream:   upstream,
 		requestedModel: "claude-sonnet-4-5",
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			handleErrorCalled = true
 			return nil
 		},
@@ -155,23 +142,6 @@ func TestAntigravityRetryLoop_URLFallback_UsesLatestSuccess(t *testing.T) {
 	require.Equal(t, base2, available[0])
 }

-func TestAntigravityHandleUpstreamError_UsesScopeLimit(t *testing.T) {
-	// 分区限流始终开启，不再支持通过环境变量关闭
-	repo := &stubAntigravityAccountRepo{}
-	svc := &AntigravityGatewayService{accountRepo: repo}
-	account := &Account{ID: 9, Name: "acc-9", Platform: PlatformAntigravity}
-
-	body := buildGeminiRateLimitBody("3s")
-	svc.handleUpstreamError(context.Background(), "[test]", account, http.StatusTooManyRequests, http.Header{}, body, AntigravityQuotaScopeClaude, 0, "", false)
-
-	require.Len(t, repo.scopeCalls, 1)
-	require.Empty(t, repo.rateCalls)
-	call := repo.scopeCalls[0]
-	require.Equal(t, account.ID, call.accountID)
-	require.Equal(t, AntigravityQuotaScopeClaude, call.scope)
-	require.WithinDuration(t, time.Now().Add(3*time.Second), call.resetAt, 2*time.Second)
-}
-
 // TestHandleUpstreamError_429_ModelRateLimit 测试 429 模型限流场景
 func TestHandleUpstreamError_429_ModelRateLimit(t *testing.T) {
 	repo := &stubAntigravityAccountRepo{}
@@ -189,7 +159,7 @@ func TestHandleUpstreamError_429_ModelRateLimit(t *testing.T) {
 		}
 	}`)

-	result := svc.handleUpstreamError(context.Background(), "[test]", account, http.StatusTooManyRequests, http.Header{}, body, AntigravityQuotaScopeClaude, 0, "", false)
+	result := svc.handleUpstreamError(context.Background(), "[test]", account, http.StatusTooManyRequests, http.Header{}, body, "claude-sonnet-4-5", 0, "", false)

 	// 应该触发模型限流
 	require.NotNil(t, result)
@@ -200,22 +170,22 @@ func TestHandleUpstreamError_429_ModelRateLimit(t *testing.T) {
 	require.Equal(t, "claude-sonnet-4-5", repo.modelRateLimitCalls[0].modelKey)
 }

-// TestHandleUpstreamError_429_NonModelRateLimit 测试 429 非模型限流场景（走 scope 限流）
+// TestHandleUpstreamError_429_NonModelRateLimit 测试 429 非模型限流场景（走模型级限流兜底）
 func TestHandleUpstreamError_429_NonModelRateLimit(t *testing.T) {
 	repo := &stubAntigravityAccountRepo{}
 	svc := &AntigravityGatewayService{accountRepo: repo}
 	account := &Account{ID: 2, Name: "acc-2", Platform: PlatformAntigravity}

-	// 429 + 普通限流响应（无 RATE_LIMIT_EXCEEDED reason）→ scope 限流
+	// 429 + 普通限流响应（无 RATE_LIMIT_EXCEEDED reason）→ 走模型级限流兜底
 	body := buildGeminiRateLimitBody("5s")

-	result := svc.handleUpstreamError(context.Background(), "[test]", account, http.StatusTooManyRequests, http.Header{}, body, AntigravityQuotaScopeClaude, 0, "", false)
+	result := svc.handleUpstreamError(context.Background(), "[test]", account, http.StatusTooManyRequests, http.Header{}, body, "claude-sonnet-4-5", 0, "", false)

-	// 不应该触发模型限流，应该走 scope 限流
+	// handleModelRateLimit 不会处理（因为没有 RATE_LIMIT_EXCEEDED），
+	// 但 429 兜底逻辑会使用 requestedModel 设置模型级限流
 	require.Nil(t, result)
-	require.Empty(t, repo.modelRateLimitCalls)
-	require.Len(t, repo.scopeCalls, 1)
-	require.Equal(t, AntigravityQuotaScopeClaude, repo.scopeCalls[0].scope)
+	require.Len(t, repo.modelRateLimitCalls, 1)
+	require.Equal(t, "claude-sonnet-4-5", repo.modelRateLimitCalls[0].modelKey)
 }

 // TestHandleUpstreamError_503_ModelRateLimit 测试 503 模型限流场景
@@ -235,7 +205,7 @@ func TestHandleUpstreamError_503_ModelRateLimit(t *testing.T) {
 		}
 	}`)

-	result := svc.handleUpstreamError(context.Background(), "[test]", account, http.StatusServiceUnavailable, http.Header{}, body, AntigravityQuotaScopeGeminiText, 0, "", false)
+	result := svc.handleUpstreamError(context.Background(), "[test]", account, http.StatusServiceUnavailable, http.Header{}, body, "gemini-3-pro-high", 0, "", false)

 	// 应该触发模型限流
 	require.NotNil(t, result)
@@ -263,12 +233,11 @@ func TestHandleUpstreamError_503_NonModelRateLimit(t *testing.T) {
 		}
 	}`)

-	result := svc.handleUpstreamError(context.Background(), "[test]", account, http.StatusServiceUnavailable, http.Header{}, body, AntigravityQuotaScopeGeminiText, 0, "", false)
+	result := svc.handleUpstreamError(context.Background(), "[test]", account, http.StatusServiceUnavailable, http.Header{}, body, "gemini-3-pro-high", 0, "", false)

 	// 503 非模型限流不应该做任何处理
 	require.Nil(t, result)
 	require.Empty(t, repo.modelRateLimitCalls, "503 non-model rate limit should not trigger model rate limit")
-	require.Empty(t, repo.scopeCalls, "503 non-model rate limit should not trigger scope rate limit")
 	require.Empty(t, repo.rateCalls, "503 non-model rate limit should not trigger account rate limit")
 }

@@ -281,12 +250,11 @@ func TestHandleUpstreamError_503_EmptyBody(t *testing.T) {
 	// 503 + 空响应体 → 不做任何处理
 	body := []byte(`{}`)

-	result := svc.handleUpstreamError(context.Background(), "[test]", account, http.StatusServiceUnavailable, http.Header{}, body, AntigravityQuotaScopeGeminiText, 0, "", false)
+	result := svc.handleUpstreamError(context.Background(), "[test]", account, http.StatusServiceUnavailable, http.Header{}, body, "gemini-3-pro-high", 0, "", false)

 	// 503 空响应不应该做任何处理
 	require.Nil(t, result)
 	require.Empty(t, repo.modelRateLimitCalls)
-	require.Empty(t, repo.scopeCalls)
 	require.Empty(t, repo.rateCalls)
 }

@@ -307,15 +275,7 @@ func TestAccountIsSchedulableForModel_AntigravityRateLimits(t *testing.T) {
 	require.False(t, account.IsSchedulableForModel("gemini-3-flash"))

 	account.RateLimitResetAt = nil
-	account.Extra = map[string]any{
-		antigravityQuotaScopesKey: map[string]any{
-			"claude": map[string]any{
-				"rate_limit_reset_at": future.Format(time.RFC3339),
-			},
-		},
-	}
-
-	require.False(t, account.IsSchedulableForModel("claude-sonnet-4-5"))
+	require.True(t, account.IsSchedulableForModel("claude-sonnet-4-5"))
 	require.True(t, account.IsSchedulableForModel("gemini-3-flash"))
 }

@@ -635,6 +595,7 @@ func TestShouldTriggerAntigravitySmartRetry(t *testing.T) {
 			}`,
 			expectedShouldRetry:     false,
 			expectedShouldRateLimit: true,
+			minWait:                 7 * time.Second,
 			modelName:               "gemini-pro",
 		},
 		{
@@ -652,6 +613,7 @@ func TestShouldTriggerAntigravitySmartRetry(t *testing.T) {
 			}`,
 			expectedShouldRetry:     false,
 			expectedShouldRateLimit: true,
+			minWait:                 39 * time.Second,
 			modelName:               "gemini-3-pro-high",
 		},
 		{
@@ -669,6 +631,7 @@ func TestShouldTriggerAntigravitySmartRetry(t *testing.T) {
 			}`,
 			expectedShouldRetry:     false,
 			expectedShouldRateLimit: true,
+			minWait:                 30 * time.Second,
 			modelName:               "gemini-2.5-flash",
 		},
 		{
@@ -686,6 +649,7 @@ func TestShouldTriggerAntigravitySmartRetry(t *testing.T) {
 			}`,
 			expectedShouldRetry:     false,
 			expectedShouldRateLimit: true,
+			minWait:                 30 * time.Second,
 			modelName:               "claude-sonnet-4-5",
 		},
 	}
@@ -704,6 +668,11 @@ func TestShouldTriggerAntigravitySmartRetry(t *testing.T) {
 					t.Errorf("wait = %v, want >= %v", wait, tt.minWait)
 				}
 			}
+			if shouldRateLimit && tt.minWait > 0 {
+				if wait < tt.minWait {
+					t.Errorf("rate limit wait = %v, want >= %v", wait, tt.minWait)
+				}
+			}
 			if (shouldRetry || shouldRateLimit) && model != tt.modelName {
 				t.Errorf("modelName = %q, want %q", model, tt.modelName)
 			}
@@ -832,7 +801,7 @@ func TestAntigravityRetryLoop_PreCheck_SwitchesWhenRateLimited(t *testing.T) {
 		requestedModel:  "claude-sonnet-4-5",
 		httpUpstream:    upstream,
 		isStickySession: true,
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	})
@@ -875,7 +844,7 @@ func TestAntigravityRetryLoop_PreCheck_SwitchesWhenRemainingLong(t *testing.T) {
 		requestedModel:  "claude-sonnet-4-5",
 		httpUpstream:    upstream,
 		isStickySession: true,
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	})

--- a/backend/internal/service/antigravity_smart_retry_test.go
+++ b/backend/internal/service/antigravity_smart_retry_test.go
@@ -75,7 +75,7 @@ func TestHandleSmartRetry_URLLevelRateLimit(t *testing.T) {
 		accessToken: "token",
 		action:      "generateContent",
 		body:        []byte(`{"input":"test"}`),
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	}
@@ -127,7 +127,7 @@ func TestHandleSmartRetry_LongDelay_ReturnsSwitchError(t *testing.T) {
 		body:            []byte(`{"input":"test"}`),
 		accountRepo:     repo,
 		isStickySession: true,
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	}
@@ -194,7 +194,7 @@ func TestHandleSmartRetry_ShortDelay_SmartRetrySuccess(t *testing.T) {
 		action:       "generateContent",
 		body:         []byte(`{"input":"test"}`),
 		httpUpstream: upstream,
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	}
@@ -269,7 +269,7 @@ func TestHandleSmartRetry_ShortDelay_SmartRetryFailed_ReturnsSwitchError(t *test
 		httpUpstream:    upstream,
 		accountRepo:     repo,
 		isStickySession: false,
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	}
@@ -331,7 +331,7 @@ func TestHandleSmartRetry_503_ModelCapacityExhausted_ReturnsSwitchError(t *testi
 		body:            []byte(`{"input":"test"}`),
 		accountRepo:     repo,
 		isStickySession: true,
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	}
@@ -387,7 +387,7 @@ func TestHandleSmartRetry_NonAntigravityAccount_ContinuesDefaultLogic(t *testing
 		accessToken: "token",
 		action:      "generateContent",
 		body:        []byte(`{"input":"test"}`),
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	}
@@ -436,7 +436,7 @@ func TestHandleSmartRetry_NonModelRateLimit_ContinuesDefaultLogic(t *testing.T)
 		accessToken: "token",
 		action:      "generateContent",
 		body:        []byte(`{"input":"test"}`),
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	}
@@ -487,7 +487,7 @@ func TestHandleSmartRetry_ExactlyAtThreshold_ReturnsSwitchError(t *testing.T) {
 		action:      "generateContent",
 		body:        []byte(`{"input":"test"}`),
 		accountRepo: repo,
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	}
@@ -548,7 +548,7 @@ func TestAntigravityRetryLoop_HandleSmartRetry_SwitchError_Propagates(t *testing
 		httpUpstream:    upstream,
 		accountRepo:     repo,
 		isStickySession: true,
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	})
@@ -604,7 +604,7 @@ func TestHandleSmartRetry_NetworkError_ExhaustsRetry(t *testing.T) {
 		body:         []byte(`{"input":"test"}`),
 		httpUpstream: upstream,
 		accountRepo:  repo,
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	}
@@ -662,7 +662,7 @@ func TestHandleSmartRetry_NoRetryDelay_UsesDefaultRateLimit(t *testing.T) {
 		body:            []byte(`{"input":"test"}`),
 		accountRepo:     repo,
 		isStickySession: true,
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	}
@@ -754,7 +754,7 @@ func TestHandleSmartRetry_ShortDelay_StickySession_FailedRetry_ClearsSession(t *
 		isStickySession: true,
 		groupID:         42,
 		sessionHash:     "sticky-hash-abc",
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	}
@@ -842,7 +842,7 @@ func TestHandleSmartRetry_ShortDelay_NonStickySession_FailedRetry_NoDeleteSessio
 		isStickySession: false,
 		groupID:         42,
 		sessionHash:     "", // 非粘性会话，sessionHash 为空
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	}
@@ -918,7 +918,7 @@ func TestHandleSmartRetry_ShortDelay_StickySession_FailedRetry_NilCache_NoPanic(
 		isStickySession: true,
 		groupID:         42,
 		sessionHash:     "sticky-hash-nil-cache",
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	}
@@ -983,7 +983,7 @@ func TestHandleSmartRetry_ShortDelay_StickySession_SuccessRetry_NoDeleteSession(
 		isStickySession: true,
 		groupID:         42,
 		sessionHash:     "sticky-hash-success",
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	}
@@ -1043,7 +1043,7 @@ func TestHandleSmartRetry_LongDelay_StickySession_NoDeleteInHandleSmartRetry(t *
 		isStickySession: true,
 		groupID:         42,
 		sessionHash:     "sticky-hash-long-delay",
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	}
@@ -1108,7 +1108,7 @@ func TestHandleSmartRetry_ShortDelay_NetworkError_StickySession_ClearsSession(t
 		isStickySession: true,
 		groupID:         99,
 		sessionHash:     "sticky-net-error",
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	}
@@ -1188,7 +1188,7 @@ func TestHandleSmartRetry_ShortDelay_503_StickySession_FailedRetry_ClearsSession
 		isStickySession: true,
 		groupID:         77,
 		sessionHash:     "sticky-503-short",
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	}
@@ -1278,7 +1278,7 @@ func TestAntigravityRetryLoop_SmartRetryFailed_StickySession_SwitchErrorPropagat
 		isStickySession: true,
 		groupID:         55,
 		sessionHash:     "sticky-loop-test",
-		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, quotaScope AntigravityQuotaScope, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
+		handleError: func(ctx context.Context, prefix string, account *Account, statusCode int, headers http.Header, body []byte, requestedModel string, groupID int64, sessionHash string, isStickySession bool) *handleModelRateLimitResult {
 			return nil
 		},
 	})
@@ -1296,4 +1296,4 @@ func TestAntigravityRetryLoop_SmartRetryFailed_StickySession_SwitchErrorPropagat
 	require.Len(t, cache.deleteCalls, 1, "should clear sticky session in handleSmartRetry")
 	require.Equal(t, int64(55), cache.deleteCalls[0].groupID)
 	require.Equal(t, "sticky-loop-test", cache.deleteCalls[0].sessionHash)
-}
\ No newline at end of file
+}