Merge branch 'main' of https://github.com/mt21625457/aicodex2api

839ab37d · yangjianbo · 9dd0ef18 · fd8473f2 · 839ab37d · 839ab37d
Commit 839ab37d authored Jan 12, 2026 by yangjianbo
--- a/backend/internal/service/ops_upstream_context.go
+++ b/backend/internal/service/ops_upstream_context.go
+package service
+import (
+	"encoding/json"
+	"strings"
+	"time"
+	"github.com/gin-gonic/gin"
+)
+// Gin context keys used by Ops error logger for capturing upstream error details.
+// These keys are set by gateway services and consumed by handler/ops_error_logger.go.
+const (
+	OpsUpstreamStatusCodeKey   = "ops_upstream_status_code"
+	OpsUpstreamErrorMessageKey = "ops_upstream_error_message"
+	OpsUpstreamErrorDetailKey  = "ops_upstream_error_detail"
+	OpsUpstreamErrorsKey       = "ops_upstream_errors"
+)
+func setOpsUpstreamError(c *gin.Context, upstreamStatusCode int, upstreamMessage, upstreamDetail string) {
+	if c == nil {
+		return
+	}
+	if upstreamStatusCode > 0 {
+		c.Set(OpsUpstreamStatusCodeKey, upstreamStatusCode)
+	}
+	if msg := strings.TrimSpace(upstreamMessage); msg != "" {
+		c.Set(OpsUpstreamErrorMessageKey, msg)
+	}
+	if detail := strings.TrimSpace(upstreamDetail); detail != "" {
+		c.Set(OpsUpstreamErrorDetailKey, detail)
+	}
+}
+// OpsUpstreamErrorEvent describes one upstream error attempt during a single gateway request.
+// It is stored in ops_error_logs.upstream_errors as a JSON array.
+type OpsUpstreamErrorEvent struct {
+	AtUnixMs int64 `json:"at_unix_ms,omitempty"`
+	// Context
+	Platform  string `json:"platform,omitempty"`
+	AccountID int64  `json:"account_id,omitempty"`
+	// Outcome
+	UpstreamStatusCode int    `json:"upstream_status_code,omitempty"`
+	UpstreamRequestID  string `json:"upstream_request_id,omitempty"`
+	// Kind: http_error | request_error | retry_exhausted | failover
+	Kind string `json:"kind,omitempty"`
+	Message string `json:"message,omitempty"`
+	Detail  string `json:"detail,omitempty"`
+}
+func appendOpsUpstreamError(c *gin.Context, ev OpsUpstreamErrorEvent) {
+	if c == nil {
+		return
+	}
+	if ev.AtUnixMs <= 0 {
+		ev.AtUnixMs = time.Now().UnixMilli()
+	}
+	ev.Platform = strings.TrimSpace(ev.Platform)
+	ev.UpstreamRequestID = strings.TrimSpace(ev.UpstreamRequestID)
+	ev.Kind = strings.TrimSpace(ev.Kind)
+	ev.Message = strings.TrimSpace(ev.Message)
+	ev.Detail = strings.TrimSpace(ev.Detail)
+	if ev.Message != "" {
+		ev.Message = sanitizeUpstreamErrorMessage(ev.Message)
+	}
+	var existing []*OpsUpstreamErrorEvent
+	if v, ok := c.Get(OpsUpstreamErrorsKey); ok {
+		if arr, ok := v.([]*OpsUpstreamErrorEvent); ok {
+			existing = arr
+		}
+	}
+	evCopy := ev
+	existing = append(existing, &evCopy)
+	c.Set(OpsUpstreamErrorsKey, existing)
+}
+func marshalOpsUpstreamErrors(events []*OpsUpstreamErrorEvent) *string {
+	if len(events) == 0 {
+		return nil
+	}
+	// Ensure we always store a valid JSON value.
+	raw, err := json.Marshal(events)
+	if err != nil || len(raw) == 0 {
+		return nil
+	}
+	s := string(raw)
+	return &s
+}
--- a/backend/internal/service/ops_window_stats.go
+++ b/backend/internal/service/ops_window_stats.go
+package service
+import (
+	"context"
+	"time"
+	infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
+)
+// GetWindowStats returns lightweight request/token counts for the provided window.
+// It is intended for realtime sampling (e.g. WebSocket QPS push) without computing percentiles/peaks.
+func (s *OpsService) GetWindowStats(ctx context.Context, startTime, endTime time.Time) (*OpsWindowStats, error) {
+	if err := s.RequireMonitoringEnabled(ctx); err != nil {
+		return nil, err
+	}
+	if s.opsRepo == nil {
+		return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+	}
+	filter := &OpsDashboardFilter{
+		StartTime: startTime,
+		EndTime:   endTime,
+	}
+	return s.opsRepo.GetWindowStats(ctx, filter)
+}
--- a/backend/internal/service/ratelimit_service.go
+++ b/backend/internal/service/ratelimit_service.go
@@ -55,19 +55,36 @@ func (s *RateLimitService) HandleUpstreamError(ctx context.Context, account *Acc
 	}
 	tempMatched := s.tryTempUnschedulable(ctx, account, statusCode, responseBody)
+	upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(responseBody))
+	upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
+	if upstreamMsg != "" {
+		upstreamMsg = truncateForLog([]byte(upstreamMsg), 512)
+	}
 	switch statusCode {
 	case 401:
 		// 认证失败：停止调度，记录错误
-		s.handleAuthError(ctx, account, "Authentication failed (401): invalid or expired credentials")
+		msg := "Authentication failed (401): invalid or expired credentials"
+		if upstreamMsg != "" {
+			msg = "Authentication failed (401): " + upstreamMsg
+		}
+		s.handleAuthError(ctx, account, msg)
 		shouldDisable = true
 	case 402:
 		// 支付要求：余额不足或计费问题，停止调度
-		s.handleAuthError(ctx, account, "Payment required (402): insufficient balance or billing issue")
+		msg := "Payment required (402): insufficient balance or billing issue"
+		if upstreamMsg != "" {
+			msg = "Payment required (402): " + upstreamMsg
+		}
+		s.handleAuthError(ctx, account, msg)
 		shouldDisable = true
 	case 403:
 		// 禁止访问：停止调度，记录错误
-		s.handleAuthError(ctx, account, "Access forbidden (403): account may be suspended or lack permissions")
+		msg := "Access forbidden (403): account may be suspended or lack permissions"
+		if upstreamMsg != "" {
+			msg = "Access forbidden (403): " + upstreamMsg
+		}
+		s.handleAuthError(ctx, account, msg)
 		shouldDisable = true
 	case 429:
 		s.handle429(ctx, account, headers)

--- a/backend/internal/service/setting_service.go
+++ b/backend/internal/service/setting_service.go
@@ -176,7 +176,7 @@ func (s *SettingService) UpdateSettings(ctx context.Context, settings *SystemSet
 		updates[SettingKeyTurnstileSecretKey] = settings.TurnstileSecretKey
 	}
-	// LinuxDo Connect OAuth 登录（终端用户 SSO）
+	// LinuxDo Connect OAuth 登录
 	updates[SettingKeyLinuxDoConnectEnabled] = strconv.FormatBool(settings.LinuxDoConnectEnabled)
 	updates[SettingKeyLinuxDoConnectClientID] = settings.LinuxDoConnectClientID
 	updates[SettingKeyLinuxDoConnectRedirectURL] = settings.LinuxDoConnectRedirectURL
@@ -208,6 +208,14 @@ func (s *SettingService) UpdateSettings(ctx context.Context, settings *SystemSet
 	updates[SettingKeyEnableIdentityPatch] = strconv.FormatBool(settings.EnableIdentityPatch)
 	updates[SettingKeyIdentityPatchPrompt] = settings.IdentityPatchPrompt
+	// Ops monitoring (vNext)
+	updates[SettingKeyOpsMonitoringEnabled] = strconv.FormatBool(settings.OpsMonitoringEnabled)
+	updates[SettingKeyOpsRealtimeMonitoringEnabled] = strconv.FormatBool(settings.OpsRealtimeMonitoringEnabled)
+	updates[SettingKeyOpsQueryModeDefault] = string(ParseOpsQueryMode(settings.OpsQueryModeDefault))
+	if settings.OpsMetricsIntervalSeconds > 0 {
+		updates[SettingKeyOpsMetricsIntervalSeconds] = strconv.Itoa(settings.OpsMetricsIntervalSeconds)
+	}
 	err := s.settingRepo.SetMultiple(ctx, updates)
 	if err == nil && s.onUpdate != nil {
 		s.onUpdate() // Invalidate cache after settings update
@@ -298,6 +306,12 @@ func (s *SettingService) InitializeDefaultSettings(ctx context.Context) error {
 		// Identity patch defaults
 		SettingKeyEnableIdentityPatch: "true",
 		SettingKeyIdentityPatchPrompt: "",
+		// Ops monitoring defaults (vNext)
+		SettingKeyOpsMonitoringEnabled:         "true",
+		SettingKeyOpsRealtimeMonitoringEnabled: "true",
+		SettingKeyOpsQueryModeDefault:          "auto",
+		SettingKeyOpsMetricsIntervalSeconds:    "60",
 	}
 	return s.settingRepo.SetMultiple(ctx, defaults)
@@ -397,100 +411,33 @@ func (s *SettingService) parseSettings(settings map[string]string) *SystemSettin
 	}
 	result.IdentityPatchPrompt = settings[SettingKeyIdentityPatchPrompt]
-	return result
+	// Ops monitoring settings (default: enabled, fail-open)
-}
+	result.OpsMonitoringEnabled = !isFalseSettingValue(settings[SettingKeyOpsMonitoringEnabled])
+	result.OpsRealtimeMonitoringEnabled = !isFalseSettingValue(settings[SettingKeyOpsRealtimeMonitoringEnabled])
-// GetLinuxDoConnectOAuthConfig 返回用于登录的“最终生效” LinuxDo Connect 配置。
+	result.OpsQueryModeDefault = string(ParseOpsQueryMode(settings[SettingKeyOpsQueryModeDefault]))
-//
+	result.OpsMetricsIntervalSeconds = 60
-// 优先级：
+	if raw := strings.TrimSpace(settings[SettingKeyOpsMetricsIntervalSeconds]); raw != "" {
-// - 若对应系统设置键存在，则覆盖 config.yaml/env 的值
+		if v, err := strconv.Atoi(raw); err == nil {
-// - 否则回退到 config.yaml/env 的值
+			if v < 60 {
-func (s *SettingService) GetLinuxDoConnectOAuthConfig(ctx context.Context) (config.LinuxDoConnectConfig, error) {
+				v = 60
-	if s == nil || s.cfg == nil {
+			}
-		return config.LinuxDoConnectConfig{}, infraerrors.ServiceUnavailable("CONFIG_NOT_READY", "config not loaded")
+			if v > 3600 {
-	}
+				v = 3600
+			}
-	effective := s.cfg.LinuxDo
+			result.OpsMetricsIntervalSeconds = v
+		}
-	keys := []string{
-		SettingKeyLinuxDoConnectEnabled,
-		SettingKeyLinuxDoConnectClientID,
-		SettingKeyLinuxDoConnectClientSecret,
-		SettingKeyLinuxDoConnectRedirectURL,
-	}
-	settings, err := s.settingRepo.GetMultiple(ctx, keys)
-	if err != nil {
-		return config.LinuxDoConnectConfig{}, fmt.Errorf("get linuxdo connect settings: %w", err)
-	}
-	if raw, ok := settings[SettingKeyLinuxDoConnectEnabled]; ok {
-		effective.Enabled = raw == "true"
-	}
-	if v, ok := settings[SettingKeyLinuxDoConnectClientID]; ok && strings.TrimSpace(v) != "" {
-		effective.ClientID = strings.TrimSpace(v)
-	}
-	if v, ok := settings[SettingKeyLinuxDoConnectClientSecret]; ok && strings.TrimSpace(v) != "" {
-		effective.ClientSecret = strings.TrimSpace(v)
-	}
-	if v, ok := settings[SettingKeyLinuxDoConnectRedirectURL]; ok && strings.TrimSpace(v) != "" {
-		effective.RedirectURL = strings.TrimSpace(v)
-	}
-	if !effective.Enabled {
-		return config.LinuxDoConnectConfig{}, infraerrors.NotFound("OAUTH_DISABLED", "oauth login is disabled")
-	}
-	// 基础健壮性校验（避免把用户重定向到一个必然失败或不安全的 OAuth 流程里）。
-	if strings.TrimSpace(effective.ClientID) == "" {
-		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth client id not configured")
-	}
-	if strings.TrimSpace(effective.AuthorizeURL) == "" {
-		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth authorize url not configured")
-	}
-	if strings.TrimSpace(effective.TokenURL) == "" {
-		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token url not configured")
-	}
-	if strings.TrimSpace(effective.UserInfoURL) == "" {
-		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth userinfo url not configured")
-	}
-	if strings.TrimSpace(effective.RedirectURL) == "" {
-		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth redirect url not configured")
-	}
-	if strings.TrimSpace(effective.FrontendRedirectURL) == "" {
-		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth frontend redirect url not configured")
 	}
-	if err := config.ValidateAbsoluteHTTPURL(effective.AuthorizeURL); err != nil {
+	return result
-		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth authorize url invalid")
+}
-	}
-	if err := config.ValidateAbsoluteHTTPURL(effective.TokenURL); err != nil {
-		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token url invalid")
-	}
-	if err := config.ValidateAbsoluteHTTPURL(effective.UserInfoURL); err != nil {
-		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth userinfo url invalid")
-	}
-	if err := config.ValidateAbsoluteHTTPURL(effective.RedirectURL); err != nil {
-		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth redirect url invalid")
-	}
-	if err := config.ValidateFrontendRedirectURL(effective.FrontendRedirectURL); err != nil {
-		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth frontend redirect url invalid")
-	}
-	method := strings.ToLower(strings.TrimSpace(effective.TokenAuthMethod))
+func isFalseSettingValue(value string) bool {
-	switch method {
+	switch strings.ToLower(strings.TrimSpace(value)) {
-	case "", "client_secret_post", "client_secret_basic":
+	case "false", "0", "off", "disabled":
-		if strings.TrimSpace(effective.ClientSecret) == "" {
+		return true
-			return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth client secret not configured")
-		}
-	case "none":
-		if !effective.UsePKCE {
-			return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth pkce must be enabled when token_auth_method=none")
-		}
 	default:
-		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token_auth_method invalid")
+		return false
 	}
-	return effective, nil
 }
 // getStringOrDefault 获取字符串值或默认值
@@ -635,3 +582,96 @@ func (s *SettingService) GetFallbackModel(ctx context.Context, platform string)
 	}
 	return value
 }
+// GetLinuxDoConnectOAuthConfig 返回用于登录的"最终生效" LinuxDo Connect 配置。
+//
+// 优先级：
+// - 若对应系统设置键存在，则覆盖 config.yaml/env 的值
+// - 否则回退到 config.yaml/env 的值
+func (s *SettingService) GetLinuxDoConnectOAuthConfig(ctx context.Context) (config.LinuxDoConnectConfig, error) {
+	if s == nil || s.cfg == nil {
+		return config.LinuxDoConnectConfig{}, infraerrors.ServiceUnavailable("CONFIG_NOT_READY", "config not loaded")
+	}
+	effective := s.cfg.LinuxDo
+	keys := []string{
+		SettingKeyLinuxDoConnectEnabled,
+		SettingKeyLinuxDoConnectClientID,
+		SettingKeyLinuxDoConnectClientSecret,
+		SettingKeyLinuxDoConnectRedirectURL,
+	}
+	settings, err := s.settingRepo.GetMultiple(ctx, keys)
+	if err != nil {
+		return config.LinuxDoConnectConfig{}, fmt.Errorf("get linuxdo connect settings: %w", err)
+	}
+	if raw, ok := settings[SettingKeyLinuxDoConnectEnabled]; ok {
+		effective.Enabled = raw == "true"
+	}
+	if v, ok := settings[SettingKeyLinuxDoConnectClientID]; ok && strings.TrimSpace(v) != "" {
+		effective.ClientID = strings.TrimSpace(v)
+	}
+	if v, ok := settings[SettingKeyLinuxDoConnectClientSecret]; ok && strings.TrimSpace(v) != "" {
+		effective.ClientSecret = strings.TrimSpace(v)
+	}
+	if v, ok := settings[SettingKeyLinuxDoConnectRedirectURL]; ok && strings.TrimSpace(v) != "" {
+		effective.RedirectURL = strings.TrimSpace(v)
+	}
+	if !effective.Enabled {
+		return config.LinuxDoConnectConfig{}, infraerrors.NotFound("OAUTH_DISABLED", "oauth login is disabled")
+	}
+	// 基础健壮性校验（避免把用户重定向到一个必然失败或不安全的 OAuth 流程里）。
+	if strings.TrimSpace(effective.ClientID) == "" {
+		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth client id not configured")
+	}
+	if strings.TrimSpace(effective.AuthorizeURL) == "" {
+		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth authorize url not configured")
+	}
+	if strings.TrimSpace(effective.TokenURL) == "" {
+		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token url not configured")
+	}
+	if strings.TrimSpace(effective.UserInfoURL) == "" {
+		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth userinfo url not configured")
+	}
+	if strings.TrimSpace(effective.RedirectURL) == "" {
+		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth redirect url not configured")
+	}
+	if strings.TrimSpace(effective.FrontendRedirectURL) == "" {
+		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth frontend redirect url not configured")
+	}
+	if err := config.ValidateAbsoluteHTTPURL(effective.AuthorizeURL); err != nil {
+		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth authorize url invalid")
+	}
+	if err := config.ValidateAbsoluteHTTPURL(effective.TokenURL); err != nil {
+		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token url invalid")
+	}
+	if err := config.ValidateAbsoluteHTTPURL(effective.UserInfoURL); err != nil {
+		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth userinfo url invalid")
+	}
+	if err := config.ValidateAbsoluteHTTPURL(effective.RedirectURL); err != nil {
+		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth redirect url invalid")
+	}
+	if err := config.ValidateFrontendRedirectURL(effective.FrontendRedirectURL); err != nil {
+		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth frontend redirect url invalid")
+	}
+	method := strings.ToLower(strings.TrimSpace(effective.TokenAuthMethod))
+	switch method {
+	case "", "client_secret_post", "client_secret_basic":
+		if strings.TrimSpace(effective.ClientSecret) == "" {
+			return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth client secret not configured")
+		}
+	case "none":
+		if !effective.UsePKCE {
+			return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth pkce must be enabled when token_auth_method=none")
+		}
+	default:
+		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token_auth_method invalid")
+	}
+	return effective, nil
+}
--- a/backend/internal/service/settings_view.go
+++ b/backend/internal/service/settings_view.go
@@ -18,7 +18,7 @@ type SystemSettings struct {
 	TurnstileSecretKey           string
 	TurnstileSecretKeyConfigured bool
-	// LinuxDo Connect OAuth 登录（终端用户 SSO）
+	// LinuxDo Connect OAuth 登录
 	LinuxDoConnectEnabled                bool
 	LinuxDoConnectClientID               string
 	LinuxDoConnectClientSecret           string
@@ -46,6 +46,12 @@ type SystemSettings struct {
 	// Identity patch configuration (Claude -> Gemini)
 	EnableIdentityPatch bool   `json:"enable_identity_patch"`
 	IdentityPatchPrompt string `json:"identity_patch_prompt"`
+	// Ops monitoring (vNext)
+	OpsMonitoringEnabled         bool
+	OpsRealtimeMonitoringEnabled bool
+	OpsQueryModeDefault          string
+	OpsMetricsIntervalSeconds    int
 }
 type PublicSettings struct {

--- a/backend/internal/service/wire.go
+++ b/backend/internal/service/wire.go
 package service
 import (
+	"database/sql"
 	"time"
 	"github.com/Wei-Shaw/sub2api/internal/config"
 	"github.com/google/wire"
+	"github.com/redis/go-redis/v9"
 )
 // BuildInfo contains build information
@@ -84,6 +86,72 @@ func ProvideConcurrencyService(cache ConcurrencyCache, accountRepo AccountReposi
 	return svc
 }
+// ProvideOpsMetricsCollector creates and starts OpsMetricsCollector.
+func ProvideOpsMetricsCollector(
+	opsRepo OpsRepository,
+	settingRepo SettingRepository,
+	accountRepo AccountRepository,
+	concurrencyService *ConcurrencyService,
+	db *sql.DB,
+	redisClient *redis.Client,
+	cfg *config.Config,
+) *OpsMetricsCollector {
+	collector := NewOpsMetricsCollector(opsRepo, settingRepo, accountRepo, concurrencyService, db, redisClient, cfg)
+	collector.Start()
+	return collector
+}
+// ProvideOpsAggregationService creates and starts OpsAggregationService (hourly/daily pre-aggregation).
+func ProvideOpsAggregationService(
+	opsRepo OpsRepository,
+	settingRepo SettingRepository,
+	db *sql.DB,
+	redisClient *redis.Client,
+	cfg *config.Config,
+) *OpsAggregationService {
+	svc := NewOpsAggregationService(opsRepo, settingRepo, db, redisClient, cfg)
+	svc.Start()
+	return svc
+}
+// ProvideOpsAlertEvaluatorService creates and starts OpsAlertEvaluatorService.
+func ProvideOpsAlertEvaluatorService(
+	opsService *OpsService,
+	opsRepo OpsRepository,
+	emailService *EmailService,
+	redisClient *redis.Client,
+	cfg *config.Config,
+) *OpsAlertEvaluatorService {
+	svc := NewOpsAlertEvaluatorService(opsService, opsRepo, emailService, redisClient, cfg)
+	svc.Start()
+	return svc
+}
+// ProvideOpsCleanupService creates and starts OpsCleanupService (cron scheduled).
+func ProvideOpsCleanupService(
+	opsRepo OpsRepository,
+	db *sql.DB,
+	redisClient *redis.Client,
+	cfg *config.Config,
+) *OpsCleanupService {
+	svc := NewOpsCleanupService(opsRepo, db, redisClient, cfg)
+	svc.Start()
+	return svc
+}
+// ProvideOpsScheduledReportService creates and starts OpsScheduledReportService.
+func ProvideOpsScheduledReportService(
+	opsService *OpsService,
+	userService *UserService,
+	emailService *EmailService,
+	redisClient *redis.Client,
+	cfg *config.Config,
+) *OpsScheduledReportService {
+	svc := NewOpsScheduledReportService(opsService, userService, emailService, redisClient, cfg)
+	svc.Start()
+	return svc
+}
 // ProvideAPIKeyAuthCacheInvalidator 提供 API Key 认证缓存失效能力
 func ProvideAPIKeyAuthCacheInvalidator(apiKeyService *APIKeyService) APIKeyAuthCacheInvalidator {
 	return apiKeyService
@@ -122,6 +190,12 @@ var ProviderSet = wire.NewSet(
 	NewAccountUsageService,
 	NewAccountTestService,
 	NewSettingService,
+	NewOpsService,
+	ProvideOpsMetricsCollector,
+	ProvideOpsAggregationService,
+	ProvideOpsAlertEvaluatorService,
+	ProvideOpsCleanupService,
+	ProvideOpsScheduledReportService,
 	NewEmailService,
 	ProvideEmailQueueService,
 	NewTurnstileService,

--- a/backend/migrations/033_ops_monitoring_vnext.sql
+++ b/backend/migrations/033_ops_monitoring_vnext.sql
+-- Ops Monitoring (vNext): squashed migration (030)
+--
+-- This repository originally planned Ops vNext as migrations 030-036:
+--   030 drop legacy ops tables
+--   031 core schema
+--   032 pre-aggregation tables
+--   033 indexes + optional extensions
+--   034 add avg/max to preagg
+--   035 add notify_email to alert rules
+--   036 seed default alert rules
+--
+-- Since these migrations have NOT been applied to any environment yet, we squash them
+-- into a single 030 migration for easier review and a cleaner migration history.
+--
+-- Notes:
+-- - This is intentionally destructive for ops_* data (error logs / metrics / alerts).
+-- - It is idempotent (DROP/CREATE/ALTER IF EXISTS/IF NOT EXISTS), but will wipe ops_* data if re-run.
+-- =====================================================================
+-- 030_ops_drop_legacy_ops_tables.sql
+-- =====================================================================
+SET LOCAL lock_timeout = '5s';
+SET LOCAL statement_timeout = '10min';
+-- Legacy pre-aggregation tables (from 026 and/or previous branches)
+DROP TABLE IF EXISTS ops_metrics_daily CASCADE;
+DROP TABLE IF EXISTS ops_metrics_hourly CASCADE;
+-- Core ops tables that may exist in some deployments / branches
+DROP TABLE IF EXISTS ops_system_metrics CASCADE;
+DROP TABLE IF EXISTS ops_error_logs CASCADE;
+DROP TABLE IF EXISTS ops_alert_events CASCADE;
+DROP TABLE IF EXISTS ops_alert_rules CASCADE;
+DROP TABLE IF EXISTS ops_job_heartbeats CASCADE;
+DROP TABLE IF EXISTS ops_retry_attempts CASCADE;
+-- Optional legacy tables (best-effort cleanup)
+DROP TABLE IF EXISTS ops_scheduled_reports CASCADE;
+DROP TABLE IF EXISTS ops_group_availability_configs CASCADE;
+DROP TABLE IF EXISTS ops_group_availability_events CASCADE;
+-- Optional legacy views/indexes
+DROP VIEW IF EXISTS ops_latest_metrics CASCADE;
+-- =====================================================================
+-- 031_ops_core_schema.sql
+-- =====================================================================
+-- Ops Monitoring (vNext): core schema (errors / retries / metrics / jobs / alerts)
+--
+-- Design goals:
+-- - Support global filtering (time/platform/group) across all ops modules.
+-- - Persist enough context for two retry modes (client retry / pinned upstream retry).
+-- - Make ops background jobs observable via job heartbeats.
+-- - Keep schema stable and indexes targeted (high-write tables).
+--
+-- Notes:
+-- - This migration is idempotent.
+-- - ops_* tables intentionally avoid strict foreign keys to reduce write amplification/locks.
+SET LOCAL lock_timeout = '5s';
+SET LOCAL statement_timeout = '10min';
+-- ============================================
+-- 1) ops_error_logs: error log details (high-write)
+-- ============================================
+CREATE TABLE IF NOT EXISTS ops_error_logs (
+    id BIGSERIAL PRIMARY KEY,
+    -- Correlation / identities
+    request_id VARCHAR(64),
+    client_request_id VARCHAR(64),
+    user_id BIGINT,
+    api_key_id BIGINT,
+    account_id BIGINT,
+    group_id BIGINT,
+    client_ip inet,
+    -- Dimensions for global filtering
+    platform VARCHAR(32),
+    -- Request metadata
+    model VARCHAR(100),
+    request_path VARCHAR(256),
+    stream BOOLEAN NOT NULL DEFAULT false,
+    user_agent TEXT,
+    -- Core error classification
+    error_phase VARCHAR(32) NOT NULL,
+    error_type VARCHAR(64) NOT NULL,
+    severity VARCHAR(8) NOT NULL DEFAULT 'P2',
+    status_code INT,
+    -- vNext metric semantics
+    is_business_limited BOOLEAN NOT NULL DEFAULT false,
+    -- Error details (sanitized/truncated at ingest time)
+    error_message TEXT,
+    error_body TEXT,
+    -- Provider/upstream details (optional; useful for trends & account health)
+    error_source VARCHAR(64),
+    error_owner VARCHAR(32),
+    account_status VARCHAR(50),
+    upstream_status_code INT,
+    upstream_error_message TEXT,
+    upstream_error_detail TEXT,
+    provider_error_code VARCHAR(64),
+    provider_error_type VARCHAR(64),
+    network_error_type VARCHAR(50),
+    retry_after_seconds INT,
+    -- Timings (ms) - optional
+    duration_ms INT,
+    time_to_first_token_ms BIGINT,
+    auth_latency_ms BIGINT,
+    routing_latency_ms BIGINT,
+    upstream_latency_ms BIGINT,
+    response_latency_ms BIGINT,
+    -- Retry context (only stored for error requests)
+    request_body JSONB,
+    request_headers JSONB,
+    request_body_truncated BOOLEAN NOT NULL DEFAULT false,
+    request_body_bytes INT,
+    -- Retryability flags (best-effort classification)
+    is_retryable BOOLEAN NOT NULL DEFAULT false,
+    retry_count INT NOT NULL DEFAULT 0,
+    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
+);
+COMMENT ON TABLE ops_error_logs IS 'Ops error logs (vNext). Stores sanitized error details and request_body for retries (errors only).';
+-- ============================================
+-- 2) ops_retry_attempts: audit log for retries
+-- ============================================
+CREATE TABLE IF NOT EXISTS ops_retry_attempts (
+    id BIGSERIAL PRIMARY KEY,
+    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+    requested_by_user_id BIGINT,
+    source_error_id BIGINT,
+    -- client|upstream
+    mode VARCHAR(16) NOT NULL,
+    pinned_account_id BIGINT,
+    -- queued|running|succeeded|failed
+    status VARCHAR(16) NOT NULL DEFAULT 'queued',
+    started_at TIMESTAMPTZ,
+    finished_at TIMESTAMPTZ,
+    duration_ms BIGINT,
+    -- Optional result correlation
+    result_request_id VARCHAR(64),
+    result_error_id BIGINT,
+    result_usage_request_id VARCHAR(64),
+    error_message TEXT
+);
+COMMENT ON TABLE ops_retry_attempts IS 'Audit table for ops retries (client retry / pinned upstream retry).';
+-- ============================================
+-- 3) ops_system_metrics: system + request window snapshots
+-- ============================================
+CREATE TABLE IF NOT EXISTS ops_system_metrics (
+    id BIGSERIAL PRIMARY KEY,
+    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+    window_minutes INT NOT NULL DEFAULT 1,
+    -- Optional dimensions (only if collector chooses to write per-dimension snapshots)
+    platform VARCHAR(32),
+    group_id BIGINT,
+    -- Core counts
+    success_count BIGINT NOT NULL DEFAULT 0,
+    error_count_total BIGINT NOT NULL DEFAULT 0,
+    business_limited_count BIGINT NOT NULL DEFAULT 0,
+    error_count_sla BIGINT NOT NULL DEFAULT 0,
+    upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0,
+    upstream_429_count BIGINT NOT NULL DEFAULT 0,
+    upstream_529_count BIGINT NOT NULL DEFAULT 0,
+    token_consumed BIGINT NOT NULL DEFAULT 0,
+    -- Rates
+    qps DOUBLE PRECISION,
+    tps DOUBLE PRECISION,
+    -- Duration percentiles (ms) - success requests
+    duration_p50_ms INT,
+    duration_p90_ms INT,
+    duration_p95_ms INT,
+    duration_p99_ms INT,
+    duration_avg_ms DOUBLE PRECISION,
+    duration_max_ms INT,
+    -- TTFT percentiles (ms) - success requests (streaming)
+    ttft_p50_ms INT,
+    ttft_p90_ms INT,
+    ttft_p95_ms INT,
+    ttft_p99_ms INT,
+    ttft_avg_ms DOUBLE PRECISION,
+    ttft_max_ms INT,
+    -- System resources
+    cpu_usage_percent DOUBLE PRECISION,
+    memory_used_mb BIGINT,
+    memory_total_mb BIGINT,
+    memory_usage_percent DOUBLE PRECISION,
+    -- Dependency health (best-effort)
+    db_ok BOOLEAN,
+    redis_ok BOOLEAN,
+    -- DB pool & runtime
+    db_conn_active INT,
+    db_conn_idle INT,
+    db_conn_waiting INT,
+    goroutine_count INT,
+    -- Queue / concurrency
+    concurrency_queue_depth INT
+);
+COMMENT ON TABLE ops_system_metrics IS 'Ops system/request metrics snapshots (vNext). Used for dashboard overview and realtime rates.';
+-- ============================================
+-- 4) ops_job_heartbeats: background jobs health
+-- ============================================
+CREATE TABLE IF NOT EXISTS ops_job_heartbeats (
+    job_name VARCHAR(64) PRIMARY KEY,
+    last_run_at TIMESTAMPTZ,
+    last_success_at TIMESTAMPTZ,
+    last_error_at TIMESTAMPTZ,
+    last_error TEXT,
+    last_duration_ms BIGINT,
+    updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
+);
+COMMENT ON TABLE ops_job_heartbeats IS 'Ops background jobs heartbeats (vNext).';
+-- ============================================
+-- 5) ops_alert_rules / ops_alert_events
+-- ============================================
+CREATE TABLE IF NOT EXISTS ops_alert_rules (
+    id BIGSERIAL PRIMARY KEY,
+    name VARCHAR(128) NOT NULL,
+    description TEXT,
+    enabled BOOLEAN NOT NULL DEFAULT true,
+    severity VARCHAR(16) NOT NULL DEFAULT 'warning',
+    -- Metric definition
+    -- Metric definition
+    metric_type VARCHAR(64) NOT NULL,
+    operator VARCHAR(8) NOT NULL,
+    threshold DOUBLE PRECISION NOT NULL,
+    window_minutes INT NOT NULL DEFAULT 5,
+    sustained_minutes INT NOT NULL DEFAULT 5,
+    cooldown_minutes INT NOT NULL DEFAULT 10,
+    -- Optional scoping: platform/group filters etc.
+    filters JSONB,
+    last_triggered_at TIMESTAMPTZ,
+    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+    updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
+);
+CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_alert_rules_name_unique
+    ON ops_alert_rules (name);
+CREATE INDEX IF NOT EXISTS idx_ops_alert_rules_enabled
+    ON ops_alert_rules (enabled);
+CREATE TABLE IF NOT EXISTS ops_alert_events (
+    id BIGSERIAL PRIMARY KEY,
+    rule_id BIGINT,
+    severity VARCHAR(16) NOT NULL,
+    status VARCHAR(16) NOT NULL DEFAULT 'firing',
+    title VARCHAR(200),
+    description TEXT,
+    metric_value DOUBLE PRECISION,
+    threshold_value DOUBLE PRECISION,
+    dimensions JSONB,
+    fired_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+    resolved_at TIMESTAMPTZ,
+    email_sent BOOLEAN NOT NULL DEFAULT false,
+    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
+);
+CREATE INDEX IF NOT EXISTS idx_ops_alert_events_rule_status
+    ON ops_alert_events (rule_id, status);
+CREATE INDEX IF NOT EXISTS idx_ops_alert_events_fired_at
+    ON ops_alert_events (fired_at DESC);
+-- =====================================================================
+-- 032_ops_preaggregation_tables.sql
+-- =====================================================================
+-- Ops Monitoring (vNext): pre-aggregation tables
+--
+-- Purpose:
+-- - Provide stable query performance for 1–24h windows (and beyond), avoiding expensive
+--   percentile_cont scans on raw logs for every dashboard refresh.
+-- - Support global filter dimensions: overall / platform / group.
+--
+-- Design note:
+-- - We keep a single table with nullable platform/group_id, and enforce uniqueness via a
+--   COALESCE-based unique index (because UNIQUE with NULLs allows duplicates in Postgres).
+SET LOCAL lock_timeout = '5s';
+SET LOCAL statement_timeout = '10min';
+-- ============================================
+-- 1) ops_metrics_hourly
+-- ============================================
+CREATE TABLE IF NOT EXISTS ops_metrics_hourly (
+    id BIGSERIAL PRIMARY KEY,
+    bucket_start TIMESTAMPTZ NOT NULL,
+    platform VARCHAR(32),
+    group_id BIGINT,
+    success_count BIGINT NOT NULL DEFAULT 0,
+    error_count_total BIGINT NOT NULL DEFAULT 0,
+    business_limited_count BIGINT NOT NULL DEFAULT 0,
+    error_count_sla BIGINT NOT NULL DEFAULT 0,
+    upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0,
+    upstream_429_count BIGINT NOT NULL DEFAULT 0,
+    upstream_529_count BIGINT NOT NULL DEFAULT 0,
+    token_consumed BIGINT NOT NULL DEFAULT 0,
+    -- Duration percentiles (ms)
+    duration_p50_ms INT,
+    duration_p90_ms INT,
+    duration_p95_ms INT,
+    duration_p99_ms INT,
+    -- TTFT percentiles (ms)
+    ttft_p50_ms INT,
+    ttft_p90_ms INT,
+    ttft_p95_ms INT,
+    ttft_p99_ms INT,
+    computed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
+);
+-- Uniqueness across three “dimension modes” (overall / platform / group).
+-- Postgres UNIQUE treats NULLs as distinct, so we enforce uniqueness via COALESCE.
+CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_metrics_hourly_unique_dim
+    ON ops_metrics_hourly (
+        bucket_start,
+        COALESCE(platform, ''),
+        COALESCE(group_id, 0)
+    );
+CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_bucket
+    ON ops_metrics_hourly (bucket_start DESC);
+CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_platform_bucket
+    ON ops_metrics_hourly (platform, bucket_start DESC)
+    WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL;
+CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_group_bucket
+    ON ops_metrics_hourly (group_id, bucket_start DESC)
+    WHERE group_id IS NOT NULL AND group_id <> 0;
+COMMENT ON TABLE ops_metrics_hourly IS 'vNext hourly pre-aggregated ops metrics (overall/platform/group).';
+-- ============================================
+-- 2) ops_metrics_daily (optional; for longer windows)
+-- ============================================
+CREATE TABLE IF NOT EXISTS ops_metrics_daily (
+    id BIGSERIAL PRIMARY KEY,
+    bucket_date DATE NOT NULL,
+    platform VARCHAR(32),
+    group_id BIGINT,
+    success_count BIGINT NOT NULL DEFAULT 0,
+    error_count_total BIGINT NOT NULL DEFAULT 0,
+    business_limited_count BIGINT NOT NULL DEFAULT 0,
+    error_count_sla BIGINT NOT NULL DEFAULT 0,
+    upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0,
+    upstream_429_count BIGINT NOT NULL DEFAULT 0,
+    upstream_529_count BIGINT NOT NULL DEFAULT 0,
+    token_consumed BIGINT NOT NULL DEFAULT 0,
+    duration_p50_ms INT,
+    duration_p90_ms INT,
+    duration_p95_ms INT,
+    duration_p99_ms INT,
+    ttft_p50_ms INT,
+    ttft_p90_ms INT,
+    ttft_p95_ms INT,
+    ttft_p99_ms INT,
+    computed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
+);
+CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_metrics_daily_unique_dim
+    ON ops_metrics_daily (
+        bucket_date,
+        COALESCE(platform, ''),
+        COALESCE(group_id, 0)
+    );
+CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_bucket
+    ON ops_metrics_daily (bucket_date DESC);
+CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_platform_bucket
+    ON ops_metrics_daily (platform, bucket_date DESC)
+    WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL;
+CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_group_bucket
+    ON ops_metrics_daily (group_id, bucket_date DESC)
+    WHERE group_id IS NOT NULL AND group_id <> 0;
+COMMENT ON TABLE ops_metrics_daily IS 'vNext daily pre-aggregated ops metrics (overall/platform/group).';
+-- =====================================================================
+-- 033_ops_indexes_and_extensions.sql
+-- =====================================================================
+-- Ops Monitoring (vNext): indexes and optional extensions
+--
+-- This migration intentionally keeps "optional" objects (like pg_trgm) best-effort,
+-- so environments without extension privileges won't fail the whole migration chain.
+SET LOCAL lock_timeout = '5s';
+SET LOCAL statement_timeout = '10min';
+-- ============================================
+-- 1) Core btree indexes (always safe)
+-- ============================================
+-- ops_error_logs
+CREATE INDEX IF NOT EXISTS idx_ops_error_logs_created_at
+    ON ops_error_logs (created_at DESC);
+CREATE INDEX IF NOT EXISTS idx_ops_error_logs_platform_time
+    ON ops_error_logs (platform, created_at DESC);
+CREATE INDEX IF NOT EXISTS idx_ops_error_logs_group_time
+    ON ops_error_logs (group_id, created_at DESC)
+    WHERE group_id IS NOT NULL;
+CREATE INDEX IF NOT EXISTS idx_ops_error_logs_account_time
+    ON ops_error_logs (account_id, created_at DESC)
+    WHERE account_id IS NOT NULL;
+CREATE INDEX IF NOT EXISTS idx_ops_error_logs_status_time
+    ON ops_error_logs (status_code, created_at DESC);
+CREATE INDEX IF NOT EXISTS idx_ops_error_logs_phase_time
+    ON ops_error_logs (error_phase, created_at DESC);
+CREATE INDEX IF NOT EXISTS idx_ops_error_logs_type_time
+    ON ops_error_logs (error_type, created_at DESC);
+CREATE INDEX IF NOT EXISTS idx_ops_error_logs_request_id
+    ON ops_error_logs (request_id);
+CREATE INDEX IF NOT EXISTS idx_ops_error_logs_client_request_id
+    ON ops_error_logs (client_request_id);
+-- ops_system_metrics
+CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_created_at
+    ON ops_system_metrics (created_at DESC);
+CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_window_time
+    ON ops_system_metrics (window_minutes, created_at DESC);
+CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_platform_time
+    ON ops_system_metrics (platform, created_at DESC)
+    WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL;
+CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_group_time
+    ON ops_system_metrics (group_id, created_at DESC)
+    WHERE group_id IS NOT NULL;
+-- ops_retry_attempts
+CREATE INDEX IF NOT EXISTS idx_ops_retry_attempts_created_at
+    ON ops_retry_attempts (created_at DESC);
+CREATE INDEX IF NOT EXISTS idx_ops_retry_attempts_source_error
+    ON ops_retry_attempts (source_error_id, created_at DESC)
+    WHERE source_error_id IS NOT NULL;
+-- Prevent concurrent retries for the same ops_error_logs row (race-free, multi-instance safe).
+CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_retry_attempts_unique_active
+    ON ops_retry_attempts (source_error_id)
+    WHERE source_error_id IS NOT NULL AND status IN ('queued', 'running');
+-- ============================================
+-- 2) Optional: pg_trgm + trigram indexes for fuzzy search
+-- ============================================
+DO $$
+BEGIN
+  BEGIN
+    CREATE EXTENSION IF NOT EXISTS pg_trgm;
+  EXCEPTION WHEN OTHERS THEN
+    -- Missing privileges or extension package should not block migrations.
+    RAISE NOTICE 'pg_trgm extension not created: %', SQLERRM;
+  END;
+  IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_trgm') THEN
+    -- request_id / client_request_id fuzzy search
+    EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_request_id_trgm
+             ON ops_error_logs USING gin (request_id gin_trgm_ops)';
+    EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_client_request_id_trgm
+             ON ops_error_logs USING gin (client_request_id gin_trgm_ops)';
+    -- error_message fuzzy search
+    EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_error_message_trgm
+             ON ops_error_logs USING gin (error_message gin_trgm_ops)';
+  END IF;
+END $$;
+-- =====================================================================
+-- 034_ops_preaggregation_add_avg_max.sql
+-- =====================================================================
+-- Ops Monitoring (vNext): extend pre-aggregation tables with avg/max latency fields
+--
+-- Why:
+-- - The dashboard overview returns avg/max for duration/TTFT.
+-- - Hourly/daily pre-aggregation tables originally stored only p50/p90/p95/p99, which makes
+--   it impossible to answer avg/max in preagg mode without falling back to raw scans.
+--
+-- This migration is idempotent and safe to run multiple times.
+--
+-- NOTE: We keep the existing p50/p90/p95/p99 columns as-is; these are still used for
+--       approximate long-window summaries.
+SET LOCAL lock_timeout = '5s';
+SET LOCAL statement_timeout = '10min';
+-- Hourly table
+ALTER TABLE ops_metrics_hourly
+    ADD COLUMN IF NOT EXISTS duration_avg_ms DOUBLE PRECISION,
+    ADD COLUMN IF NOT EXISTS duration_max_ms INT,
+    ADD COLUMN IF NOT EXISTS ttft_avg_ms DOUBLE PRECISION,
+    ADD COLUMN IF NOT EXISTS ttft_max_ms INT;
+-- Daily table
+ALTER TABLE ops_metrics_daily
+    ADD COLUMN IF NOT EXISTS duration_avg_ms DOUBLE PRECISION,
+    ADD COLUMN IF NOT EXISTS duration_max_ms INT,
+    ADD COLUMN IF NOT EXISTS ttft_avg_ms DOUBLE PRECISION,
+    ADD COLUMN IF NOT EXISTS ttft_max_ms INT;
+-- =====================================================================
+-- 035_ops_alert_rules_notify_email.sql
+-- =====================================================================
+-- Ops Monitoring (vNext): alert rule notify settings
+--
+-- Adds notify_email flag to ops_alert_rules to keep UI parity with the backup Ops dashboard.
+-- Migration is idempotent.
+SET LOCAL lock_timeout = '5s';
+SET LOCAL statement_timeout = '10min';
+ALTER TABLE ops_alert_rules
+    ADD COLUMN IF NOT EXISTS notify_email BOOLEAN NOT NULL DEFAULT true;
+-- =====================================================================
+-- 036_ops_seed_default_alert_rules.sql
+-- =====================================================================
+-- Ops Monitoring (vNext): seed default alert rules (idempotent)
+--
+-- Goal:
+-- - Provide "out of the box" alert rules so the Ops dashboard can immediately show alert events.
+-- - Keep inserts idempotent via ON CONFLICT (name) DO NOTHING.
+--
+-- Notes:
+-- - Thresholds are intentionally conservative defaults and should be tuned per deployment.
+-- - Metric semantics follow vNext:
+--   - success_rate / error_rate are based on SLA-scope counts (exclude is_business_limited).
+--   - upstream_error_rate excludes 429/529.
+SET LOCAL lock_timeout = '5s';
+SET LOCAL statement_timeout = '10min';
+-- 1) High error rate (P1)
+INSERT INTO ops_alert_rules (
+    name, description, enabled, metric_type, operator, threshold,
+    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
+    created_at, updated_at
+) VALUES (
+    '错误率过高',
+    '当错误率超过 5% 且持续 5 分钟时触发告警',
+    true, 'error_rate', '>', 5.0, 5, 5, 'P1', true, 20, NOW(), NOW()
+) ON CONFLICT (name) DO NOTHING;
+-- 2) Low success rate (P0)
+INSERT INTO ops_alert_rules (
+    name, description, enabled, metric_type, operator, threshold,
+    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
+    created_at, updated_at
+) VALUES (
+    '成功率过低',
+    '当成功率低于 95% 且持续 5 分钟时触发告警（服务可用性下降）',
+    true, 'success_rate', '<', 95.0, 5, 5, 'P0', true, 15, NOW(), NOW()
+) ON CONFLICT (name) DO NOTHING;
+-- 3) P99 latency too high (P2)
+INSERT INTO ops_alert_rules (
+    name, description, enabled, metric_type, operator, threshold,
+    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
+    created_at, updated_at
+) VALUES (
+    'P99延迟过高',
+    '当 P99 延迟超过 3000ms 且持续 10 分钟时触发告警',
+    true, 'p99_latency_ms', '>', 3000.0, 5, 10, 'P2', true, 30, NOW(), NOW()
+) ON CONFLICT (name) DO NOTHING;
+-- 4) P95 latency too high (P2)
+INSERT INTO ops_alert_rules (
+    name, description, enabled, metric_type, operator, threshold,
+    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
+    created_at, updated_at
+) VALUES (
+    'P95延迟过高',
+    '当 P95 延迟超过 2000ms 且持续 10 分钟时触发告警',
+    true, 'p95_latency_ms', '>', 2000.0, 5, 10, 'P2', true, 30, NOW(), NOW()
+) ON CONFLICT (name) DO NOTHING;
+-- 5) CPU usage too high (P2)
+INSERT INTO ops_alert_rules (
+    name, description, enabled, metric_type, operator, threshold,
+    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
+    created_at, updated_at
+) VALUES (
+    'CPU使用率过高',
+    '当 CPU 使用率超过 85% 且持续 10 分钟时触发告警',
+    true, 'cpu_usage_percent', '>', 85.0, 5, 10, 'P2', true, 30, NOW(), NOW()
+) ON CONFLICT (name) DO NOTHING;
+-- 6) Memory usage too high (P1)
+INSERT INTO ops_alert_rules (
+    name, description, enabled, metric_type, operator, threshold,
+    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
+    created_at, updated_at
+) VALUES (
+    '内存使用率过高',
+    '当内存使用率超过 90% 且持续 10 分钟时触发告警（可能导致 OOM）',
+    true, 'memory_usage_percent', '>', 90.0, 5, 10, 'P1', true, 20, NOW(), NOW()
+) ON CONFLICT (name) DO NOTHING;
+-- 7) Concurrency queue buildup (P1)
+INSERT INTO ops_alert_rules (
+    name, description, enabled, metric_type, operator, threshold,
+    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
+    created_at, updated_at
+) VALUES (
+    '并发队列积压',
+    '当并发队列深度超过 100 且持续 5 分钟时触发告警（系统处理能力不足）',
+    true, 'concurrency_queue_depth', '>', 100.0, 5, 5, 'P1', true, 20, NOW(), NOW()
+) ON CONFLICT (name) DO NOTHING;
+-- 8) Extremely high error rate (P0)
+INSERT INTO ops_alert_rules (
+    name, description, enabled, metric_type, operator, threshold,
+    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
+    created_at, updated_at
+) VALUES (
+    '错误率极高',
+    '当错误率超过 20% 且持续 1 分钟时触发告警（服务严重异常）',
+    true, 'error_rate', '>', 20.0, 1, 1, 'P0', true, 15, NOW(), NOW()
+) ON CONFLICT (name) DO NOTHING;
+-- Ops Monitoring vNext: add Redis pool stats fields to system metrics snapshots.
+-- This migration is intentionally idempotent.
+ALTER TABLE ops_system_metrics
+  ADD COLUMN IF NOT EXISTS redis_conn_total INT,
+  ADD COLUMN IF NOT EXISTS redis_conn_idle INT;
+COMMENT ON COLUMN ops_system_metrics.redis_conn_total IS 'Redis pool total connections (go-redis PoolStats.TotalConns).';
+COMMENT ON COLUMN ops_system_metrics.redis_conn_idle IS 'Redis pool idle connections (go-redis PoolStats.IdleConns).';
--- a/backend/migrations/034_ops_upstream_error_events.sql
+++ b/backend/migrations/034_ops_upstream_error_events.sql
+-- Add upstream error events list (JSONB) to ops_error_logs for per-request correlation.
+--
+-- This is intentionally idempotent.
+ALTER TABLE ops_error_logs
+    ADD COLUMN IF NOT EXISTS upstream_errors JSONB;
+COMMENT ON COLUMN ops_error_logs.upstream_errors IS
+    'Sanitized upstream error events list (JSON array), correlated per gateway request (request_id/client_request_id); used for per-request upstream debugging.';
--- a/config.yaml
+++ b/config.yaml
@@ -159,7 +159,7 @@ gateway:
  max_line_size: 41943040
  # Log upstream error response body summary (safe/truncated; does not log request content)
  # 记录上游错误响应体摘要（安全/截断；不记录请求内容）
-  log_upstream_error_body: false
+  log_upstream_error_body: true
  # Max bytes to log from upstream error body
  # 记录上游错误响应体的最大字节数
  log_upstream_error_body_max_bytes: 2048
@@ -302,6 +302,41 @@ redis:
  # 数据库编号（0-15）
  db: 0
+# =============================================================================
+# Ops Monitoring (Optional)
+# 运维监控 (可选)
+# =============================================================================
+ops:
+  # Hard switch: disable all ops background jobs and APIs when false
+  # 硬开关：为 false 时禁用所有 Ops 后台任务与接口
+  enabled: true
+  # Prefer pre-aggregated tables (ops_metrics_hourly/ops_metrics_daily) for long-window dashboard queries.
+  # 优先使用预聚合表（用于长时间窗口查询性能）
+  use_preaggregated_tables: false
+  # Data cleanup configuration
+  # 数据清理配置（vNext 默认统一保留 30 天）
+  cleanup:
+    enabled: true
+    # Cron expression (minute hour dom month dow), e.g. "0 2 * * *" = daily at 2 AM
+    # Cron 表达式（分 时 日 月 周），例如 "0 2 * * *" = 每天凌晨 2 点
+    schedule: "0 2 * * *"
+    error_log_retention_days: 30
+    minute_metrics_retention_days: 30
+    hourly_metrics_retention_days: 30
+  # Pre-aggregation configuration
+  # 预聚合任务配置
+  aggregation:
+    enabled: true
+  # OpsMetricsCollector Redis cache (reduces duplicate expensive window aggregation in multi-replica deployments)
+  # 指标采集 Redis 缓存（多副本部署时减少重复计算）
+  metrics_collector_cache:
+    enabled: true
+    ttl: 65s
 # =============================================================================
 # JWT Configuration
 # JWT 配置

--- a/deploy/.env.example
+++ b/deploy/.env.example
@@ -151,6 +151,15 @@ GEMINI_OAUTH_SCOPES=
 # GEMINI_QUOTA_POLICY={"tiers":{"LEGACY":{"pro_rpd":50,"flash_rpd":1500,"cooldown_minutes":30},"PRO":{"pro_rpd":1500,"flash_rpd":4000,"cooldown_minutes":5},"ULTRA":{"pro_rpd":2000,"flash_rpd":0,"cooldown_minutes":5}}}
 GEMINI_QUOTA_POLICY=
+# -----------------------------------------------------------------------------
+# Ops Monitoring Configuration (运维监控配置)
+# -----------------------------------------------------------------------------
+# Enable ops monitoring features (background jobs and APIs)
+# 是否启用运维监控功能（后台任务和接口）
+# Set to false to hide ops menu in sidebar and disable all ops features
+# 设置为 false 可在左侧栏隐藏运维监控菜单并禁用所有运维监控功能
+OPS_ENABLED=true
 # -----------------------------------------------------------------------------
 # Update Configuration (在线更新配置)
 # -----------------------------------------------------------------------------

--- a/deploy/config.example.yaml
+++ b/deploy/config.example.yaml
@@ -159,7 +159,7 @@ gateway:
  max_line_size: 41943040
  # Log upstream error response body summary (safe/truncated; does not log request content)
  # 记录上游错误响应体摘要（安全/截断；不记录请求内容）
-  log_upstream_error_body: false
+  log_upstream_error_body: true
  # Max bytes to log from upstream error body
  # 记录上游错误响应体的最大字节数
  log_upstream_error_body_max_bytes: 2048
@@ -302,6 +302,19 @@ redis:
  # 数据库编号（0-15）
  db: 0
+# =============================================================================
+# Ops Monitoring (Optional)
+# 运维监控 (可选)
+# =============================================================================
+ops:
+  # Enable ops monitoring features (background jobs and APIs)
+  # 是否启用运维监控功能（后台任务和接口）
+  # Set to false to hide ops menu in sidebar and disable all ops features
+  # 设置为 false 可在左侧栏隐藏运维监控菜单并禁用所有运维监控功能
+  # Other detailed settings (cleanup, aggregation, etc.) are configured in ops settings dialog
+  # 其他详细设置（数据清理、预聚合等）在运维监控设置对话框中配置
+  enabled: true
 # =============================================================================
 # JWT Configuration
 # JWT 配置

--- a/frontend/src/api/admin/index.ts
+++ b/frontend/src/api/admin/index.ts
@@ -17,6 +17,7 @@ import usageAPI from './usage'
 import geminiAPI from './gemini'
 import antigravityAPI from './antigravity'
 import userAttributesAPI from './userAttributes'
+import opsAPI from './ops'
 /**
 * Unified admin API object for convenient access
@@ -35,7 +36,8 @@ export const adminAPI = {
  usage: usageAPI,
  gemini: geminiAPI,
  antigravity: antigravityAPI,
-  userAttributes: userAttributesAPI
+  userAttributes: userAttributesAPI,
+  ops: opsAPI
 }
 export {
@@ -52,7 +54,8 @@ export {
  usageAPI,
  geminiAPI,
  antigravityAPI,
-  userAttributesAPI
+  userAttributesAPI,
+  opsAPI
 }
 export default adminAPI
--- a/frontend/src/api/admin/ops.ts
+++ b/frontend/src/api/admin/ops.ts
+/**
+ * Admin Ops API endpoints (vNext)
+ * - Error logs list/detail + retry (client/upstream)
+ * - Dashboard overview (raw path)
+ */
+import { apiClient } from '../client'
+import type { PaginatedResponse } from '@/types'
+export type OpsRetryMode = 'client' | 'upstream'
+export type OpsQueryMode = 'auto' | 'raw' | 'preagg'
+export interface OpsRequestOptions {
+  signal?: AbortSignal
+}
+export interface OpsRetryRequest {
+  mode: OpsRetryMode
+  pinned_account_id?: number
+}
+export interface OpsRetryResult {
+  attempt_id: number
+  mode: OpsRetryMode
+  status: 'running' | 'succeeded' | 'failed' | string
+  pinned_account_id?: number | null
+  used_account_id?: number | null
+  http_status_code: number
+  upstream_request_id: string
+  response_preview: string
+  response_truncated: boolean
+  error_message: string
+  started_at: string
+  finished_at: string
+  duration_ms: number
+}
+export interface OpsDashboardOverview {
+  start_time: string
+  end_time: string
+  platform: string
+  group_id?: number | null
+  health_score?: number
+  system_metrics?: OpsSystemMetricsSnapshot | null
+  job_heartbeats?: OpsJobHeartbeat[] | null
+  success_count: number
+  error_count_total: number
+  business_limited_count: number
+  error_count_sla: number
+  request_count_total: number
+  request_count_sla: number
+  token_consumed: number
+  sla: number
+  error_rate: number
+  upstream_error_rate: number
+  upstream_error_count_excl_429_529: number
+  upstream_429_count: number
+  upstream_529_count: number
+  qps: {
+    current: number
+    peak: number
+    avg: number
+  }
+  tps: {
+    current: number
+    peak: number
+    avg: number
+  }
+  duration: OpsPercentiles
+  ttft: OpsPercentiles
+}
+export interface OpsPercentiles {
+  p50_ms?: number | null
+  p90_ms?: number | null
+  p95_ms?: number | null
+  p99_ms?: number | null
+  avg_ms?: number | null
+  max_ms?: number | null
+}
+export interface OpsThroughputTrendPoint {
+  bucket_start: string
+  request_count: number
+  token_consumed: number
+  qps: number
+  tps: number
+}
+export interface OpsThroughputPlatformBreakdownItem {
+  platform: string
+  request_count: number
+  token_consumed: number
+}
+export interface OpsThroughputGroupBreakdownItem {
+  group_id: number
+  group_name: string
+  request_count: number
+  token_consumed: number
+}
+export interface OpsThroughputTrendResponse {
+  bucket: string
+  points: OpsThroughputTrendPoint[]
+  by_platform?: OpsThroughputPlatformBreakdownItem[]
+  top_groups?: OpsThroughputGroupBreakdownItem[]
+}
+export type OpsRequestKind = 'success' | 'error'
+export type OpsRequestDetailsKind = OpsRequestKind | 'all'
+export type OpsRequestDetailsSort = 'created_at_desc' | 'duration_desc'
+export interface OpsRequestDetail {
+  kind: OpsRequestKind
+  created_at: string
+  request_id: string
+  platform?: string
+  model?: string
+  duration_ms?: number | null
+  status_code?: number | null
+  error_id?: number | null
+  phase?: string
+  severity?: string
+  message?: string
+  user_id?: number | null
+  api_key_id?: number | null
+  account_id?: number | null
+  group_id?: number | null
+  stream?: boolean
+}
+export interface OpsRequestDetailsParams {
+  time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
+  start_time?: string
+  end_time?: string
+  kind?: OpsRequestDetailsKind
+  platform?: string
+  group_id?: number | null
+  user_id?: number
+  api_key_id?: number
+  account_id?: number
+  model?: string
+  request_id?: string
+  q?: string
+  min_duration_ms?: number
+  max_duration_ms?: number
+  sort?: OpsRequestDetailsSort
+  page?: number
+  page_size?: number
+}
+export type OpsRequestDetailsResponse = PaginatedResponse<OpsRequestDetail>
+export interface OpsLatencyHistogramBucket {
+  range: string
+  count: number
+}
+export interface OpsLatencyHistogramResponse {
+  start_time: string
+  end_time: string
+  platform: string
+  group_id?: number | null
+  total_requests: number
+  buckets: OpsLatencyHistogramBucket[]
+}
+export interface OpsErrorTrendPoint {
+  bucket_start: string
+  error_count_total: number
+  business_limited_count: number
+  error_count_sla: number
+  upstream_error_count_excl_429_529: number
+  upstream_429_count: number
+  upstream_529_count: number
+}
+export interface OpsErrorTrendResponse {
+  bucket: string
+  points: OpsErrorTrendPoint[]
+}
+export interface OpsErrorDistributionItem {
+  status_code: number
+  total: number
+  sla: number
+  business_limited: number
+}
+export interface OpsErrorDistributionResponse {
+  total: number
+  items: OpsErrorDistributionItem[]
+}
+export interface OpsSystemMetricsSnapshot {
+  id: number
+  created_at: string
+  window_minutes: number
+  cpu_usage_percent?: number | null
+  memory_used_mb?: number | null
+  memory_total_mb?: number | null
+  memory_usage_percent?: number | null
+  db_ok?: boolean | null
+  redis_ok?: boolean | null
+  // Config-derived limits (best-effort) for rendering "current vs max".
+  db_max_open_conns?: number | null
+  redis_pool_size?: number | null
+  redis_conn_total?: number | null
+  redis_conn_idle?: number | null
+  db_conn_active?: number | null
+  db_conn_idle?: number | null
+  db_conn_waiting?: number | null
+  goroutine_count?: number | null
+  concurrency_queue_depth?: number | null
+}
+export interface OpsJobHeartbeat {
+  job_name: string
+  last_run_at?: string | null
+  last_success_at?: string | null
+  last_error_at?: string | null
+  last_error?: string | null
+  last_duration_ms?: number | null
+  updated_at: string
+}
+export interface PlatformConcurrencyInfo {
+  platform: string
+  current_in_use: number
+  max_capacity: number
+  load_percentage: number
+  waiting_in_queue: number
+}
+export interface GroupConcurrencyInfo {
+  group_id: number
+  group_name: string
+  platform: string
+  current_in_use: number
+  max_capacity: number
+  load_percentage: number
+  waiting_in_queue: number
+}
+export interface AccountConcurrencyInfo {
+  account_id: number
+  account_name?: string
+  platform: string
+  group_id: number
+  group_name: string
+  current_in_use: number
+  max_capacity: number
+  load_percentage: number
+  waiting_in_queue: number
+}
+export interface OpsConcurrencyStatsResponse {
+  enabled: boolean
+  platform: Record<string, PlatformConcurrencyInfo>
+  group: Record<string, GroupConcurrencyInfo>
+  account: Record<string, AccountConcurrencyInfo>
+  timestamp?: string
+}
+export async function getConcurrencyStats(platform?: string, groupId?: number | null): Promise<OpsConcurrencyStatsResponse> {
+  const params: Record<string, any> = {}
+  if (platform) {
+    params.platform = platform
+  }
+  if (typeof groupId === 'number' && groupId > 0) {
+    params.group_id = groupId
+  }
+  const { data } = await apiClient.get<OpsConcurrencyStatsResponse>('/admin/ops/concurrency', { params })
+  return data
+}
+export interface PlatformAvailability {
+  platform: string
+  total_accounts: number
+  available_count: number
+  rate_limit_count: number
+  error_count: number
+}
+export interface GroupAvailability {
+  group_id: number
+  group_name: string
+  platform: string
+  total_accounts: number
+  available_count: number
+  rate_limit_count: number
+  error_count: number
+}
+export interface AccountAvailability {
+  account_id: number
+  account_name: string
+  platform: string
+  group_id: number
+  group_name: string
+  status: string
+  is_available: boolean
+  is_rate_limited: boolean
+  rate_limit_reset_at?: string
+  rate_limit_remaining_sec?: number
+  is_overloaded: boolean
+  overload_until?: string
+  overload_remaining_sec?: number
+  has_error: boolean
+  error_message?: string
+}
+export interface OpsAccountAvailabilityStatsResponse {
+  enabled: boolean
+  platform: Record<string, PlatformAvailability>
+  group: Record<string, GroupAvailability>
+  account: Record<string, AccountAvailability>
+  timestamp?: string
+}
+export async function getAccountAvailabilityStats(platform?: string, groupId?: number | null): Promise<OpsAccountAvailabilityStatsResponse> {
+  const params: Record<string, any> = {}
+  if (platform) {
+    params.platform = platform
+  }
+  if (typeof groupId === 'number' && groupId > 0) {
+    params.group_id = groupId
+  }
+  const { data } = await apiClient.get<OpsAccountAvailabilityStatsResponse>('/admin/ops/account-availability', { params })
+  return data
+}
+/**
+ * Subscribe to realtime QPS updates via WebSocket.
+ *
+ * Note: browsers cannot set Authorization headers for WebSockets.
+ * We authenticate via Sec-WebSocket-Protocol using a prefixed token item:
+ *   ["sub2api-admin", "jwt.<token>"]
+ */
+export interface SubscribeQPSOptions {
+  token?: string | null
+  onOpen?: () => void
+  onClose?: (event: CloseEvent) => void
+  onError?: (event: Event) => void
+  /**
+   * Called when the server closes with an application close code that indicates
+   * reconnecting is not useful (e.g. feature flag disabled).
+   */
+  onFatalClose?: (event: CloseEvent) => void
+  /**
+   * More granular status updates for UI (connecting/reconnecting/offline/etc).
+   */
+  onStatusChange?: (status: OpsWSStatus) => void
+  /**
+   * Called when a reconnect is scheduled (helps display "retry in Xs").
+   */
+  onReconnectScheduled?: (info: { attempt: number, delayMs: number }) => void
+  wsBaseUrl?: string
+  /**
+   * Maximum reconnect attempts. Defaults to Infinity to keep the dashboard live.
+   * Set to 0 to disable reconnect.
+   */
+  maxReconnectAttempts?: number
+  reconnectBaseDelayMs?: number
+  reconnectMaxDelayMs?: number
+  /**
+   * Stale connection detection (heartbeat-by-observation).
+   * If no messages are received within this window, the socket is closed to trigger a reconnect.
+   * Set to 0 to disable.
+   */
+  staleTimeoutMs?: number
+  /**
+   * How often to check staleness. Only used when `staleTimeoutMs > 0`.
+   */
+  staleCheckIntervalMs?: number
+}
+export type OpsWSStatus = 'connecting' | 'connected' | 'reconnecting' | 'offline' | 'closed'
+export const OPS_WS_CLOSE_CODES = {
+  REALTIME_DISABLED: 4001
+} as const
+const OPS_WS_BASE_PROTOCOL = 'sub2api-admin'
+export function subscribeQPS(onMessage: (data: any) => void, options: SubscribeQPSOptions = {}): () => void {
+  let ws: WebSocket | null = null
+  let reconnectAttempts = 0
+  const maxReconnectAttempts = Number.isFinite(options.maxReconnectAttempts as number)
+    ? (options.maxReconnectAttempts as number)
+    : Infinity
+  const baseDelayMs = options.reconnectBaseDelayMs ?? 1000
+  const maxDelayMs = options.reconnectMaxDelayMs ?? 30000
+  let reconnectTimer: ReturnType<typeof setTimeout> | null = null
+  let shouldReconnect = true
+  let isConnecting = false
+  let hasConnectedOnce = false
+  let lastMessageAt = 0
+  const staleTimeoutMs = options.staleTimeoutMs ?? 120_000
+  const staleCheckIntervalMs = options.staleCheckIntervalMs ?? 30_000
+  let staleTimer: ReturnType<typeof setInterval> | null = null
+  const setStatus = (status: OpsWSStatus) => {
+    options.onStatusChange?.(status)
+  }
+  const clearReconnectTimer = () => {
+    if (reconnectTimer) {
+      clearTimeout(reconnectTimer)
+      reconnectTimer = null
+    }
+  }
+  const clearStaleTimer = () => {
+    if (staleTimer) {
+      clearInterval(staleTimer)
+      staleTimer = null
+    }
+  }
+  const startStaleTimer = () => {
+    clearStaleTimer()
+    if (!staleTimeoutMs || staleTimeoutMs <= 0) return
+    staleTimer = setInterval(() => {
+      if (!shouldReconnect) return
+      if (!ws || ws.readyState !== WebSocket.OPEN) return
+      if (!lastMessageAt) return
+      const ageMs = Date.now() - lastMessageAt
+      if (ageMs > staleTimeoutMs) {
+        // Treat as a half-open connection; closing triggers the normal reconnect path.
+        ws.close()
+      }
+    }, staleCheckIntervalMs)
+  }
+  const scheduleReconnect = () => {
+    if (!shouldReconnect) return
+    if (hasConnectedOnce && reconnectAttempts >= maxReconnectAttempts) return
+    // If we're offline, wait for the browser to come back online.
+    if (typeof navigator !== 'undefined' && 'onLine' in navigator && !navigator.onLine) {
+      setStatus('offline')
+      return
+    }
+    const expDelay = baseDelayMs * Math.pow(2, reconnectAttempts)
+    const delay = Math.min(expDelay, maxDelayMs)
+    const jitter = Math.floor(Math.random() * 250)
+    clearReconnectTimer()
+    reconnectTimer = setTimeout(() => {
+      reconnectAttempts++
+      connect()
+    }, delay + jitter)
+    options.onReconnectScheduled?.({ attempt: reconnectAttempts + 1, delayMs: delay + jitter })
+  }
+  const handleOnline = () => {
+    if (!shouldReconnect) return
+    if (ws && (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING)) return
+    connect()
+  }
+  const handleOffline = () => {
+    setStatus('offline')
+  }
+  const connect = () => {
+    if (!shouldReconnect) return
+    if (isConnecting) return
+    if (ws && (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING)) return
+    if (hasConnectedOnce && reconnectAttempts >= maxReconnectAttempts) return
+    isConnecting = true
+    setStatus(hasConnectedOnce ? 'reconnecting' : 'connecting')
+    const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'
+    const wsBaseUrl = options.wsBaseUrl || import.meta.env.VITE_WS_BASE_URL || window.location.host
+    const wsURL = new URL(`${protocol}//${wsBaseUrl}/api/v1/admin/ops/ws/qps`)
+    // Do NOT put admin JWT in the URL query string (it can leak via access logs, proxies, etc).
+    // Browsers cannot set Authorization headers for WebSockets, so we pass the token via
+    // Sec-WebSocket-Protocol (subprotocol list): ["sub2api-admin", "jwt.<token>"].
+    const rawToken = String(options.token ?? localStorage.getItem('auth_token') ?? '').trim()
+    const protocols: string[] = [OPS_WS_BASE_PROTOCOL]
+    if (rawToken) protocols.push(`jwt.${rawToken}`)
+    ws = new WebSocket(wsURL.toString(), protocols)
+    ws.onopen = () => {
+      reconnectAttempts = 0
+      isConnecting = false
+      hasConnectedOnce = true
+      clearReconnectTimer()
+      lastMessageAt = Date.now()
+      startStaleTimer()
+      setStatus('connected')
+      options.onOpen?.()
+    }
+    ws.onmessage = (e) => {
+      try {
+        const data = JSON.parse(e.data)
+        lastMessageAt = Date.now()
+        onMessage(data)
+      } catch (err) {
+        console.warn('[OpsWS] Failed to parse message:', err)
+      }
+    }
+    ws.onerror = (error) => {
+      console.error('[OpsWS] Connection error:', error)
+      options.onError?.(error)
+    }
+    ws.onclose = (event) => {
+      isConnecting = false
+      options.onClose?.(event)
+      clearStaleTimer()
+      ws = null
+      // If the server explicitly tells us to stop reconnecting, honor it.
+      if (event && typeof event.code === 'number' && event.code === OPS_WS_CLOSE_CODES.REALTIME_DISABLED) {
+        shouldReconnect = false
+        clearReconnectTimer()
+        setStatus('closed')
+        options.onFatalClose?.(event)
+        return
+      }
+      scheduleReconnect()
+    }
+  }
+  window.addEventListener('online', handleOnline)
+  window.addEventListener('offline', handleOffline)
+  connect()
+  return () => {
+    shouldReconnect = false
+    window.removeEventListener('online', handleOnline)
+    window.removeEventListener('offline', handleOffline)
+    clearReconnectTimer()
+    clearStaleTimer()
+    if (ws) ws.close()
+    ws = null
+    setStatus('closed')
+  }
+}
+export type OpsSeverity = string
+export type OpsPhase = string
+export type AlertSeverity = 'critical' | 'warning' | 'info'
+export type ThresholdMode = 'count' | 'percentage' | 'both'
+export type MetricType =
+  | 'success_rate'
+  | 'error_rate'
+  | 'upstream_error_rate'
+  | 'p95_latency_ms'
+  | 'p99_latency_ms'
+  | 'cpu_usage_percent'
+  | 'memory_usage_percent'
+  | 'concurrency_queue_depth'
+  | 'group_available_accounts'
+  | 'group_available_ratio'
+  | 'group_rate_limit_ratio'
+  | 'account_rate_limited_count'
+  | 'account_error_count'
+  | 'account_error_ratio'
+  | 'overload_account_count'
+export type Operator = '>' | '>=' | '<' | '<=' | '==' | '!='
+export interface AlertRule {
+  id?: number
+  name: string
+  description?: string
+  enabled: boolean
+  metric_type: MetricType
+  operator: Operator
+  threshold: number
+  window_minutes: number
+  sustained_minutes: number
+  severity: OpsSeverity
+  cooldown_minutes: number
+  notify_email: boolean
+  filters?: Record<string, any>
+  created_at?: string
+  updated_at?: string
+  last_triggered_at?: string | null
+}
+export interface AlertEvent {
+  id: number
+  rule_id: number
+  severity: OpsSeverity | string
+  status: 'firing' | 'resolved' | string
+  title?: string
+  description?: string
+  metric_value?: number
+  threshold_value?: number
+  dimensions?: Record<string, any>
+  fired_at: string
+  resolved_at?: string | null
+  email_sent: boolean
+  created_at: string
+}
+export interface EmailNotificationConfig {
+  alert: {
+    enabled: boolean
+    recipients: string[]
+    min_severity: AlertSeverity | ''
+    rate_limit_per_hour: number
+    batching_window_seconds: number
+    include_resolved_alerts: boolean
+  }
+  report: {
+    enabled: boolean
+    recipients: string[]
+    daily_summary_enabled: boolean
+    daily_summary_schedule: string
+    weekly_summary_enabled: boolean
+    weekly_summary_schedule: string
+    error_digest_enabled: boolean
+    error_digest_schedule: string
+    error_digest_min_count: number
+    account_health_enabled: boolean
+    account_health_schedule: string
+    account_health_error_rate_threshold: number
+  }
+}
+export interface OpsDistributedLockSettings {
+  enabled: boolean
+  key: string
+  ttl_seconds: number
+}
+export interface OpsAlertRuntimeSettings {
+  evaluation_interval_seconds: number
+  distributed_lock: OpsDistributedLockSettings
+  silencing: {
+    enabled: boolean
+    global_until_rfc3339: string
+    global_reason: string
+    entries?: Array<{
+      rule_id?: number
+      severities?: Array<OpsSeverity | string>
+      until_rfc3339: string
+      reason: string
+    }>
+  }
+}
+export interface OpsAdvancedSettings {
+  data_retention: OpsDataRetentionSettings
+  aggregation: OpsAggregationSettings
+}
+export interface OpsDataRetentionSettings {
+  cleanup_enabled: boolean
+  cleanup_schedule: string
+  error_log_retention_days: number
+  minute_metrics_retention_days: number
+  hourly_metrics_retention_days: number
+}
+export interface OpsAggregationSettings {
+  aggregation_enabled: boolean
+}
+export interface OpsErrorLog {
+  id: number
+  created_at: string
+  phase: OpsPhase
+  type: string
+  severity: OpsSeverity
+  status_code: number
+  platform: string
+  model: string
+  latency_ms?: number | null
+  client_request_id: string
+  request_id: string
+  message: string
+  user_id?: number | null
+  api_key_id?: number | null
+  account_id?: number | null
+  group_id?: number | null
+  client_ip?: string | null
+  request_path?: string
+  stream?: boolean
+}
+export interface OpsErrorDetail extends OpsErrorLog {
+  error_body: string
+  user_agent: string
+  // Upstream context (optional; enriched by gateway services)
+  upstream_status_code?: number | null
+  upstream_error_message?: string
+  upstream_error_detail?: string
+  upstream_errors?: string
+  auth_latency_ms?: number | null
+  routing_latency_ms?: number | null
+  upstream_latency_ms?: number | null
+  response_latency_ms?: number | null
+  time_to_first_token_ms?: number | null
+  request_body: string
+  request_body_truncated: boolean
+  request_body_bytes?: number | null
+  is_business_limited: boolean
+}
+export type OpsErrorLogsResponse = PaginatedResponse<OpsErrorLog>
+export async function getDashboardOverview(
+  params: {
+  time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
+  start_time?: string
+  end_time?: string
+  platform?: string
+  group_id?: number | null
+  mode?: OpsQueryMode
+  },
+  options: OpsRequestOptions = {}
+): Promise<OpsDashboardOverview> {
+  const { data } = await apiClient.get<OpsDashboardOverview>('/admin/ops/dashboard/overview', {
+    params,
+    signal: options.signal
+  })
+  return data
+}
+export async function getThroughputTrend(
+  params: {
+  time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
+  start_time?: string
+  end_time?: string
+  platform?: string
+  group_id?: number | null
+  mode?: OpsQueryMode
+  },
+  options: OpsRequestOptions = {}
+): Promise<OpsThroughputTrendResponse> {
+  const { data } = await apiClient.get<OpsThroughputTrendResponse>('/admin/ops/dashboard/throughput-trend', {
+    params,
+    signal: options.signal
+  })
+  return data
+}
+export async function getLatencyHistogram(
+  params: {
+  time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
+  start_time?: string
+  end_time?: string
+  platform?: string
+  group_id?: number | null
+  mode?: OpsQueryMode
+  },
+  options: OpsRequestOptions = {}
+): Promise<OpsLatencyHistogramResponse> {
+  const { data } = await apiClient.get<OpsLatencyHistogramResponse>('/admin/ops/dashboard/latency-histogram', {
+    params,
+    signal: options.signal
+  })
+  return data
+}
+export async function getErrorTrend(
+  params: {
+  time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
+  start_time?: string
+  end_time?: string
+  platform?: string
+  group_id?: number | null
+  mode?: OpsQueryMode
+  },
+  options: OpsRequestOptions = {}
+): Promise<OpsErrorTrendResponse> {
+  const { data } = await apiClient.get<OpsErrorTrendResponse>('/admin/ops/dashboard/error-trend', {
+    params,
+    signal: options.signal
+  })
+  return data
+}
+export async function getErrorDistribution(
+  params: {
+  time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
+  start_time?: string
+  end_time?: string
+  platform?: string
+  group_id?: number | null
+  mode?: OpsQueryMode
+  },
+  options: OpsRequestOptions = {}
+): Promise<OpsErrorDistributionResponse> {
+  const { data } = await apiClient.get<OpsErrorDistributionResponse>('/admin/ops/dashboard/error-distribution', {
+    params,
+    signal: options.signal
+  })
+  return data
+}
+export async function listErrorLogs(params: {
+  page?: number
+  page_size?: number
+  time_range?: string
+  start_time?: string
+  end_time?: string
+  platform?: string
+  group_id?: number | null
+  account_id?: number | null
+  phase?: string
+  q?: string
+  status_codes?: string
+}): Promise<OpsErrorLogsResponse> {
+  const { data } = await apiClient.get<OpsErrorLogsResponse>('/admin/ops/errors', { params })
+  return data
+}
+export async function getErrorLogDetail(id: number): Promise<OpsErrorDetail> {
+  const { data } = await apiClient.get<OpsErrorDetail>(`/admin/ops/errors/${id}`)
+  return data
+}
+export async function retryErrorRequest(id: number, req: OpsRetryRequest): Promise<OpsRetryResult> {
+  const { data } = await apiClient.post<OpsRetryResult>(`/admin/ops/errors/${id}/retry`, req)
+  return data
+}
+export async function listRequestDetails(params: OpsRequestDetailsParams): Promise<OpsRequestDetailsResponse> {
+  const { data } = await apiClient.get<OpsRequestDetailsResponse>('/admin/ops/requests', { params })
+  return data
+}
+// Alert rules
+export async function listAlertRules(): Promise<AlertRule[]> {
+  const { data } = await apiClient.get<AlertRule[]>('/admin/ops/alert-rules')
+  return data
+}
+export async function createAlertRule(rule: AlertRule): Promise<AlertRule> {
+  const { data } = await apiClient.post<AlertRule>('/admin/ops/alert-rules', rule)
+  return data
+}
+export async function updateAlertRule(id: number, rule: Partial<AlertRule>): Promise<AlertRule> {
+  const { data } = await apiClient.put<AlertRule>(`/admin/ops/alert-rules/${id}`, rule)
+  return data
+}
+export async function deleteAlertRule(id: number): Promise<void> {
+  await apiClient.delete(`/admin/ops/alert-rules/${id}`)
+}
+export async function listAlertEvents(limit = 100): Promise<AlertEvent[]> {
+  const { data } = await apiClient.get<AlertEvent[]>('/admin/ops/alert-events', { params: { limit } })
+  return data
+}
+// Email notification config
+export async function getEmailNotificationConfig(): Promise<EmailNotificationConfig> {
+  const { data } = await apiClient.get<EmailNotificationConfig>('/admin/ops/email-notification/config')
+  return data
+}
+export async function updateEmailNotificationConfig(config: EmailNotificationConfig): Promise<EmailNotificationConfig> {
+  const { data } = await apiClient.put<EmailNotificationConfig>('/admin/ops/email-notification/config', config)
+  return data
+}
+// Runtime settings (DB-backed)
+export async function getAlertRuntimeSettings(): Promise<OpsAlertRuntimeSettings> {
+  const { data } = await apiClient.get<OpsAlertRuntimeSettings>('/admin/ops/runtime/alert')
+  return data
+}
+export async function updateAlertRuntimeSettings(config: OpsAlertRuntimeSettings): Promise<OpsAlertRuntimeSettings> {
+  const { data } = await apiClient.put<OpsAlertRuntimeSettings>('/admin/ops/runtime/alert', config)
+  return data
+}
+// Advanced settings (DB-backed)
+export async function getAdvancedSettings(): Promise<OpsAdvancedSettings> {
+  const { data } = await apiClient.get<OpsAdvancedSettings>('/admin/ops/advanced-settings')
+  return data
+}
+export async function updateAdvancedSettings(config: OpsAdvancedSettings): Promise<OpsAdvancedSettings> {
+  const { data } = await apiClient.put<OpsAdvancedSettings>('/admin/ops/advanced-settings', config)
+  return data
+}
+export const opsAPI = {
+  getDashboardOverview,
+  getThroughputTrend,
+  getLatencyHistogram,
+  getErrorTrend,
+  getErrorDistribution,
+  getConcurrencyStats,
+  getAccountAvailabilityStats,
+  subscribeQPS,
+  listErrorLogs,
+  getErrorLogDetail,
+  retryErrorRequest,
+  listRequestDetails,
+  listAlertRules,
+  createAlertRule,
+  updateAlertRule,
+  deleteAlertRule,
+  listAlertEvents,
+  getEmailNotificationConfig,
+  updateEmailNotificationConfig,
+  getAlertRuntimeSettings,
+  updateAlertRuntimeSettings,
+  getAdvancedSettings,
+  updateAdvancedSettings
+}
+export default opsAPI
--- a/frontend/src/api/admin/settings.ts
+++ b/frontend/src/api/admin/settings.ts
@@ -35,14 +35,29 @@ export interface SystemSettings {
  turnstile_enabled: boolean
  turnstile_site_key: string
  turnstile_secret_key_configured: boolean
-  // LinuxDo Connect OAuth 登录（终端用户 SSO）
+  // LinuxDo Connect OAuth settings
  linuxdo_connect_enabled: boolean
  linuxdo_connect_client_id: string
  linuxdo_connect_client_secret_configured: boolean
  linuxdo_connect_redirect_url: string
+  // Model fallback configuration
+  enable_model_fallback: boolean
+  fallback_model_anthropic: string
+  fallback_model_openai: string
+  fallback_model_gemini: string
+  fallback_model_antigravity: string
  // Identity patch configuration (Claude -> Gemini)
  enable_identity_patch: boolean
  identity_patch_prompt: string
+  // Ops Monitoring (vNext)
+  ops_monitoring_enabled: boolean
+  ops_realtime_monitoring_enabled: boolean
+  ops_query_mode_default: 'auto' | 'raw' | 'preagg' | string
+  ops_metrics_interval_seconds: number
 }
 export interface UpdateSettingsRequest {
@@ -71,8 +86,17 @@ export interface UpdateSettingsRequest {
  linuxdo_connect_client_id?: string
  linuxdo_connect_client_secret?: string
  linuxdo_connect_redirect_url?: string
+  enable_model_fallback?: boolean
+  fallback_model_anthropic?: string
+  fallback_model_openai?: string
+  fallback_model_gemini?: string
+  fallback_model_antigravity?: string
  enable_identity_patch?: boolean
  identity_patch_prompt?: string
+  ops_monitoring_enabled?: boolean
+  ops_realtime_monitoring_enabled?: boolean
+  ops_query_mode_default?: 'auto' | 'raw' | 'preagg' | string
+  ops_metrics_interval_seconds?: number
 }
 /**

--- a/frontend/src/api/client.ts
+++ b/frontend/src/api/client.ts
@@ -80,9 +80,45 @@ apiClient.interceptors.response.use(
    return response
  },
  (error: AxiosError<ApiResponse<unknown>>) => {
+    // Request cancellation: keep the original axios cancellation error so callers can ignore it.
+    // Otherwise we'd misclassify it as a generic "network error".
+    if (error.code === 'ERR_CANCELED' || axios.isCancel(error)) {
+      return Promise.reject(error)
+    }
    // Handle common errors
    if (error.response) {
      const { status, data } = error.response
+      const url = String(error.config?.url || '')
+      // Validate `data` shape to avoid HTML error pages breaking our error handling.
+      const apiData = (typeof data === 'object' && data !== null ? data : {}) as Record<string, any>
+      // Ops monitoring disabled: treat as feature-flagged 404, and proactively redirect away
+      // from ops pages to avoid broken UI states.
+      if (status === 404 && apiData.message === 'Ops monitoring is disabled') {
+        try {
+          localStorage.setItem('ops_monitoring_enabled_cached', 'false')
+        } catch {
+          // ignore localStorage failures
+        }
+        try {
+          window.dispatchEvent(new CustomEvent('ops-monitoring-disabled'))
+        } catch {
+          // ignore event failures
+        }
+        if (window.location.pathname.startsWith('/admin/ops')) {
+          window.location.href = '/admin/settings'
+        }
+        return Promise.reject({
+          status,
+          code: 'OPS_DISABLED',
+          message: apiData.message || error.message,
+          url
+        })
+      }
      // 401: Unauthorized - clear token and redirect to login
      if (status === 401) {
@@ -113,8 +149,8 @@ apiClient.interceptors.response.use(
      // Return structured error
      return Promise.reject({
        status,
-        code: data?.code,
+        code: apiData.code,
-        message: data?.message || error.message
+        message: apiData.message || apiData.detail || error.message
      })
    }

--- a/frontend/src/components/common/HelpTooltip.vue
+++ b/frontend/src/components/common/HelpTooltip.vue
+<script setup lang="ts">
+import { ref } from 'vue'
+defineProps<{
+  content?: string
+}>()
+const show = ref(false)
+</script>
+<template>
+  <div
+    class="group relative ml-1 inline-flex items-center align-middle"
+    @mouseenter="show = true"
+    @mouseleave="show = false"
+  >
+    <!-- Trigger Icon -->
+    <slot name="trigger">
+      <svg
+        class="h-4 w-4 cursor-help text-gray-400 transition-colors hover:text-primary-600 dark:text-gray-500 dark:hover:text-primary-400"
+        fill="none"
+        viewBox="0 0 24 24"
+        stroke="currentColor"
+        stroke-width="2"
+      >
+        <path
+          stroke-linecap="round"
+          stroke-linejoin="round"
+          d="M13 16h-1v-4h-1m1-4h.01M21 12a9 9 0 11-18 0 9 9 0 0118 0z"
+        />
+      </svg>
+    </slot>
+    <!-- Popover Content -->
+    <div
+      v-show="show"
+      class="absolute bottom-full left-1/2 z-50 mb-2 w-64 -translate-x-1/2 rounded-lg bg-gray-900 p-3 text-xs leading-relaxed text-white shadow-xl ring-1 ring-white/10 opacity-0 transition-opacity duration-200 group-hover:opacity-100 dark:bg-gray-800"
+    >
+      <slot>{{ content }}</slot>
+      <div class="absolute -bottom-1 left-1/2 h-2 w-2 -translate-x-1/2 rotate-45 bg-gray-900 dark:bg-gray-800"></div>
+    </div>
+  </div>
+</template>
--- a/frontend/src/components/common/Select.vue
+++ b/frontend/src/components/common/Select.vue
@@ -67,12 +67,13 @@
              :aria-selected="isSelected(option)"
              :aria-disabled="isOptionDisabled(option)"
              @click.stop="!isOptionDisabled(option) && selectOption(option)"
-              @mouseenter="focusedIndex = index"
+              @mouseenter="handleOptionMouseEnter(option, index)"
              :class="[
                'select-option',
+                isGroupHeaderOption(option) && 'select-option-group',
                isSelected(option) && 'select-option-selected',
-                isOptionDisabled(option) && 'select-option-disabled',
+                isOptionDisabled(option) && !isGroupHeaderOption(option) && 'select-option-disabled',
-                focusedIndex === index && 'select-option-focused'
+                focusedIndex === index && !isGroupHeaderOption(option) && 'select-option-focused'
              ]"
            >
              <slot name="option" :option="option" :selected="isSelected(option)">
@@ -201,6 +202,13 @@ const isOptionDisabled = (option: any): boolean => {
  return false
 }
+const isGroupHeaderOption = (option: any): boolean => {
+  if (typeof option === 'object' && option !== null) {
+    return option.kind === 'group'
+  }
+  return false
+}
 const selectedOption = computed(() => {
  return props.options.find((opt) => getOptionValue(opt) === props.modelValue) || null
 })
@@ -225,6 +233,31 @@ const isSelected = (option: any): boolean => {
  return getOptionValue(option) === props.modelValue
 }
+const findNextEnabledIndex = (startIndex: number): number => {
+  const opts = filteredOptions.value
+  if (opts.length === 0) return -1
+  for (let offset = 0; offset < opts.length; offset++) {
+    const idx = (startIndex + offset) % opts.length
+    if (!isOptionDisabled(opts[idx])) return idx
+  }
+  return -1
+}
+const findPrevEnabledIndex = (startIndex: number): number => {
+  const opts = filteredOptions.value
+  if (opts.length === 0) return -1
+  for (let offset = 0; offset < opts.length; offset++) {
+    const idx = (startIndex - offset + opts.length) % opts.length
+    if (!isOptionDisabled(opts[idx])) return idx
+  }
+  return -1
+}
+const handleOptionMouseEnter = (option: any, index: number) => {
+  if (isOptionDisabled(option) || isGroupHeaderOption(option)) return
+  focusedIndex.value = index
+}
 // Update trigger rect periodically while open to follow scroll/resize
 const updateTriggerRect = () => {
  if (containerRef.value) {
@@ -259,8 +292,15 @@ watch(isOpen, (open) => {
  if (open) {
    calculateDropdownPosition()
    // Reset focused index to current selection or first item
-    const selectedIdx = filteredOptions.value.findIndex(isSelected)
+    if (filteredOptions.value.length === 0) {
-    focusedIndex.value = selectedIdx >= 0 ? selectedIdx : 0
+      focusedIndex.value = -1
+    } else {
+      const selectedIdx = filteredOptions.value.findIndex(isSelected)
+      const initialIdx = selectedIdx >= 0 ? selectedIdx : 0
+      focusedIndex.value = isOptionDisabled(filteredOptions.value[initialIdx])
+        ? findNextEnabledIndex(initialIdx + 1)
+        : initialIdx
+    }
    if (props.searchable) {
      nextTick(() => searchInputRef.value?.focus())
@@ -295,13 +335,13 @@ const onDropdownKeyDown = (e: KeyboardEvent) => {
  switch (e.key) {
    case 'ArrowDown':
      e.preventDefault()
-      focusedIndex.value = (focusedIndex.value + 1) % filteredOptions.value.length
+      focusedIndex.value = findNextEnabledIndex(focusedIndex.value + 1)
-      scrollToFocused()
+      if (focusedIndex.value >= 0) scrollToFocused()
      break
    case 'ArrowUp':
      e.preventDefault()
-      focusedIndex.value = (focusedIndex.value - 1 + filteredOptions.value.length) % filteredOptions.value.length
+      focusedIndex.value = findPrevEnabledIndex(focusedIndex.value - 1)
-      scrollToFocused()
+      if (focusedIndex.value >= 0) scrollToFocused()
      break
    case 'Enter':
      e.preventDefault()
@@ -441,6 +481,17 @@ onUnmounted(() => {
  @apply cursor-not-allowed opacity-40;
 }
+.select-dropdown-portal .select-option-group {
+  @apply cursor-default select-none;
+  @apply bg-gray-50 dark:bg-dark-900;
+  @apply text-[11px] font-bold uppercase tracking-wider;
+  @apply text-gray-500 dark:text-gray-400;
+}
+.select-dropdown-portal .select-option-group:hover {
+  @apply bg-gray-50 dark:bg-dark-900;
+}
 .select-dropdown-portal .select-option-label {
  @apply flex-1 min-w-0 truncate text-left;
 }

--- a/frontend/src/components/keys/UseKeyModal.vue
+++ b/frontend/src/components/keys/UseKeyModal.vue
@@ -28,8 +28,8 @@
          {{ platformDescription }}
        </p>
-        <!-- Client Tabs (only for Antigravity platform) -->
+        <!-- Client Tabs -->
-        <div v-if="platform === 'antigravity'" class="border-b border-gray-200 dark:border-dark-700">
+        <div v-if="clientTabs.length" class="border-b border-gray-200 dark:border-dark-700">
          <nav class="-mb-px flex space-x-6" aria-label="Client">
            <button
              v-for="tab in clientTabs"
@@ -51,7 +51,7 @@
        </div>
        <!-- OS/Shell Tabs -->
-        <div class="border-b border-gray-200 dark:border-dark-700">
+        <div v-if="showShellTabs" class="border-b border-gray-200 dark:border-dark-700">
          <nav class="-mb-px flex space-x-4" aria-label="Tabs">
            <button
              v-for="tab in currentTabs"
@@ -111,7 +111,7 @@
        </div>
        <!-- Usage Note -->
-        <div class="flex items-start gap-3 p-3 rounded-lg bg-blue-50 dark:bg-blue-900/20 border border-blue-100 dark:border-blue-800">
+        <div v-if="showPlatformNote" class="flex items-start gap-3 p-3 rounded-lg bg-blue-50 dark:bg-blue-900/20 border border-blue-100 dark:border-blue-800">
          <Icon name="infoCircle" size="md" class="text-blue-500 flex-shrink-0 mt-0.5" />
          <p class="text-sm text-blue-700 dark:text-blue-300">
            {{ platformNote }}
@@ -173,17 +173,28 @@ const { copyToClipboard: clipboardCopy } = useClipboard()
 const copiedIndex = ref<number | null>(null)
 const activeTab = ref<string>('unix')
-const activeClientTab = ref<string>('claude')  // Level 1 tab for antigravity platform
+const activeClientTab = ref<string>('claude')
 // Reset tabs when platform changes
-watch(() => props.platform, (newPlatform) => {
+const defaultClientTab = computed(() => {
-  activeTab.value = 'unix'
+  switch (props.platform) {
-  if (newPlatform === 'antigravity') {
+    case 'openai':
-    activeClientTab.value = 'claude'
+      return 'codex'
+    case 'gemini':
+      return 'gemini'
+    case 'antigravity':
+      return 'claude'
+    default:
+      return 'claude'
  }
 })
-// Reset shell tab when client changes (for antigravity)
+watch(() => props.platform, () => {
+  activeTab.value = 'unix'
+  activeClientTab.value = defaultClientTab.value
+}, { immediate: true })
+// Reset shell tab when client changes
 watch(activeClientTab, () => {
  activeTab.value = 'unix'
 })
@@ -251,11 +262,32 @@ const SparkleIcon = {
  }
 }
-// Client tabs for Antigravity platform (Level 1)
+const clientTabs = computed((): TabConfig[] => {
-const clientTabs = computed((): TabConfig[] => [
+  if (!props.platform) return []
-  { id: 'claude', label: t('keys.useKeyModal.antigravity.claudeCode'), icon: TerminalIcon },
+  switch (props.platform) {
-  { id: 'gemini', label: t('keys.useKeyModal.antigravity.geminiCli'), icon: SparkleIcon }
+    case 'openai':
-])
+      return [
+        { id: 'codex', label: t('keys.useKeyModal.cliTabs.codexCli'), icon: TerminalIcon },
+        { id: 'opencode', label: t('keys.useKeyModal.cliTabs.opencode'), icon: TerminalIcon }
+      ]
+    case 'gemini':
+      return [
+        { id: 'gemini', label: t('keys.useKeyModal.cliTabs.geminiCli'), icon: SparkleIcon },
+        { id: 'opencode', label: t('keys.useKeyModal.cliTabs.opencode'), icon: TerminalIcon }
+      ]
+    case 'antigravity':
+      return [
+        { id: 'claude', label: t('keys.useKeyModal.cliTabs.claudeCode'), icon: TerminalIcon },
+        { id: 'gemini', label: t('keys.useKeyModal.cliTabs.geminiCli'), icon: SparkleIcon },
+        { id: 'opencode', label: t('keys.useKeyModal.cliTabs.opencode'), icon: TerminalIcon }
+      ]
+    default:
+      return [
+        { id: 'claude', label: t('keys.useKeyModal.cliTabs.claudeCode'), icon: TerminalIcon },
+        { id: 'opencode', label: t('keys.useKeyModal.cliTabs.opencode'), icon: TerminalIcon }
+      ]
+  }
+})
 // Shell tabs (3 types for environment variable based configs)
 const shellTabs: TabConfig[] = [
@@ -270,11 +302,13 @@ const openaiTabs: TabConfig[] = [
  { id: 'windows', label: 'Windows', icon: WindowsIcon }
 ]
+const showShellTabs = computed(() => activeClientTab.value !== 'opencode')
 const currentTabs = computed(() => {
+  if (!showShellTabs.value) return []
  if (props.platform === 'openai') {
-    return openaiTabs  // 2 tabs: unix, windows
+    return openaiTabs
  }
-  // All other platforms (anthropic, gemini, antigravity) use shell tabs
  return shellTabs
 })
@@ -308,6 +342,8 @@ const platformNote = computed(() => {
  }
 })
+const showPlatformNote = computed(() => activeClientTab.value !== 'opencode')
 const escapeHtml = (value: string) => value
  .replace(/&/g, '&amp;')
  .replace(/</g, '&lt;')
@@ -329,6 +365,35 @@ const comment = (value: string) => wrapToken('text-slate-500', value)
 const currentFiles = computed((): FileConfig[] => {
  const baseUrl = props.baseUrl || window.location.origin
  const apiKey = props.apiKey
+  const baseRoot = baseUrl.replace(/\/v1\/?$/, '').replace(/\/+$/, '')
+  const ensureV1 = (value: string) => {
+    const trimmed = value.replace(/\/+$/, '')
+    return trimmed.endsWith('/v1') ? trimmed : `${trimmed}/v1`
+  }
+  const apiBase = ensureV1(baseRoot)
+  const antigravityBase = ensureV1(`${baseRoot}/antigravity`)
+  const antigravityGeminiBase = (() => {
+    const trimmed = `${baseRoot}/antigravity`.replace(/\/+$/, '')
+    return trimmed.endsWith('/v1beta') ? trimmed : `${trimmed}/v1beta`
+  })()
+  if (activeClientTab.value === 'opencode') {
+    switch (props.platform) {
+      case 'anthropic':
+        return [generateOpenCodeConfig('anthropic', apiBase, apiKey)]
+      case 'openai':
+        return [generateOpenCodeConfig('openai', apiBase, apiKey)]
+      case 'gemini':
+        return [generateOpenCodeConfig('gemini', apiBase, apiKey)]
+      case 'antigravity':
+        return [
+          generateOpenCodeConfig('antigravity-claude', antigravityBase, apiKey, 'opencode.json (Claude)'),
+          generateOpenCodeConfig('antigravity-gemini', antigravityGeminiBase, apiKey, 'opencode.json (Gemini)')
+        ]
+      default:
+        return [generateOpenCodeConfig('openai', apiBase, apiKey)]
+    }
+  }
  switch (props.platform) {
    case 'openai':
@@ -336,12 +401,11 @@ const currentFiles = computed((): FileConfig[] => {
    case 'gemini':
      return [generateGeminiCliContent(baseUrl, apiKey)]
    case 'antigravity':
-      // Both Claude Code and Gemini CLI need /antigravity suffix for antigravity platform
+      if (activeClientTab.value === 'gemini') {
-      if (activeClientTab.value === 'claude') {
+        return [generateGeminiCliContent(`${baseUrl}/antigravity`, apiKey)]
-        return generateAnthropicFiles(`${baseUrl}/antigravity`, apiKey)
      }
-      return [generateGeminiCliContent(`${baseUrl}/antigravity`, apiKey)]
+      return generateAnthropicFiles(`${baseUrl}/antigravity`, apiKey)
-    default: // anthropic
+    default:
      return generateAnthropicFiles(baseUrl, apiKey)
  }
 })
@@ -456,6 +520,76 @@ requires_openai_auth = true`
  ]
 }
+function generateOpenCodeConfig(platform: string, baseUrl: string, apiKey: string, pathLabel?: string): FileConfig {
+  const provider: Record<string, any> = {
+    [platform]: {
+      options: {
+        baseURL: baseUrl,
+        apiKey,
+        ...(platform === 'openai' ? { store: false } : {})
+      }
+    }
+  }
+  const openaiModels = {
+    'gpt-5.2-codex': {
+      name: 'GPT-5.2 Codex',
+      variants: {
+        low: {},
+        medium: {},
+        high: {},
+        xhigh: {}
+      }
+    }
+  }
+  const geminiModels = {
+    'gemini-3-pro-high': { name: 'Gemini 3 Pro High' },
+    'gemini-3-pro-low': { name: 'Gemini 3 Pro Low' },
+    'gemini-3-pro-preview': { name: 'Gemini 3 Pro Preview' },
+    'gemini-3-pro-image': { name: 'Gemini 3 Pro Image' },
+    'gemini-3-flash': { name: 'Gemini 3 Flash' },
+    'gemini-2.5-flash-thinking': { name: 'Gemini 2.5 Flash Thinking' },
+    'gemini-2.5-flash': { name: 'Gemini 2.5 Flash' },
+    'gemini-2.5-flash-lite': { name: 'Gemini 2.5 Flash Lite' }
+  }
+  const claudeModels = {
+    'claude-opus-4-5-thinking': { name: 'Claude Opus 4.5 Thinking' },
+    'claude-sonnet-4-5-thinking': { name: 'Claude Sonnet 4.5 Thinking' },
+    'claude-sonnet-4-5': { name: 'Claude Sonnet 4.5' }
+  }
+  if (platform === 'gemini') {
+    provider[platform].npm = '@ai-sdk/google'
+    provider[platform].models = geminiModels
+  } else if (platform === 'anthropic') {
+    provider[platform].npm = '@ai-sdk/anthropic'
+  } else if (platform === 'antigravity-claude') {
+    provider[platform].npm = '@ai-sdk/anthropic'
+    provider[platform].name = 'Antigravity (Claude)'
+    provider[platform].models = claudeModels
+  } else if (platform === 'antigravity-gemini') {
+    provider[platform].npm = '@ai-sdk/google'
+    provider[platform].name = 'Antigravity (Gemini)'
+    provider[platform].models = geminiModels
+  } else if (platform === 'openai') {
+    provider[platform].models = openaiModels
+  }
+  const content = JSON.stringify(
+    {
+      provider,
+      $schema: 'https://opencode.ai/config.json'
+    },
+    null,
+    2
+  )
+  return {
+    path: pathLabel ?? 'opencode.json',
+    content,
+    hint: t('keys.useKeyModal.opencode.hint')
+  }
+}
 const copyContent = async (content: string, index: number) => {
  const success = await clipboardCopy(content, t('keys.copied'))
  if (success) {

--- a/frontend/src/components/layout/AppSidebar.vue
+++ b/frontend/src/components/layout/AppSidebar.vue
@@ -144,10 +144,10 @@
 </template>
 <script setup lang="ts">
-import { computed, h, ref } from 'vue'
+import { computed, h, onMounted, ref, watch } from 'vue'
 import { useRoute } from 'vue-router'
 import { useI18n } from 'vue-i18n'
-import { useAppStore, useAuthStore, useOnboardingStore } from '@/stores'
+import { useAdminSettingsStore, useAppStore, useAuthStore, useOnboardingStore } from '@/stores'
 import VersionBadge from '@/components/common/VersionBadge.vue'
 const { t } = useI18n()
@@ -156,6 +156,7 @@ const route = useRoute()
 const appStore = useAppStore()
 const authStore = useAuthStore()
 const onboardingStore = useOnboardingStore()
+const adminSettingsStore = useAdminSettingsStore()
 const sidebarCollapsed = computed(() => appStore.sidebarCollapsed)
 const mobileOpen = computed(() => appStore.mobileOpen)
@@ -442,6 +443,9 @@ const personalNavItems = computed(() => {
 const adminNavItems = computed(() => {
  const baseItems = [
    { path: '/admin/dashboard', label: t('nav.dashboard'), icon: DashboardIcon },
+    ...(adminSettingsStore.opsMonitoringEnabled
+      ? [{ path: '/admin/ops', label: t('nav.ops'), icon: ChartIcon }]
+      : []),
    { path: '/admin/users', label: t('nav.users'), icon: UsersIcon, hideInSimpleMode: true },
    { path: '/admin/groups', label: t('nav.groups'), icon: FolderIcon, hideInSimpleMode: true },
    { path: '/admin/subscriptions', label: t('nav.subscriptions'), icon: CreditCardIcon, hideInSimpleMode: true },
@@ -511,6 +515,23 @@ if (
  isDark.value = true
  document.documentElement.classList.add('dark')
 }
+// Fetch admin settings (for feature-gated nav items like Ops).
+watch(
+  isAdmin,
+  (v) => {
+    if (v) {
+      adminSettingsStore.fetch()
+    }
+  },
+  { immediate: true }
+)
+onMounted(() => {
+  if (isAdmin.value) {
+    adminSettingsStore.fetch()
+  }
+})
 </script>
 <style scoped>

--- a/frontend/src/i18n/locales/en.ts
+++ b/frontend/src/i18n/locales/en.ts
@@ -131,6 +131,7 @@ export default {
    noData: 'No data',
    success: 'Success',
    error: 'Error',
+    critical: 'Critical',
    warning: 'Warning',
    info: 'Info',
    active: 'Active',
@@ -145,9 +146,11 @@ export default {
    copiedToClipboard: 'Copied to clipboard',
    copyFailed: 'Failed to copy',
    contactSupport: 'Contact Support',
+    add: 'Add',
+    invalidEmail: 'Please enter a valid email address',
    optional: 'optional',
    selectOption: 'Select an option',
-        searchPlaceholder: 'Search...', 
+    searchPlaceholder: 'Search...', 
        noOptionsFound: 'No options found',
        noGroupsAvailable: 'No groups available',
        unknownError: 'Unknown error occurred',
@@ -178,6 +181,7 @@ export default {
    accounts: 'Accounts',
    proxies: 'Proxies',
    redeemCodes: 'Redeem Codes',
+    ops: 'Ops',
    promoCodes: 'Promo Codes',
    settings: 'Settings',
    myAccount: 'My Account',
@@ -364,6 +368,12 @@ export default {
        note: 'Make sure the config directory exists. macOS/Linux users can run mkdir -p ~/.codex to create it.',
        noteWindows: 'Press Win+R and enter %userprofile%\\.codex to open the config directory. Create it manually if it does not exist.',
      },
+      cliTabs: {
+        claudeCode: 'Claude Code',
+        geminiCli: 'Gemini CLI',
+        codexCli: 'Codex CLI',
+        opencode: 'OpenCode',
+      },
      antigravity: {
        description: 'Configure API access for Antigravity group. Select the configuration method based on your client.',
        claudeCode: 'Claude Code',
@@ -376,6 +386,11 @@ export default {
        modelComment: 'If you have Gemini 3 access, you can use: gemini-3-pro-preview',
        note: 'These environment variables will be active in the current terminal session. For permanent configuration, add them to ~/.bashrc, ~/.zshrc, or the appropriate configuration file.',
      },
+      opencode: {
+        title: 'OpenCode Example',
+        subtitle: 'opencode.json',
+        hint: 'This is a group configuration example. Adjust model and options as needed.',
+      },
    },
    customKeyLabel: 'Custom Key',
    customKeyPlaceholder: 'Enter your custom key (min 16 chars)',
@@ -1826,6 +1841,524 @@ export default {
      ipAddress: 'IP'
    },
+    // Ops Monitoring
+    ops: {
+      title: 'Ops Monitoring',
+      description: 'Operational monitoring and troubleshooting',
+      // Dashboard
+      systemHealth: 'System Health',
+      overview: 'Overview',
+      noSystemMetrics: 'No system metrics collected yet.',
+      collectedAt: 'Collected at:',
+      window: 'window',
+      cpu: 'CPU',
+      memory: 'Memory',
+      db: 'DB',
+      redis: 'Redis',
+      goroutines: 'Goroutines',
+      jobs: 'Jobs',
+      jobsHelp: 'Click “Details” to view job heartbeats and recent errors',
+      active: 'active',
+      idle: 'idle',
+      waiting: 'waiting',
+      conns: 'conns',
+      queue: 'queue',
+      ok: 'ok',
+      lastRun: 'last_run:',
+      lastSuccess: 'last_success:',
+      lastError: 'last_error:',
+      noData: 'No data.',
+      loadingText: 'loading',
+      ready: 'ready',
+      requestsTotal: 'Requests (total)',
+      slaScope: 'SLA scope:',
+      tokens: 'Tokens',
+      tps: 'TPS:',
+      current: 'current',
+      peak: 'peak',
+      average: 'average',
+      totalRequests: 'Total Requests',
+      avgQps: 'Avg QPS',
+      avgTps: 'Avg TPS',
+      avgLatency: 'Avg Latency',
+      avgTtft: 'Avg TTFT',
+      exceptions: 'Exceptions',
+      requestErrors: 'Request Errors',
+      errorCount: 'Error Count',
+      upstreamErrors: 'Upstream Errors',
+      errorCountExcl429529: 'Error Count (excl 429/529)',
+      sla: 'SLA (excl business limits)',
+      businessLimited: 'business_limited:',
+      errors: 'Errors',
+      errorRate: 'error_rate:',
+      upstreamRate: 'upstream_rate:',
+      latencyDuration: 'Latency (duration_ms)',
+      ttftLabel: 'TTFT (first_token_ms)',
+      p50: 'p50:',
+      p90: 'p90:',
+      p95: 'p95:',
+      p99: 'p99:',
+      avg: 'avg:',
+      max: 'max:',
+      qps: 'QPS',
+      requests: 'Requests',
+      upstream: 'Upstream',
+      client: 'Client',
+      system: 'System',
+      other: 'Other',
+      errorsSla: 'Errors (SLA scope)',
+      upstreamExcl429529: 'Upstream (excl 429/529)',
+      failedToLoadData: 'Failed to load ops data.',
+      failedToLoadOverview: 'Failed to load overview',
+      failedToLoadThroughputTrend: 'Failed to load throughput trend',
+      failedToLoadLatencyHistogram: 'Failed to load latency histogram',
+      failedToLoadErrorTrend: 'Failed to load error trend',
+      failedToLoadErrorDistribution: 'Failed to load error distribution',
+      failedToLoadErrorDetail: 'Failed to load error detail',
+      retryFailed: 'Retry failed',
+      tpsK: 'TPS (K)',
+      top: 'Top:',
+      throughputTrend: 'Throughput Trend',
+      latencyHistogram: 'Latency Histogram',
+      errorTrend: 'Error Trend',
+      errorDistribution: 'Error Distribution',
+      // Health Score & Diagnosis
+      health: 'Health',
+      healthCondition: 'Health Condition',
+      healthHelp: 'Overall system health score based on SLA, error rate, and resource usage',
+      healthyStatus: 'Healthy',
+      riskyStatus: 'At Risk',
+      idleStatus: 'Idle',
+      timeRange: {
+        '5m': 'Last 5 minutes',
+        '30m': 'Last 30 minutes',
+        '1h': 'Last 1 hour',
+        '6h': 'Last 6 hours',
+        '24h': 'Last 24 hours'
+      },
+      diagnosis: {
+        title: 'Smart Diagnosis',
+        footer: 'Automated diagnostic suggestions based on current metrics',
+        idle: 'System is currently idle',
+        idleImpact: 'No active traffic',
+        // Resource diagnostics
+        dbDown: 'Database connection failed',
+        dbDownImpact: 'All database operations will fail',
+        dbDownAction: 'Check database service status, network connectivity, and connection configuration',
+        redisDown: 'Redis connection failed',
+        redisDownImpact: 'Cache functionality degraded, performance may decline',
+        redisDownAction: 'Check Redis service status and network connectivity',
+        cpuCritical: 'CPU usage critically high ({usage}%)',
+        cpuCriticalImpact: 'System response slowing, may affect all requests',
+        cpuCriticalAction: 'Check CPU-intensive tasks, consider scaling or code optimization',
+        cpuHigh: 'CPU usage elevated ({usage}%)',
+        cpuHighImpact: 'System load is high, needs attention',
+        cpuHighAction: 'Monitor CPU trends, prepare scaling plan',
+        memoryCritical: 'Memory usage critically high ({usage}%)',
+        memoryCriticalImpact: 'May trigger OOM, system stability threatened',
+        memoryCriticalAction: 'Check for memory leaks, consider increasing memory or optimizing usage',
+        memoryHigh: 'Memory usage elevated ({usage}%)',
+        memoryHighImpact: 'Memory pressure is high, needs attention',
+        memoryHighAction: 'Monitor memory trends, check for memory leaks',
+        // Latency diagnostics
+        latencyCritical: 'Response latency critically high ({latency}ms)',
+        latencyCriticalImpact: 'User experience extremely poor, many requests timing out',
+        latencyCriticalAction: 'Check slow queries, database indexes, network latency, and upstream services',
+        latencyHigh: 'Response latency elevated ({latency}ms)',
+        latencyHighImpact: 'User experience degraded, needs optimization',
+        latencyHighAction: 'Analyze slow request logs, optimize database queries and business logic',
+        ttftHigh: 'Time to first byte elevated ({ttft}ms)',
+        ttftHighImpact: 'User perceived latency increased',
+        ttftHighAction: 'Optimize request processing flow, reduce pre-processing time',
+        // Error rate diagnostics
+        upstreamCritical: 'Upstream error rate critically high ({rate}%)',
+        upstreamCriticalImpact: 'May affect many user requests',
+        upstreamCriticalAction: 'Check upstream service health, enable fallback strategies',
+        upstreamHigh: 'Upstream error rate elevated ({rate}%)',
+        upstreamHighImpact: 'Recommend checking upstream service status',
+        upstreamHighAction: 'Contact upstream service team, prepare fallback plan',
+        errorHigh: 'Error rate too high ({rate}%)',
+        errorHighImpact: 'Many requests failing',
+        errorHighAction: 'Check error logs, identify root cause, urgent fix required',
+        errorElevated: 'Error rate elevated ({rate}%)',
+        errorElevatedImpact: 'Recommend checking error logs',
+        errorElevatedAction: 'Analyze error types and distribution, create fix plan',
+        // SLA diagnostics
+        slaCritical: 'SLA critically below target ({sla}%)',
+        slaCriticalImpact: 'User experience severely degraded',
+        slaCriticalAction: 'Urgently investigate errors and latency, consider rate limiting',
+        slaLow: 'SLA below target ({sla}%)',
+        slaLowImpact: 'Service quality needs attention',
+        slaLowAction: 'Analyze SLA decline causes, optimize system performance',
+        // Health score diagnostics
+        healthCritical: 'Overall health score critically low ({score})',
+        healthCriticalImpact: 'Multiple metrics may be degraded; prioritize error rate and latency investigation',
+        healthCriticalAction: 'Comprehensive system check, prioritize critical-level issues',
+        healthLow: 'Overall health score low ({score})',
+        healthLowImpact: 'May indicate minor instability; monitor SLA and error rates',
+        healthLowAction: 'Monitor metric trends, prevent issue escalation',
+        healthy: 'All system metrics normal',
+        healthyImpact: 'Service running stable'
+      },
+      // Error Log
+      errorLog: {
+        timeId: 'Time / ID',
+        context: 'Context',
+        status: 'Status',
+        message: 'Message',
+        latency: 'Latency',
+        action: 'Action',
+        noErrors: 'No errors in this window.',
+        grp: 'GRP:',
+        acc: 'ACC:',
+        details: 'Details',
+        phase: 'Phase'
+      },
+      // Error Details Modal
+      errorDetails: {
+        upstreamErrors: 'Upstream Errors',
+        requestErrors: 'Request Errors',
+        total: 'Total:',
+        searchPlaceholder: 'Search request_id / client_request_id / message',
+        accountIdPlaceholder: 'account_id'
+      },
+      // Error Detail Modal
+      errorDetail: {
+        loading: 'Loading…',
+        requestId: 'Request ID',
+        time: 'Time',
+        phase: 'Phase',
+        status: 'Status',
+        message: 'Message',
+        basicInfo: 'Basic Info',
+        platform: 'Platform',
+        model: 'Model',
+        latency: 'Latency',
+        ttft: 'TTFT',
+        businessLimited: 'Business Limited',
+        requestPath: 'Request Path',
+        timings: 'Timings',
+        auth: 'Auth',
+        routing: 'Routing',
+        upstream: 'Upstream',
+        response: 'Response',
+        retry: 'Retry',
+        retryClient: 'Retry (Client)',
+        retryUpstream: 'Retry (Upstream pinned)',
+        pinnedAccountId: 'Pinned account_id',
+        retryNotes: 'Retry Notes',
+        requestBody: 'Request Body',
+        errorBody: 'Error Body',
+        trimmed: 'trimmed',
+        confirmRetry: 'Confirm Retry',
+        retrySuccess: 'Retry succeeded',
+        retryFailed: 'Retry failed',
+        na: 'N/A',
+        retryHint: 'Retry will resend the request with the same parameters',
+        retryClientHint: 'Use client retry (no account pinning)',
+        retryUpstreamHint: 'Use upstream pinned retry (pin to the error account)',
+        pinnedAccountIdHint: '(auto from error log)',
+        retryNote1: 'Retry will use the same request body and parameters',
+        retryNote2: 'If the original request failed due to account issues, pinned retry may still fail',
+        retryNote3: 'Client retry will reselect an account',
+        confirmRetryMessage: 'Confirm retry this request?',
+        confirmRetryHint: 'Will resend with the same request parameters'
+      },
+      requestDetails: {
+        title: 'Request Details',
+        details: 'Details',
+        rangeLabel: 'Window: {range}',
+        rangeMinutes: '{n} minutes',
+        rangeHours: '{n} hours',
+        empty: 'No requests in this window.',
+        emptyHint: 'Try a different time range or remove filters.',
+        failedToLoad: 'Failed to load request details',
+        requestIdCopied: 'Request ID copied',
+        copyFailed: 'Copy failed',
+        copy: 'Copy',
+        viewError: 'View Error',
+        kind: {
+          success: 'SUCCESS',
+          error: 'ERROR'
+        },
+        table: {
+          time: 'Time',
+          kind: 'Kind',
+          platform: 'Platform',
+          model: 'Model',
+          duration: 'Duration',
+          status: 'Status',
+          requestId: 'Request ID',
+          actions: 'Actions'
+        }
+      },
+      alertEvents: {
+        title: 'Alert Events',
+        description: 'Recent alert firing/resolution records (email-only)',
+        loading: 'Loading...',
+        empty: 'No alert events',
+        loadFailed: 'Failed to load alert events',
+        table: {
+          time: 'Time',
+          status: 'Status',
+          severity: 'Severity',
+          title: 'Title',
+          metric: 'Metric / Threshold',
+          email: 'Email Sent'
+        }
+      },
+      alertRules: {
+        title: 'Alert Rules',
+        description: 'Create and manage threshold-based system alerts (email-only)',
+        loading: 'Loading...',
+        empty: 'No alert rules',
+        loadFailed: 'Failed to load alert rules',
+        saveFailed: 'Failed to save alert rule',
+        deleteFailed: 'Failed to delete alert rule',
+        create: 'Create Rule',
+        createTitle: 'Create Alert Rule',
+        editTitle: 'Edit Alert Rule',
+        deleteConfirmTitle: 'Delete this rule?',
+        deleteConfirmMessage: 'This will remove the rule and its related events. Continue?',
+        metricGroups: {
+          system: 'System Metrics',
+          group: 'Group-level Metrics (requires group_id)',
+          account: 'Account-level Metrics'
+        },
+        metrics: {
+          successRate: 'Success Rate (%)',
+          errorRate: 'Error Rate (%)',
+          upstreamErrorRate: 'Upstream Error Rate (%)',
+          p95: 'P95 Latency (ms)',
+          p99: 'P99 Latency (ms)',
+          cpu: 'CPU Usage (%)',
+          memory: 'Memory Usage (%)',
+          queueDepth: 'Concurrency Queue Depth',
+          groupAvailableAccounts: 'Group Available Accounts',
+          groupAvailableRatio: 'Group Available Ratio (%)',
+          groupRateLimitRatio: 'Group Rate Limit Ratio (%)',
+          accountRateLimitedCount: 'Rate-limited Accounts',
+          accountErrorCount: 'Error Accounts (excluding temporarily unschedulable)',
+          accountErrorRatio: 'Error Account Ratio (%)',
+          overloadAccountCount: 'Overloaded Accounts'
+        },
+        metricDescriptions: {
+          successRate: 'Percentage of successful requests in the window (0-100).',
+          errorRate: 'Percentage of failed requests in the window (0-100).',
+          upstreamErrorRate: 'Percentage of upstream failures in the window (0-100).',
+          p95: 'P95 request latency within the window (ms).',
+          p99: 'P99 request latency within the window (ms).',
+          cpu: 'Current instance CPU usage (0-100).',
+          memory: 'Current instance memory usage (0-100).',
+          queueDepth: 'Concurrency queue depth within the window (queued requests).',
+          groupAvailableAccounts: 'Number of available accounts in the selected group (requires group_id).',
+          groupAvailableRatio: 'Available account ratio in the selected group (0-100, requires group_id).',
+          groupRateLimitRatio: 'Rate-limited account ratio in the selected group (0-100, requires group_id).',
+          accountRateLimitedCount: 'Number of rate-limited accounts within the window.',
+          accountErrorCount: 'Number of error accounts within the window (excluding temporarily unschedulable).',
+          accountErrorRatio: 'Error account ratio within the window (0-100).',
+          overloadAccountCount: 'Number of overloaded accounts within the window.'
+        },
+        hints: {
+          recommended: 'Recommended: operator {operator}, threshold {threshold}{unit}',
+          groupRequired: 'This is a group-level metric; selecting a group (group_id) is required.',
+          groupOptional: 'Optional: limit the rule to a specific group via group_id.'
+        },
+        table: {
+          name: 'Name',
+          metric: 'Metric',
+          severity: 'Severity',
+          enabled: 'Enabled',
+          actions: 'Actions'
+        },
+        form: {
+          name: 'Name',
+          description: 'Description',
+          metric: 'Metric',
+          operator: 'Operator',
+          groupId: 'Group (group_id)',
+          groupPlaceholder: 'Select a group',
+          allGroups: 'All groups',
+          threshold: 'Threshold',
+          severity: 'Severity',
+          window: 'Window (minutes)',
+          sustained: 'Sustained (samples)',
+          cooldown: 'Cooldown (minutes)',
+          enabled: 'Enabled',
+          notifyEmail: 'Send email notifications'
+        },
+        validation: {
+          title: 'Please fix the following issues',
+          invalid: 'Invalid rule',
+          nameRequired: 'Name is required',
+          metricRequired: 'Metric is required',
+          groupIdRequired: 'group_id is required for group-level metrics',
+          operatorRequired: 'Operator is required',
+          thresholdRequired: 'Threshold must be a number',
+          windowRange: 'Window must be one of: 1, 5, 60 minutes',
+          sustainedRange: 'Sustained must be between 1 and 1440 samples',
+          cooldownRange: 'Cooldown must be between 0 and 1440 minutes'
+        }
+      },
+      runtime: {
+        title: 'Ops Runtime Settings',
+        description: 'Stored in database; changes take effect without editing config files.',
+        loading: 'Loading...',
+        noData: 'No runtime settings available',
+        loadFailed: 'Failed to load runtime settings',
+        saveSuccess: 'Runtime settings saved',
+        saveFailed: 'Failed to save runtime settings',
+        alertTitle: 'Alert Evaluator',
+        groupAvailabilityTitle: 'Group Availability Monitor',
+        evalIntervalSeconds: 'Evaluation Interval (seconds)',
+        silencing: {
+          title: 'Alert Silencing (Maintenance Mode)',
+          enabled: 'Enable silencing',
+          globalUntil: 'Silence until (RFC3339)',
+          untilPlaceholder: '2026-01-05T00:00:00Z',
+          untilHint: 'Leave empty to only toggle silencing without an expiry (not recommended).',
+          reason: 'Reason',
+          reasonPlaceholder: 'e.g., planned maintenance',
+          entries: {
+            title: 'Advanced: targeted silencing',
+            hint: 'Optional: silence only certain rules or severities. Leave fields empty to match all.',
+            add: 'Add Entry',
+            empty: 'No targeted entries',
+            entryTitle: 'Entry #{n}',
+            ruleId: 'Rule ID (optional)',
+            ruleIdPlaceholder: 'e.g., 1',
+            severities: 'Severities (optional)',
+            severitiesPlaceholder: 'e.g., P0,P1 (empty = all)',
+            until: 'Until (RFC3339)',
+            reason: 'Reason',
+            validation: {
+              untilRequired: 'Entry until time is required',
+              untilFormat: 'Entry until time must be a valid RFC3339 timestamp',
+              ruleIdPositive: 'Entry rule_id must be a positive integer',
+              severitiesFormat: 'Entry severities must be a comma-separated list of P0..P3'
+            }
+          },
+          validation: {
+            timeFormat: 'Silence time must be a valid RFC3339 timestamp'
+          }
+        },
+        lockEnabled: 'Distributed Lock Enabled',
+        lockKey: 'Distributed Lock Key',
+        lockTTLSeconds: 'Distributed Lock TTL (seconds)',
+        showAdvancedDeveloperSettings: 'Show advanced developer settings (Distributed Lock)',
+        advancedSettingsSummary: 'Advanced settings (Distributed Lock)',
+        evalIntervalHint: 'How often the evaluator runs. Keeping the default is recommended.',
+        validation: {
+          title: 'Please fix the following issues',
+          invalid: 'Invalid settings',
+          evalIntervalRange: 'Evaluation interval must be between 1 and 86400 seconds',
+          lockKeyRequired: 'Distributed lock key is required when lock is enabled',
+          lockKeyPrefix: 'Distributed lock key must start with "{prefix}"',
+          lockKeyHint: 'Recommended: start with "{prefix}" to avoid conflicts',
+          lockTtlRange: 'Distributed lock TTL must be between 1 and 86400 seconds'
+        }
+      },
+      email: {
+        title: 'Email Notification',
+        description: 'Configure alert/report email notifications (stored in database).',
+        loading: 'Loading...',
+        noData: 'No email notification config',
+        loadFailed: 'Failed to load email notification config',
+        saveSuccess: 'Email notification config saved',
+        saveFailed: 'Failed to save email notification config',
+        alertTitle: 'Alert Emails',
+        reportTitle: 'Report Emails',
+        recipients: 'Recipients',
+        recipientsHint: 'If empty, the system may fallback to the first admin email.',
+        minSeverity: 'Min Severity',
+        minSeverityAll: 'All severities',
+        rateLimitPerHour: 'Rate limit per hour',
+        batchWindowSeconds: 'Batch window (seconds)',
+        includeResolved: 'Include resolved alerts',
+        dailySummary: 'Daily summary',
+        weeklySummary: 'Weekly summary',
+        errorDigest: 'Error digest',
+        errorDigestMinCount: 'Min errors for digest',
+        accountHealth: 'Account health',
+        accountHealthThreshold: 'Error rate threshold (%)',
+        cronPlaceholder: 'Cron expression',
+        reportHint: 'Schedules use cron syntax; leave empty to use defaults.',
+        validation: {
+          title: 'Please fix the following issues',
+          invalid: 'Invalid email notification config',
+          alertRecipientsRequired: 'Alert emails are enabled but no recipients are configured',
+          reportRecipientsRequired: 'Report emails are enabled but no recipients are configured',
+          invalidRecipients: 'One or more recipient emails are invalid',
+          rateLimitRange: 'Rate limit per hour must be a number ≥ 0',
+          batchWindowRange: 'Batch window must be between 0 and 86400 seconds',
+          cronRequired: 'A cron expression is required when schedule is enabled',
+          cronFormat: 'Cron expression format looks invalid (expected at least 5 parts)',
+          digestMinCountRange: 'Min errors for digest must be a number ≥ 0',
+          accountHealthThresholdRange: 'Account health threshold must be between 0 and 100'
+        }
+      },
+      concurrency: {
+        title: 'Concurrency / Queue',
+        byPlatform: 'By Platform',
+        byGroup: 'By Group',
+        byAccount: 'By Account',
+        totalRows: '{count} rows',
+        disabledHint: 'Realtime monitoring is disabled in settings.',
+        empty: 'No data',
+        queued: 'Queue {count}',
+        rateLimited: 'Rate-limited {count}',
+        errorAccounts: 'Errors {count}',
+        loadFailed: 'Failed to load concurrency data'
+      },
+      realtime: {
+        title: 'Realtime',
+        connected: 'Realtime connected',
+        connecting: 'Realtime connecting',
+        reconnecting: 'Realtime reconnecting',
+        offline: 'Realtime offline',
+        closed: 'Realtime closed',
+        reconnectIn: 'retry in {seconds}s'
+      },
+      queryMode: {
+        auto: 'Auto',
+        raw: 'Raw',
+        preagg: 'Preagg'
+      },
+      accountAvailability: {
+        available: 'Available',
+        unavailable: 'Unavailable',
+        accountError: 'Error'
+      },
+      tooltips: {
+        throughputTrend: 'Requests/QPS + Tokens/TPS in the selected window.',
+        latencyHistogram: 'Latency distribution (duration_ms) for successful requests.',
+        errorTrend: 'Error counts over time (SLA scope excludes business limits; upstream excludes 429/529).',
+        errorDistribution: 'Error distribution by status code.',
+        goroutines:
+          'Number of Go runtime goroutines (lightweight threads). There is no absolute “safe” number—use your historical baseline. Heuristic: <2k is common; 2k–8k watch; >8k plus rising queue/latency often suggests blocking/leaks.',
+        cpu: 'CPU usage percentage, showing system processor load.',
+        memory: 'Memory usage, including used and total available memory.',
+        db: 'Database connection pool status, including active, idle, and waiting connections.',
+        redis: 'Redis connection pool status, showing active and idle connections.',
+        jobs: 'Background job execution status, including last run time, success time, and error information.',
+        qps: 'Queries Per Second (QPS) and Tokens Per Second (TPS), real-time system throughput.',
+        tokens: 'Total number of tokens processed in the current time window.',
+        sla: 'Service Level Agreement success rate, excluding business limits (e.g., insufficient balance, quota exceeded).',
+        errors: 'Error statistics, including total errors, error rate, and upstream error rate.',
+        latency: 'Request latency statistics, including p50, p90, p95, p99 percentiles.',
+        ttft: 'Time To First Token, measuring the speed of first byte return in streaming responses.',
+        health: 'System health score (0-100), considering SLA, error rate, and resource usage.'
+      },
+      charts: {
+        emptyRequest: 'No requests in this window.',
+        emptyError: 'No errors in this window.',
+        resetZoom: 'Reset',
+        resetZoomHint: 'Reset zoom (if enabled)',
+        downloadChart: 'Download',
+        downloadChartHint: 'Download chart as image'
+      }
+    },
    // Settings
    settings: {
      title: 'System Settings',
@@ -1940,6 +2473,22 @@ export default {
        sending: 'Sending...',
        enterRecipientHint: 'Please enter a recipient email address'
      },
+      opsMonitoring: {
+        title: 'Ops Monitoring',
+        description: 'Enable ops monitoring for troubleshooting and health visibility',
+        disabled: 'Ops monitoring is disabled',
+        enabled: 'Enable Ops Monitoring',
+        enabledHint: 'Enable the ops monitoring module (admin only)',
+        realtimeEnabled: 'Enable Realtime Monitoring',
+        realtimeEnabledHint: 'Enable realtime QPS/metrics push (WebSocket)',
+        queryMode: 'Default Query Mode',
+        queryModeHint: 'Default query mode for Ops Dashboard (auto/raw/preagg)',
+        queryModeAuto: 'Auto (recommended)',
+        queryModeRaw: 'Raw (most accurate, slower)',
+        queryModePreagg: 'Preagg (fastest, requires aggregation)',
+        metricsInterval: 'Metrics Collection Interval (seconds)',
+        metricsIntervalHint: 'How often to collect system/request metrics (60-3600 seconds)'
+      },
      adminApiKey: {
        title: 'Admin API Key',
        description: 'Global API key for external system integration with full admin access',