Merge PR #238: feat(ops): 实现完整的运维监控系统（vNext）

7844dc4f · shaw · 2b2f7a6d · c48795a9 · 7844dc4f · 7844dc4f
Commit 7844dc4f authored Jan 12, 2026 by shaw
--- a/backend/internal/service/ratelimit_service.go
+++ b/backend/internal/service/ratelimit_service.go
@@ -55,19 +55,36 @@ func (s *RateLimitService) HandleUpstreamError(ctx context.Context, account *Acc
 	}
 	tempMatched := s.tryTempUnschedulable(ctx, account, statusCode, responseBody)
+	upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(responseBody))
+	upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
+	if upstreamMsg != "" {
+		upstreamMsg = truncateForLog([]byte(upstreamMsg), 512)
+	}
 	switch statusCode {
 	case 401:
 		// 认证失败：停止调度，记录错误
-		s.handleAuthError(ctx, account, "Authentication failed (401): invalid or expired credentials")
+		msg := "Authentication failed (401): invalid or expired credentials"
+		if upstreamMsg != "" {
+			msg = "Authentication failed (401): " + upstreamMsg
+		}
+		s.handleAuthError(ctx, account, msg)
 		shouldDisable = true
 	case 402:
 		// 支付要求：余额不足或计费问题，停止调度
-		s.handleAuthError(ctx, account, "Payment required (402): insufficient balance or billing issue")
+		msg := "Payment required (402): insufficient balance or billing issue"
+		if upstreamMsg != "" {
+			msg = "Payment required (402): " + upstreamMsg
+		}
+		s.handleAuthError(ctx, account, msg)
 		shouldDisable = true
 	case 403:
 		// 禁止访问：停止调度，记录错误
-		s.handleAuthError(ctx, account, "Access forbidden (403): account may be suspended or lack permissions")
+		msg := "Access forbidden (403): account may be suspended or lack permissions"
+		if upstreamMsg != "" {
+			msg = "Access forbidden (403): " + upstreamMsg
+		}
+		s.handleAuthError(ctx, account, msg)
 		shouldDisable = true
 	case 429:
 		s.handle429(ctx, account, headers)

--- a/backend/internal/service/setting_service.go
+++ b/backend/internal/service/setting_service.go
@@ -208,6 +208,14 @@ func (s *SettingService) UpdateSettings(ctx context.Context, settings *SystemSet
 	updates[SettingKeyEnableIdentityPatch] = strconv.FormatBool(settings.EnableIdentityPatch)
 	updates[SettingKeyIdentityPatchPrompt] = settings.IdentityPatchPrompt
+	// Ops monitoring (vNext)
+	updates[SettingKeyOpsMonitoringEnabled] = strconv.FormatBool(settings.OpsMonitoringEnabled)
+	updates[SettingKeyOpsRealtimeMonitoringEnabled] = strconv.FormatBool(settings.OpsRealtimeMonitoringEnabled)
+	updates[SettingKeyOpsQueryModeDefault] = string(ParseOpsQueryMode(settings.OpsQueryModeDefault))
+	if settings.OpsMetricsIntervalSeconds > 0 {
+		updates[SettingKeyOpsMetricsIntervalSeconds] = strconv.Itoa(settings.OpsMetricsIntervalSeconds)
+	}
 	err := s.settingRepo.SetMultiple(ctx, updates)
 	if err == nil && s.onUpdate != nil {
 		s.onUpdate() // Invalidate cache after settings update
@@ -219,8 +227,8 @@ func (s *SettingService) UpdateSettings(ctx context.Context, settings *SystemSet
 func (s *SettingService) IsRegistrationEnabled(ctx context.Context) bool {
 	value, err := s.settingRepo.GetValue(ctx, SettingKeyRegistrationEnabled)
 	if err != nil {
-		// 安全默认：如果设置不存在或查询出错，默认关闭注册
+		// 默认开放注册
-		return false
+		return true
 	}
 	return value == "true"
 }
@@ -298,6 +306,12 @@ func (s *SettingService) InitializeDefaultSettings(ctx context.Context) error {
 		// Identity patch defaults
 		SettingKeyEnableIdentityPatch: "true",
 		SettingKeyIdentityPatchPrompt: "",
+		// Ops monitoring defaults (vNext)
+		SettingKeyOpsMonitoringEnabled:         "true",
+		SettingKeyOpsRealtimeMonitoringEnabled: "true",
+		SettingKeyOpsQueryModeDefault:          "auto",
+		SettingKeyOpsMetricsIntervalSeconds:    "60",
 	}
 	return s.settingRepo.SetMultiple(ctx, defaults)
@@ -397,100 +411,33 @@ func (s *SettingService) parseSettings(settings map[string]string) *SystemSettin
 	}
 	result.IdentityPatchPrompt = settings[SettingKeyIdentityPatchPrompt]
-	return result
+	// Ops monitoring settings (default: enabled, fail-open)
-}
+	result.OpsMonitoringEnabled = !isFalseSettingValue(settings[SettingKeyOpsMonitoringEnabled])
+	result.OpsRealtimeMonitoringEnabled = !isFalseSettingValue(settings[SettingKeyOpsRealtimeMonitoringEnabled])
-// GetLinuxDoConnectOAuthConfig 返回用于登录的“最终生效” LinuxDo Connect 配置。
+	result.OpsQueryModeDefault = string(ParseOpsQueryMode(settings[SettingKeyOpsQueryModeDefault]))
-//
+	result.OpsMetricsIntervalSeconds = 60
-// 优先级：
+	if raw := strings.TrimSpace(settings[SettingKeyOpsMetricsIntervalSeconds]); raw != "" {
-// - 若对应系统设置键存在，则覆盖 config.yaml/env 的值
+		if v, err := strconv.Atoi(raw); err == nil {
-// - 否则回退到 config.yaml/env 的值
+			if v < 60 {
-func (s *SettingService) GetLinuxDoConnectOAuthConfig(ctx context.Context) (config.LinuxDoConnectConfig, error) {
+				v = 60
-	if s == nil || s.cfg == nil {
+			}
-		return config.LinuxDoConnectConfig{}, infraerrors.ServiceUnavailable("CONFIG_NOT_READY", "config not loaded")
+			if v > 3600 {
-	}
+				v = 3600
+			}
-	effective := s.cfg.LinuxDo
+			result.OpsMetricsIntervalSeconds = v
+		}
-	keys := []string{
-		SettingKeyLinuxDoConnectEnabled,
-		SettingKeyLinuxDoConnectClientID,
-		SettingKeyLinuxDoConnectClientSecret,
-		SettingKeyLinuxDoConnectRedirectURL,
-	}
-	settings, err := s.settingRepo.GetMultiple(ctx, keys)
-	if err != nil {
-		return config.LinuxDoConnectConfig{}, fmt.Errorf("get linuxdo connect settings: %w", err)
-	}
-	if raw, ok := settings[SettingKeyLinuxDoConnectEnabled]; ok {
-		effective.Enabled = raw == "true"
-	}
-	if v, ok := settings[SettingKeyLinuxDoConnectClientID]; ok && strings.TrimSpace(v) != "" {
-		effective.ClientID = strings.TrimSpace(v)
-	}
-	if v, ok := settings[SettingKeyLinuxDoConnectClientSecret]; ok && strings.TrimSpace(v) != "" {
-		effective.ClientSecret = strings.TrimSpace(v)
-	}
-	if v, ok := settings[SettingKeyLinuxDoConnectRedirectURL]; ok && strings.TrimSpace(v) != "" {
-		effective.RedirectURL = strings.TrimSpace(v)
-	}
-	if !effective.Enabled {
-		return config.LinuxDoConnectConfig{}, infraerrors.NotFound("OAUTH_DISABLED", "oauth login is disabled")
-	}
-	// 基础健壮性校验（避免把用户重定向到一个必然失败或不安全的 OAuth 流程里）。
-	if strings.TrimSpace(effective.ClientID) == "" {
-		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth client id not configured")
-	}
-	if strings.TrimSpace(effective.AuthorizeURL) == "" {
-		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth authorize url not configured")
-	}
-	if strings.TrimSpace(effective.TokenURL) == "" {
-		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token url not configured")
-	}
-	if strings.TrimSpace(effective.UserInfoURL) == "" {
-		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth userinfo url not configured")
-	}
-	if strings.TrimSpace(effective.RedirectURL) == "" {
-		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth redirect url not configured")
-	}
-	if strings.TrimSpace(effective.FrontendRedirectURL) == "" {
-		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth frontend redirect url not configured")
 	}
-	if err := config.ValidateAbsoluteHTTPURL(effective.AuthorizeURL); err != nil {
+	return result
-		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth authorize url invalid")
+}
-	}
-	if err := config.ValidateAbsoluteHTTPURL(effective.TokenURL); err != nil {
-		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token url invalid")
-	}
-	if err := config.ValidateAbsoluteHTTPURL(effective.UserInfoURL); err != nil {
-		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth userinfo url invalid")
-	}
-	if err := config.ValidateAbsoluteHTTPURL(effective.RedirectURL); err != nil {
-		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth redirect url invalid")
-	}
-	if err := config.ValidateFrontendRedirectURL(effective.FrontendRedirectURL); err != nil {
-		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth frontend redirect url invalid")
-	}
-	method := strings.ToLower(strings.TrimSpace(effective.TokenAuthMethod))
+func isFalseSettingValue(value string) bool {
-	switch method {
+	switch strings.ToLower(strings.TrimSpace(value)) {
-	case "", "client_secret_post", "client_secret_basic":
+	case "false", "0", "off", "disabled":
-		if strings.TrimSpace(effective.ClientSecret) == "" {
+		return true
-			return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth client secret not configured")
-		}
-	case "none":
-		if !effective.UsePKCE {
-			return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth pkce must be enabled when token_auth_method=none")
-		}
 	default:
-		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token_auth_method invalid")
+		return false
 	}
-	return effective, nil
 }
 // getStringOrDefault 获取字符串值或默认值
@@ -635,3 +582,96 @@ func (s *SettingService) GetFallbackModel(ctx context.Context, platform string)
 	}
 	return value
 }
+// GetLinuxDoConnectOAuthConfig 返回用于登录的"最终生效" LinuxDo Connect 配置。
+//
+// 优先级：
+// - 若对应系统设置键存在，则覆盖 config.yaml/env 的值
+// - 否则回退到 config.yaml/env 的值
+func (s *SettingService) GetLinuxDoConnectOAuthConfig(ctx context.Context) (config.LinuxDoConnectConfig, error) {
+	if s == nil || s.cfg == nil {
+		return config.LinuxDoConnectConfig{}, infraerrors.ServiceUnavailable("CONFIG_NOT_READY", "config not loaded")
+	}
+	effective := s.cfg.LinuxDo
+	keys := []string{
+		SettingKeyLinuxDoConnectEnabled,
+		SettingKeyLinuxDoConnectClientID,
+		SettingKeyLinuxDoConnectClientSecret,
+		SettingKeyLinuxDoConnectRedirectURL,
+	}
+	settings, err := s.settingRepo.GetMultiple(ctx, keys)
+	if err != nil {
+		return config.LinuxDoConnectConfig{}, fmt.Errorf("get linuxdo connect settings: %w", err)
+	}
+	if raw, ok := settings[SettingKeyLinuxDoConnectEnabled]; ok {
+		effective.Enabled = raw == "true"
+	}
+	if v, ok := settings[SettingKeyLinuxDoConnectClientID]; ok && strings.TrimSpace(v) != "" {
+		effective.ClientID = strings.TrimSpace(v)
+	}
+	if v, ok := settings[SettingKeyLinuxDoConnectClientSecret]; ok && strings.TrimSpace(v) != "" {
+		effective.ClientSecret = strings.TrimSpace(v)
+	}
+	if v, ok := settings[SettingKeyLinuxDoConnectRedirectURL]; ok && strings.TrimSpace(v) != "" {
+		effective.RedirectURL = strings.TrimSpace(v)
+	}
+	if !effective.Enabled {
+		return config.LinuxDoConnectConfig{}, infraerrors.NotFound("OAUTH_DISABLED", "oauth login is disabled")
+	}
+	// 基础健壮性校验（避免把用户重定向到一个必然失败或不安全的 OAuth 流程里）。
+	if strings.TrimSpace(effective.ClientID) == "" {
+		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth client id not configured")
+	}
+	if strings.TrimSpace(effective.AuthorizeURL) == "" {
+		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth authorize url not configured")
+	}
+	if strings.TrimSpace(effective.TokenURL) == "" {
+		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token url not configured")
+	}
+	if strings.TrimSpace(effective.UserInfoURL) == "" {
+		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth userinfo url not configured")
+	}
+	if strings.TrimSpace(effective.RedirectURL) == "" {
+		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth redirect url not configured")
+	}
+	if strings.TrimSpace(effective.FrontendRedirectURL) == "" {
+		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth frontend redirect url not configured")
+	}
+	if err := config.ValidateAbsoluteHTTPURL(effective.AuthorizeURL); err != nil {
+		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth authorize url invalid")
+	}
+	if err := config.ValidateAbsoluteHTTPURL(effective.TokenURL); err != nil {
+		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token url invalid")
+	}
+	if err := config.ValidateAbsoluteHTTPURL(effective.UserInfoURL); err != nil {
+		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth userinfo url invalid")
+	}
+	if err := config.ValidateAbsoluteHTTPURL(effective.RedirectURL); err != nil {
+		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth redirect url invalid")
+	}
+	if err := config.ValidateFrontendRedirectURL(effective.FrontendRedirectURL); err != nil {
+		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth frontend redirect url invalid")
+	}
+	method := strings.ToLower(strings.TrimSpace(effective.TokenAuthMethod))
+	switch method {
+	case "", "client_secret_post", "client_secret_basic":
+		if strings.TrimSpace(effective.ClientSecret) == "" {
+			return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth client secret not configured")
+		}
+	case "none":
+		if !effective.UsePKCE {
+			return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth pkce must be enabled when token_auth_method=none")
+		}
+	default:
+		return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token_auth_method invalid")
+	}
+	return effective, nil
+}
--- a/backend/internal/service/settings_view.go
+++ b/backend/internal/service/settings_view.go
@@ -46,6 +46,12 @@ type SystemSettings struct {
 	// Identity patch configuration (Claude -> Gemini)
 	EnableIdentityPatch bool   `json:"enable_identity_patch"`
 	IdentityPatchPrompt string `json:"identity_patch_prompt"`
+	// Ops monitoring (vNext)
+	OpsMonitoringEnabled         bool
+	OpsRealtimeMonitoringEnabled bool
+	OpsQueryModeDefault          string
+	OpsMetricsIntervalSeconds    int
 }
 type PublicSettings struct {

--- a/backend/internal/service/wire.go
+++ b/backend/internal/service/wire.go
 package service
 import (
+	"database/sql"
 	"time"
 	"github.com/Wei-Shaw/sub2api/internal/config"
 	"github.com/google/wire"
+	"github.com/redis/go-redis/v9"
 )
 // BuildInfo contains build information
@@ -84,6 +86,72 @@ func ProvideConcurrencyService(cache ConcurrencyCache, accountRepo AccountReposi
 	return svc
 }
+// ProvideOpsMetricsCollector creates and starts OpsMetricsCollector.
+func ProvideOpsMetricsCollector(
+	opsRepo OpsRepository,
+	settingRepo SettingRepository,
+	accountRepo AccountRepository,
+	concurrencyService *ConcurrencyService,
+	db *sql.DB,
+	redisClient *redis.Client,
+	cfg *config.Config,
+) *OpsMetricsCollector {
+	collector := NewOpsMetricsCollector(opsRepo, settingRepo, accountRepo, concurrencyService, db, redisClient, cfg)
+	collector.Start()
+	return collector
+}
+// ProvideOpsAggregationService creates and starts OpsAggregationService (hourly/daily pre-aggregation).
+func ProvideOpsAggregationService(
+	opsRepo OpsRepository,
+	settingRepo SettingRepository,
+	db *sql.DB,
+	redisClient *redis.Client,
+	cfg *config.Config,
+) *OpsAggregationService {
+	svc := NewOpsAggregationService(opsRepo, settingRepo, db, redisClient, cfg)
+	svc.Start()
+	return svc
+}
+// ProvideOpsAlertEvaluatorService creates and starts OpsAlertEvaluatorService.
+func ProvideOpsAlertEvaluatorService(
+	opsService *OpsService,
+	opsRepo OpsRepository,
+	emailService *EmailService,
+	redisClient *redis.Client,
+	cfg *config.Config,
+) *OpsAlertEvaluatorService {
+	svc := NewOpsAlertEvaluatorService(opsService, opsRepo, emailService, redisClient, cfg)
+	svc.Start()
+	return svc
+}
+// ProvideOpsCleanupService creates and starts OpsCleanupService (cron scheduled).
+func ProvideOpsCleanupService(
+	opsRepo OpsRepository,
+	db *sql.DB,
+	redisClient *redis.Client,
+	cfg *config.Config,
+) *OpsCleanupService {
+	svc := NewOpsCleanupService(opsRepo, db, redisClient, cfg)
+	svc.Start()
+	return svc
+}
+// ProvideOpsScheduledReportService creates and starts OpsScheduledReportService.
+func ProvideOpsScheduledReportService(
+	opsService *OpsService,
+	userService *UserService,
+	emailService *EmailService,
+	redisClient *redis.Client,
+	cfg *config.Config,
+) *OpsScheduledReportService {
+	svc := NewOpsScheduledReportService(opsService, userService, emailService, redisClient, cfg)
+	svc.Start()
+	return svc
+}
 // ProvideAPIKeyAuthCacheInvalidator 提供 API Key 认证缓存失效能力
 func ProvideAPIKeyAuthCacheInvalidator(apiKeyService *APIKeyService) APIKeyAuthCacheInvalidator {
 	return apiKeyService
@@ -122,6 +190,12 @@ var ProviderSet = wire.NewSet(
 	NewAccountUsageService,
 	NewAccountTestService,
 	NewSettingService,
+	NewOpsService,
+	ProvideOpsMetricsCollector,
+	ProvideOpsAggregationService,
+	ProvideOpsAlertEvaluatorService,
+	ProvideOpsCleanupService,
+	ProvideOpsScheduledReportService,
 	NewEmailService,
 	ProvideEmailQueueService,
 	NewTurnstileService,

--- a/backend/migrations/033_ops_monitoring_vnext.sql
+++ b/backend/migrations/033_ops_monitoring_vnext.sql
+-- Ops Monitoring (vNext): squashed migration (030)
+--
+-- This repository originally planned Ops vNext as migrations 030-036:
+--   030 drop legacy ops tables
+--   031 core schema
+--   032 pre-aggregation tables
+--   033 indexes + optional extensions
+--   034 add avg/max to preagg
+--   035 add notify_email to alert rules
+--   036 seed default alert rules
+--
+-- Since these migrations have NOT been applied to any environment yet, we squash them
+-- into a single 030 migration for easier review and a cleaner migration history.
+--
+-- Notes:
+-- - This is intentionally destructive for ops_* data (error logs / metrics / alerts).
+-- - It is idempotent (DROP/CREATE/ALTER IF EXISTS/IF NOT EXISTS), but will wipe ops_* data if re-run.
+-- =====================================================================
+-- 030_ops_drop_legacy_ops_tables.sql
+-- =====================================================================
+SET LOCAL lock_timeout = '5s';
+SET LOCAL statement_timeout = '10min';
+-- Legacy pre-aggregation tables (from 026 and/or previous branches)
+DROP TABLE IF EXISTS ops_metrics_daily CASCADE;
+DROP TABLE IF EXISTS ops_metrics_hourly CASCADE;
+-- Core ops tables that may exist in some deployments / branches
+DROP TABLE IF EXISTS ops_system_metrics CASCADE;
+DROP TABLE IF EXISTS ops_error_logs CASCADE;
+DROP TABLE IF EXISTS ops_alert_events CASCADE;
+DROP TABLE IF EXISTS ops_alert_rules CASCADE;
+DROP TABLE IF EXISTS ops_job_heartbeats CASCADE;
+DROP TABLE IF EXISTS ops_retry_attempts CASCADE;
+-- Optional legacy tables (best-effort cleanup)
+DROP TABLE IF EXISTS ops_scheduled_reports CASCADE;
+DROP TABLE IF EXISTS ops_group_availability_configs CASCADE;
+DROP TABLE IF EXISTS ops_group_availability_events CASCADE;
+-- Optional legacy views/indexes
+DROP VIEW IF EXISTS ops_latest_metrics CASCADE;
+-- =====================================================================
+-- 031_ops_core_schema.sql
+-- =====================================================================
+-- Ops Monitoring (vNext): core schema (errors / retries / metrics / jobs / alerts)
+--
+-- Design goals:
+-- - Support global filtering (time/platform/group) across all ops modules.
+-- - Persist enough context for two retry modes (client retry / pinned upstream retry).
+-- - Make ops background jobs observable via job heartbeats.
+-- - Keep schema stable and indexes targeted (high-write tables).
+--
+-- Notes:
+-- - This migration is idempotent.
+-- - ops_* tables intentionally avoid strict foreign keys to reduce write amplification/locks.
+SET LOCAL lock_timeout = '5s';
+SET LOCAL statement_timeout = '10min';
+-- ============================================
+-- 1) ops_error_logs: error log details (high-write)
+-- ============================================
+CREATE TABLE IF NOT EXISTS ops_error_logs (
+    id BIGSERIAL PRIMARY KEY,
+    -- Correlation / identities
+    request_id VARCHAR(64),
+    client_request_id VARCHAR(64),
+    user_id BIGINT,
+    api_key_id BIGINT,
+    account_id BIGINT,
+    group_id BIGINT,
+    client_ip inet,
+    -- Dimensions for global filtering
+    platform VARCHAR(32),
+    -- Request metadata
+    model VARCHAR(100),
+    request_path VARCHAR(256),
+    stream BOOLEAN NOT NULL DEFAULT false,
+    user_agent TEXT,
+    -- Core error classification
+    error_phase VARCHAR(32) NOT NULL,
+    error_type VARCHAR(64) NOT NULL,
+    severity VARCHAR(8) NOT NULL DEFAULT 'P2',
+    status_code INT,
+    -- vNext metric semantics
+    is_business_limited BOOLEAN NOT NULL DEFAULT false,
+    -- Error details (sanitized/truncated at ingest time)
+    error_message TEXT,
+    error_body TEXT,
+    -- Provider/upstream details (optional; useful for trends & account health)
+    error_source VARCHAR(64),
+    error_owner VARCHAR(32),
+    account_status VARCHAR(50),
+    upstream_status_code INT,
+    upstream_error_message TEXT,
+    upstream_error_detail TEXT,
+    provider_error_code VARCHAR(64),
+    provider_error_type VARCHAR(64),
+    network_error_type VARCHAR(50),
+    retry_after_seconds INT,
+    -- Timings (ms) - optional
+    duration_ms INT,
+    time_to_first_token_ms BIGINT,
+    auth_latency_ms BIGINT,
+    routing_latency_ms BIGINT,
+    upstream_latency_ms BIGINT,
+    response_latency_ms BIGINT,
+    -- Retry context (only stored for error requests)
+    request_body JSONB,
+    request_headers JSONB,
+    request_body_truncated BOOLEAN NOT NULL DEFAULT false,
+    request_body_bytes INT,
+    -- Retryability flags (best-effort classification)
+    is_retryable BOOLEAN NOT NULL DEFAULT false,
+    retry_count INT NOT NULL DEFAULT 0,
+    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
+);
+COMMENT ON TABLE ops_error_logs IS 'Ops error logs (vNext). Stores sanitized error details and request_body for retries (errors only).';
+-- ============================================
+-- 2) ops_retry_attempts: audit log for retries
+-- ============================================
+CREATE TABLE IF NOT EXISTS ops_retry_attempts (
+    id BIGSERIAL PRIMARY KEY,
+    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+    requested_by_user_id BIGINT,
+    source_error_id BIGINT,
+    -- client|upstream
+    mode VARCHAR(16) NOT NULL,
+    pinned_account_id BIGINT,
+    -- queued|running|succeeded|failed
+    status VARCHAR(16) NOT NULL DEFAULT 'queued',
+    started_at TIMESTAMPTZ,
+    finished_at TIMESTAMPTZ,
+    duration_ms BIGINT,
+    -- Optional result correlation
+    result_request_id VARCHAR(64),
+    result_error_id BIGINT,
+    result_usage_request_id VARCHAR(64),
+    error_message TEXT
+);
+COMMENT ON TABLE ops_retry_attempts IS 'Audit table for ops retries (client retry / pinned upstream retry).';
+-- ============================================
+-- 3) ops_system_metrics: system + request window snapshots
+-- ============================================
+CREATE TABLE IF NOT EXISTS ops_system_metrics (
+    id BIGSERIAL PRIMARY KEY,
+    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+    window_minutes INT NOT NULL DEFAULT 1,
+    -- Optional dimensions (only if collector chooses to write per-dimension snapshots)
+    platform VARCHAR(32),
+    group_id BIGINT,
+    -- Core counts
+    success_count BIGINT NOT NULL DEFAULT 0,
+    error_count_total BIGINT NOT NULL DEFAULT 0,
+    business_limited_count BIGINT NOT NULL DEFAULT 0,
+    error_count_sla BIGINT NOT NULL DEFAULT 0,
+    upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0,
+    upstream_429_count BIGINT NOT NULL DEFAULT 0,
+    upstream_529_count BIGINT NOT NULL DEFAULT 0,
+    token_consumed BIGINT NOT NULL DEFAULT 0,
+    -- Rates
+    qps DOUBLE PRECISION,
+    tps DOUBLE PRECISION,
+    -- Duration percentiles (ms) - success requests
+    duration_p50_ms INT,
+    duration_p90_ms INT,
+    duration_p95_ms INT,
+    duration_p99_ms INT,
+    duration_avg_ms DOUBLE PRECISION,
+    duration_max_ms INT,
+    -- TTFT percentiles (ms) - success requests (streaming)
+    ttft_p50_ms INT,
+    ttft_p90_ms INT,
+    ttft_p95_ms INT,
+    ttft_p99_ms INT,
+    ttft_avg_ms DOUBLE PRECISION,
+    ttft_max_ms INT,
+    -- System resources
+    cpu_usage_percent DOUBLE PRECISION,
+    memory_used_mb BIGINT,
+    memory_total_mb BIGINT,
+    memory_usage_percent DOUBLE PRECISION,
+    -- Dependency health (best-effort)
+    db_ok BOOLEAN,
+    redis_ok BOOLEAN,
+    -- DB pool & runtime
+    db_conn_active INT,
+    db_conn_idle INT,
+    db_conn_waiting INT,
+    goroutine_count INT,
+    -- Queue / concurrency
+    concurrency_queue_depth INT
+);
+COMMENT ON TABLE ops_system_metrics IS 'Ops system/request metrics snapshots (vNext). Used for dashboard overview and realtime rates.';
+-- ============================================
+-- 4) ops_job_heartbeats: background jobs health
+-- ============================================
+CREATE TABLE IF NOT EXISTS ops_job_heartbeats (
+    job_name VARCHAR(64) PRIMARY KEY,
+    last_run_at TIMESTAMPTZ,
+    last_success_at TIMESTAMPTZ,
+    last_error_at TIMESTAMPTZ,
+    last_error TEXT,
+    last_duration_ms BIGINT,
+    updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
+);
+COMMENT ON TABLE ops_job_heartbeats IS 'Ops background jobs heartbeats (vNext).';
+-- ============================================
+-- 5) ops_alert_rules / ops_alert_events
+-- ============================================
+CREATE TABLE IF NOT EXISTS ops_alert_rules (
+    id BIGSERIAL PRIMARY KEY,
+    name VARCHAR(128) NOT NULL,
+    description TEXT,
+    enabled BOOLEAN NOT NULL DEFAULT true,
+    severity VARCHAR(16) NOT NULL DEFAULT 'warning',
+    -- Metric definition
+    -- Metric definition
+    metric_type VARCHAR(64) NOT NULL,
+    operator VARCHAR(8) NOT NULL,
+    threshold DOUBLE PRECISION NOT NULL,
+    window_minutes INT NOT NULL DEFAULT 5,
+    sustained_minutes INT NOT NULL DEFAULT 5,
+    cooldown_minutes INT NOT NULL DEFAULT 10,
+    -- Optional scoping: platform/group filters etc.
+    filters JSONB,
+    last_triggered_at TIMESTAMPTZ,
+    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+    updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
+);
+CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_alert_rules_name_unique
+    ON ops_alert_rules (name);
+CREATE INDEX IF NOT EXISTS idx_ops_alert_rules_enabled
+    ON ops_alert_rules (enabled);
+CREATE TABLE IF NOT EXISTS ops_alert_events (
+    id BIGSERIAL PRIMARY KEY,
+    rule_id BIGINT,
+    severity VARCHAR(16) NOT NULL,
+    status VARCHAR(16) NOT NULL DEFAULT 'firing',
+    title VARCHAR(200),
+    description TEXT,
+    metric_value DOUBLE PRECISION,
+    threshold_value DOUBLE PRECISION,
+    dimensions JSONB,
+    fired_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+    resolved_at TIMESTAMPTZ,
+    email_sent BOOLEAN NOT NULL DEFAULT false,
+    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
+);
+CREATE INDEX IF NOT EXISTS idx_ops_alert_events_rule_status
+    ON ops_alert_events (rule_id, status);
+CREATE INDEX IF NOT EXISTS idx_ops_alert_events_fired_at
+    ON ops_alert_events (fired_at DESC);
+-- =====================================================================
+-- 032_ops_preaggregation_tables.sql
+-- =====================================================================
+-- Ops Monitoring (vNext): pre-aggregation tables
+--
+-- Purpose:
+-- - Provide stable query performance for 1–24h windows (and beyond), avoiding expensive
+--   percentile_cont scans on raw logs for every dashboard refresh.
+-- - Support global filter dimensions: overall / platform / group.
+--
+-- Design note:
+-- - We keep a single table with nullable platform/group_id, and enforce uniqueness via a
+--   COALESCE-based unique index (because UNIQUE with NULLs allows duplicates in Postgres).
+SET LOCAL lock_timeout = '5s';
+SET LOCAL statement_timeout = '10min';
+-- ============================================
+-- 1) ops_metrics_hourly
+-- ============================================
+CREATE TABLE IF NOT EXISTS ops_metrics_hourly (
+    id BIGSERIAL PRIMARY KEY,
+    bucket_start TIMESTAMPTZ NOT NULL,
+    platform VARCHAR(32),
+    group_id BIGINT,
+    success_count BIGINT NOT NULL DEFAULT 0,
+    error_count_total BIGINT NOT NULL DEFAULT 0,
+    business_limited_count BIGINT NOT NULL DEFAULT 0,
+    error_count_sla BIGINT NOT NULL DEFAULT 0,
+    upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0,
+    upstream_429_count BIGINT NOT NULL DEFAULT 0,
+    upstream_529_count BIGINT NOT NULL DEFAULT 0,
+    token_consumed BIGINT NOT NULL DEFAULT 0,
+    -- Duration percentiles (ms)
+    duration_p50_ms INT,
+    duration_p90_ms INT,
+    duration_p95_ms INT,
+    duration_p99_ms INT,
+    -- TTFT percentiles (ms)
+    ttft_p50_ms INT,
+    ttft_p90_ms INT,
+    ttft_p95_ms INT,
+    ttft_p99_ms INT,
+    computed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
+);
+-- Uniqueness across three “dimension modes” (overall / platform / group).
+-- Postgres UNIQUE treats NULLs as distinct, so we enforce uniqueness via COALESCE.
+CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_metrics_hourly_unique_dim
+    ON ops_metrics_hourly (
+        bucket_start,
+        COALESCE(platform, ''),
+        COALESCE(group_id, 0)
+    );
+CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_bucket
+    ON ops_metrics_hourly (bucket_start DESC);
+CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_platform_bucket
+    ON ops_metrics_hourly (platform, bucket_start DESC)
+    WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL;
+CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_group_bucket
+    ON ops_metrics_hourly (group_id, bucket_start DESC)
+    WHERE group_id IS NOT NULL AND group_id <> 0;
+COMMENT ON TABLE ops_metrics_hourly IS 'vNext hourly pre-aggregated ops metrics (overall/platform/group).';
+-- ============================================
+-- 2) ops_metrics_daily (optional; for longer windows)
+-- ============================================
+CREATE TABLE IF NOT EXISTS ops_metrics_daily (
+    id BIGSERIAL PRIMARY KEY,
+    bucket_date DATE NOT NULL,
+    platform VARCHAR(32),
+    group_id BIGINT,
+    success_count BIGINT NOT NULL DEFAULT 0,
+    error_count_total BIGINT NOT NULL DEFAULT 0,
+    business_limited_count BIGINT NOT NULL DEFAULT 0,
+    error_count_sla BIGINT NOT NULL DEFAULT 0,
+    upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0,
+    upstream_429_count BIGINT NOT NULL DEFAULT 0,
+    upstream_529_count BIGINT NOT NULL DEFAULT 0,
+    token_consumed BIGINT NOT NULL DEFAULT 0,
+    duration_p50_ms INT,
+    duration_p90_ms INT,
+    duration_p95_ms INT,
+    duration_p99_ms INT,
+    ttft_p50_ms INT,
+    ttft_p90_ms INT,
+    ttft_p95_ms INT,
+    ttft_p99_ms INT,
+    computed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
+);
+CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_metrics_daily_unique_dim
+    ON ops_metrics_daily (
+        bucket_date,
+        COALESCE(platform, ''),
+        COALESCE(group_id, 0)
+    );
+CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_bucket
+    ON ops_metrics_daily (bucket_date DESC);
+CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_platform_bucket
+    ON ops_metrics_daily (platform, bucket_date DESC)
+    WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL;
+CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_group_bucket
+    ON ops_metrics_daily (group_id, bucket_date DESC)
+    WHERE group_id IS NOT NULL AND group_id <> 0;
+COMMENT ON TABLE ops_metrics_daily IS 'vNext daily pre-aggregated ops metrics (overall/platform/group).';
+-- =====================================================================
+-- 033_ops_indexes_and_extensions.sql
+-- =====================================================================
+-- Ops Monitoring (vNext): indexes and optional extensions
+--
+-- This migration intentionally keeps "optional" objects (like pg_trgm) best-effort,
+-- so environments without extension privileges won't fail the whole migration chain.
+SET LOCAL lock_timeout = '5s';
+SET LOCAL statement_timeout = '10min';
+-- ============================================
+-- 1) Core btree indexes (always safe)
+-- ============================================
+-- ops_error_logs
+CREATE INDEX IF NOT EXISTS idx_ops_error_logs_created_at
+    ON ops_error_logs (created_at DESC);
+CREATE INDEX IF NOT EXISTS idx_ops_error_logs_platform_time
+    ON ops_error_logs (platform, created_at DESC);
+CREATE INDEX IF NOT EXISTS idx_ops_error_logs_group_time
+    ON ops_error_logs (group_id, created_at DESC)
+    WHERE group_id IS NOT NULL;
+CREATE INDEX IF NOT EXISTS idx_ops_error_logs_account_time
+    ON ops_error_logs (account_id, created_at DESC)
+    WHERE account_id IS NOT NULL;
+CREATE INDEX IF NOT EXISTS idx_ops_error_logs_status_time
+    ON ops_error_logs (status_code, created_at DESC);
+CREATE INDEX IF NOT EXISTS idx_ops_error_logs_phase_time
+    ON ops_error_logs (error_phase, created_at DESC);
+CREATE INDEX IF NOT EXISTS idx_ops_error_logs_type_time
+    ON ops_error_logs (error_type, created_at DESC);
+CREATE INDEX IF NOT EXISTS idx_ops_error_logs_request_id
+    ON ops_error_logs (request_id);
+CREATE INDEX IF NOT EXISTS idx_ops_error_logs_client_request_id
+    ON ops_error_logs (client_request_id);
+-- ops_system_metrics
+CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_created_at
+    ON ops_system_metrics (created_at DESC);
+CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_window_time
+    ON ops_system_metrics (window_minutes, created_at DESC);
+CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_platform_time
+    ON ops_system_metrics (platform, created_at DESC)
+    WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL;
+CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_group_time
+    ON ops_system_metrics (group_id, created_at DESC)
+    WHERE group_id IS NOT NULL;
+-- ops_retry_attempts
+CREATE INDEX IF NOT EXISTS idx_ops_retry_attempts_created_at
+    ON ops_retry_attempts (created_at DESC);
+CREATE INDEX IF NOT EXISTS idx_ops_retry_attempts_source_error
+    ON ops_retry_attempts (source_error_id, created_at DESC)
+    WHERE source_error_id IS NOT NULL;
+-- Prevent concurrent retries for the same ops_error_logs row (race-free, multi-instance safe).
+CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_retry_attempts_unique_active
+    ON ops_retry_attempts (source_error_id)
+    WHERE source_error_id IS NOT NULL AND status IN ('queued', 'running');
+-- ============================================
+-- 2) Optional: pg_trgm + trigram indexes for fuzzy search
+-- ============================================
+DO $$
+BEGIN
+  BEGIN
+    CREATE EXTENSION IF NOT EXISTS pg_trgm;
+  EXCEPTION WHEN OTHERS THEN
+    -- Missing privileges or extension package should not block migrations.
+    RAISE NOTICE 'pg_trgm extension not created: %', SQLERRM;
+  END;
+  IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_trgm') THEN
+    -- request_id / client_request_id fuzzy search
+    EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_request_id_trgm
+             ON ops_error_logs USING gin (request_id gin_trgm_ops)';
+    EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_client_request_id_trgm
+             ON ops_error_logs USING gin (client_request_id gin_trgm_ops)';
+    -- error_message fuzzy search
+    EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_error_message_trgm
+             ON ops_error_logs USING gin (error_message gin_trgm_ops)';
+  END IF;
+END $$;
+-- =====================================================================
+-- 034_ops_preaggregation_add_avg_max.sql
+-- =====================================================================
+-- Ops Monitoring (vNext): extend pre-aggregation tables with avg/max latency fields
+--
+-- Why:
+-- - The dashboard overview returns avg/max for duration/TTFT.
+-- - Hourly/daily pre-aggregation tables originally stored only p50/p90/p95/p99, which makes
+--   it impossible to answer avg/max in preagg mode without falling back to raw scans.
+--
+-- This migration is idempotent and safe to run multiple times.
+--
+-- NOTE: We keep the existing p50/p90/p95/p99 columns as-is; these are still used for
+--       approximate long-window summaries.
+SET LOCAL lock_timeout = '5s';
+SET LOCAL statement_timeout = '10min';
+-- Hourly table
+ALTER TABLE ops_metrics_hourly
+    ADD COLUMN IF NOT EXISTS duration_avg_ms DOUBLE PRECISION,
+    ADD COLUMN IF NOT EXISTS duration_max_ms INT,
+    ADD COLUMN IF NOT EXISTS ttft_avg_ms DOUBLE PRECISION,
+    ADD COLUMN IF NOT EXISTS ttft_max_ms INT;
+-- Daily table
+ALTER TABLE ops_metrics_daily
+    ADD COLUMN IF NOT EXISTS duration_avg_ms DOUBLE PRECISION,
+    ADD COLUMN IF NOT EXISTS duration_max_ms INT,
+    ADD COLUMN IF NOT EXISTS ttft_avg_ms DOUBLE PRECISION,
+    ADD COLUMN IF NOT EXISTS ttft_max_ms INT;
+-- =====================================================================
+-- 035_ops_alert_rules_notify_email.sql
+-- =====================================================================
+-- Ops Monitoring (vNext): alert rule notify settings
+--
+-- Adds notify_email flag to ops_alert_rules to keep UI parity with the backup Ops dashboard.
+-- Migration is idempotent.
+SET LOCAL lock_timeout = '5s';
+SET LOCAL statement_timeout = '10min';
+ALTER TABLE ops_alert_rules
+    ADD COLUMN IF NOT EXISTS notify_email BOOLEAN NOT NULL DEFAULT true;
+-- =====================================================================
+-- 036_ops_seed_default_alert_rules.sql
+-- =====================================================================
+-- Ops Monitoring (vNext): seed default alert rules (idempotent)
+--
+-- Goal:
+-- - Provide "out of the box" alert rules so the Ops dashboard can immediately show alert events.
+-- - Keep inserts idempotent via ON CONFLICT (name) DO NOTHING.
+--
+-- Notes:
+-- - Thresholds are intentionally conservative defaults and should be tuned per deployment.
+-- - Metric semantics follow vNext:
+--   - success_rate / error_rate are based on SLA-scope counts (exclude is_business_limited).
+--   - upstream_error_rate excludes 429/529.
+SET LOCAL lock_timeout = '5s';
+SET LOCAL statement_timeout = '10min';
+-- 1) High error rate (P1)
+INSERT INTO ops_alert_rules (
+    name, description, enabled, metric_type, operator, threshold,
+    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
+    created_at, updated_at
+) VALUES (
+    '错误率过高',
+    '当错误率超过 5% 且持续 5 分钟时触发告警',
+    true, 'error_rate', '>', 5.0, 5, 5, 'P1', true, 20, NOW(), NOW()
+) ON CONFLICT (name) DO NOTHING;
+-- 2) Low success rate (P0)
+INSERT INTO ops_alert_rules (
+    name, description, enabled, metric_type, operator, threshold,
+    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
+    created_at, updated_at
+) VALUES (
+    '成功率过低',
+    '当成功率低于 95% 且持续 5 分钟时触发告警（服务可用性下降）',
+    true, 'success_rate', '<', 95.0, 5, 5, 'P0', true, 15, NOW(), NOW()
+) ON CONFLICT (name) DO NOTHING;
+-- 3) P99 latency too high (P2)
+INSERT INTO ops_alert_rules (
+    name, description, enabled, metric_type, operator, threshold,
+    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
+    created_at, updated_at
+) VALUES (
+    'P99延迟过高',
+    '当 P99 延迟超过 3000ms 且持续 10 分钟时触发告警',
+    true, 'p99_latency_ms', '>', 3000.0, 5, 10, 'P2', true, 30, NOW(), NOW()
+) ON CONFLICT (name) DO NOTHING;
+-- 4) P95 latency too high (P2)
+INSERT INTO ops_alert_rules (
+    name, description, enabled, metric_type, operator, threshold,
+    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
+    created_at, updated_at
+) VALUES (
+    'P95延迟过高',
+    '当 P95 延迟超过 2000ms 且持续 10 分钟时触发告警',
+    true, 'p95_latency_ms', '>', 2000.0, 5, 10, 'P2', true, 30, NOW(), NOW()
+) ON CONFLICT (name) DO NOTHING;
+-- 5) CPU usage too high (P2)
+INSERT INTO ops_alert_rules (
+    name, description, enabled, metric_type, operator, threshold,
+    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
+    created_at, updated_at
+) VALUES (
+    'CPU使用率过高',
+    '当 CPU 使用率超过 85% 且持续 10 分钟时触发告警',
+    true, 'cpu_usage_percent', '>', 85.0, 5, 10, 'P2', true, 30, NOW(), NOW()
+) ON CONFLICT (name) DO NOTHING;
+-- 6) Memory usage too high (P1)
+INSERT INTO ops_alert_rules (
+    name, description, enabled, metric_type, operator, threshold,
+    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
+    created_at, updated_at
+) VALUES (
+    '内存使用率过高',
+    '当内存使用率超过 90% 且持续 10 分钟时触发告警（可能导致 OOM）',
+    true, 'memory_usage_percent', '>', 90.0, 5, 10, 'P1', true, 20, NOW(), NOW()
+) ON CONFLICT (name) DO NOTHING;
+-- 7) Concurrency queue buildup (P1)
+INSERT INTO ops_alert_rules (
+    name, description, enabled, metric_type, operator, threshold,
+    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
+    created_at, updated_at
+) VALUES (
+    '并发队列积压',
+    '当并发队列深度超过 100 且持续 5 分钟时触发告警（系统处理能力不足）',
+    true, 'concurrency_queue_depth', '>', 100.0, 5, 5, 'P1', true, 20, NOW(), NOW()
+) ON CONFLICT (name) DO NOTHING;
+-- 8) Extremely high error rate (P0)
+INSERT INTO ops_alert_rules (
+    name, description, enabled, metric_type, operator, threshold,
+    window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
+    created_at, updated_at
+) VALUES (
+    '错误率极高',
+    '当错误率超过 20% 且持续 1 分钟时触发告警（服务严重异常）',
+    true, 'error_rate', '>', 20.0, 1, 1, 'P0', true, 15, NOW(), NOW()
+) ON CONFLICT (name) DO NOTHING;
+-- Ops Monitoring vNext: add Redis pool stats fields to system metrics snapshots.
+-- This migration is intentionally idempotent.
+ALTER TABLE ops_system_metrics
+  ADD COLUMN IF NOT EXISTS redis_conn_total INT,
+  ADD COLUMN IF NOT EXISTS redis_conn_idle INT;
+COMMENT ON COLUMN ops_system_metrics.redis_conn_total IS 'Redis pool total connections (go-redis PoolStats.TotalConns).';
+COMMENT ON COLUMN ops_system_metrics.redis_conn_idle IS 'Redis pool idle connections (go-redis PoolStats.IdleConns).';
--- a/backend/migrations/034_ops_upstream_error_events.sql
+++ b/backend/migrations/034_ops_upstream_error_events.sql
+-- Add upstream error events list (JSONB) to ops_error_logs for per-request correlation.
+--
+-- This is intentionally idempotent.
+ALTER TABLE ops_error_logs
+    ADD COLUMN IF NOT EXISTS upstream_errors JSONB;
+COMMENT ON COLUMN ops_error_logs.upstream_errors IS
+    'Sanitized upstream error events list (JSON array), correlated per gateway request (request_id/client_request_id); used for per-request upstream debugging.';
--- a/config.yaml
+++ b/config.yaml
@@ -159,7 +159,7 @@ gateway:
  max_line_size: 41943040
  # Log upstream error response body summary (safe/truncated; does not log request content)
  # 记录上游错误响应体摘要（安全/截断；不记录请求内容）
-  log_upstream_error_body: false
+  log_upstream_error_body: true
  # Max bytes to log from upstream error body
  # 记录上游错误响应体的最大字节数
  log_upstream_error_body_max_bytes: 2048
@@ -302,6 +302,41 @@ redis:
  # 数据库编号（0-15）
  db: 0
+# =============================================================================
+# Ops Monitoring (Optional)
+# 运维监控 (可选)
+# =============================================================================
+ops:
+  # Hard switch: disable all ops background jobs and APIs when false
+  # 硬开关：为 false 时禁用所有 Ops 后台任务与接口
+  enabled: true
+  # Prefer pre-aggregated tables (ops_metrics_hourly/ops_metrics_daily) for long-window dashboard queries.
+  # 优先使用预聚合表（用于长时间窗口查询性能）
+  use_preaggregated_tables: false
+  # Data cleanup configuration
+  # 数据清理配置（vNext 默认统一保留 30 天）
+  cleanup:
+    enabled: true
+    # Cron expression (minute hour dom month dow), e.g. "0 2 * * *" = daily at 2 AM
+    # Cron 表达式（分 时 日 月 周），例如 "0 2 * * *" = 每天凌晨 2 点
+    schedule: "0 2 * * *"
+    error_log_retention_days: 30
+    minute_metrics_retention_days: 30
+    hourly_metrics_retention_days: 30
+  # Pre-aggregation configuration
+  # 预聚合任务配置
+  aggregation:
+    enabled: true
+  # OpsMetricsCollector Redis cache (reduces duplicate expensive window aggregation in multi-replica deployments)
+  # 指标采集 Redis 缓存（多副本部署时减少重复计算）
+  metrics_collector_cache:
+    enabled: true
+    ttl: 65s
 # =============================================================================
 # JWT Configuration
 # JWT 配置

--- a/deploy/.env.example
+++ b/deploy/.env.example
@@ -151,6 +151,15 @@ GEMINI_OAUTH_SCOPES=
 # GEMINI_QUOTA_POLICY={"tiers":{"LEGACY":{"pro_rpd":50,"flash_rpd":1500,"cooldown_minutes":30},"PRO":{"pro_rpd":1500,"flash_rpd":4000,"cooldown_minutes":5},"ULTRA":{"pro_rpd":2000,"flash_rpd":0,"cooldown_minutes":5}}}
 GEMINI_QUOTA_POLICY=
+# -----------------------------------------------------------------------------
+# Ops Monitoring Configuration (运维监控配置)
+# -----------------------------------------------------------------------------
+# Enable ops monitoring features (background jobs and APIs)
+# 是否启用运维监控功能（后台任务和接口）
+# Set to false to hide ops menu in sidebar and disable all ops features
+# 设置为 false 可在左侧栏隐藏运维监控菜单并禁用所有运维监控功能
+OPS_ENABLED=true
 # -----------------------------------------------------------------------------
 # Update Configuration (在线更新配置)
 # -----------------------------------------------------------------------------

--- a/deploy/config.example.yaml
+++ b/deploy/config.example.yaml
@@ -159,7 +159,7 @@ gateway:
  max_line_size: 41943040
  # Log upstream error response body summary (safe/truncated; does not log request content)
  # 记录上游错误响应体摘要（安全/截断；不记录请求内容）
-  log_upstream_error_body: false
+  log_upstream_error_body: true
  # Max bytes to log from upstream error body
  # 记录上游错误响应体的最大字节数
  log_upstream_error_body_max_bytes: 2048
@@ -302,6 +302,19 @@ redis:
  # 数据库编号（0-15）
  db: 0
+# =============================================================================
+# Ops Monitoring (Optional)
+# 运维监控 (可选)
+# =============================================================================
+ops:
+  # Enable ops monitoring features (background jobs and APIs)
+  # 是否启用运维监控功能（后台任务和接口）
+  # Set to false to hide ops menu in sidebar and disable all ops features
+  # 设置为 false 可在左侧栏隐藏运维监控菜单并禁用所有运维监控功能
+  # Other detailed settings (cleanup, aggregation, etc.) are configured in ops settings dialog
+  # 其他详细设置（数据清理、预聚合等）在运维监控设置对话框中配置
+  enabled: true
 # =============================================================================
 # JWT Configuration
 # JWT 配置

--- a/frontend/src/api/admin/index.ts
+++ b/frontend/src/api/admin/index.ts
@@ -17,6 +17,7 @@ import usageAPI from './usage'
 import geminiAPI from './gemini'
 import antigravityAPI from './antigravity'
 import userAttributesAPI from './userAttributes'
+import opsAPI from './ops'
 /**
 * Unified admin API object for convenient access
@@ -35,7 +36,8 @@ export const adminAPI = {
  usage: usageAPI,
  gemini: geminiAPI,
  antigravity: antigravityAPI,
-  userAttributes: userAttributesAPI
+  userAttributes: userAttributesAPI,
+  ops: opsAPI
 }
 export {
@@ -52,7 +54,8 @@ export {
  usageAPI,
  geminiAPI,
  antigravityAPI,
-  userAttributesAPI
+  userAttributesAPI,
+  opsAPI
 }
 export default adminAPI
--- a/frontend/src/api/admin/ops.ts
+++ b/frontend/src/api/admin/ops.ts
+/**
+ * Admin Ops API endpoints (vNext)
+ * - Error logs list/detail + retry (client/upstream)
+ * - Dashboard overview (raw path)
+ */
+import { apiClient } from '../client'
+import type { PaginatedResponse } from '@/types'
+export type OpsRetryMode = 'client' | 'upstream'
+export type OpsQueryMode = 'auto' | 'raw' | 'preagg'
+export interface OpsRequestOptions {
+  signal?: AbortSignal
+}
+export interface OpsRetryRequest {
+  mode: OpsRetryMode
+  pinned_account_id?: number
+}
+export interface OpsRetryResult {
+  attempt_id: number
+  mode: OpsRetryMode
+  status: 'running' | 'succeeded' | 'failed' | string
+  pinned_account_id?: number | null
+  used_account_id?: number | null
+  http_status_code: number
+  upstream_request_id: string
+  response_preview: string
+  response_truncated: boolean
+  error_message: string
+  started_at: string
+  finished_at: string
+  duration_ms: number
+}
+export interface OpsDashboardOverview {
+  start_time: string
+  end_time: string
+  platform: string
+  group_id?: number | null
+  health_score?: number
+  system_metrics?: OpsSystemMetricsSnapshot | null
+  job_heartbeats?: OpsJobHeartbeat[] | null
+  success_count: number
+  error_count_total: number
+  business_limited_count: number
+  error_count_sla: number
+  request_count_total: number
+  request_count_sla: number
+  token_consumed: number
+  sla: number
+  error_rate: number
+  upstream_error_rate: number
+  upstream_error_count_excl_429_529: number
+  upstream_429_count: number
+  upstream_529_count: number
+  qps: {
+    current: number
+    peak: number
+    avg: number
+  }
+  tps: {
+    current: number
+    peak: number
+    avg: number
+  }
+  duration: OpsPercentiles
+  ttft: OpsPercentiles
+}
+export interface OpsPercentiles {
+  p50_ms?: number | null
+  p90_ms?: number | null
+  p95_ms?: number | null
+  p99_ms?: number | null
+  avg_ms?: number | null
+  max_ms?: number | null
+}
+export interface OpsThroughputTrendPoint {
+  bucket_start: string
+  request_count: number
+  token_consumed: number
+  qps: number
+  tps: number
+}
+export interface OpsThroughputPlatformBreakdownItem {
+  platform: string
+  request_count: number
+  token_consumed: number
+}
+export interface OpsThroughputGroupBreakdownItem {
+  group_id: number
+  group_name: string
+  request_count: number
+  token_consumed: number
+}
+export interface OpsThroughputTrendResponse {
+  bucket: string
+  points: OpsThroughputTrendPoint[]
+  by_platform?: OpsThroughputPlatformBreakdownItem[]
+  top_groups?: OpsThroughputGroupBreakdownItem[]
+}
+export type OpsRequestKind = 'success' | 'error'
+export type OpsRequestDetailsKind = OpsRequestKind | 'all'
+export type OpsRequestDetailsSort = 'created_at_desc' | 'duration_desc'
+export interface OpsRequestDetail {
+  kind: OpsRequestKind
+  created_at: string
+  request_id: string
+  platform?: string
+  model?: string
+  duration_ms?: number | null
+  status_code?: number | null
+  error_id?: number | null
+  phase?: string
+  severity?: string
+  message?: string
+  user_id?: number | null
+  api_key_id?: number | null
+  account_id?: number | null
+  group_id?: number | null
+  stream?: boolean
+}
+export interface OpsRequestDetailsParams {
+  time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
+  start_time?: string
+  end_time?: string
+  kind?: OpsRequestDetailsKind
+  platform?: string
+  group_id?: number | null
+  user_id?: number
+  api_key_id?: number
+  account_id?: number
+  model?: string
+  request_id?: string
+  q?: string
+  min_duration_ms?: number
+  max_duration_ms?: number
+  sort?: OpsRequestDetailsSort
+  page?: number
+  page_size?: number
+}
+export type OpsRequestDetailsResponse = PaginatedResponse<OpsRequestDetail>
+export interface OpsLatencyHistogramBucket {
+  range: string
+  count: number
+}
+export interface OpsLatencyHistogramResponse {
+  start_time: string
+  end_time: string
+  platform: string
+  group_id?: number | null
+  total_requests: number
+  buckets: OpsLatencyHistogramBucket[]
+}
+export interface OpsErrorTrendPoint {
+  bucket_start: string
+  error_count_total: number
+  business_limited_count: number
+  error_count_sla: number
+  upstream_error_count_excl_429_529: number
+  upstream_429_count: number
+  upstream_529_count: number
+}
+export interface OpsErrorTrendResponse {
+  bucket: string
+  points: OpsErrorTrendPoint[]
+}
+export interface OpsErrorDistributionItem {
+  status_code: number
+  total: number
+  sla: number
+  business_limited: number
+}
+export interface OpsErrorDistributionResponse {
+  total: number
+  items: OpsErrorDistributionItem[]
+}
+export interface OpsSystemMetricsSnapshot {
+  id: number
+  created_at: string
+  window_minutes: number
+  cpu_usage_percent?: number | null
+  memory_used_mb?: number | null
+  memory_total_mb?: number | null
+  memory_usage_percent?: number | null
+  db_ok?: boolean | null
+  redis_ok?: boolean | null
+  // Config-derived limits (best-effort) for rendering "current vs max".
+  db_max_open_conns?: number | null
+  redis_pool_size?: number | null
+  redis_conn_total?: number | null
+  redis_conn_idle?: number | null
+  db_conn_active?: number | null
+  db_conn_idle?: number | null
+  db_conn_waiting?: number | null
+  goroutine_count?: number | null
+  concurrency_queue_depth?: number | null
+}
+export interface OpsJobHeartbeat {
+  job_name: string
+  last_run_at?: string | null
+  last_success_at?: string | null
+  last_error_at?: string | null
+  last_error?: string | null
+  last_duration_ms?: number | null
+  updated_at: string
+}
+export interface PlatformConcurrencyInfo {
+  platform: string
+  current_in_use: number
+  max_capacity: number
+  load_percentage: number
+  waiting_in_queue: number
+}
+export interface GroupConcurrencyInfo {
+  group_id: number
+  group_name: string
+  platform: string
+  current_in_use: number
+  max_capacity: number
+  load_percentage: number
+  waiting_in_queue: number
+}
+export interface AccountConcurrencyInfo {
+  account_id: number
+  account_name?: string
+  platform: string
+  group_id: number
+  group_name: string
+  current_in_use: number
+  max_capacity: number
+  load_percentage: number
+  waiting_in_queue: number
+}
+export interface OpsConcurrencyStatsResponse {
+  enabled: boolean
+  platform: Record<string, PlatformConcurrencyInfo>
+  group: Record<string, GroupConcurrencyInfo>
+  account: Record<string, AccountConcurrencyInfo>
+  timestamp?: string
+}
+export async function getConcurrencyStats(platform?: string, groupId?: number | null): Promise<OpsConcurrencyStatsResponse> {
+  const params: Record<string, any> = {}
+  if (platform) {
+    params.platform = platform
+  }
+  if (typeof groupId === 'number' && groupId > 0) {
+    params.group_id = groupId
+  }
+  const { data } = await apiClient.get<OpsConcurrencyStatsResponse>('/admin/ops/concurrency', { params })
+  return data
+}
+export interface PlatformAvailability {
+  platform: string
+  total_accounts: number
+  available_count: number
+  rate_limit_count: number
+  error_count: number
+}
+export interface GroupAvailability {
+  group_id: number
+  group_name: string
+  platform: string
+  total_accounts: number
+  available_count: number
+  rate_limit_count: number
+  error_count: number
+}
+export interface AccountAvailability {
+  account_id: number
+  account_name: string
+  platform: string
+  group_id: number
+  group_name: string
+  status: string
+  is_available: boolean
+  is_rate_limited: boolean
+  rate_limit_reset_at?: string
+  rate_limit_remaining_sec?: number
+  is_overloaded: boolean
+  overload_until?: string
+  overload_remaining_sec?: number
+  has_error: boolean
+  error_message?: string
+}
+export interface OpsAccountAvailabilityStatsResponse {
+  enabled: boolean
+  platform: Record<string, PlatformAvailability>
+  group: Record<string, GroupAvailability>
+  account: Record<string, AccountAvailability>
+  timestamp?: string
+}
+export async function getAccountAvailabilityStats(platform?: string, groupId?: number | null): Promise<OpsAccountAvailabilityStatsResponse> {
+  const params: Record<string, any> = {}
+  if (platform) {
+    params.platform = platform
+  }
+  if (typeof groupId === 'number' && groupId > 0) {
+    params.group_id = groupId
+  }
+  const { data } = await apiClient.get<OpsAccountAvailabilityStatsResponse>('/admin/ops/account-availability', { params })
+  return data
+}
+/**
+ * Subscribe to realtime QPS updates via WebSocket.
+ *
+ * Note: browsers cannot set Authorization headers for WebSockets.
+ * We authenticate via Sec-WebSocket-Protocol using a prefixed token item:
+ *   ["sub2api-admin", "jwt.<token>"]
+ */
+export interface SubscribeQPSOptions {
+  token?: string | null
+  onOpen?: () => void
+  onClose?: (event: CloseEvent) => void
+  onError?: (event: Event) => void
+  /**
+   * Called when the server closes with an application close code that indicates
+   * reconnecting is not useful (e.g. feature flag disabled).
+   */
+  onFatalClose?: (event: CloseEvent) => void
+  /**
+   * More granular status updates for UI (connecting/reconnecting/offline/etc).
+   */
+  onStatusChange?: (status: OpsWSStatus) => void
+  /**
+   * Called when a reconnect is scheduled (helps display "retry in Xs").
+   */
+  onReconnectScheduled?: (info: { attempt: number, delayMs: number }) => void
+  wsBaseUrl?: string
+  /**
+   * Maximum reconnect attempts. Defaults to Infinity to keep the dashboard live.
+   * Set to 0 to disable reconnect.
+   */
+  maxReconnectAttempts?: number
+  reconnectBaseDelayMs?: number
+  reconnectMaxDelayMs?: number
+  /**
+   * Stale connection detection (heartbeat-by-observation).
+   * If no messages are received within this window, the socket is closed to trigger a reconnect.
+   * Set to 0 to disable.
+   */
+  staleTimeoutMs?: number
+  /**
+   * How often to check staleness. Only used when `staleTimeoutMs > 0`.
+   */
+  staleCheckIntervalMs?: number
+}
+export type OpsWSStatus = 'connecting' | 'connected' | 'reconnecting' | 'offline' | 'closed'
+export const OPS_WS_CLOSE_CODES = {
+  REALTIME_DISABLED: 4001
+} as const
+const OPS_WS_BASE_PROTOCOL = 'sub2api-admin'
+export function subscribeQPS(onMessage: (data: any) => void, options: SubscribeQPSOptions = {}): () => void {
+  let ws: WebSocket | null = null
+  let reconnectAttempts = 0
+  const maxReconnectAttempts = Number.isFinite(options.maxReconnectAttempts as number)
+    ? (options.maxReconnectAttempts as number)
+    : Infinity
+  const baseDelayMs = options.reconnectBaseDelayMs ?? 1000
+  const maxDelayMs = options.reconnectMaxDelayMs ?? 30000
+  let reconnectTimer: ReturnType<typeof setTimeout> | null = null
+  let shouldReconnect = true
+  let isConnecting = false
+  let hasConnectedOnce = false
+  let lastMessageAt = 0
+  const staleTimeoutMs = options.staleTimeoutMs ?? 120_000
+  const staleCheckIntervalMs = options.staleCheckIntervalMs ?? 30_000
+  let staleTimer: ReturnType<typeof setInterval> | null = null
+  const setStatus = (status: OpsWSStatus) => {
+    options.onStatusChange?.(status)
+  }
+  const clearReconnectTimer = () => {
+    if (reconnectTimer) {
+      clearTimeout(reconnectTimer)
+      reconnectTimer = null
+    }
+  }
+  const clearStaleTimer = () => {
+    if (staleTimer) {
+      clearInterval(staleTimer)
+      staleTimer = null
+    }
+  }
+  const startStaleTimer = () => {
+    clearStaleTimer()
+    if (!staleTimeoutMs || staleTimeoutMs <= 0) return
+    staleTimer = setInterval(() => {
+      if (!shouldReconnect) return
+      if (!ws || ws.readyState !== WebSocket.OPEN) return
+      if (!lastMessageAt) return
+      const ageMs = Date.now() - lastMessageAt
+      if (ageMs > staleTimeoutMs) {
+        // Treat as a half-open connection; closing triggers the normal reconnect path.
+        ws.close()
+      }
+    }, staleCheckIntervalMs)
+  }
+  const scheduleReconnect = () => {
+    if (!shouldReconnect) return
+    if (hasConnectedOnce && reconnectAttempts >= maxReconnectAttempts) return
+    // If we're offline, wait for the browser to come back online.
+    if (typeof navigator !== 'undefined' && 'onLine' in navigator && !navigator.onLine) {
+      setStatus('offline')
+      return
+    }
+    const expDelay = baseDelayMs * Math.pow(2, reconnectAttempts)
+    const delay = Math.min(expDelay, maxDelayMs)
+    const jitter = Math.floor(Math.random() * 250)
+    clearReconnectTimer()
+    reconnectTimer = setTimeout(() => {
+      reconnectAttempts++
+      connect()
+    }, delay + jitter)
+    options.onReconnectScheduled?.({ attempt: reconnectAttempts + 1, delayMs: delay + jitter })
+  }
+  const handleOnline = () => {
+    if (!shouldReconnect) return
+    if (ws && (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING)) return
+    connect()
+  }
+  const handleOffline = () => {
+    setStatus('offline')
+  }
+  const connect = () => {
+    if (!shouldReconnect) return
+    if (isConnecting) return
+    if (ws && (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING)) return
+    if (hasConnectedOnce && reconnectAttempts >= maxReconnectAttempts) return
+    isConnecting = true
+    setStatus(hasConnectedOnce ? 'reconnecting' : 'connecting')
+    const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'
+    const wsBaseUrl = options.wsBaseUrl || import.meta.env.VITE_WS_BASE_URL || window.location.host
+    const wsURL = new URL(`${protocol}//${wsBaseUrl}/api/v1/admin/ops/ws/qps`)
+    // Do NOT put admin JWT in the URL query string (it can leak via access logs, proxies, etc).
+    // Browsers cannot set Authorization headers for WebSockets, so we pass the token via
+    // Sec-WebSocket-Protocol (subprotocol list): ["sub2api-admin", "jwt.<token>"].
+    const rawToken = String(options.token ?? localStorage.getItem('auth_token') ?? '').trim()
+    const protocols: string[] = [OPS_WS_BASE_PROTOCOL]
+    if (rawToken) protocols.push(`jwt.${rawToken}`)
+    ws = new WebSocket(wsURL.toString(), protocols)
+    ws.onopen = () => {
+      reconnectAttempts = 0
+      isConnecting = false
+      hasConnectedOnce = true
+      clearReconnectTimer()
+      lastMessageAt = Date.now()
+      startStaleTimer()
+      setStatus('connected')
+      options.onOpen?.()
+    }
+    ws.onmessage = (e) => {
+      try {
+        const data = JSON.parse(e.data)
+        lastMessageAt = Date.now()
+        onMessage(data)
+      } catch (err) {
+        console.warn('[OpsWS] Failed to parse message:', err)
+      }
+    }
+    ws.onerror = (error) => {
+      console.error('[OpsWS] Connection error:', error)
+      options.onError?.(error)
+    }
+    ws.onclose = (event) => {
+      isConnecting = false
+      options.onClose?.(event)
+      clearStaleTimer()
+      ws = null
+      // If the server explicitly tells us to stop reconnecting, honor it.
+      if (event && typeof event.code === 'number' && event.code === OPS_WS_CLOSE_CODES.REALTIME_DISABLED) {
+        shouldReconnect = false
+        clearReconnectTimer()
+        setStatus('closed')
+        options.onFatalClose?.(event)
+        return
+      }
+      scheduleReconnect()
+    }
+  }
+  window.addEventListener('online', handleOnline)
+  window.addEventListener('offline', handleOffline)
+  connect()
+  return () => {
+    shouldReconnect = false
+    window.removeEventListener('online', handleOnline)
+    window.removeEventListener('offline', handleOffline)
+    clearReconnectTimer()
+    clearStaleTimer()
+    if (ws) ws.close()
+    ws = null
+    setStatus('closed')
+  }
+}
+export type OpsSeverity = string
+export type OpsPhase = string
+export type AlertSeverity = 'critical' | 'warning' | 'info'
+export type ThresholdMode = 'count' | 'percentage' | 'both'
+export type MetricType =
+  | 'success_rate'
+  | 'error_rate'
+  | 'upstream_error_rate'
+  | 'p95_latency_ms'
+  | 'p99_latency_ms'
+  | 'cpu_usage_percent'
+  | 'memory_usage_percent'
+  | 'concurrency_queue_depth'
+  | 'group_available_accounts'
+  | 'group_available_ratio'
+  | 'group_rate_limit_ratio'
+  | 'account_rate_limited_count'
+  | 'account_error_count'
+  | 'account_error_ratio'
+  | 'overload_account_count'
+export type Operator = '>' | '>=' | '<' | '<=' | '==' | '!='
+export interface AlertRule {
+  id?: number
+  name: string
+  description?: string
+  enabled: boolean
+  metric_type: MetricType
+  operator: Operator
+  threshold: number
+  window_minutes: number
+  sustained_minutes: number
+  severity: OpsSeverity
+  cooldown_minutes: number
+  notify_email: boolean
+  filters?: Record<string, any>
+  created_at?: string
+  updated_at?: string
+  last_triggered_at?: string | null
+}
+export interface AlertEvent {
+  id: number
+  rule_id: number
+  severity: OpsSeverity | string
+  status: 'firing' | 'resolved' | string
+  title?: string
+  description?: string
+  metric_value?: number
+  threshold_value?: number
+  dimensions?: Record<string, any>
+  fired_at: string
+  resolved_at?: string | null
+  email_sent: boolean
+  created_at: string
+}
+export interface EmailNotificationConfig {
+  alert: {
+    enabled: boolean
+    recipients: string[]
+    min_severity: AlertSeverity | ''
+    rate_limit_per_hour: number
+    batching_window_seconds: number
+    include_resolved_alerts: boolean
+  }
+  report: {
+    enabled: boolean
+    recipients: string[]
+    daily_summary_enabled: boolean
+    daily_summary_schedule: string
+    weekly_summary_enabled: boolean
+    weekly_summary_schedule: string
+    error_digest_enabled: boolean
+    error_digest_schedule: string
+    error_digest_min_count: number
+    account_health_enabled: boolean
+    account_health_schedule: string
+    account_health_error_rate_threshold: number
+  }
+}
+export interface OpsDistributedLockSettings {
+  enabled: boolean
+  key: string
+  ttl_seconds: number
+}
+export interface OpsAlertRuntimeSettings {
+  evaluation_interval_seconds: number
+  distributed_lock: OpsDistributedLockSettings
+  silencing: {
+    enabled: boolean
+    global_until_rfc3339: string
+    global_reason: string
+    entries?: Array<{
+      rule_id?: number
+      severities?: Array<OpsSeverity | string>
+      until_rfc3339: string
+      reason: string
+    }>
+  }
+}
+export interface OpsAdvancedSettings {
+  data_retention: OpsDataRetentionSettings
+  aggregation: OpsAggregationSettings
+}
+export interface OpsDataRetentionSettings {
+  cleanup_enabled: boolean
+  cleanup_schedule: string
+  error_log_retention_days: number
+  minute_metrics_retention_days: number
+  hourly_metrics_retention_days: number
+}
+export interface OpsAggregationSettings {
+  aggregation_enabled: boolean
+}
+export interface OpsErrorLog {
+  id: number
+  created_at: string
+  phase: OpsPhase
+  type: string
+  severity: OpsSeverity
+  status_code: number
+  platform: string
+  model: string
+  latency_ms?: number | null
+  client_request_id: string
+  request_id: string
+  message: string
+  user_id?: number | null
+  api_key_id?: number | null
+  account_id?: number | null
+  group_id?: number | null
+  client_ip?: string | null
+  request_path?: string
+  stream?: boolean
+}
+export interface OpsErrorDetail extends OpsErrorLog {
+  error_body: string
+  user_agent: string
+  // Upstream context (optional; enriched by gateway services)
+  upstream_status_code?: number | null
+  upstream_error_message?: string
+  upstream_error_detail?: string
+  upstream_errors?: string
+  auth_latency_ms?: number | null
+  routing_latency_ms?: number | null
+  upstream_latency_ms?: number | null
+  response_latency_ms?: number | null
+  time_to_first_token_ms?: number | null
+  request_body: string
+  request_body_truncated: boolean
+  request_body_bytes?: number | null
+  is_business_limited: boolean
+}
+export type OpsErrorLogsResponse = PaginatedResponse<OpsErrorLog>
+export async function getDashboardOverview(
+  params: {
+  time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
+  start_time?: string
+  end_time?: string
+  platform?: string
+  group_id?: number | null
+  mode?: OpsQueryMode
+  },
+  options: OpsRequestOptions = {}
+): Promise<OpsDashboardOverview> {
+  const { data } = await apiClient.get<OpsDashboardOverview>('/admin/ops/dashboard/overview', {
+    params,
+    signal: options.signal
+  })
+  return data
+}
+export async function getThroughputTrend(
+  params: {
+  time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
+  start_time?: string
+  end_time?: string
+  platform?: string
+  group_id?: number | null
+  mode?: OpsQueryMode
+  },
+  options: OpsRequestOptions = {}
+): Promise<OpsThroughputTrendResponse> {
+  const { data } = await apiClient.get<OpsThroughputTrendResponse>('/admin/ops/dashboard/throughput-trend', {
+    params,
+    signal: options.signal
+  })
+  return data
+}
+export async function getLatencyHistogram(
+  params: {
+  time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
+  start_time?: string
+  end_time?: string
+  platform?: string
+  group_id?: number | null
+  mode?: OpsQueryMode
+  },
+  options: OpsRequestOptions = {}
+): Promise<OpsLatencyHistogramResponse> {
+  const { data } = await apiClient.get<OpsLatencyHistogramResponse>('/admin/ops/dashboard/latency-histogram', {
+    params,
+    signal: options.signal
+  })
+  return data
+}
+export async function getErrorTrend(
+  params: {
+  time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
+  start_time?: string
+  end_time?: string
+  platform?: string
+  group_id?: number | null
+  mode?: OpsQueryMode
+  },
+  options: OpsRequestOptions = {}
+): Promise<OpsErrorTrendResponse> {
+  const { data } = await apiClient.get<OpsErrorTrendResponse>('/admin/ops/dashboard/error-trend', {
+    params,
+    signal: options.signal
+  })
+  return data
+}
+export async function getErrorDistribution(
+  params: {
+  time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
+  start_time?: string
+  end_time?: string
+  platform?: string
+  group_id?: number | null
+  mode?: OpsQueryMode
+  },
+  options: OpsRequestOptions = {}
+): Promise<OpsErrorDistributionResponse> {
+  const { data } = await apiClient.get<OpsErrorDistributionResponse>('/admin/ops/dashboard/error-distribution', {
+    params,
+    signal: options.signal
+  })
+  return data
+}
+export async function listErrorLogs(params: {
+  page?: number
+  page_size?: number
+  time_range?: string
+  start_time?: string
+  end_time?: string
+  platform?: string
+  group_id?: number | null
+  account_id?: number | null
+  phase?: string
+  q?: string
+  status_codes?: string
+}): Promise<OpsErrorLogsResponse> {
+  const { data } = await apiClient.get<OpsErrorLogsResponse>('/admin/ops/errors', { params })
+  return data
+}
+export async function getErrorLogDetail(id: number): Promise<OpsErrorDetail> {
+  const { data } = await apiClient.get<OpsErrorDetail>(`/admin/ops/errors/${id}`)
+  return data
+}
+export async function retryErrorRequest(id: number, req: OpsRetryRequest): Promise<OpsRetryResult> {
+  const { data } = await apiClient.post<OpsRetryResult>(`/admin/ops/errors/${id}/retry`, req)
+  return data
+}
+export async function listRequestDetails(params: OpsRequestDetailsParams): Promise<OpsRequestDetailsResponse> {
+  const { data } = await apiClient.get<OpsRequestDetailsResponse>('/admin/ops/requests', { params })
+  return data
+}
+// Alert rules
+export async function listAlertRules(): Promise<AlertRule[]> {
+  const { data } = await apiClient.get<AlertRule[]>('/admin/ops/alert-rules')
+  return data
+}
+export async function createAlertRule(rule: AlertRule): Promise<AlertRule> {
+  const { data } = await apiClient.post<AlertRule>('/admin/ops/alert-rules', rule)
+  return data
+}
+export async function updateAlertRule(id: number, rule: Partial<AlertRule>): Promise<AlertRule> {
+  const { data } = await apiClient.put<AlertRule>(`/admin/ops/alert-rules/${id}`, rule)
+  return data
+}
+export async function deleteAlertRule(id: number): Promise<void> {
+  await apiClient.delete(`/admin/ops/alert-rules/${id}`)
+}
+export async function listAlertEvents(limit = 100): Promise<AlertEvent[]> {
+  const { data } = await apiClient.get<AlertEvent[]>('/admin/ops/alert-events', { params: { limit } })
+  return data
+}
+// Email notification config
+export async function getEmailNotificationConfig(): Promise<EmailNotificationConfig> {
+  const { data } = await apiClient.get<EmailNotificationConfig>('/admin/ops/email-notification/config')
+  return data
+}
+export async function updateEmailNotificationConfig(config: EmailNotificationConfig): Promise<EmailNotificationConfig> {
+  const { data } = await apiClient.put<EmailNotificationConfig>('/admin/ops/email-notification/config', config)
+  return data
+}
+// Runtime settings (DB-backed)
+export async function getAlertRuntimeSettings(): Promise<OpsAlertRuntimeSettings> {
+  const { data } = await apiClient.get<OpsAlertRuntimeSettings>('/admin/ops/runtime/alert')
+  return data
+}
+export async function updateAlertRuntimeSettings(config: OpsAlertRuntimeSettings): Promise<OpsAlertRuntimeSettings> {
+  const { data } = await apiClient.put<OpsAlertRuntimeSettings>('/admin/ops/runtime/alert', config)
+  return data
+}
+// Advanced settings (DB-backed)
+export async function getAdvancedSettings(): Promise<OpsAdvancedSettings> {
+  const { data } = await apiClient.get<OpsAdvancedSettings>('/admin/ops/advanced-settings')
+  return data
+}
+export async function updateAdvancedSettings(config: OpsAdvancedSettings): Promise<OpsAdvancedSettings> {
+  const { data } = await apiClient.put<OpsAdvancedSettings>('/admin/ops/advanced-settings', config)
+  return data
+}
+export const opsAPI = {
+  getDashboardOverview,
+  getThroughputTrend,
+  getLatencyHistogram,
+  getErrorTrend,
+  getErrorDistribution,
+  getConcurrencyStats,
+  getAccountAvailabilityStats,
+  subscribeQPS,
+  listErrorLogs,
+  getErrorLogDetail,
+  retryErrorRequest,
+  listRequestDetails,
+  listAlertRules,
+  createAlertRule,
+  updateAlertRule,
+  deleteAlertRule,
+  listAlertEvents,
+  getEmailNotificationConfig,
+  updateEmailNotificationConfig,
+  getAlertRuntimeSettings,
+  updateAlertRuntimeSettings,
+  getAdvancedSettings,
+  updateAdvancedSettings
+}
+export default opsAPI
--- a/frontend/src/api/admin/settings.ts
+++ b/frontend/src/api/admin/settings.ts
@@ -35,14 +35,23 @@ export interface SystemSettings {
  turnstile_enabled: boolean
  turnstile_site_key: string
  turnstile_secret_key_configured: boolean
-  // LinuxDo Connect OAuth 登录（终端用户 SSO）
-  linuxdo_connect_enabled: boolean
+  // Model fallback configuration
-  linuxdo_connect_client_id: string
+  enable_model_fallback: boolean
-  linuxdo_connect_client_secret_configured: boolean
+  fallback_model_anthropic: string
-  linuxdo_connect_redirect_url: string
+  fallback_model_openai: string
+  fallback_model_gemini: string
+  fallback_model_antigravity: string
  // Identity patch configuration (Claude -> Gemini)
  enable_identity_patch: boolean
  identity_patch_prompt: string
+  // Ops Monitoring (vNext)
+  ops_monitoring_enabled: boolean
+  ops_realtime_monitoring_enabled: boolean
+  ops_query_mode_default: 'auto' | 'raw' | 'preagg' | string
+  ops_metrics_interval_seconds: number
 }
 export interface UpdateSettingsRequest {
@@ -67,12 +76,17 @@ export interface UpdateSettingsRequest {
  turnstile_enabled?: boolean
  turnstile_site_key?: string
  turnstile_secret_key?: string
-  linuxdo_connect_enabled?: boolean
+  enable_model_fallback?: boolean
-  linuxdo_connect_client_id?: string
+  fallback_model_anthropic?: string
-  linuxdo_connect_client_secret?: string
+  fallback_model_openai?: string
-  linuxdo_connect_redirect_url?: string
+  fallback_model_gemini?: string
+  fallback_model_antigravity?: string
  enable_identity_patch?: boolean
  identity_patch_prompt?: string
+  ops_monitoring_enabled?: boolean
+  ops_realtime_monitoring_enabled?: boolean
+  ops_query_mode_default?: 'auto' | 'raw' | 'preagg' | string
+  ops_metrics_interval_seconds?: number
 }
 /**

--- a/frontend/src/api/client.ts
+++ b/frontend/src/api/client.ts
@@ -80,9 +80,45 @@ apiClient.interceptors.response.use(
    return response
  },
  (error: AxiosError<ApiResponse<unknown>>) => {
+    // Request cancellation: keep the original axios cancellation error so callers can ignore it.
+    // Otherwise we'd misclassify it as a generic "network error".
+    if (error.code === 'ERR_CANCELED' || axios.isCancel(error)) {
+      return Promise.reject(error)
+    }
    // Handle common errors
    if (error.response) {
      const { status, data } = error.response
+      const url = String(error.config?.url || '')
+      // Validate `data` shape to avoid HTML error pages breaking our error handling.
+      const apiData = (typeof data === 'object' && data !== null ? data : {}) as Record<string, any>
+      // Ops monitoring disabled: treat as feature-flagged 404, and proactively redirect away
+      // from ops pages to avoid broken UI states.
+      if (status === 404 && apiData.message === 'Ops monitoring is disabled') {
+        try {
+          localStorage.setItem('ops_monitoring_enabled_cached', 'false')
+        } catch {
+          // ignore localStorage failures
+        }
+        try {
+          window.dispatchEvent(new CustomEvent('ops-monitoring-disabled'))
+        } catch {
+          // ignore event failures
+        }
+        if (window.location.pathname.startsWith('/admin/ops')) {
+          window.location.href = '/admin/settings'
+        }
+        return Promise.reject({
+          status,
+          code: 'OPS_DISABLED',
+          message: apiData.message || error.message,
+          url
+        })
+      }
      // 401: Unauthorized - clear token and redirect to login
      if (status === 401) {
@@ -113,8 +149,8 @@ apiClient.interceptors.response.use(
      // Return structured error
      return Promise.reject({
        status,
-        code: data?.code,
+        code: apiData.code,
-        message: data?.message || error.message
+        message: apiData.message || apiData.detail || error.message
      })
    }

--- a/frontend/src/components/common/HelpTooltip.vue
+++ b/frontend/src/components/common/HelpTooltip.vue
+<script setup lang="ts">
+import { ref } from 'vue'
+defineProps<{
+  content?: string
+}>()
+const show = ref(false)
+</script>
+<template>
+  <div
+    class="group relative ml-1 inline-flex items-center align-middle"
+    @mouseenter="show = true"
+    @mouseleave="show = false"
+  >
+    <!-- Trigger Icon -->
+    <slot name="trigger">
+      <svg
+        class="h-4 w-4 cursor-help text-gray-400 transition-colors hover:text-primary-600 dark:text-gray-500 dark:hover:text-primary-400"
+        fill="none"
+        viewBox="0 0 24 24"
+        stroke="currentColor"
+        stroke-width="2"
+      >
+        <path
+          stroke-linecap="round"
+          stroke-linejoin="round"
+          d="M13 16h-1v-4h-1m1-4h.01M21 12a9 9 0 11-18 0 9 9 0 0118 0z"
+        />
+      </svg>
+    </slot>
+    <!-- Popover Content -->
+    <div
+      v-show="show"
+      class="absolute bottom-full left-1/2 z-50 mb-2 w-64 -translate-x-1/2 rounded-lg bg-gray-900 p-3 text-xs leading-relaxed text-white shadow-xl ring-1 ring-white/10 opacity-0 transition-opacity duration-200 group-hover:opacity-100 dark:bg-gray-800"
+    >
+      <slot>{{ content }}</slot>
+      <div class="absolute -bottom-1 left-1/2 h-2 w-2 -translate-x-1/2 rotate-45 bg-gray-900 dark:bg-gray-800"></div>
+    </div>
+  </div>
+</template>
--- a/frontend/src/components/common/Select.vue
+++ b/frontend/src/components/common/Select.vue
@@ -67,12 +67,13 @@
              :aria-selected="isSelected(option)"
              :aria-disabled="isOptionDisabled(option)"
              @click.stop="!isOptionDisabled(option) && selectOption(option)"
-              @mouseenter="focusedIndex = index"
+              @mouseenter="handleOptionMouseEnter(option, index)"
              :class="[
                'select-option',
+                isGroupHeaderOption(option) && 'select-option-group',
                isSelected(option) && 'select-option-selected',
-                isOptionDisabled(option) && 'select-option-disabled',
+                isOptionDisabled(option) && !isGroupHeaderOption(option) && 'select-option-disabled',
-                focusedIndex === index && 'select-option-focused'
+                focusedIndex === index && !isGroupHeaderOption(option) && 'select-option-focused'
              ]"
            >
              <slot name="option" :option="option" :selected="isSelected(option)">
@@ -201,6 +202,13 @@ const isOptionDisabled = (option: any): boolean => {
  return false
 }
+const isGroupHeaderOption = (option: any): boolean => {
+  if (typeof option === 'object' && option !== null) {
+    return option.kind === 'group'
+  }
+  return false
+}
 const selectedOption = computed(() => {
  return props.options.find((opt) => getOptionValue(opt) === props.modelValue) || null
 })
@@ -225,6 +233,31 @@ const isSelected = (option: any): boolean => {
  return getOptionValue(option) === props.modelValue
 }
+const findNextEnabledIndex = (startIndex: number): number => {
+  const opts = filteredOptions.value
+  if (opts.length === 0) return -1
+  for (let offset = 0; offset < opts.length; offset++) {
+    const idx = (startIndex + offset) % opts.length
+    if (!isOptionDisabled(opts[idx])) return idx
+  }
+  return -1
+}
+const findPrevEnabledIndex = (startIndex: number): number => {
+  const opts = filteredOptions.value
+  if (opts.length === 0) return -1
+  for (let offset = 0; offset < opts.length; offset++) {
+    const idx = (startIndex - offset + opts.length) % opts.length
+    if (!isOptionDisabled(opts[idx])) return idx
+  }
+  return -1
+}
+const handleOptionMouseEnter = (option: any, index: number) => {
+  if (isOptionDisabled(option) || isGroupHeaderOption(option)) return
+  focusedIndex.value = index
+}
 // Update trigger rect periodically while open to follow scroll/resize
 const updateTriggerRect = () => {
  if (containerRef.value) {
@@ -259,8 +292,15 @@ watch(isOpen, (open) => {
  if (open) {
    calculateDropdownPosition()
    // Reset focused index to current selection or first item
-    const selectedIdx = filteredOptions.value.findIndex(isSelected)
+    if (filteredOptions.value.length === 0) {
-    focusedIndex.value = selectedIdx >= 0 ? selectedIdx : 0
+      focusedIndex.value = -1
+    } else {
+      const selectedIdx = filteredOptions.value.findIndex(isSelected)
+      const initialIdx = selectedIdx >= 0 ? selectedIdx : 0
+      focusedIndex.value = isOptionDisabled(filteredOptions.value[initialIdx])
+        ? findNextEnabledIndex(initialIdx + 1)
+        : initialIdx
+    }
    if (props.searchable) {
      nextTick(() => searchInputRef.value?.focus())
@@ -295,13 +335,13 @@ const onDropdownKeyDown = (e: KeyboardEvent) => {
  switch (e.key) {
    case 'ArrowDown':
      e.preventDefault()
-      focusedIndex.value = (focusedIndex.value + 1) % filteredOptions.value.length
+      focusedIndex.value = findNextEnabledIndex(focusedIndex.value + 1)
-      scrollToFocused()
+      if (focusedIndex.value >= 0) scrollToFocused()
      break
    case 'ArrowUp':
      e.preventDefault()
-      focusedIndex.value = (focusedIndex.value - 1 + filteredOptions.value.length) % filteredOptions.value.length
+      focusedIndex.value = findPrevEnabledIndex(focusedIndex.value - 1)
-      scrollToFocused()
+      if (focusedIndex.value >= 0) scrollToFocused()
      break
    case 'Enter':
      e.preventDefault()
@@ -441,6 +481,17 @@ onUnmounted(() => {
  @apply cursor-not-allowed opacity-40;
 }
+.select-dropdown-portal .select-option-group {
+  @apply cursor-default select-none;
+  @apply bg-gray-50 dark:bg-dark-900;
+  @apply text-[11px] font-bold uppercase tracking-wider;
+  @apply text-gray-500 dark:text-gray-400;
+}
+.select-dropdown-portal .select-option-group:hover {
+  @apply bg-gray-50 dark:bg-dark-900;
+}
 .select-dropdown-portal .select-option-label {
  @apply flex-1 min-w-0 truncate text-left;
 }

--- a/frontend/src/components/layout/AppSidebar.vue
+++ b/frontend/src/components/layout/AppSidebar.vue
@@ -144,10 +144,10 @@
 </template>
 <script setup lang="ts">
-import { computed, h, ref } from 'vue'
+import { computed, h, onMounted, ref, watch } from 'vue'
 import { useRoute } from 'vue-router'
 import { useI18n } from 'vue-i18n'
-import { useAppStore, useAuthStore, useOnboardingStore } from '@/stores'
+import { useAdminSettingsStore, useAppStore, useAuthStore, useOnboardingStore } from '@/stores'
 import VersionBadge from '@/components/common/VersionBadge.vue'
 const { t } = useI18n()
@@ -156,6 +156,7 @@ const route = useRoute()
 const appStore = useAppStore()
 const authStore = useAuthStore()
 const onboardingStore = useOnboardingStore()
+const adminSettingsStore = useAdminSettingsStore()
 const sidebarCollapsed = computed(() => appStore.sidebarCollapsed)
 const mobileOpen = computed(() => appStore.mobileOpen)
@@ -442,6 +443,9 @@ const personalNavItems = computed(() => {
 const adminNavItems = computed(() => {
  const baseItems = [
    { path: '/admin/dashboard', label: t('nav.dashboard'), icon: DashboardIcon },
+    ...(adminSettingsStore.opsMonitoringEnabled
+      ? [{ path: '/admin/ops', label: t('nav.ops'), icon: ChartIcon }]
+      : []),
    { path: '/admin/users', label: t('nav.users'), icon: UsersIcon, hideInSimpleMode: true },
    { path: '/admin/groups', label: t('nav.groups'), icon: FolderIcon, hideInSimpleMode: true },
    { path: '/admin/subscriptions', label: t('nav.subscriptions'), icon: CreditCardIcon, hideInSimpleMode: true },
@@ -511,6 +515,23 @@ if (
  isDark.value = true
  document.documentElement.classList.add('dark')
 }
+// Fetch admin settings (for feature-gated nav items like Ops).
+watch(
+  isAdmin,
+  (v) => {
+    if (v) {
+      adminSettingsStore.fetch()
+    }
+  },
+  { immediate: true }
+)
+onMounted(() => {
+  if (isAdmin.value) {
+    adminSettingsStore.fetch()
+  }
+})
 </script>
 <style scoped>

--- a/frontend/src/i18n/locales/en.ts
+++ b/frontend/src/i18n/locales/en.ts
@@ -131,6 +131,7 @@ export default {
    noData: 'No data',
    success: 'Success',
    error: 'Error',
+    critical: 'Critical',
    warning: 'Warning',
    info: 'Info',
    active: 'Active',
@@ -145,9 +146,11 @@ export default {
    copiedToClipboard: 'Copied to clipboard',
    copyFailed: 'Failed to copy',
    contactSupport: 'Contact Support',
+    add: 'Add',
+    invalidEmail: 'Please enter a valid email address',
    optional: 'optional',
    selectOption: 'Select an option',
-        searchPlaceholder: 'Search...', 
+    searchPlaceholder: 'Search...', 
        noOptionsFound: 'No options found',
        noGroupsAvailable: 'No groups available',
        unknownError: 'Unknown error occurred',
@@ -178,6 +181,7 @@ export default {
    accounts: 'Accounts',
    proxies: 'Proxies',
    redeemCodes: 'Redeem Codes',
+    ops: 'Ops',
    promoCodes: 'Promo Codes',
    settings: 'Settings',
    myAccount: 'My Account',
@@ -1837,6 +1841,524 @@ export default {
      ipAddress: 'IP'
    },
+    // Ops Monitoring
+    ops: {
+      title: 'Ops Monitoring',
+      description: 'Operational monitoring and troubleshooting',
+      // Dashboard
+      systemHealth: 'System Health',
+      overview: 'Overview',
+      noSystemMetrics: 'No system metrics collected yet.',
+      collectedAt: 'Collected at:',
+      window: 'window',
+      cpu: 'CPU',
+      memory: 'Memory',
+      db: 'DB',
+      redis: 'Redis',
+      goroutines: 'Goroutines',
+      jobs: 'Jobs',
+      jobsHelp: 'Click “Details” to view job heartbeats and recent errors',
+      active: 'active',
+      idle: 'idle',
+      waiting: 'waiting',
+      conns: 'conns',
+      queue: 'queue',
+      ok: 'ok',
+      lastRun: 'last_run:',
+      lastSuccess: 'last_success:',
+      lastError: 'last_error:',
+      noData: 'No data.',
+      loadingText: 'loading',
+      ready: 'ready',
+      requestsTotal: 'Requests (total)',
+      slaScope: 'SLA scope:',
+      tokens: 'Tokens',
+      tps: 'TPS:',
+      current: 'current',
+      peak: 'peak',
+      average: 'average',
+      totalRequests: 'Total Requests',
+      avgQps: 'Avg QPS',
+      avgTps: 'Avg TPS',
+      avgLatency: 'Avg Latency',
+      avgTtft: 'Avg TTFT',
+      exceptions: 'Exceptions',
+      requestErrors: 'Request Errors',
+      errorCount: 'Error Count',
+      upstreamErrors: 'Upstream Errors',
+      errorCountExcl429529: 'Error Count (excl 429/529)',
+      sla: 'SLA (excl business limits)',
+      businessLimited: 'business_limited:',
+      errors: 'Errors',
+      errorRate: 'error_rate:',
+      upstreamRate: 'upstream_rate:',
+      latencyDuration: 'Latency (duration_ms)',
+      ttftLabel: 'TTFT (first_token_ms)',
+      p50: 'p50:',
+      p90: 'p90:',
+      p95: 'p95:',
+      p99: 'p99:',
+      avg: 'avg:',
+      max: 'max:',
+      qps: 'QPS',
+      requests: 'Requests',
+      upstream: 'Upstream',
+      client: 'Client',
+      system: 'System',
+      other: 'Other',
+      errorsSla: 'Errors (SLA scope)',
+      upstreamExcl429529: 'Upstream (excl 429/529)',
+      failedToLoadData: 'Failed to load ops data.',
+      failedToLoadOverview: 'Failed to load overview',
+      failedToLoadThroughputTrend: 'Failed to load throughput trend',
+      failedToLoadLatencyHistogram: 'Failed to load latency histogram',
+      failedToLoadErrorTrend: 'Failed to load error trend',
+      failedToLoadErrorDistribution: 'Failed to load error distribution',
+      failedToLoadErrorDetail: 'Failed to load error detail',
+      retryFailed: 'Retry failed',
+      tpsK: 'TPS (K)',
+      top: 'Top:',
+      throughputTrend: 'Throughput Trend',
+      latencyHistogram: 'Latency Histogram',
+      errorTrend: 'Error Trend',
+      errorDistribution: 'Error Distribution',
+      // Health Score & Diagnosis
+      health: 'Health',
+      healthCondition: 'Health Condition',
+      healthHelp: 'Overall system health score based on SLA, error rate, and resource usage',
+      healthyStatus: 'Healthy',
+      riskyStatus: 'At Risk',
+      idleStatus: 'Idle',
+      timeRange: {
+        '5m': 'Last 5 minutes',
+        '30m': 'Last 30 minutes',
+        '1h': 'Last 1 hour',
+        '6h': 'Last 6 hours',
+        '24h': 'Last 24 hours'
+      },
+      diagnosis: {
+        title: 'Smart Diagnosis',
+        footer: 'Automated diagnostic suggestions based on current metrics',
+        idle: 'System is currently idle',
+        idleImpact: 'No active traffic',
+        // Resource diagnostics
+        dbDown: 'Database connection failed',
+        dbDownImpact: 'All database operations will fail',
+        dbDownAction: 'Check database service status, network connectivity, and connection configuration',
+        redisDown: 'Redis connection failed',
+        redisDownImpact: 'Cache functionality degraded, performance may decline',
+        redisDownAction: 'Check Redis service status and network connectivity',
+        cpuCritical: 'CPU usage critically high ({usage}%)',
+        cpuCriticalImpact: 'System response slowing, may affect all requests',
+        cpuCriticalAction: 'Check CPU-intensive tasks, consider scaling or code optimization',
+        cpuHigh: 'CPU usage elevated ({usage}%)',
+        cpuHighImpact: 'System load is high, needs attention',
+        cpuHighAction: 'Monitor CPU trends, prepare scaling plan',
+        memoryCritical: 'Memory usage critically high ({usage}%)',
+        memoryCriticalImpact: 'May trigger OOM, system stability threatened',
+        memoryCriticalAction: 'Check for memory leaks, consider increasing memory or optimizing usage',
+        memoryHigh: 'Memory usage elevated ({usage}%)',
+        memoryHighImpact: 'Memory pressure is high, needs attention',
+        memoryHighAction: 'Monitor memory trends, check for memory leaks',
+        // Latency diagnostics
+        latencyCritical: 'Response latency critically high ({latency}ms)',
+        latencyCriticalImpact: 'User experience extremely poor, many requests timing out',
+        latencyCriticalAction: 'Check slow queries, database indexes, network latency, and upstream services',
+        latencyHigh: 'Response latency elevated ({latency}ms)',
+        latencyHighImpact: 'User experience degraded, needs optimization',
+        latencyHighAction: 'Analyze slow request logs, optimize database queries and business logic',
+        ttftHigh: 'Time to first byte elevated ({ttft}ms)',
+        ttftHighImpact: 'User perceived latency increased',
+        ttftHighAction: 'Optimize request processing flow, reduce pre-processing time',
+        // Error rate diagnostics
+        upstreamCritical: 'Upstream error rate critically high ({rate}%)',
+        upstreamCriticalImpact: 'May affect many user requests',
+        upstreamCriticalAction: 'Check upstream service health, enable fallback strategies',
+        upstreamHigh: 'Upstream error rate elevated ({rate}%)',
+        upstreamHighImpact: 'Recommend checking upstream service status',
+        upstreamHighAction: 'Contact upstream service team, prepare fallback plan',
+        errorHigh: 'Error rate too high ({rate}%)',
+        errorHighImpact: 'Many requests failing',
+        errorHighAction: 'Check error logs, identify root cause, urgent fix required',
+        errorElevated: 'Error rate elevated ({rate}%)',
+        errorElevatedImpact: 'Recommend checking error logs',
+        errorElevatedAction: 'Analyze error types and distribution, create fix plan',
+        // SLA diagnostics
+        slaCritical: 'SLA critically below target ({sla}%)',
+        slaCriticalImpact: 'User experience severely degraded',
+        slaCriticalAction: 'Urgently investigate errors and latency, consider rate limiting',
+        slaLow: 'SLA below target ({sla}%)',
+        slaLowImpact: 'Service quality needs attention',
+        slaLowAction: 'Analyze SLA decline causes, optimize system performance',
+        // Health score diagnostics
+        healthCritical: 'Overall health score critically low ({score})',
+        healthCriticalImpact: 'Multiple metrics may be degraded; prioritize error rate and latency investigation',
+        healthCriticalAction: 'Comprehensive system check, prioritize critical-level issues',
+        healthLow: 'Overall health score low ({score})',
+        healthLowImpact: 'May indicate minor instability; monitor SLA and error rates',
+        healthLowAction: 'Monitor metric trends, prevent issue escalation',
+        healthy: 'All system metrics normal',
+        healthyImpact: 'Service running stable'
+      },
+      // Error Log
+      errorLog: {
+        timeId: 'Time / ID',
+        context: 'Context',
+        status: 'Status',
+        message: 'Message',
+        latency: 'Latency',
+        action: 'Action',
+        noErrors: 'No errors in this window.',
+        grp: 'GRP:',
+        acc: 'ACC:',
+        details: 'Details',
+        phase: 'Phase'
+      },
+      // Error Details Modal
+      errorDetails: {
+        upstreamErrors: 'Upstream Errors',
+        requestErrors: 'Request Errors',
+        total: 'Total:',
+        searchPlaceholder: 'Search request_id / client_request_id / message',
+        accountIdPlaceholder: 'account_id'
+      },
+      // Error Detail Modal
+      errorDetail: {
+        loading: 'Loading…',
+        requestId: 'Request ID',
+        time: 'Time',
+        phase: 'Phase',
+        status: 'Status',
+        message: 'Message',
+        basicInfo: 'Basic Info',
+        platform: 'Platform',
+        model: 'Model',
+        latency: 'Latency',
+        ttft: 'TTFT',
+        businessLimited: 'Business Limited',
+        requestPath: 'Request Path',
+        timings: 'Timings',
+        auth: 'Auth',
+        routing: 'Routing',
+        upstream: 'Upstream',
+        response: 'Response',
+        retry: 'Retry',
+        retryClient: 'Retry (Client)',
+        retryUpstream: 'Retry (Upstream pinned)',
+        pinnedAccountId: 'Pinned account_id',
+        retryNotes: 'Retry Notes',
+        requestBody: 'Request Body',
+        errorBody: 'Error Body',
+        trimmed: 'trimmed',
+        confirmRetry: 'Confirm Retry',
+        retrySuccess: 'Retry succeeded',
+        retryFailed: 'Retry failed',
+        na: 'N/A',
+        retryHint: 'Retry will resend the request with the same parameters',
+        retryClientHint: 'Use client retry (no account pinning)',
+        retryUpstreamHint: 'Use upstream pinned retry (pin to the error account)',
+        pinnedAccountIdHint: '(auto from error log)',
+        retryNote1: 'Retry will use the same request body and parameters',
+        retryNote2: 'If the original request failed due to account issues, pinned retry may still fail',
+        retryNote3: 'Client retry will reselect an account',
+        confirmRetryMessage: 'Confirm retry this request?',
+        confirmRetryHint: 'Will resend with the same request parameters'
+      },
+      requestDetails: {
+        title: 'Request Details',
+        details: 'Details',
+        rangeLabel: 'Window: {range}',
+        rangeMinutes: '{n} minutes',
+        rangeHours: '{n} hours',
+        empty: 'No requests in this window.',
+        emptyHint: 'Try a different time range or remove filters.',
+        failedToLoad: 'Failed to load request details',
+        requestIdCopied: 'Request ID copied',
+        copyFailed: 'Copy failed',
+        copy: 'Copy',
+        viewError: 'View Error',
+        kind: {
+          success: 'SUCCESS',
+          error: 'ERROR'
+        },
+        table: {
+          time: 'Time',
+          kind: 'Kind',
+          platform: 'Platform',
+          model: 'Model',
+          duration: 'Duration',
+          status: 'Status',
+          requestId: 'Request ID',
+          actions: 'Actions'
+        }
+      },
+      alertEvents: {
+        title: 'Alert Events',
+        description: 'Recent alert firing/resolution records (email-only)',
+        loading: 'Loading...',
+        empty: 'No alert events',
+        loadFailed: 'Failed to load alert events',
+        table: {
+          time: 'Time',
+          status: 'Status',
+          severity: 'Severity',
+          title: 'Title',
+          metric: 'Metric / Threshold',
+          email: 'Email Sent'
+        }
+      },
+      alertRules: {
+        title: 'Alert Rules',
+        description: 'Create and manage threshold-based system alerts (email-only)',
+        loading: 'Loading...',
+        empty: 'No alert rules',
+        loadFailed: 'Failed to load alert rules',
+        saveFailed: 'Failed to save alert rule',
+        deleteFailed: 'Failed to delete alert rule',
+        create: 'Create Rule',
+        createTitle: 'Create Alert Rule',
+        editTitle: 'Edit Alert Rule',
+        deleteConfirmTitle: 'Delete this rule?',
+        deleteConfirmMessage: 'This will remove the rule and its related events. Continue?',
+        metricGroups: {
+          system: 'System Metrics',
+          group: 'Group-level Metrics (requires group_id)',
+          account: 'Account-level Metrics'
+        },
+        metrics: {
+          successRate: 'Success Rate (%)',
+          errorRate: 'Error Rate (%)',
+          upstreamErrorRate: 'Upstream Error Rate (%)',
+          p95: 'P95 Latency (ms)',
+          p99: 'P99 Latency (ms)',
+          cpu: 'CPU Usage (%)',
+          memory: 'Memory Usage (%)',
+          queueDepth: 'Concurrency Queue Depth',
+          groupAvailableAccounts: 'Group Available Accounts',
+          groupAvailableRatio: 'Group Available Ratio (%)',
+          groupRateLimitRatio: 'Group Rate Limit Ratio (%)',
+          accountRateLimitedCount: 'Rate-limited Accounts',
+          accountErrorCount: 'Error Accounts (excluding temporarily unschedulable)',
+          accountErrorRatio: 'Error Account Ratio (%)',
+          overloadAccountCount: 'Overloaded Accounts'
+        },
+        metricDescriptions: {
+          successRate: 'Percentage of successful requests in the window (0-100).',
+          errorRate: 'Percentage of failed requests in the window (0-100).',
+          upstreamErrorRate: 'Percentage of upstream failures in the window (0-100).',
+          p95: 'P95 request latency within the window (ms).',
+          p99: 'P99 request latency within the window (ms).',
+          cpu: 'Current instance CPU usage (0-100).',
+          memory: 'Current instance memory usage (0-100).',
+          queueDepth: 'Concurrency queue depth within the window (queued requests).',
+          groupAvailableAccounts: 'Number of available accounts in the selected group (requires group_id).',
+          groupAvailableRatio: 'Available account ratio in the selected group (0-100, requires group_id).',
+          groupRateLimitRatio: 'Rate-limited account ratio in the selected group (0-100, requires group_id).',
+          accountRateLimitedCount: 'Number of rate-limited accounts within the window.',
+          accountErrorCount: 'Number of error accounts within the window (excluding temporarily unschedulable).',
+          accountErrorRatio: 'Error account ratio within the window (0-100).',
+          overloadAccountCount: 'Number of overloaded accounts within the window.'
+        },
+        hints: {
+          recommended: 'Recommended: operator {operator}, threshold {threshold}{unit}',
+          groupRequired: 'This is a group-level metric; selecting a group (group_id) is required.',
+          groupOptional: 'Optional: limit the rule to a specific group via group_id.'
+        },
+        table: {
+          name: 'Name',
+          metric: 'Metric',
+          severity: 'Severity',
+          enabled: 'Enabled',
+          actions: 'Actions'
+        },
+        form: {
+          name: 'Name',
+          description: 'Description',
+          metric: 'Metric',
+          operator: 'Operator',
+          groupId: 'Group (group_id)',
+          groupPlaceholder: 'Select a group',
+          allGroups: 'All groups',
+          threshold: 'Threshold',
+          severity: 'Severity',
+          window: 'Window (minutes)',
+          sustained: 'Sustained (samples)',
+          cooldown: 'Cooldown (minutes)',
+          enabled: 'Enabled',
+          notifyEmail: 'Send email notifications'
+        },
+        validation: {
+          title: 'Please fix the following issues',
+          invalid: 'Invalid rule',
+          nameRequired: 'Name is required',
+          metricRequired: 'Metric is required',
+          groupIdRequired: 'group_id is required for group-level metrics',
+          operatorRequired: 'Operator is required',
+          thresholdRequired: 'Threshold must be a number',
+          windowRange: 'Window must be one of: 1, 5, 60 minutes',
+          sustainedRange: 'Sustained must be between 1 and 1440 samples',
+          cooldownRange: 'Cooldown must be between 0 and 1440 minutes'
+        }
+      },
+      runtime: {
+        title: 'Ops Runtime Settings',
+        description: 'Stored in database; changes take effect without editing config files.',
+        loading: 'Loading...',
+        noData: 'No runtime settings available',
+        loadFailed: 'Failed to load runtime settings',
+        saveSuccess: 'Runtime settings saved',
+        saveFailed: 'Failed to save runtime settings',
+        alertTitle: 'Alert Evaluator',
+        groupAvailabilityTitle: 'Group Availability Monitor',
+        evalIntervalSeconds: 'Evaluation Interval (seconds)',
+        silencing: {
+          title: 'Alert Silencing (Maintenance Mode)',
+          enabled: 'Enable silencing',
+          globalUntil: 'Silence until (RFC3339)',
+          untilPlaceholder: '2026-01-05T00:00:00Z',
+          untilHint: 'Leave empty to only toggle silencing without an expiry (not recommended).',
+          reason: 'Reason',
+          reasonPlaceholder: 'e.g., planned maintenance',
+          entries: {
+            title: 'Advanced: targeted silencing',
+            hint: 'Optional: silence only certain rules or severities. Leave fields empty to match all.',
+            add: 'Add Entry',
+            empty: 'No targeted entries',
+            entryTitle: 'Entry #{n}',
+            ruleId: 'Rule ID (optional)',
+            ruleIdPlaceholder: 'e.g., 1',
+            severities: 'Severities (optional)',
+            severitiesPlaceholder: 'e.g., P0,P1 (empty = all)',
+            until: 'Until (RFC3339)',
+            reason: 'Reason',
+            validation: {
+              untilRequired: 'Entry until time is required',
+              untilFormat: 'Entry until time must be a valid RFC3339 timestamp',
+              ruleIdPositive: 'Entry rule_id must be a positive integer',
+              severitiesFormat: 'Entry severities must be a comma-separated list of P0..P3'
+            }
+          },
+          validation: {
+            timeFormat: 'Silence time must be a valid RFC3339 timestamp'
+          }
+        },
+        lockEnabled: 'Distributed Lock Enabled',
+        lockKey: 'Distributed Lock Key',
+        lockTTLSeconds: 'Distributed Lock TTL (seconds)',
+        showAdvancedDeveloperSettings: 'Show advanced developer settings (Distributed Lock)',
+        advancedSettingsSummary: 'Advanced settings (Distributed Lock)',
+        evalIntervalHint: 'How often the evaluator runs. Keeping the default is recommended.',
+        validation: {
+          title: 'Please fix the following issues',
+          invalid: 'Invalid settings',
+          evalIntervalRange: 'Evaluation interval must be between 1 and 86400 seconds',
+          lockKeyRequired: 'Distributed lock key is required when lock is enabled',
+          lockKeyPrefix: 'Distributed lock key must start with "{prefix}"',
+          lockKeyHint: 'Recommended: start with "{prefix}" to avoid conflicts',
+          lockTtlRange: 'Distributed lock TTL must be between 1 and 86400 seconds'
+        }
+      },
+      email: {
+        title: 'Email Notification',
+        description: 'Configure alert/report email notifications (stored in database).',
+        loading: 'Loading...',
+        noData: 'No email notification config',
+        loadFailed: 'Failed to load email notification config',
+        saveSuccess: 'Email notification config saved',
+        saveFailed: 'Failed to save email notification config',
+        alertTitle: 'Alert Emails',
+        reportTitle: 'Report Emails',
+        recipients: 'Recipients',
+        recipientsHint: 'If empty, the system may fallback to the first admin email.',
+        minSeverity: 'Min Severity',
+        minSeverityAll: 'All severities',
+        rateLimitPerHour: 'Rate limit per hour',
+        batchWindowSeconds: 'Batch window (seconds)',
+        includeResolved: 'Include resolved alerts',
+        dailySummary: 'Daily summary',
+        weeklySummary: 'Weekly summary',
+        errorDigest: 'Error digest',
+        errorDigestMinCount: 'Min errors for digest',
+        accountHealth: 'Account health',
+        accountHealthThreshold: 'Error rate threshold (%)',
+        cronPlaceholder: 'Cron expression',
+        reportHint: 'Schedules use cron syntax; leave empty to use defaults.',
+        validation: {
+          title: 'Please fix the following issues',
+          invalid: 'Invalid email notification config',
+          alertRecipientsRequired: 'Alert emails are enabled but no recipients are configured',
+          reportRecipientsRequired: 'Report emails are enabled but no recipients are configured',
+          invalidRecipients: 'One or more recipient emails are invalid',
+          rateLimitRange: 'Rate limit per hour must be a number ≥ 0',
+          batchWindowRange: 'Batch window must be between 0 and 86400 seconds',
+          cronRequired: 'A cron expression is required when schedule is enabled',
+          cronFormat: 'Cron expression format looks invalid (expected at least 5 parts)',
+          digestMinCountRange: 'Min errors for digest must be a number ≥ 0',
+          accountHealthThresholdRange: 'Account health threshold must be between 0 and 100'
+        }
+      },
+      concurrency: {
+        title: 'Concurrency / Queue',
+        byPlatform: 'By Platform',
+        byGroup: 'By Group',
+        byAccount: 'By Account',
+        totalRows: '{count} rows',
+        disabledHint: 'Realtime monitoring is disabled in settings.',
+        empty: 'No data',
+        queued: 'Queue {count}',
+        rateLimited: 'Rate-limited {count}',
+        errorAccounts: 'Errors {count}',
+        loadFailed: 'Failed to load concurrency data'
+      },
+      realtime: {
+        title: 'Realtime',
+        connected: 'Realtime connected',
+        connecting: 'Realtime connecting',
+        reconnecting: 'Realtime reconnecting',
+        offline: 'Realtime offline',
+        closed: 'Realtime closed',
+        reconnectIn: 'retry in {seconds}s'
+      },
+      queryMode: {
+        auto: 'Auto',
+        raw: 'Raw',
+        preagg: 'Preagg'
+      },
+      accountAvailability: {
+        available: 'Available',
+        unavailable: 'Unavailable',
+        accountError: 'Error'
+      },
+      tooltips: {
+        throughputTrend: 'Requests/QPS + Tokens/TPS in the selected window.',
+        latencyHistogram: 'Latency distribution (duration_ms) for successful requests.',
+        errorTrend: 'Error counts over time (SLA scope excludes business limits; upstream excludes 429/529).',
+        errorDistribution: 'Error distribution by status code.',
+        goroutines:
+          'Number of Go runtime goroutines (lightweight threads). There is no absolute “safe” number—use your historical baseline. Heuristic: <2k is common; 2k–8k watch; >8k plus rising queue/latency often suggests blocking/leaks.',
+        cpu: 'CPU usage percentage, showing system processor load.',
+        memory: 'Memory usage, including used and total available memory.',
+        db: 'Database connection pool status, including active, idle, and waiting connections.',
+        redis: 'Redis connection pool status, showing active and idle connections.',
+        jobs: 'Background job execution status, including last run time, success time, and error information.',
+        qps: 'Queries Per Second (QPS) and Tokens Per Second (TPS), real-time system throughput.',
+        tokens: 'Total number of tokens processed in the current time window.',
+        sla: 'Service Level Agreement success rate, excluding business limits (e.g., insufficient balance, quota exceeded).',
+        errors: 'Error statistics, including total errors, error rate, and upstream error rate.',
+        latency: 'Request latency statistics, including p50, p90, p95, p99 percentiles.',
+        ttft: 'Time To First Token, measuring the speed of first byte return in streaming responses.',
+        health: 'System health score (0-100), considering SLA, error rate, and resource usage.'
+      },
+      charts: {
+        emptyRequest: 'No requests in this window.',
+        emptyError: 'No errors in this window.',
+        resetZoom: 'Reset',
+        resetZoomHint: 'Reset zoom (if enabled)',
+        downloadChart: 'Download',
+        downloadChartHint: 'Download chart as image'
+      }
+    },
    // Settings
    settings: {
      title: 'System Settings',
@@ -1951,6 +2473,22 @@ export default {
        sending: 'Sending...',
        enterRecipientHint: 'Please enter a recipient email address'
      },
+      opsMonitoring: {
+        title: 'Ops Monitoring',
+        description: 'Enable ops monitoring for troubleshooting and health visibility',
+        disabled: 'Ops monitoring is disabled',
+        enabled: 'Enable Ops Monitoring',
+        enabledHint: 'Enable the ops monitoring module (admin only)',
+        realtimeEnabled: 'Enable Realtime Monitoring',
+        realtimeEnabledHint: 'Enable realtime QPS/metrics push (WebSocket)',
+        queryMode: 'Default Query Mode',
+        queryModeHint: 'Default query mode for Ops Dashboard (auto/raw/preagg)',
+        queryModeAuto: 'Auto (recommended)',
+        queryModeRaw: 'Raw (most accurate, slower)',
+        queryModePreagg: 'Preagg (fastest, requires aggregation)',
+        metricsInterval: 'Metrics Collection Interval (seconds)',
+        metricsIntervalHint: 'How often to collect system/request metrics (60-3600 seconds)'
+      },
      adminApiKey: {
        title: 'Admin API Key',
        description: 'Global API key for external system integration with full admin access',

--- a/frontend/src/i18n/locales/zh.ts
+++ b/frontend/src/i18n/locales/zh.ts
@@ -128,6 +128,7 @@ export default {
    noData: '暂无数据',
    success: '成功',
    error: '错误',
+    critical: '严重',
    warning: '警告',
    info: '提示',
    active: '启用',
@@ -142,6 +143,8 @@ export default {
    copiedToClipboard: '已复制到剪贴板',
    copyFailed: '复制失败',
    contactSupport: '联系客服',
+    add: '添加',
+    invalidEmail: '请输入有效的邮箱地址',
    optional: '可选',
    selectOption: '请选择',
    searchPlaceholder: '搜索...',
@@ -151,6 +154,7 @@ export default {
    saving: '保存中...',
    selectedCount: '（已选 {count} 个）',
    refresh: '刷新',
+    settings: '设置',
    notAvailable: '不可用',
    now: '现在',
    unknown: '未知',
@@ -176,6 +180,7 @@ export default {
    accounts: '账号管理',
    proxies: 'IP管理',
    redeemCodes: '兑换码',
+    ops: '运维监控',
    promoCodes: '优惠码',
    settings: '系统设置',
    myAccount: '我的账户',
@@ -1982,6 +1987,565 @@ export default {
      ipAddress: 'IP'
    },
+    // Ops Monitoring
+    ops: {
+      title: '运维监控',
+      description: '运维监控与排障',
+      // Dashboard
+      systemHealth: '系统健康',
+      overview: '概览',
+      noSystemMetrics: '尚未收集系统指标。',
+      collectedAt: '采集时间：',
+      window: '窗口',
+      cpu: 'CPU',
+      memory: '内存',
+      db: '数据库',
+      redis: 'Redis',
+      goroutines: '协程',
+      jobs: '后台任务',
+      jobsHelp: '点击“明细”查看任务心跳与报错信息',
+      active: '活跃',
+      idle: '空闲',
+      waiting: '等待',
+      conns: '连接',
+      queue: '队列',
+      ok: '正常',
+      lastRun: '最近运行',
+      lastSuccess: '最近成功',
+      lastError: '最近错误',
+      noData: '暂无数据',
+      loadingText: '加载中...',
+      ready: '就绪',
+      requestsTotal: '请求（总计）',
+      slaScope: 'SLA 范围：',
+      tokens: 'Token',
+      tps: 'TPS',
+      current: '当前',
+      peak: '峰值',
+      average: '平均',
+      totalRequests: '总请求',
+      avgQps: '平均 QPS',
+      avgTps: '平均 TPS',
+      avgLatency: '平均延迟',
+      avgTtft: '平均首字延迟',
+      exceptions: '异常数',
+      requestErrors: '请求错误',
+      errorCount: '错误数',
+      upstreamErrors: '上游错误',
+      errorCountExcl429529: '错误数（排除429/529）',
+      sla: 'SLA（排除业务限制）',
+      businessLimited: '业务限制：',
+      errors: '错误',
+      errorRate: '错误率：',
+      upstreamRate: '上游错误率：',
+      latencyDuration: '延迟（毫秒）',
+      ttftLabel: '首字延迟（毫秒）',
+      p50: 'p50',
+      p90: 'p90',
+      p95: 'p95',
+      p99: 'p99',
+      avg: 'avg',
+      max: 'max',
+      qps: 'QPS',
+      requests: '请求',
+      upstream: '上游',
+      client: '客户端',
+      system: '系统',
+      other: '其他',
+      errorsSla: '错误（SLA范围）',
+      upstreamExcl429529: '上游（排除429/529）',
+      failedToLoadData: '加载运维数据失败',
+      failedToLoadOverview: '加载概览数据失败',
+      failedToLoadThroughputTrend: '加载吞吐趋势失败',
+      failedToLoadLatencyHistogram: '加载延迟分布失败',
+      failedToLoadErrorTrend: '加载错误趋势失败',
+      failedToLoadErrorDistribution: '加载错误分布失败',
+      failedToLoadErrorDetail: '加载错误详情失败',
+      retryFailed: '重试失败',
+      tpsK: 'TPS（千）',
+      top: '最高：',
+      throughputTrend: '吞吐趋势',
+      latencyHistogram: '延迟分布',
+      errorTrend: '错误趋势',
+      errorDistribution: '错误分布',
+      // Health Score & Diagnosis
+      health: '健康',
+      healthCondition: '健康状况',
+      healthHelp: '基于 SLA、错误率和资源使用情况的系统整体健康评分',
+      healthyStatus: '健康',
+      riskyStatus: '风险',
+      idleStatus: '待机',
+      timeRange: {
+        '5m': '近5分钟',
+        '30m': '近30分钟',
+        '1h': '近1小时',
+        '6h': '近6小时',
+        '24h': '近24小时'
+      },
+      diagnosis: {
+        title: '智能诊断',
+        footer: '基于当前指标的自动诊断建议',
+        idle: '系统当前处于待机状态',
+        idleImpact: '无活跃流量',
+        // Resource diagnostics
+        dbDown: '数据库连接失败',
+        dbDownImpact: '所有数据库操作将失败',
+        dbDownAction: '检查数据库服务状态、网络连接和连接配置',
+        redisDown: 'Redis连接失败',
+        redisDownImpact: '缓存功能降级，性能可能下降',
+        redisDownAction: '检查Redis服务状态和网络连接',
+        cpuCritical: 'CPU使用率严重过高 ({usage}%)',
+        cpuCriticalImpact: '系统响应变慢，可能影响所有请求',
+        cpuCriticalAction: '检查CPU密集型任务，考虑扩容或优化代码',
+        cpuHigh: 'CPU使用率偏高 ({usage}%)',
+        cpuHighImpact: '系统负载较高，需要关注',
+        cpuHighAction: '监控CPU趋势，准备扩容方案',
+        memoryCritical: '内存使用率严重过高 ({usage}%)',
+        memoryCriticalImpact: '可能触发OOM，系统稳定性受威胁',
+        memoryCriticalAction: '检查内存泄漏，考虑增加内存或优化内存使用',
+        memoryHigh: '内存使用率偏高 ({usage}%)',
+        memoryHighImpact: '内存压力较大，需要关注',
+        memoryHighAction: '监控内存趋势，检查是否有内存泄漏',
+        // Latency diagnostics
+        latencyCritical: '响应延迟严重过高 ({latency}ms)',
+        latencyCriticalImpact: '用户体验极差，大量请求超时',
+        latencyCriticalAction: '检查慢查询、数据库索引、网络延迟和上游服务',
+        latencyHigh: '响应延迟偏高 ({latency}ms)',
+        latencyHighImpact: '用户体验下降，需要优化',
+        latencyHighAction: '分析慢请求日志，优化数据库查询和业务逻辑',
+        ttftHigh: '首字节时间偏高 ({ttft}ms)',
+        ttftHighImpact: '用户感知延迟增加',
+        ttftHighAction: '优化请求处理流程，减少前置逻辑耗时',
+        // Error rate diagnostics
+        upstreamCritical: '上游错误率严重偏高 ({rate}%)',
+        upstreamCriticalImpact: '可能影响大量用户请求',
+        upstreamCriticalAction: '检查上游服务健康状态，启用降级策略',
+        upstreamHigh: '上游错误率偏高 ({rate}%)',
+        upstreamHighImpact: '建议检查上游服务状态',
+        upstreamHighAction: '联系上游服务团队，准备降级方案',
+        errorHigh: '错误率过高 ({rate}%)',
+        errorHighImpact: '大量请求失败',
+        errorHighAction: '查看错误日志，定位错误根因，紧急修复',
+        errorElevated: '错误率偏高 ({rate}%)',
+        errorElevatedImpact: '建议检查错误日志',
+        errorElevatedAction: '分析错误类型和分布，制定修复计划',
+        // SLA diagnostics
+        slaCritical: 'SLA 严重低于目标 ({sla}%)',
+        slaCriticalImpact: '用户体验严重受损',
+        slaCriticalAction: '紧急排查错误和延迟问题，考虑限流保护',
+        slaLow: 'SLA 低于目标 ({sla}%)',
+        slaLowImpact: '需要关注服务质量',
+        slaLowAction: '分析SLA下降原因，优化系统性能',
+        // Health score diagnostics
+        healthCritical: '综合健康评分过低 ({score})',
+        healthCriticalImpact: '多个指标可能同时异常，建议优先排查错误与延迟',
+        healthCriticalAction: '全面检查系统状态，优先处理critical级别问题',
+        healthLow: '综合健康评分偏低 ({score})',
+        healthLowImpact: '可能存在轻度波动，建议关注 SLA 与错误率',
+        healthLowAction: '监控指标趋势，预防问题恶化',
+        healthy: '所有系统指标正常',
+        healthyImpact: '服务运行稳定'
+      },
+      // Error Log
+      errorLog: {
+        timeId: '时间 / ID',
+        context: '上下文',
+        status: '状态码',
+        message: '消息',
+        latency: '延迟',
+        action: '操作',
+        noErrors: '该窗口内暂无错误。',
+        grp: 'GRP：',
+        acc: 'ACC：',
+        details: '详情',
+        phase: '阶段'
+      },
+      // Error Details Modal
+      errorDetails: {
+        upstreamErrors: '上游错误',
+        requestErrors: '请求错误',
+        total: '总计：',
+        searchPlaceholder: '搜索 request_id / client_request_id / message',
+        accountIdPlaceholder: 'account_id'
+      },
+      // Error Detail Modal
+      errorDetail: {
+        loading: '加载中…',
+        requestId: '请求 ID',
+        time: '时间',
+        phase: '阶段',
+        status: '状态码',
+        message: '消息',
+        basicInfo: '基本信息',
+        platform: '平台',
+        model: '模型',
+        latency: '延迟',
+        ttft: 'TTFT',
+        businessLimited: '业务限制',
+        requestPath: '请求路径',
+        timings: '时序信息',
+        auth: '认证',
+        routing: '路由',
+        upstream: '上游',
+        response: '响应',
+        retry: '重试',
+        retryClient: '重试（客户端）',
+        retryUpstream: '重试（上游固定）',
+        pinnedAccountId: '固定 account_id',
+        retryNotes: '重试说明',
+        requestBody: '请求体',
+        errorBody: '错误体',
+        trimmed: '已截断',
+        confirmRetry: '确认重试',
+        retrySuccess: '重试成功',
+        retryFailed: '重试失败',
+        na: 'N/A',
+        retryHint: '重试将使用相同的请求参数重新发送请求',
+        retryClientHint: '使用客户端重试（不固定账号）',
+        retryUpstreamHint: '使用上游固定重试（固定到错误的账号）',
+        pinnedAccountIdHint: '（自动从错误日志获取）',
+        retryNote1: '重试会使用相同的请求体和参数',
+        retryNote2: '如果原请求失败是因为账号问题，固定重试可能仍会失败',
+        retryNote3: '客户端重试会重新选择账号',
+        confirmRetryMessage: '确认要重试该请求吗？',
+        confirmRetryHint: '将使用相同的请求参数重新发送'
+      },
+      requestDetails: {
+        title: '请求明细',
+        details: '明细',
+        rangeLabel: '窗口：{range}',
+        rangeMinutes: '{n} 分钟',
+        rangeHours: '{n} 小时',
+        empty: '该窗口内暂无请求。',
+        emptyHint: '可尝试调整时间范围或取消部分筛选。',
+        failedToLoad: '加载请求明细失败',
+        requestIdCopied: '请求ID已复制',
+        copyFailed: '复制失败',
+        copy: '复制',
+        viewError: '查看错误',
+        kind: {
+          success: '成功',
+          error: '失败'
+        },
+        table: {
+          time: '时间',
+          kind: '类型',
+          platform: '平台',
+          model: '模型',
+          duration: '耗时',
+          status: '状态码',
+          requestId: '请求ID',
+          actions: '操作'
+        }
+      },
+      alertEvents: {
+        title: '告警事件',
+        description: '最近的告警触发/恢复记录（仅邮件通知）',
+        loading: '加载中...',
+        empty: '暂无告警事件',
+        loadFailed: '加载告警事件失败',
+        table: {
+          time: '时间',
+          status: '状态',
+          severity: '级别',
+          title: '标题',
+          metric: '指标 / 阈值',
+          email: '邮件已发送'
+        }
+      },
+      alertRules: {
+        title: '告警规则',
+        description: '创建与管理系统阈值告警（仅邮件通知）',
+        loading: '加载中...',
+        empty: '暂无告警规则',
+        loadFailed: '加载告警规则失败',
+        saveSuccess: '警报规则保存成功',
+        saveFailed: '保存告警规则失败',
+        deleteSuccess: '警报规则删除成功',
+        deleteFailed: '删除告警规则失败',
+        create: '新建规则',
+        createTitle: '新建告警规则',
+        editTitle: '编辑告警规则',
+        deleteConfirmTitle: '确认删除该规则？',
+        deleteConfirmMessage: '将删除该规则及其关联的告警事件，是否继续？',
+        manage: '预警规则',
+        metricGroups: {
+          system: '系统指标',
+          group: '分组级别指标（需 group_id）',
+          account: '账号级别指标'
+        },
+        metrics: {
+          successRate: '成功率 (%)',
+          errorRate: '错误率 (%)',
+          upstreamErrorRate: '上游错误率 (%)',
+          p95: 'P95 延迟 (ms)',
+          p99: 'P99 延迟 (ms)',
+          cpu: 'CPU 使用率 (%)',
+          memory: '内存使用率 (%)',
+          queueDepth: '并发排队深度',
+          groupAvailableAccounts: '分组可用账号数',
+          groupAvailableRatio: '分组可用比例 (%)',
+          groupRateLimitRatio: '分组限流比例 (%)',
+          accountRateLimitedCount: '限流账号数',
+          accountErrorCount: '错误账号数（不含临时不可调度）',
+          accountErrorRatio: '错误账号比例 (%)',
+          overloadAccountCount: '过载账号数'
+        },
+        metricDescriptions: {
+          successRate: '统计窗口内成功请求占比（0~100）。',
+          errorRate: '统计窗口内失败请求占比（0~100）。',
+          upstreamErrorRate: '统计窗口内上游错误占比（0~100）。',
+          p95: '统计窗口内 P95 请求耗时（毫秒）。',
+          p99: '统计窗口内 P99 请求耗时（毫秒）。',
+          cpu: '当前实例 CPU 使用率（0~100）。',
+          memory: '当前实例内存使用率（0~100）。',
+          queueDepth: '统计窗口内并发队列排队深度（等待中的请求数）。',
+          groupAvailableAccounts: '指定分组中当前可用账号数量（需要 group_id 过滤）。',
+          groupAvailableRatio: '指定分组中可用账号占比（0~100，需要 group_id 过滤）。',
+          groupRateLimitRatio: '指定分组中账号被限流的比例（0~100，需要 group_id 过滤）。',
+          accountRateLimitedCount: '统计窗口内被限流的账号数量。',
+          accountErrorCount: '统计窗口内产生错误的账号数量（不含临时不可调度）。',
+          accountErrorRatio: '统计窗口内错误账号占比（0~100）。',
+          overloadAccountCount: '统计窗口内过载账号数量。'
+        },
+        hints: {
+          recommended: '推荐：运算符 {operator}，阈值 {threshold}{unit}',
+          groupRequired: '该指标为分组级别指标，必须选择分组（group_id）。',
+          groupOptional: '可选：通过 group_id 将规则限定到某个分组。'
+        },
+        table: {
+          name: '名称',
+          metric: '指标',
+          severity: '级别',
+          enabled: '启用',
+          actions: '操作'
+        },
+        form: {
+          name: '名称',
+          description: '描述',
+          metric: '指标',
+          operator: '运算符',
+          groupId: '分组（group_id）',
+          groupPlaceholder: '请选择分组',
+          allGroups: '全部分组',
+          threshold: '阈值',
+          severity: '级别',
+          window: '统计窗口（分钟）',
+          sustained: '连续样本数（每分钟）',
+          cooldown: '冷却期（分钟）',
+          enabled: '启用',
+          notifyEmail: '发送邮件通知'
+        },
+        validation: {
+          title: '请先修正以下问题',
+          invalid: '规则不合法',
+          nameRequired: '名称不能为空',
+          metricRequired: '指标不能为空',
+          groupIdRequired: '分组级别指标必须指定 group_id',
+          operatorRequired: '运算符不能为空',
+          thresholdRequired: '阈值必须为数字',
+          windowRange: '统计窗口必须为 1 / 5 / 60 分钟之一',
+          sustainedRange: '连续样本数必须在 1 到 1440 之间',
+          cooldownRange: '冷却期必须在 0 到 1440 分钟之间'
+        }
+      },
+      runtime: {
+        title: '运维监控运行设置',
+        description: '配置存储在数据库中，无需修改 config 文件即可生效。',
+        loading: '加载中...',
+        noData: '暂无运行设置',
+        loadFailed: '加载运行设置失败',
+        saveSuccess: '运行设置已保存',
+        saveFailed: '保存运行设置失败',
+        alertTitle: '告警评估器',
+        groupAvailabilityTitle: '分组可用性监控',
+        evalIntervalSeconds: '评估间隔（秒）',
+        silencing: {
+          title: '告警静默（维护模式）',
+          enabled: '启用静默',
+          globalUntil: '静默截止时间（RFC3339）',
+          untilPlaceholder: '2026-01-05T00:00:00Z',
+          untilHint: '建议填写截止时间，避免忘记关闭静默。',
+          reason: '原因',
+          reasonPlaceholder: '例如：计划维护',
+          entries: {
+            title: '高级：定向静默',
+            hint: '可选：仅静默特定规则或特定级别。字段留空表示匹配全部。',
+            add: '新增条目',
+            empty: '暂无定向静默条目',
+            entryTitle: '条目 #{n}',
+            ruleId: '规则ID（可选）',
+            ruleIdPlaceholder: '例如：1',
+            severities: '级别（可选）',
+            severitiesPlaceholder: '例如：P0,P1（留空=全部）',
+            until: '截止时间（RFC3339）',
+            reason: '原因',
+            validation: {
+              untilRequired: '条目截止时间不能为空',
+              untilFormat: '条目截止时间必须为合法的 RFC3339 时间戳',
+              ruleIdPositive: '条目 rule_id 必须为正整数',
+              severitiesFormat: '条目级别必须为 P0..P3 的逗号分隔列表'
+            }
+          },
+          validation: {
+            timeFormat: '静默时间必须为合法的 RFC3339 时间戳'
+          }
+        },
+        lockEnabled: '启用分布式锁',
+        lockKey: '分布式锁 Key',
+        lockTTLSeconds: '分布式锁 TTL（秒）',
+        showAdvancedDeveloperSettings: '显示高级开发者设置 (Distributed Lock)',
+        advancedSettingsSummary: '高级设置 (分布式锁)',
+        evalIntervalHint: '检测任务的执行频率，建议保持默认。',
+        validation: {
+          title: '请先修正以下问题',
+          invalid: '设置不合法',
+          evalIntervalRange: '评估间隔必须在 1 到 86400 秒之间',
+          lockKeyRequired: '启用分布式锁时必须填写 Lock Key',
+          lockKeyPrefix: '分布式锁 Key 必须以「{prefix}」开头',
+          lockKeyHint: '建议以「{prefix}」开头以避免冲突',
+          lockTtlRange: '分布式锁 TTL 必须在 1 到 86400 秒之间'
+        }
+      },
+      email: {
+        title: '邮件通知配置',
+        description: '配置告警/报告邮件通知（存储在数据库中）。',
+        loading: '加载中...',
+        noData: '暂无邮件通知配置',
+        loadFailed: '加载邮件通知配置失败',
+        saveSuccess: '邮件通知配置已保存',
+        saveFailed: '保存邮件通知配置失败',
+        alertTitle: '告警邮件',
+        reportTitle: '报告邮件',
+        recipients: '收件人',
+        recipientsHint: '若为空，系统可能会回退使用第一个管理员邮箱。',
+        minSeverity: '最低级别',
+        minSeverityAll: '全部级别',
+        rateLimitPerHour: '每小时限额',
+        batchWindowSeconds: '合并窗口（秒）',
+        includeResolved: '包含恢复通知',
+        dailySummary: '每日摘要',
+        weeklySummary: '每周摘要',
+        errorDigest: '错误摘要',
+        errorDigestMinCount: '错误摘要最小数量',
+        accountHealth: '账号健康报告',
+        accountHealthThreshold: '错误率阈值（%）',
+        cronPlaceholder: 'Cron 表达式',
+        reportHint: '发送时间使用 Cron 语法；留空将使用默认值。',
+        validation: {
+          title: '请先修正以下问题',
+          invalid: '邮件通知配置不合法',
+          alertRecipientsRequired: '已启用告警邮件，但未配置任何收件人',
+          reportRecipientsRequired: '已启用报告邮件，但未配置任何收件人',
+          invalidRecipients: '存在不合法的收件人邮箱',
+          rateLimitRange: '每小时限额必须为 ≥ 0 的数字',
+          batchWindowRange: '合并窗口必须在 0 到 86400 秒之间',
+          cronRequired: '启用定时任务时必须填写 Cron 表达式',
+          cronFormat: 'Cron 表达式格式可能不正确（至少应包含 5 段）',
+          digestMinCountRange: '错误摘要最小数量必须为 ≥ 0 的数字',
+          accountHealthThresholdRange: '账号健康错误率阈值必须在 0 到 100 之间'
+        }
+      },
+      settings: {
+        title: '运维监控设置',
+        loadFailed: '加载设置失败',
+        saveSuccess: '运维监控设置保存成功',
+        saveFailed: '保存设置失败',
+        dataCollection: '数据采集',
+        evaluationInterval: '评估间隔（秒）',
+        evaluationIntervalHint: '检测任务的执行频率，建议保持默认',
+        alertConfig: '预警配置',
+        enableAlert: '开启预警',
+        alertRecipients: '预警接收邮箱',
+        emailPlaceholder: '输入邮箱地址',
+        recipientsHint: '若为空，系统将使用第一个管理员邮箱作为默认收件人',
+        minSeverity: '最低级别',
+        reportConfig: '评估报告配置',
+        enableReport: '开启评估报告',
+        reportRecipients: '评估报告接收邮箱',
+        dailySummary: '每日摘要',
+        weeklySummary: '每周摘要',
+        advancedSettings: '高级设置',
+        dataRetention: '数据保留策略',
+        enableCleanup: '启用数据清理',
+        cleanupSchedule: '清理计划（Cron）',
+        cleanupScheduleHint: '例如：0 2 * * * 表示每天凌晨2点',
+        errorLogRetentionDays: '错误日志保留天数',
+        minuteMetricsRetentionDays: '分钟指标保留天数',
+        hourlyMetricsRetentionDays: '小时指标保留天数',
+        retentionDaysHint: '建议保留7-90天，过长会占用存储空间',
+        aggregation: '预聚合任务',
+        enableAggregation: '启用预聚合任务',
+        aggregationHint: '预聚合可提升长时间窗口查询性能',
+        validation: {
+          title: '请先修正以下问题',
+          retentionDaysRange: '保留天数必须在1-365天之间'
+        }
+      },
+      concurrency: {
+        title: '并发 / 排队',
+        byPlatform: '按平台',
+        byGroup: '按分组',
+        byAccount: '按账号',
+        totalRows: '共 {count} 项',
+        disabledHint: '已在设置中关闭实时监控。',
+        empty: '暂无数据',
+        queued: '队列 {count}',
+        rateLimited: '限流 {count}',
+        errorAccounts: '异常 {count}',
+        loadFailed: '加载并发数据失败'
+      },
+      realtime: {
+        title: '实时信息',
+        connected: '实时已连接',
+        connecting: '实时连接中',
+        reconnecting: '实时重连中',
+        offline: '实时离线',
+        closed: '实时已关闭',
+        reconnectIn: '重连 {seconds}s'
+      },
+      queryMode: {
+        auto: 'Auto（自动）',
+        raw: 'Raw（不聚合）',
+        preagg: 'Preagg（聚合）'
+      },
+      accountAvailability: {
+        available: '可用',
+        unavailable: '不可用',
+        accountError: '异常'
+      },
+      tooltips: {
+        totalRequests: '当前时间窗口内的总请求数和Token消耗量。',
+        throughputTrend: '当前窗口内的请求/QPS 与 token/TPS 趋势。',
+        latencyHistogram: '成功请求的延迟分布（毫秒）。',
+        errorTrend: '错误趋势（SLA 口径排除业务限制；上游错误率排除 429/529）。',
+        errorDistribution: '按状态码统计的错误分布。',
+        upstreamErrors: '上游服务返回的错误，包括API提供商的错误响应（排除429/529限流错误）。',
+        goroutines:
+          'Go 运行时的协程数量（轻量级线程）。没有绝对“安全值”，建议以历史基线为准。经验参考：<2000 常见；2000-8000 需关注；>8000 且伴随队列/延迟上升时，优先排查阻塞/泄漏。',
+        cpu: 'CPU 使用率，显示系统处理器的负载情况。',
+        memory: '内存使用率，包括已使用和总可用内存。',
+        db: '数据库连接池状态，包括活跃连接、空闲连接和等待连接数。',
+        redis: 'Redis 连接池状态，显示活跃和空闲的连接数。',
+        jobs: '后台任务执行状态，包括最近运行时间、成功时间和错误信息。',
+        qps: '每秒查询数（QPS）和每秒Token数（TPS），实时显示系统吞吐量。',
+        tokens: '当前时间窗口内处理的总Token数量。',
+        sla: '服务等级协议达成率，排除业务限制（如余额不足、配额超限）的成功请求占比。',
+        errors: '错误统计，包括总错误数、错误率和上游错误率。',
+        latency: '请求延迟统计，包括 p50、p90、p95、p99 等百分位数。',
+        ttft: '首Token延迟（Time To First Token），衡量流式响应的首字节返回速度。',
+        health: '系统健康评分（0-100），综合考虑 SLA、错误率和资源使用情况。'
+      },
+      charts: {
+        emptyRequest: '该时间窗口内暂无请求。',
+        emptyError: '该时间窗口内暂无错误。',
+        resetZoom: '重置',
+        resetZoomHint: '重置缩放（若启用）',
+        downloadChart: '下载',
+        downloadChartHint: '下载图表图片'
+      }
+    },
    // Settings
    settings: {
      title: '系统设置',
@@ -2094,6 +2658,22 @@ export default {
        sending: '发送中...',
        enterRecipientHint: '请输入收件人邮箱地址'
      },
+      opsMonitoring: {
+        title: '运维监控',
+        description: '启用运维监控模块，用于排障与健康可视化',
+        disabled: '运维监控已关闭',
+        enabled: '启用运维监控',
+        enabledHint: '启用运维监控模块（仅管理员可见）',
+        realtimeEnabled: '启用实时监控',
+        realtimeEnabledHint: '启用实时请求速率和指标推送（WebSocket）',
+        queryMode: '默认查询模式',
+        queryModeHint: '运维监控默认查询模式（自动/原始/预聚合）',
+        queryModeAuto: '自动（推荐）',
+        queryModeRaw: '原始（最准确，但较慢）',
+        queryModePreagg: '预聚合（最快，需预聚合）',
+        metricsInterval: '采集频率（秒）',
+        metricsIntervalHint: '系统/请求指标采集频率（60-3600 秒）'
+      },
      adminApiKey: {
        title: '管理员 API Key',
        description: '用于外部系统集成的全局 API Key，拥有完整的管理员权限',

--- a/frontend/src/router/index.ts
+++ b/frontend/src/router/index.ts
@@ -173,6 +173,18 @@ const routes: RouteRecordRaw[] = [
      descriptionKey: 'admin.dashboard.description'
    }
  },
+  {
+    path: '/admin/ops',
+    name: 'AdminOps',
+    component: () => import('@/views/admin/ops/OpsDashboard.vue'),
+    meta: {
+      requiresAuth: true,
+      requiresAdmin: true,
+      title: 'Ops Monitoring',
+      titleKey: 'admin.ops.title',
+      descriptionKey: 'admin.ops.description'
+    }
+  },
  {
    path: '/admin/users',
    name: 'AdminUsers',

--- a/frontend/src/stores/adminSettings.ts
+++ b/frontend/src/stores/adminSettings.ts
+import { defineStore } from 'pinia'
+import { ref } from 'vue'
+import { adminAPI } from '@/api'
+export const useAdminSettingsStore = defineStore('adminSettings', () => {
+  const loaded = ref(false)
+  const loading = ref(false)
+  const readCachedBool = (key: string, defaultValue: boolean): boolean => {
+    try {
+      const raw = localStorage.getItem(key)
+      if (raw === 'true') return true
+      if (raw === 'false') return false
+    } catch {
+      // ignore localStorage failures
+    }
+    return defaultValue
+  }
+  const writeCachedBool = (key: string, value: boolean) => {
+    try {
+      localStorage.setItem(key, value ? 'true' : 'false')
+    } catch {
+      // ignore localStorage failures
+    }
+  }
+  const readCachedString = (key: string, defaultValue: string): string => {
+    try {
+      const raw = localStorage.getItem(key)
+      if (typeof raw === 'string' && raw.length > 0) return raw
+    } catch {
+      // ignore localStorage failures
+    }
+    return defaultValue
+  }
+  const writeCachedString = (key: string, value: string) => {
+    try {
+      localStorage.setItem(key, value)
+    } catch {
+      // ignore localStorage failures
+    }
+  }
+  // Default open, but honor cached value to reduce UI flicker on first paint.
+  const opsMonitoringEnabled = ref(readCachedBool('ops_monitoring_enabled_cached', true))
+  const opsRealtimeMonitoringEnabled = ref(readCachedBool('ops_realtime_monitoring_enabled_cached', true))
+  const opsQueryModeDefault = ref(readCachedString('ops_query_mode_default_cached', 'auto'))
+  async function fetch(force = false): Promise<void> {
+    if (loaded.value && !force) return
+    if (loading.value) return
+    loading.value = true
+    try {
+      const settings = await adminAPI.settings.getSettings()
+      opsMonitoringEnabled.value = settings.ops_monitoring_enabled ?? true
+      writeCachedBool('ops_monitoring_enabled_cached', opsMonitoringEnabled.value)
+      opsRealtimeMonitoringEnabled.value = settings.ops_realtime_monitoring_enabled ?? true
+      writeCachedBool('ops_realtime_monitoring_enabled_cached', opsRealtimeMonitoringEnabled.value)
+      opsQueryModeDefault.value = settings.ops_query_mode_default || 'auto'
+      writeCachedString('ops_query_mode_default_cached', opsQueryModeDefault.value)
+      loaded.value = true
+    } catch (err) {
+      // Keep cached/default value: do not "flip" the UI based on a transient fetch failure.
+      loaded.value = true
+      console.error('[adminSettings] Failed to fetch settings:', err)
+    } finally {
+      loading.value = false
+    }
+  }
+  function setOpsMonitoringEnabledLocal(value: boolean) {
+    opsMonitoringEnabled.value = value
+    writeCachedBool('ops_monitoring_enabled_cached', value)
+    loaded.value = true
+  }
+  function setOpsRealtimeMonitoringEnabledLocal(value: boolean) {
+    opsRealtimeMonitoringEnabled.value = value
+    writeCachedBool('ops_realtime_monitoring_enabled_cached', value)
+    loaded.value = true
+  }
+  function setOpsQueryModeDefaultLocal(value: string) {
+    opsQueryModeDefault.value = value || 'auto'
+    writeCachedString('ops_query_mode_default_cached', opsQueryModeDefault.value)
+    loaded.value = true
+  }
+  // Keep UI consistent if we learn that ops is disabled via feature-gated 404s.
+  // (event is dispatched from the axios interceptor)
+  let eventHandlerCleanup: (() => void) | null = null
+  function initializeEventListeners() {
+    if (eventHandlerCleanup) return
+    try {
+      const handler = () => {
+        setOpsMonitoringEnabledLocal(false)
+      }
+      window.addEventListener('ops-monitoring-disabled', handler)
+      eventHandlerCleanup = () => {
+        window.removeEventListener('ops-monitoring-disabled', handler)
+      }
+    } catch {
+      // ignore window access failures (SSR)
+    }
+  }
+  if (typeof window !== 'undefined') {
+    initializeEventListeners()
+  }
+  return {
+    loaded,
+    loading,
+    opsMonitoringEnabled,
+    opsRealtimeMonitoringEnabled,
+    opsQueryModeDefault,
+    fetch,
+    setOpsMonitoringEnabledLocal,
+    setOpsRealtimeMonitoringEnabledLocal,
+    setOpsQueryModeDefaultLocal
+  }
+})