Commit 7844dc4f authored by shaw's avatar shaw
Browse files

Merge PR #238: feat(ops): 实现完整的运维监控系统(vNext)

parents 2b2f7a6d c48795a9
...@@ -55,19 +55,36 @@ func (s *RateLimitService) HandleUpstreamError(ctx context.Context, account *Acc ...@@ -55,19 +55,36 @@ func (s *RateLimitService) HandleUpstreamError(ctx context.Context, account *Acc
} }
tempMatched := s.tryTempUnschedulable(ctx, account, statusCode, responseBody) tempMatched := s.tryTempUnschedulable(ctx, account, statusCode, responseBody)
upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(responseBody))
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
if upstreamMsg != "" {
upstreamMsg = truncateForLog([]byte(upstreamMsg), 512)
}
switch statusCode { switch statusCode {
case 401: case 401:
// 认证失败:停止调度,记录错误 // 认证失败:停止调度,记录错误
s.handleAuthError(ctx, account, "Authentication failed (401): invalid or expired credentials") msg := "Authentication failed (401): invalid or expired credentials"
if upstreamMsg != "" {
msg = "Authentication failed (401): " + upstreamMsg
}
s.handleAuthError(ctx, account, msg)
shouldDisable = true shouldDisable = true
case 402: case 402:
// 支付要求:余额不足或计费问题,停止调度 // 支付要求:余额不足或计费问题,停止调度
s.handleAuthError(ctx, account, "Payment required (402): insufficient balance or billing issue") msg := "Payment required (402): insufficient balance or billing issue"
if upstreamMsg != "" {
msg = "Payment required (402): " + upstreamMsg
}
s.handleAuthError(ctx, account, msg)
shouldDisable = true shouldDisable = true
case 403: case 403:
// 禁止访问:停止调度,记录错误 // 禁止访问:停止调度,记录错误
s.handleAuthError(ctx, account, "Access forbidden (403): account may be suspended or lack permissions") msg := "Access forbidden (403): account may be suspended or lack permissions"
if upstreamMsg != "" {
msg = "Access forbidden (403): " + upstreamMsg
}
s.handleAuthError(ctx, account, msg)
shouldDisable = true shouldDisable = true
case 429: case 429:
s.handle429(ctx, account, headers) s.handle429(ctx, account, headers)
......
...@@ -208,6 +208,14 @@ func (s *SettingService) UpdateSettings(ctx context.Context, settings *SystemSet ...@@ -208,6 +208,14 @@ func (s *SettingService) UpdateSettings(ctx context.Context, settings *SystemSet
updates[SettingKeyEnableIdentityPatch] = strconv.FormatBool(settings.EnableIdentityPatch) updates[SettingKeyEnableIdentityPatch] = strconv.FormatBool(settings.EnableIdentityPatch)
updates[SettingKeyIdentityPatchPrompt] = settings.IdentityPatchPrompt updates[SettingKeyIdentityPatchPrompt] = settings.IdentityPatchPrompt
// Ops monitoring (vNext)
updates[SettingKeyOpsMonitoringEnabled] = strconv.FormatBool(settings.OpsMonitoringEnabled)
updates[SettingKeyOpsRealtimeMonitoringEnabled] = strconv.FormatBool(settings.OpsRealtimeMonitoringEnabled)
updates[SettingKeyOpsQueryModeDefault] = string(ParseOpsQueryMode(settings.OpsQueryModeDefault))
if settings.OpsMetricsIntervalSeconds > 0 {
updates[SettingKeyOpsMetricsIntervalSeconds] = strconv.Itoa(settings.OpsMetricsIntervalSeconds)
}
err := s.settingRepo.SetMultiple(ctx, updates) err := s.settingRepo.SetMultiple(ctx, updates)
if err == nil && s.onUpdate != nil { if err == nil && s.onUpdate != nil {
s.onUpdate() // Invalidate cache after settings update s.onUpdate() // Invalidate cache after settings update
...@@ -219,8 +227,8 @@ func (s *SettingService) UpdateSettings(ctx context.Context, settings *SystemSet ...@@ -219,8 +227,8 @@ func (s *SettingService) UpdateSettings(ctx context.Context, settings *SystemSet
func (s *SettingService) IsRegistrationEnabled(ctx context.Context) bool { func (s *SettingService) IsRegistrationEnabled(ctx context.Context) bool {
value, err := s.settingRepo.GetValue(ctx, SettingKeyRegistrationEnabled) value, err := s.settingRepo.GetValue(ctx, SettingKeyRegistrationEnabled)
if err != nil { if err != nil {
// 安全默认:如果设置不存在或查询出错,默认关闭注册 // 默认开放注册
return false return true
} }
return value == "true" return value == "true"
} }
...@@ -298,6 +306,12 @@ func (s *SettingService) InitializeDefaultSettings(ctx context.Context) error { ...@@ -298,6 +306,12 @@ func (s *SettingService) InitializeDefaultSettings(ctx context.Context) error {
// Identity patch defaults // Identity patch defaults
SettingKeyEnableIdentityPatch: "true", SettingKeyEnableIdentityPatch: "true",
SettingKeyIdentityPatchPrompt: "", SettingKeyIdentityPatchPrompt: "",
// Ops monitoring defaults (vNext)
SettingKeyOpsMonitoringEnabled: "true",
SettingKeyOpsRealtimeMonitoringEnabled: "true",
SettingKeyOpsQueryModeDefault: "auto",
SettingKeyOpsMetricsIntervalSeconds: "60",
} }
return s.settingRepo.SetMultiple(ctx, defaults) return s.settingRepo.SetMultiple(ctx, defaults)
...@@ -397,100 +411,33 @@ func (s *SettingService) parseSettings(settings map[string]string) *SystemSettin ...@@ -397,100 +411,33 @@ func (s *SettingService) parseSettings(settings map[string]string) *SystemSettin
} }
result.IdentityPatchPrompt = settings[SettingKeyIdentityPatchPrompt] result.IdentityPatchPrompt = settings[SettingKeyIdentityPatchPrompt]
return result // Ops monitoring settings (default: enabled, fail-open)
} result.OpsMonitoringEnabled = !isFalseSettingValue(settings[SettingKeyOpsMonitoringEnabled])
result.OpsRealtimeMonitoringEnabled = !isFalseSettingValue(settings[SettingKeyOpsRealtimeMonitoringEnabled])
// GetLinuxDoConnectOAuthConfig 返回用于登录的“最终生效” LinuxDo Connect 配置。 result.OpsQueryModeDefault = string(ParseOpsQueryMode(settings[SettingKeyOpsQueryModeDefault]))
// result.OpsMetricsIntervalSeconds = 60
// 优先级: if raw := strings.TrimSpace(settings[SettingKeyOpsMetricsIntervalSeconds]); raw != "" {
// - 若对应系统设置键存在,则覆盖 config.yaml/env 的值 if v, err := strconv.Atoi(raw); err == nil {
// - 否则回退到 config.yaml/env 的值 if v < 60 {
func (s *SettingService) GetLinuxDoConnectOAuthConfig(ctx context.Context) (config.LinuxDoConnectConfig, error) { v = 60
if s == nil || s.cfg == nil { }
return config.LinuxDoConnectConfig{}, infraerrors.ServiceUnavailable("CONFIG_NOT_READY", "config not loaded") if v > 3600 {
} v = 3600
}
effective := s.cfg.LinuxDo result.OpsMetricsIntervalSeconds = v
}
keys := []string{
SettingKeyLinuxDoConnectEnabled,
SettingKeyLinuxDoConnectClientID,
SettingKeyLinuxDoConnectClientSecret,
SettingKeyLinuxDoConnectRedirectURL,
}
settings, err := s.settingRepo.GetMultiple(ctx, keys)
if err != nil {
return config.LinuxDoConnectConfig{}, fmt.Errorf("get linuxdo connect settings: %w", err)
}
if raw, ok := settings[SettingKeyLinuxDoConnectEnabled]; ok {
effective.Enabled = raw == "true"
}
if v, ok := settings[SettingKeyLinuxDoConnectClientID]; ok && strings.TrimSpace(v) != "" {
effective.ClientID = strings.TrimSpace(v)
}
if v, ok := settings[SettingKeyLinuxDoConnectClientSecret]; ok && strings.TrimSpace(v) != "" {
effective.ClientSecret = strings.TrimSpace(v)
}
if v, ok := settings[SettingKeyLinuxDoConnectRedirectURL]; ok && strings.TrimSpace(v) != "" {
effective.RedirectURL = strings.TrimSpace(v)
}
if !effective.Enabled {
return config.LinuxDoConnectConfig{}, infraerrors.NotFound("OAUTH_DISABLED", "oauth login is disabled")
}
// 基础健壮性校验(避免把用户重定向到一个必然失败或不安全的 OAuth 流程里)。
if strings.TrimSpace(effective.ClientID) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth client id not configured")
}
if strings.TrimSpace(effective.AuthorizeURL) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth authorize url not configured")
}
if strings.TrimSpace(effective.TokenURL) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token url not configured")
}
if strings.TrimSpace(effective.UserInfoURL) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth userinfo url not configured")
}
if strings.TrimSpace(effective.RedirectURL) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth redirect url not configured")
}
if strings.TrimSpace(effective.FrontendRedirectURL) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth frontend redirect url not configured")
} }
if err := config.ValidateAbsoluteHTTPURL(effective.AuthorizeURL); err != nil { return result
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth authorize url invalid") }
}
if err := config.ValidateAbsoluteHTTPURL(effective.TokenURL); err != nil {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token url invalid")
}
if err := config.ValidateAbsoluteHTTPURL(effective.UserInfoURL); err != nil {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth userinfo url invalid")
}
if err := config.ValidateAbsoluteHTTPURL(effective.RedirectURL); err != nil {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth redirect url invalid")
}
if err := config.ValidateFrontendRedirectURL(effective.FrontendRedirectURL); err != nil {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth frontend redirect url invalid")
}
method := strings.ToLower(strings.TrimSpace(effective.TokenAuthMethod)) func isFalseSettingValue(value string) bool {
switch method { switch strings.ToLower(strings.TrimSpace(value)) {
case "", "client_secret_post", "client_secret_basic": case "false", "0", "off", "disabled":
if strings.TrimSpace(effective.ClientSecret) == "" { return true
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth client secret not configured")
}
case "none":
if !effective.UsePKCE {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth pkce must be enabled when token_auth_method=none")
}
default: default:
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token_auth_method invalid") return false
} }
return effective, nil
} }
// getStringOrDefault 获取字符串值或默认值 // getStringOrDefault 获取字符串值或默认值
...@@ -635,3 +582,96 @@ func (s *SettingService) GetFallbackModel(ctx context.Context, platform string) ...@@ -635,3 +582,96 @@ func (s *SettingService) GetFallbackModel(ctx context.Context, platform string)
} }
return value return value
} }
// GetLinuxDoConnectOAuthConfig 返回用于登录的"最终生效" LinuxDo Connect 配置。
//
// 优先级:
// - 若对应系统设置键存在,则覆盖 config.yaml/env 的值
// - 否则回退到 config.yaml/env 的值
func (s *SettingService) GetLinuxDoConnectOAuthConfig(ctx context.Context) (config.LinuxDoConnectConfig, error) {
if s == nil || s.cfg == nil {
return config.LinuxDoConnectConfig{}, infraerrors.ServiceUnavailable("CONFIG_NOT_READY", "config not loaded")
}
effective := s.cfg.LinuxDo
keys := []string{
SettingKeyLinuxDoConnectEnabled,
SettingKeyLinuxDoConnectClientID,
SettingKeyLinuxDoConnectClientSecret,
SettingKeyLinuxDoConnectRedirectURL,
}
settings, err := s.settingRepo.GetMultiple(ctx, keys)
if err != nil {
return config.LinuxDoConnectConfig{}, fmt.Errorf("get linuxdo connect settings: %w", err)
}
if raw, ok := settings[SettingKeyLinuxDoConnectEnabled]; ok {
effective.Enabled = raw == "true"
}
if v, ok := settings[SettingKeyLinuxDoConnectClientID]; ok && strings.TrimSpace(v) != "" {
effective.ClientID = strings.TrimSpace(v)
}
if v, ok := settings[SettingKeyLinuxDoConnectClientSecret]; ok && strings.TrimSpace(v) != "" {
effective.ClientSecret = strings.TrimSpace(v)
}
if v, ok := settings[SettingKeyLinuxDoConnectRedirectURL]; ok && strings.TrimSpace(v) != "" {
effective.RedirectURL = strings.TrimSpace(v)
}
if !effective.Enabled {
return config.LinuxDoConnectConfig{}, infraerrors.NotFound("OAUTH_DISABLED", "oauth login is disabled")
}
// 基础健壮性校验(避免把用户重定向到一个必然失败或不安全的 OAuth 流程里)。
if strings.TrimSpace(effective.ClientID) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth client id not configured")
}
if strings.TrimSpace(effective.AuthorizeURL) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth authorize url not configured")
}
if strings.TrimSpace(effective.TokenURL) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token url not configured")
}
if strings.TrimSpace(effective.UserInfoURL) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth userinfo url not configured")
}
if strings.TrimSpace(effective.RedirectURL) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth redirect url not configured")
}
if strings.TrimSpace(effective.FrontendRedirectURL) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth frontend redirect url not configured")
}
if err := config.ValidateAbsoluteHTTPURL(effective.AuthorizeURL); err != nil {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth authorize url invalid")
}
if err := config.ValidateAbsoluteHTTPURL(effective.TokenURL); err != nil {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token url invalid")
}
if err := config.ValidateAbsoluteHTTPURL(effective.UserInfoURL); err != nil {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth userinfo url invalid")
}
if err := config.ValidateAbsoluteHTTPURL(effective.RedirectURL); err != nil {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth redirect url invalid")
}
if err := config.ValidateFrontendRedirectURL(effective.FrontendRedirectURL); err != nil {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth frontend redirect url invalid")
}
method := strings.ToLower(strings.TrimSpace(effective.TokenAuthMethod))
switch method {
case "", "client_secret_post", "client_secret_basic":
if strings.TrimSpace(effective.ClientSecret) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth client secret not configured")
}
case "none":
if !effective.UsePKCE {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth pkce must be enabled when token_auth_method=none")
}
default:
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token_auth_method invalid")
}
return effective, nil
}
...@@ -46,6 +46,12 @@ type SystemSettings struct { ...@@ -46,6 +46,12 @@ type SystemSettings struct {
// Identity patch configuration (Claude -> Gemini) // Identity patch configuration (Claude -> Gemini)
EnableIdentityPatch bool `json:"enable_identity_patch"` EnableIdentityPatch bool `json:"enable_identity_patch"`
IdentityPatchPrompt string `json:"identity_patch_prompt"` IdentityPatchPrompt string `json:"identity_patch_prompt"`
// Ops monitoring (vNext)
OpsMonitoringEnabled bool
OpsRealtimeMonitoringEnabled bool
OpsQueryModeDefault string
OpsMetricsIntervalSeconds int
} }
type PublicSettings struct { type PublicSettings struct {
......
package service package service
import ( import (
"database/sql"
"time" "time"
"github.com/Wei-Shaw/sub2api/internal/config" "github.com/Wei-Shaw/sub2api/internal/config"
"github.com/google/wire" "github.com/google/wire"
"github.com/redis/go-redis/v9"
) )
// BuildInfo contains build information // BuildInfo contains build information
...@@ -84,6 +86,72 @@ func ProvideConcurrencyService(cache ConcurrencyCache, accountRepo AccountReposi ...@@ -84,6 +86,72 @@ func ProvideConcurrencyService(cache ConcurrencyCache, accountRepo AccountReposi
return svc return svc
} }
// ProvideOpsMetricsCollector creates and starts OpsMetricsCollector.
func ProvideOpsMetricsCollector(
opsRepo OpsRepository,
settingRepo SettingRepository,
accountRepo AccountRepository,
concurrencyService *ConcurrencyService,
db *sql.DB,
redisClient *redis.Client,
cfg *config.Config,
) *OpsMetricsCollector {
collector := NewOpsMetricsCollector(opsRepo, settingRepo, accountRepo, concurrencyService, db, redisClient, cfg)
collector.Start()
return collector
}
// ProvideOpsAggregationService creates and starts OpsAggregationService (hourly/daily pre-aggregation).
func ProvideOpsAggregationService(
opsRepo OpsRepository,
settingRepo SettingRepository,
db *sql.DB,
redisClient *redis.Client,
cfg *config.Config,
) *OpsAggregationService {
svc := NewOpsAggregationService(opsRepo, settingRepo, db, redisClient, cfg)
svc.Start()
return svc
}
// ProvideOpsAlertEvaluatorService creates and starts OpsAlertEvaluatorService.
func ProvideOpsAlertEvaluatorService(
opsService *OpsService,
opsRepo OpsRepository,
emailService *EmailService,
redisClient *redis.Client,
cfg *config.Config,
) *OpsAlertEvaluatorService {
svc := NewOpsAlertEvaluatorService(opsService, opsRepo, emailService, redisClient, cfg)
svc.Start()
return svc
}
// ProvideOpsCleanupService creates and starts OpsCleanupService (cron scheduled).
func ProvideOpsCleanupService(
opsRepo OpsRepository,
db *sql.DB,
redisClient *redis.Client,
cfg *config.Config,
) *OpsCleanupService {
svc := NewOpsCleanupService(opsRepo, db, redisClient, cfg)
svc.Start()
return svc
}
// ProvideOpsScheduledReportService creates and starts OpsScheduledReportService.
func ProvideOpsScheduledReportService(
opsService *OpsService,
userService *UserService,
emailService *EmailService,
redisClient *redis.Client,
cfg *config.Config,
) *OpsScheduledReportService {
svc := NewOpsScheduledReportService(opsService, userService, emailService, redisClient, cfg)
svc.Start()
return svc
}
// ProvideAPIKeyAuthCacheInvalidator 提供 API Key 认证缓存失效能力 // ProvideAPIKeyAuthCacheInvalidator 提供 API Key 认证缓存失效能力
func ProvideAPIKeyAuthCacheInvalidator(apiKeyService *APIKeyService) APIKeyAuthCacheInvalidator { func ProvideAPIKeyAuthCacheInvalidator(apiKeyService *APIKeyService) APIKeyAuthCacheInvalidator {
return apiKeyService return apiKeyService
...@@ -122,6 +190,12 @@ var ProviderSet = wire.NewSet( ...@@ -122,6 +190,12 @@ var ProviderSet = wire.NewSet(
NewAccountUsageService, NewAccountUsageService,
NewAccountTestService, NewAccountTestService,
NewSettingService, NewSettingService,
NewOpsService,
ProvideOpsMetricsCollector,
ProvideOpsAggregationService,
ProvideOpsAlertEvaluatorService,
ProvideOpsCleanupService,
ProvideOpsScheduledReportService,
NewEmailService, NewEmailService,
ProvideEmailQueueService, ProvideEmailQueueService,
NewTurnstileService, NewTurnstileService,
......
-- Ops Monitoring (vNext): squashed migration (030)
--
-- This repository originally planned Ops vNext as migrations 030-036:
-- 030 drop legacy ops tables
-- 031 core schema
-- 032 pre-aggregation tables
-- 033 indexes + optional extensions
-- 034 add avg/max to preagg
-- 035 add notify_email to alert rules
-- 036 seed default alert rules
--
-- Since these migrations have NOT been applied to any environment yet, we squash them
-- into a single 030 migration for easier review and a cleaner migration history.
--
-- Notes:
-- - This is intentionally destructive for ops_* data (error logs / metrics / alerts).
-- - It is idempotent (DROP/CREATE/ALTER IF EXISTS/IF NOT EXISTS), but will wipe ops_* data if re-run.
-- =====================================================================
-- 030_ops_drop_legacy_ops_tables.sql
-- =====================================================================
SET LOCAL lock_timeout = '5s';
SET LOCAL statement_timeout = '10min';
-- Legacy pre-aggregation tables (from 026 and/or previous branches)
DROP TABLE IF EXISTS ops_metrics_daily CASCADE;
DROP TABLE IF EXISTS ops_metrics_hourly CASCADE;
-- Core ops tables that may exist in some deployments / branches
DROP TABLE IF EXISTS ops_system_metrics CASCADE;
DROP TABLE IF EXISTS ops_error_logs CASCADE;
DROP TABLE IF EXISTS ops_alert_events CASCADE;
DROP TABLE IF EXISTS ops_alert_rules CASCADE;
DROP TABLE IF EXISTS ops_job_heartbeats CASCADE;
DROP TABLE IF EXISTS ops_retry_attempts CASCADE;
-- Optional legacy tables (best-effort cleanup)
DROP TABLE IF EXISTS ops_scheduled_reports CASCADE;
DROP TABLE IF EXISTS ops_group_availability_configs CASCADE;
DROP TABLE IF EXISTS ops_group_availability_events CASCADE;
-- Optional legacy views/indexes
DROP VIEW IF EXISTS ops_latest_metrics CASCADE;
-- =====================================================================
-- 031_ops_core_schema.sql
-- =====================================================================
-- Ops Monitoring (vNext): core schema (errors / retries / metrics / jobs / alerts)
--
-- Design goals:
-- - Support global filtering (time/platform/group) across all ops modules.
-- - Persist enough context for two retry modes (client retry / pinned upstream retry).
-- - Make ops background jobs observable via job heartbeats.
-- - Keep schema stable and indexes targeted (high-write tables).
--
-- Notes:
-- - This migration is idempotent.
-- - ops_* tables intentionally avoid strict foreign keys to reduce write amplification/locks.
SET LOCAL lock_timeout = '5s';
SET LOCAL statement_timeout = '10min';
-- ============================================
-- 1) ops_error_logs: error log details (high-write)
-- ============================================
CREATE TABLE IF NOT EXISTS ops_error_logs (
id BIGSERIAL PRIMARY KEY,
-- Correlation / identities
request_id VARCHAR(64),
client_request_id VARCHAR(64),
user_id BIGINT,
api_key_id BIGINT,
account_id BIGINT,
group_id BIGINT,
client_ip inet,
-- Dimensions for global filtering
platform VARCHAR(32),
-- Request metadata
model VARCHAR(100),
request_path VARCHAR(256),
stream BOOLEAN NOT NULL DEFAULT false,
user_agent TEXT,
-- Core error classification
error_phase VARCHAR(32) NOT NULL,
error_type VARCHAR(64) NOT NULL,
severity VARCHAR(8) NOT NULL DEFAULT 'P2',
status_code INT,
-- vNext metric semantics
is_business_limited BOOLEAN NOT NULL DEFAULT false,
-- Error details (sanitized/truncated at ingest time)
error_message TEXT,
error_body TEXT,
-- Provider/upstream details (optional; useful for trends & account health)
error_source VARCHAR(64),
error_owner VARCHAR(32),
account_status VARCHAR(50),
upstream_status_code INT,
upstream_error_message TEXT,
upstream_error_detail TEXT,
provider_error_code VARCHAR(64),
provider_error_type VARCHAR(64),
network_error_type VARCHAR(50),
retry_after_seconds INT,
-- Timings (ms) - optional
duration_ms INT,
time_to_first_token_ms BIGINT,
auth_latency_ms BIGINT,
routing_latency_ms BIGINT,
upstream_latency_ms BIGINT,
response_latency_ms BIGINT,
-- Retry context (only stored for error requests)
request_body JSONB,
request_headers JSONB,
request_body_truncated BOOLEAN NOT NULL DEFAULT false,
request_body_bytes INT,
-- Retryability flags (best-effort classification)
is_retryable BOOLEAN NOT NULL DEFAULT false,
retry_count INT NOT NULL DEFAULT 0,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
COMMENT ON TABLE ops_error_logs IS 'Ops error logs (vNext). Stores sanitized error details and request_body for retries (errors only).';
-- ============================================
-- 2) ops_retry_attempts: audit log for retries
-- ============================================
CREATE TABLE IF NOT EXISTS ops_retry_attempts (
id BIGSERIAL PRIMARY KEY,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
requested_by_user_id BIGINT,
source_error_id BIGINT,
-- client|upstream
mode VARCHAR(16) NOT NULL,
pinned_account_id BIGINT,
-- queued|running|succeeded|failed
status VARCHAR(16) NOT NULL DEFAULT 'queued',
started_at TIMESTAMPTZ,
finished_at TIMESTAMPTZ,
duration_ms BIGINT,
-- Optional result correlation
result_request_id VARCHAR(64),
result_error_id BIGINT,
result_usage_request_id VARCHAR(64),
error_message TEXT
);
COMMENT ON TABLE ops_retry_attempts IS 'Audit table for ops retries (client retry / pinned upstream retry).';
-- ============================================
-- 3) ops_system_metrics: system + request window snapshots
-- ============================================
CREATE TABLE IF NOT EXISTS ops_system_metrics (
id BIGSERIAL PRIMARY KEY,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
window_minutes INT NOT NULL DEFAULT 1,
-- Optional dimensions (only if collector chooses to write per-dimension snapshots)
platform VARCHAR(32),
group_id BIGINT,
-- Core counts
success_count BIGINT NOT NULL DEFAULT 0,
error_count_total BIGINT NOT NULL DEFAULT 0,
business_limited_count BIGINT NOT NULL DEFAULT 0,
error_count_sla BIGINT NOT NULL DEFAULT 0,
upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0,
upstream_429_count BIGINT NOT NULL DEFAULT 0,
upstream_529_count BIGINT NOT NULL DEFAULT 0,
token_consumed BIGINT NOT NULL DEFAULT 0,
-- Rates
qps DOUBLE PRECISION,
tps DOUBLE PRECISION,
-- Duration percentiles (ms) - success requests
duration_p50_ms INT,
duration_p90_ms INT,
duration_p95_ms INT,
duration_p99_ms INT,
duration_avg_ms DOUBLE PRECISION,
duration_max_ms INT,
-- TTFT percentiles (ms) - success requests (streaming)
ttft_p50_ms INT,
ttft_p90_ms INT,
ttft_p95_ms INT,
ttft_p99_ms INT,
ttft_avg_ms DOUBLE PRECISION,
ttft_max_ms INT,
-- System resources
cpu_usage_percent DOUBLE PRECISION,
memory_used_mb BIGINT,
memory_total_mb BIGINT,
memory_usage_percent DOUBLE PRECISION,
-- Dependency health (best-effort)
db_ok BOOLEAN,
redis_ok BOOLEAN,
-- DB pool & runtime
db_conn_active INT,
db_conn_idle INT,
db_conn_waiting INT,
goroutine_count INT,
-- Queue / concurrency
concurrency_queue_depth INT
);
COMMENT ON TABLE ops_system_metrics IS 'Ops system/request metrics snapshots (vNext). Used for dashboard overview and realtime rates.';
-- ============================================
-- 4) ops_job_heartbeats: background jobs health
-- ============================================
CREATE TABLE IF NOT EXISTS ops_job_heartbeats (
job_name VARCHAR(64) PRIMARY KEY,
last_run_at TIMESTAMPTZ,
last_success_at TIMESTAMPTZ,
last_error_at TIMESTAMPTZ,
last_error TEXT,
last_duration_ms BIGINT,
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
COMMENT ON TABLE ops_job_heartbeats IS 'Ops background jobs heartbeats (vNext).';
-- ============================================
-- 5) ops_alert_rules / ops_alert_events
-- ============================================
CREATE TABLE IF NOT EXISTS ops_alert_rules (
id BIGSERIAL PRIMARY KEY,
name VARCHAR(128) NOT NULL,
description TEXT,
enabled BOOLEAN NOT NULL DEFAULT true,
severity VARCHAR(16) NOT NULL DEFAULT 'warning',
-- Metric definition
-- Metric definition
metric_type VARCHAR(64) NOT NULL,
operator VARCHAR(8) NOT NULL,
threshold DOUBLE PRECISION NOT NULL,
window_minutes INT NOT NULL DEFAULT 5,
sustained_minutes INT NOT NULL DEFAULT 5,
cooldown_minutes INT NOT NULL DEFAULT 10,
-- Optional scoping: platform/group filters etc.
filters JSONB,
last_triggered_at TIMESTAMPTZ,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_alert_rules_name_unique
ON ops_alert_rules (name);
CREATE INDEX IF NOT EXISTS idx_ops_alert_rules_enabled
ON ops_alert_rules (enabled);
CREATE TABLE IF NOT EXISTS ops_alert_events (
id BIGSERIAL PRIMARY KEY,
rule_id BIGINT,
severity VARCHAR(16) NOT NULL,
status VARCHAR(16) NOT NULL DEFAULT 'firing',
title VARCHAR(200),
description TEXT,
metric_value DOUBLE PRECISION,
threshold_value DOUBLE PRECISION,
dimensions JSONB,
fired_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
resolved_at TIMESTAMPTZ,
email_sent BOOLEAN NOT NULL DEFAULT false,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_ops_alert_events_rule_status
ON ops_alert_events (rule_id, status);
CREATE INDEX IF NOT EXISTS idx_ops_alert_events_fired_at
ON ops_alert_events (fired_at DESC);
-- =====================================================================
-- 032_ops_preaggregation_tables.sql
-- =====================================================================
-- Ops Monitoring (vNext): pre-aggregation tables
--
-- Purpose:
-- - Provide stable query performance for 1–24h windows (and beyond), avoiding expensive
-- percentile_cont scans on raw logs for every dashboard refresh.
-- - Support global filter dimensions: overall / platform / group.
--
-- Design note:
-- - We keep a single table with nullable platform/group_id, and enforce uniqueness via a
-- COALESCE-based unique index (because UNIQUE with NULLs allows duplicates in Postgres).
SET LOCAL lock_timeout = '5s';
SET LOCAL statement_timeout = '10min';
-- ============================================
-- 1) ops_metrics_hourly
-- ============================================
CREATE TABLE IF NOT EXISTS ops_metrics_hourly (
id BIGSERIAL PRIMARY KEY,
bucket_start TIMESTAMPTZ NOT NULL,
platform VARCHAR(32),
group_id BIGINT,
success_count BIGINT NOT NULL DEFAULT 0,
error_count_total BIGINT NOT NULL DEFAULT 0,
business_limited_count BIGINT NOT NULL DEFAULT 0,
error_count_sla BIGINT NOT NULL DEFAULT 0,
upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0,
upstream_429_count BIGINT NOT NULL DEFAULT 0,
upstream_529_count BIGINT NOT NULL DEFAULT 0,
token_consumed BIGINT NOT NULL DEFAULT 0,
-- Duration percentiles (ms)
duration_p50_ms INT,
duration_p90_ms INT,
duration_p95_ms INT,
duration_p99_ms INT,
-- TTFT percentiles (ms)
ttft_p50_ms INT,
ttft_p90_ms INT,
ttft_p95_ms INT,
ttft_p99_ms INT,
computed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
-- Uniqueness across three “dimension modes” (overall / platform / group).
-- Postgres UNIQUE treats NULLs as distinct, so we enforce uniqueness via COALESCE.
CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_metrics_hourly_unique_dim
ON ops_metrics_hourly (
bucket_start,
COALESCE(platform, ''),
COALESCE(group_id, 0)
);
CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_bucket
ON ops_metrics_hourly (bucket_start DESC);
CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_platform_bucket
ON ops_metrics_hourly (platform, bucket_start DESC)
WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL;
CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_group_bucket
ON ops_metrics_hourly (group_id, bucket_start DESC)
WHERE group_id IS NOT NULL AND group_id <> 0;
COMMENT ON TABLE ops_metrics_hourly IS 'vNext hourly pre-aggregated ops metrics (overall/platform/group).';
-- ============================================
-- 2) ops_metrics_daily (optional; for longer windows)
-- ============================================
CREATE TABLE IF NOT EXISTS ops_metrics_daily (
id BIGSERIAL PRIMARY KEY,
bucket_date DATE NOT NULL,
platform VARCHAR(32),
group_id BIGINT,
success_count BIGINT NOT NULL DEFAULT 0,
error_count_total BIGINT NOT NULL DEFAULT 0,
business_limited_count BIGINT NOT NULL DEFAULT 0,
error_count_sla BIGINT NOT NULL DEFAULT 0,
upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0,
upstream_429_count BIGINT NOT NULL DEFAULT 0,
upstream_529_count BIGINT NOT NULL DEFAULT 0,
token_consumed BIGINT NOT NULL DEFAULT 0,
duration_p50_ms INT,
duration_p90_ms INT,
duration_p95_ms INT,
duration_p99_ms INT,
ttft_p50_ms INT,
ttft_p90_ms INT,
ttft_p95_ms INT,
ttft_p99_ms INT,
computed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_metrics_daily_unique_dim
ON ops_metrics_daily (
bucket_date,
COALESCE(platform, ''),
COALESCE(group_id, 0)
);
CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_bucket
ON ops_metrics_daily (bucket_date DESC);
CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_platform_bucket
ON ops_metrics_daily (platform, bucket_date DESC)
WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL;
CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_group_bucket
ON ops_metrics_daily (group_id, bucket_date DESC)
WHERE group_id IS NOT NULL AND group_id <> 0;
COMMENT ON TABLE ops_metrics_daily IS 'vNext daily pre-aggregated ops metrics (overall/platform/group).';
-- =====================================================================
-- 033_ops_indexes_and_extensions.sql
-- =====================================================================
-- Ops Monitoring (vNext): indexes and optional extensions
--
-- This migration intentionally keeps "optional" objects (like pg_trgm) best-effort,
-- so environments without extension privileges won't fail the whole migration chain.
SET LOCAL lock_timeout = '5s';
SET LOCAL statement_timeout = '10min';
-- ============================================
-- 1) Core btree indexes (always safe)
-- ============================================
-- ops_error_logs
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_created_at
ON ops_error_logs (created_at DESC);
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_platform_time
ON ops_error_logs (platform, created_at DESC);
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_group_time
ON ops_error_logs (group_id, created_at DESC)
WHERE group_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_account_time
ON ops_error_logs (account_id, created_at DESC)
WHERE account_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_status_time
ON ops_error_logs (status_code, created_at DESC);
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_phase_time
ON ops_error_logs (error_phase, created_at DESC);
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_type_time
ON ops_error_logs (error_type, created_at DESC);
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_request_id
ON ops_error_logs (request_id);
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_client_request_id
ON ops_error_logs (client_request_id);
-- ops_system_metrics
CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_created_at
ON ops_system_metrics (created_at DESC);
CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_window_time
ON ops_system_metrics (window_minutes, created_at DESC);
CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_platform_time
ON ops_system_metrics (platform, created_at DESC)
WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL;
CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_group_time
ON ops_system_metrics (group_id, created_at DESC)
WHERE group_id IS NOT NULL;
-- ops_retry_attempts
CREATE INDEX IF NOT EXISTS idx_ops_retry_attempts_created_at
ON ops_retry_attempts (created_at DESC);
CREATE INDEX IF NOT EXISTS idx_ops_retry_attempts_source_error
ON ops_retry_attempts (source_error_id, created_at DESC)
WHERE source_error_id IS NOT NULL;
-- Prevent concurrent retries for the same ops_error_logs row (race-free, multi-instance safe).
CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_retry_attempts_unique_active
ON ops_retry_attempts (source_error_id)
WHERE source_error_id IS NOT NULL AND status IN ('queued', 'running');
-- ============================================
-- 2) Optional: pg_trgm + trigram indexes for fuzzy search
-- ============================================
DO $$
BEGIN
BEGIN
CREATE EXTENSION IF NOT EXISTS pg_trgm;
EXCEPTION WHEN OTHERS THEN
-- Missing privileges or extension package should not block migrations.
RAISE NOTICE 'pg_trgm extension not created: %', SQLERRM;
END;
IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_trgm') THEN
-- request_id / client_request_id fuzzy search
EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_request_id_trgm
ON ops_error_logs USING gin (request_id gin_trgm_ops)';
EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_client_request_id_trgm
ON ops_error_logs USING gin (client_request_id gin_trgm_ops)';
-- error_message fuzzy search
EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_error_message_trgm
ON ops_error_logs USING gin (error_message gin_trgm_ops)';
END IF;
END $$;
-- =====================================================================
-- 034_ops_preaggregation_add_avg_max.sql
-- =====================================================================
-- Ops Monitoring (vNext): extend pre-aggregation tables with avg/max latency fields
--
-- Why:
-- - The dashboard overview returns avg/max for duration/TTFT.
-- - Hourly/daily pre-aggregation tables originally stored only p50/p90/p95/p99, which makes
-- it impossible to answer avg/max in preagg mode without falling back to raw scans.
--
-- This migration is idempotent and safe to run multiple times.
--
-- NOTE: We keep the existing p50/p90/p95/p99 columns as-is; these are still used for
-- approximate long-window summaries.
SET LOCAL lock_timeout = '5s';
SET LOCAL statement_timeout = '10min';
-- Hourly table
ALTER TABLE ops_metrics_hourly
ADD COLUMN IF NOT EXISTS duration_avg_ms DOUBLE PRECISION,
ADD COLUMN IF NOT EXISTS duration_max_ms INT,
ADD COLUMN IF NOT EXISTS ttft_avg_ms DOUBLE PRECISION,
ADD COLUMN IF NOT EXISTS ttft_max_ms INT;
-- Daily table
ALTER TABLE ops_metrics_daily
ADD COLUMN IF NOT EXISTS duration_avg_ms DOUBLE PRECISION,
ADD COLUMN IF NOT EXISTS duration_max_ms INT,
ADD COLUMN IF NOT EXISTS ttft_avg_ms DOUBLE PRECISION,
ADD COLUMN IF NOT EXISTS ttft_max_ms INT;
-- =====================================================================
-- 035_ops_alert_rules_notify_email.sql
-- =====================================================================
-- Ops Monitoring (vNext): alert rule notify settings
--
-- Adds notify_email flag to ops_alert_rules to keep UI parity with the backup Ops dashboard.
-- Migration is idempotent.
SET LOCAL lock_timeout = '5s';
SET LOCAL statement_timeout = '10min';
ALTER TABLE ops_alert_rules
ADD COLUMN IF NOT EXISTS notify_email BOOLEAN NOT NULL DEFAULT true;
-- =====================================================================
-- 036_ops_seed_default_alert_rules.sql
-- =====================================================================
-- Ops Monitoring (vNext): seed default alert rules (idempotent)
--
-- Goal:
-- - Provide "out of the box" alert rules so the Ops dashboard can immediately show alert events.
-- - Keep inserts idempotent via ON CONFLICT (name) DO NOTHING.
--
-- Notes:
-- - Thresholds are intentionally conservative defaults and should be tuned per deployment.
-- - Metric semantics follow vNext:
-- - success_rate / error_rate are based on SLA-scope counts (exclude is_business_limited).
-- - upstream_error_rate excludes 429/529.
SET LOCAL lock_timeout = '5s';
SET LOCAL statement_timeout = '10min';
-- 1) High error rate (P1)
INSERT INTO ops_alert_rules (
name, description, enabled, metric_type, operator, threshold,
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
created_at, updated_at
) VALUES (
'错误率过高',
'当错误率超过 5% 且持续 5 分钟时触发告警',
true, 'error_rate', '>', 5.0, 5, 5, 'P1', true, 20, NOW(), NOW()
) ON CONFLICT (name) DO NOTHING;
-- 2) Low success rate (P0)
INSERT INTO ops_alert_rules (
name, description, enabled, metric_type, operator, threshold,
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
created_at, updated_at
) VALUES (
'成功率过低',
'当成功率低于 95% 且持续 5 分钟时触发告警(服务可用性下降)',
true, 'success_rate', '<', 95.0, 5, 5, 'P0', true, 15, NOW(), NOW()
) ON CONFLICT (name) DO NOTHING;
-- 3) P99 latency too high (P2)
INSERT INTO ops_alert_rules (
name, description, enabled, metric_type, operator, threshold,
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
created_at, updated_at
) VALUES (
'P99延迟过高',
'当 P99 延迟超过 3000ms 且持续 10 分钟时触发告警',
true, 'p99_latency_ms', '>', 3000.0, 5, 10, 'P2', true, 30, NOW(), NOW()
) ON CONFLICT (name) DO NOTHING;
-- 4) P95 latency too high (P2)
INSERT INTO ops_alert_rules (
name, description, enabled, metric_type, operator, threshold,
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
created_at, updated_at
) VALUES (
'P95延迟过高',
'当 P95 延迟超过 2000ms 且持续 10 分钟时触发告警',
true, 'p95_latency_ms', '>', 2000.0, 5, 10, 'P2', true, 30, NOW(), NOW()
) ON CONFLICT (name) DO NOTHING;
-- 5) CPU usage too high (P2)
INSERT INTO ops_alert_rules (
name, description, enabled, metric_type, operator, threshold,
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
created_at, updated_at
) VALUES (
'CPU使用率过高',
'当 CPU 使用率超过 85% 且持续 10 分钟时触发告警',
true, 'cpu_usage_percent', '>', 85.0, 5, 10, 'P2', true, 30, NOW(), NOW()
) ON CONFLICT (name) DO NOTHING;
-- 6) Memory usage too high (P1)
INSERT INTO ops_alert_rules (
name, description, enabled, metric_type, operator, threshold,
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
created_at, updated_at
) VALUES (
'内存使用率过高',
'当内存使用率超过 90% 且持续 10 分钟时触发告警(可能导致 OOM)',
true, 'memory_usage_percent', '>', 90.0, 5, 10, 'P1', true, 20, NOW(), NOW()
) ON CONFLICT (name) DO NOTHING;
-- 7) Concurrency queue buildup (P1)
INSERT INTO ops_alert_rules (
name, description, enabled, metric_type, operator, threshold,
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
created_at, updated_at
) VALUES (
'并发队列积压',
'当并发队列深度超过 100 且持续 5 分钟时触发告警(系统处理能力不足)',
true, 'concurrency_queue_depth', '>', 100.0, 5, 5, 'P1', true, 20, NOW(), NOW()
) ON CONFLICT (name) DO NOTHING;
-- 8) Extremely high error rate (P0)
INSERT INTO ops_alert_rules (
name, description, enabled, metric_type, operator, threshold,
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
created_at, updated_at
) VALUES (
'错误率极高',
'当错误率超过 20% 且持续 1 分钟时触发告警(服务严重异常)',
true, 'error_rate', '>', 20.0, 1, 1, 'P0', true, 15, NOW(), NOW()
) ON CONFLICT (name) DO NOTHING;
-- Ops Monitoring vNext: add Redis pool stats fields to system metrics snapshots.
-- This migration is intentionally idempotent.
ALTER TABLE ops_system_metrics
ADD COLUMN IF NOT EXISTS redis_conn_total INT,
ADD COLUMN IF NOT EXISTS redis_conn_idle INT;
COMMENT ON COLUMN ops_system_metrics.redis_conn_total IS 'Redis pool total connections (go-redis PoolStats.TotalConns).';
COMMENT ON COLUMN ops_system_metrics.redis_conn_idle IS 'Redis pool idle connections (go-redis PoolStats.IdleConns).';
-- Add upstream error events list (JSONB) to ops_error_logs for per-request correlation.
--
-- This is intentionally idempotent.
ALTER TABLE ops_error_logs
ADD COLUMN IF NOT EXISTS upstream_errors JSONB;
COMMENT ON COLUMN ops_error_logs.upstream_errors IS
'Sanitized upstream error events list (JSON array), correlated per gateway request (request_id/client_request_id); used for per-request upstream debugging.';
...@@ -159,7 +159,7 @@ gateway: ...@@ -159,7 +159,7 @@ gateway:
max_line_size: 41943040 max_line_size: 41943040
# Log upstream error response body summary (safe/truncated; does not log request content) # Log upstream error response body summary (safe/truncated; does not log request content)
# 记录上游错误响应体摘要(安全/截断;不记录请求内容) # 记录上游错误响应体摘要(安全/截断;不记录请求内容)
log_upstream_error_body: false log_upstream_error_body: true
# Max bytes to log from upstream error body # Max bytes to log from upstream error body
# 记录上游错误响应体的最大字节数 # 记录上游错误响应体的最大字节数
log_upstream_error_body_max_bytes: 2048 log_upstream_error_body_max_bytes: 2048
...@@ -302,6 +302,41 @@ redis: ...@@ -302,6 +302,41 @@ redis:
# 数据库编号(0-15) # 数据库编号(0-15)
db: 0 db: 0
# =============================================================================
# Ops Monitoring (Optional)
# 运维监控 (可选)
# =============================================================================
ops:
# Hard switch: disable all ops background jobs and APIs when false
# 硬开关:为 false 时禁用所有 Ops 后台任务与接口
enabled: true
# Prefer pre-aggregated tables (ops_metrics_hourly/ops_metrics_daily) for long-window dashboard queries.
# 优先使用预聚合表(用于长时间窗口查询性能)
use_preaggregated_tables: false
# Data cleanup configuration
# 数据清理配置(vNext 默认统一保留 30 天)
cleanup:
enabled: true
# Cron expression (minute hour dom month dow), e.g. "0 2 * * *" = daily at 2 AM
# Cron 表达式(分 时 日 月 周),例如 "0 2 * * *" = 每天凌晨 2 点
schedule: "0 2 * * *"
error_log_retention_days: 30
minute_metrics_retention_days: 30
hourly_metrics_retention_days: 30
# Pre-aggregation configuration
# 预聚合任务配置
aggregation:
enabled: true
# OpsMetricsCollector Redis cache (reduces duplicate expensive window aggregation in multi-replica deployments)
# 指标采集 Redis 缓存(多副本部署时减少重复计算)
metrics_collector_cache:
enabled: true
ttl: 65s
# ============================================================================= # =============================================================================
# JWT Configuration # JWT Configuration
# JWT 配置 # JWT 配置
......
...@@ -151,6 +151,15 @@ GEMINI_OAUTH_SCOPES= ...@@ -151,6 +151,15 @@ GEMINI_OAUTH_SCOPES=
# GEMINI_QUOTA_POLICY={"tiers":{"LEGACY":{"pro_rpd":50,"flash_rpd":1500,"cooldown_minutes":30},"PRO":{"pro_rpd":1500,"flash_rpd":4000,"cooldown_minutes":5},"ULTRA":{"pro_rpd":2000,"flash_rpd":0,"cooldown_minutes":5}}} # GEMINI_QUOTA_POLICY={"tiers":{"LEGACY":{"pro_rpd":50,"flash_rpd":1500,"cooldown_minutes":30},"PRO":{"pro_rpd":1500,"flash_rpd":4000,"cooldown_minutes":5},"ULTRA":{"pro_rpd":2000,"flash_rpd":0,"cooldown_minutes":5}}}
GEMINI_QUOTA_POLICY= GEMINI_QUOTA_POLICY=
# -----------------------------------------------------------------------------
# Ops Monitoring Configuration (运维监控配置)
# -----------------------------------------------------------------------------
# Enable ops monitoring features (background jobs and APIs)
# 是否启用运维监控功能(后台任务和接口)
# Set to false to hide ops menu in sidebar and disable all ops features
# 设置为 false 可在左侧栏隐藏运维监控菜单并禁用所有运维监控功能
OPS_ENABLED=true
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# Update Configuration (在线更新配置) # Update Configuration (在线更新配置)
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
......
...@@ -159,7 +159,7 @@ gateway: ...@@ -159,7 +159,7 @@ gateway:
max_line_size: 41943040 max_line_size: 41943040
# Log upstream error response body summary (safe/truncated; does not log request content) # Log upstream error response body summary (safe/truncated; does not log request content)
# 记录上游错误响应体摘要(安全/截断;不记录请求内容) # 记录上游错误响应体摘要(安全/截断;不记录请求内容)
log_upstream_error_body: false log_upstream_error_body: true
# Max bytes to log from upstream error body # Max bytes to log from upstream error body
# 记录上游错误响应体的最大字节数 # 记录上游错误响应体的最大字节数
log_upstream_error_body_max_bytes: 2048 log_upstream_error_body_max_bytes: 2048
...@@ -302,6 +302,19 @@ redis: ...@@ -302,6 +302,19 @@ redis:
# 数据库编号(0-15) # 数据库编号(0-15)
db: 0 db: 0
# =============================================================================
# Ops Monitoring (Optional)
# 运维监控 (可选)
# =============================================================================
ops:
# Enable ops monitoring features (background jobs and APIs)
# 是否启用运维监控功能(后台任务和接口)
# Set to false to hide ops menu in sidebar and disable all ops features
# 设置为 false 可在左侧栏隐藏运维监控菜单并禁用所有运维监控功能
# Other detailed settings (cleanup, aggregation, etc.) are configured in ops settings dialog
# 其他详细设置(数据清理、预聚合等)在运维监控设置对话框中配置
enabled: true
# ============================================================================= # =============================================================================
# JWT Configuration # JWT Configuration
# JWT 配置 # JWT 配置
......
...@@ -17,6 +17,7 @@ import usageAPI from './usage' ...@@ -17,6 +17,7 @@ import usageAPI from './usage'
import geminiAPI from './gemini' import geminiAPI from './gemini'
import antigravityAPI from './antigravity' import antigravityAPI from './antigravity'
import userAttributesAPI from './userAttributes' import userAttributesAPI from './userAttributes'
import opsAPI from './ops'
/** /**
* Unified admin API object for convenient access * Unified admin API object for convenient access
...@@ -35,7 +36,8 @@ export const adminAPI = { ...@@ -35,7 +36,8 @@ export const adminAPI = {
usage: usageAPI, usage: usageAPI,
gemini: geminiAPI, gemini: geminiAPI,
antigravity: antigravityAPI, antigravity: antigravityAPI,
userAttributes: userAttributesAPI userAttributes: userAttributesAPI,
ops: opsAPI
} }
export { export {
...@@ -52,7 +54,8 @@ export { ...@@ -52,7 +54,8 @@ export {
usageAPI, usageAPI,
geminiAPI, geminiAPI,
antigravityAPI, antigravityAPI,
userAttributesAPI userAttributesAPI,
opsAPI
} }
export default adminAPI export default adminAPI
/**
* Admin Ops API endpoints (vNext)
* - Error logs list/detail + retry (client/upstream)
* - Dashboard overview (raw path)
*/
import { apiClient } from '../client'
import type { PaginatedResponse } from '@/types'
export type OpsRetryMode = 'client' | 'upstream'
export type OpsQueryMode = 'auto' | 'raw' | 'preagg'
export interface OpsRequestOptions {
signal?: AbortSignal
}
export interface OpsRetryRequest {
mode: OpsRetryMode
pinned_account_id?: number
}
export interface OpsRetryResult {
attempt_id: number
mode: OpsRetryMode
status: 'running' | 'succeeded' | 'failed' | string
pinned_account_id?: number | null
used_account_id?: number | null
http_status_code: number
upstream_request_id: string
response_preview: string
response_truncated: boolean
error_message: string
started_at: string
finished_at: string
duration_ms: number
}
export interface OpsDashboardOverview {
start_time: string
end_time: string
platform: string
group_id?: number | null
health_score?: number
system_metrics?: OpsSystemMetricsSnapshot | null
job_heartbeats?: OpsJobHeartbeat[] | null
success_count: number
error_count_total: number
business_limited_count: number
error_count_sla: number
request_count_total: number
request_count_sla: number
token_consumed: number
sla: number
error_rate: number
upstream_error_rate: number
upstream_error_count_excl_429_529: number
upstream_429_count: number
upstream_529_count: number
qps: {
current: number
peak: number
avg: number
}
tps: {
current: number
peak: number
avg: number
}
duration: OpsPercentiles
ttft: OpsPercentiles
}
export interface OpsPercentiles {
p50_ms?: number | null
p90_ms?: number | null
p95_ms?: number | null
p99_ms?: number | null
avg_ms?: number | null
max_ms?: number | null
}
export interface OpsThroughputTrendPoint {
bucket_start: string
request_count: number
token_consumed: number
qps: number
tps: number
}
export interface OpsThroughputPlatformBreakdownItem {
platform: string
request_count: number
token_consumed: number
}
export interface OpsThroughputGroupBreakdownItem {
group_id: number
group_name: string
request_count: number
token_consumed: number
}
export interface OpsThroughputTrendResponse {
bucket: string
points: OpsThroughputTrendPoint[]
by_platform?: OpsThroughputPlatformBreakdownItem[]
top_groups?: OpsThroughputGroupBreakdownItem[]
}
export type OpsRequestKind = 'success' | 'error'
export type OpsRequestDetailsKind = OpsRequestKind | 'all'
export type OpsRequestDetailsSort = 'created_at_desc' | 'duration_desc'
export interface OpsRequestDetail {
kind: OpsRequestKind
created_at: string
request_id: string
platform?: string
model?: string
duration_ms?: number | null
status_code?: number | null
error_id?: number | null
phase?: string
severity?: string
message?: string
user_id?: number | null
api_key_id?: number | null
account_id?: number | null
group_id?: number | null
stream?: boolean
}
export interface OpsRequestDetailsParams {
time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
start_time?: string
end_time?: string
kind?: OpsRequestDetailsKind
platform?: string
group_id?: number | null
user_id?: number
api_key_id?: number
account_id?: number
model?: string
request_id?: string
q?: string
min_duration_ms?: number
max_duration_ms?: number
sort?: OpsRequestDetailsSort
page?: number
page_size?: number
}
export type OpsRequestDetailsResponse = PaginatedResponse<OpsRequestDetail>
export interface OpsLatencyHistogramBucket {
range: string
count: number
}
export interface OpsLatencyHistogramResponse {
start_time: string
end_time: string
platform: string
group_id?: number | null
total_requests: number
buckets: OpsLatencyHistogramBucket[]
}
export interface OpsErrorTrendPoint {
bucket_start: string
error_count_total: number
business_limited_count: number
error_count_sla: number
upstream_error_count_excl_429_529: number
upstream_429_count: number
upstream_529_count: number
}
export interface OpsErrorTrendResponse {
bucket: string
points: OpsErrorTrendPoint[]
}
export interface OpsErrorDistributionItem {
status_code: number
total: number
sla: number
business_limited: number
}
export interface OpsErrorDistributionResponse {
total: number
items: OpsErrorDistributionItem[]
}
export interface OpsSystemMetricsSnapshot {
id: number
created_at: string
window_minutes: number
cpu_usage_percent?: number | null
memory_used_mb?: number | null
memory_total_mb?: number | null
memory_usage_percent?: number | null
db_ok?: boolean | null
redis_ok?: boolean | null
// Config-derived limits (best-effort) for rendering "current vs max".
db_max_open_conns?: number | null
redis_pool_size?: number | null
redis_conn_total?: number | null
redis_conn_idle?: number | null
db_conn_active?: number | null
db_conn_idle?: number | null
db_conn_waiting?: number | null
goroutine_count?: number | null
concurrency_queue_depth?: number | null
}
export interface OpsJobHeartbeat {
job_name: string
last_run_at?: string | null
last_success_at?: string | null
last_error_at?: string | null
last_error?: string | null
last_duration_ms?: number | null
updated_at: string
}
export interface PlatformConcurrencyInfo {
platform: string
current_in_use: number
max_capacity: number
load_percentage: number
waiting_in_queue: number
}
export interface GroupConcurrencyInfo {
group_id: number
group_name: string
platform: string
current_in_use: number
max_capacity: number
load_percentage: number
waiting_in_queue: number
}
export interface AccountConcurrencyInfo {
account_id: number
account_name?: string
platform: string
group_id: number
group_name: string
current_in_use: number
max_capacity: number
load_percentage: number
waiting_in_queue: number
}
export interface OpsConcurrencyStatsResponse {
enabled: boolean
platform: Record<string, PlatformConcurrencyInfo>
group: Record<string, GroupConcurrencyInfo>
account: Record<string, AccountConcurrencyInfo>
timestamp?: string
}
export async function getConcurrencyStats(platform?: string, groupId?: number | null): Promise<OpsConcurrencyStatsResponse> {
const params: Record<string, any> = {}
if (platform) {
params.platform = platform
}
if (typeof groupId === 'number' && groupId > 0) {
params.group_id = groupId
}
const { data } = await apiClient.get<OpsConcurrencyStatsResponse>('/admin/ops/concurrency', { params })
return data
}
export interface PlatformAvailability {
platform: string
total_accounts: number
available_count: number
rate_limit_count: number
error_count: number
}
export interface GroupAvailability {
group_id: number
group_name: string
platform: string
total_accounts: number
available_count: number
rate_limit_count: number
error_count: number
}
export interface AccountAvailability {
account_id: number
account_name: string
platform: string
group_id: number
group_name: string
status: string
is_available: boolean
is_rate_limited: boolean
rate_limit_reset_at?: string
rate_limit_remaining_sec?: number
is_overloaded: boolean
overload_until?: string
overload_remaining_sec?: number
has_error: boolean
error_message?: string
}
export interface OpsAccountAvailabilityStatsResponse {
enabled: boolean
platform: Record<string, PlatformAvailability>
group: Record<string, GroupAvailability>
account: Record<string, AccountAvailability>
timestamp?: string
}
export async function getAccountAvailabilityStats(platform?: string, groupId?: number | null): Promise<OpsAccountAvailabilityStatsResponse> {
const params: Record<string, any> = {}
if (platform) {
params.platform = platform
}
if (typeof groupId === 'number' && groupId > 0) {
params.group_id = groupId
}
const { data } = await apiClient.get<OpsAccountAvailabilityStatsResponse>('/admin/ops/account-availability', { params })
return data
}
/**
* Subscribe to realtime QPS updates via WebSocket.
*
* Note: browsers cannot set Authorization headers for WebSockets.
* We authenticate via Sec-WebSocket-Protocol using a prefixed token item:
* ["sub2api-admin", "jwt.<token>"]
*/
export interface SubscribeQPSOptions {
token?: string | null
onOpen?: () => void
onClose?: (event: CloseEvent) => void
onError?: (event: Event) => void
/**
* Called when the server closes with an application close code that indicates
* reconnecting is not useful (e.g. feature flag disabled).
*/
onFatalClose?: (event: CloseEvent) => void
/**
* More granular status updates for UI (connecting/reconnecting/offline/etc).
*/
onStatusChange?: (status: OpsWSStatus) => void
/**
* Called when a reconnect is scheduled (helps display "retry in Xs").
*/
onReconnectScheduled?: (info: { attempt: number, delayMs: number }) => void
wsBaseUrl?: string
/**
* Maximum reconnect attempts. Defaults to Infinity to keep the dashboard live.
* Set to 0 to disable reconnect.
*/
maxReconnectAttempts?: number
reconnectBaseDelayMs?: number
reconnectMaxDelayMs?: number
/**
* Stale connection detection (heartbeat-by-observation).
* If no messages are received within this window, the socket is closed to trigger a reconnect.
* Set to 0 to disable.
*/
staleTimeoutMs?: number
/**
* How often to check staleness. Only used when `staleTimeoutMs > 0`.
*/
staleCheckIntervalMs?: number
}
export type OpsWSStatus = 'connecting' | 'connected' | 'reconnecting' | 'offline' | 'closed'
export const OPS_WS_CLOSE_CODES = {
REALTIME_DISABLED: 4001
} as const
const OPS_WS_BASE_PROTOCOL = 'sub2api-admin'
export function subscribeQPS(onMessage: (data: any) => void, options: SubscribeQPSOptions = {}): () => void {
let ws: WebSocket | null = null
let reconnectAttempts = 0
const maxReconnectAttempts = Number.isFinite(options.maxReconnectAttempts as number)
? (options.maxReconnectAttempts as number)
: Infinity
const baseDelayMs = options.reconnectBaseDelayMs ?? 1000
const maxDelayMs = options.reconnectMaxDelayMs ?? 30000
let reconnectTimer: ReturnType<typeof setTimeout> | null = null
let shouldReconnect = true
let isConnecting = false
let hasConnectedOnce = false
let lastMessageAt = 0
const staleTimeoutMs = options.staleTimeoutMs ?? 120_000
const staleCheckIntervalMs = options.staleCheckIntervalMs ?? 30_000
let staleTimer: ReturnType<typeof setInterval> | null = null
const setStatus = (status: OpsWSStatus) => {
options.onStatusChange?.(status)
}
const clearReconnectTimer = () => {
if (reconnectTimer) {
clearTimeout(reconnectTimer)
reconnectTimer = null
}
}
const clearStaleTimer = () => {
if (staleTimer) {
clearInterval(staleTimer)
staleTimer = null
}
}
const startStaleTimer = () => {
clearStaleTimer()
if (!staleTimeoutMs || staleTimeoutMs <= 0) return
staleTimer = setInterval(() => {
if (!shouldReconnect) return
if (!ws || ws.readyState !== WebSocket.OPEN) return
if (!lastMessageAt) return
const ageMs = Date.now() - lastMessageAt
if (ageMs > staleTimeoutMs) {
// Treat as a half-open connection; closing triggers the normal reconnect path.
ws.close()
}
}, staleCheckIntervalMs)
}
const scheduleReconnect = () => {
if (!shouldReconnect) return
if (hasConnectedOnce && reconnectAttempts >= maxReconnectAttempts) return
// If we're offline, wait for the browser to come back online.
if (typeof navigator !== 'undefined' && 'onLine' in navigator && !navigator.onLine) {
setStatus('offline')
return
}
const expDelay = baseDelayMs * Math.pow(2, reconnectAttempts)
const delay = Math.min(expDelay, maxDelayMs)
const jitter = Math.floor(Math.random() * 250)
clearReconnectTimer()
reconnectTimer = setTimeout(() => {
reconnectAttempts++
connect()
}, delay + jitter)
options.onReconnectScheduled?.({ attempt: reconnectAttempts + 1, delayMs: delay + jitter })
}
const handleOnline = () => {
if (!shouldReconnect) return
if (ws && (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING)) return
connect()
}
const handleOffline = () => {
setStatus('offline')
}
const connect = () => {
if (!shouldReconnect) return
if (isConnecting) return
if (ws && (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING)) return
if (hasConnectedOnce && reconnectAttempts >= maxReconnectAttempts) return
isConnecting = true
setStatus(hasConnectedOnce ? 'reconnecting' : 'connecting')
const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'
const wsBaseUrl = options.wsBaseUrl || import.meta.env.VITE_WS_BASE_URL || window.location.host
const wsURL = new URL(`${protocol}//${wsBaseUrl}/api/v1/admin/ops/ws/qps`)
// Do NOT put admin JWT in the URL query string (it can leak via access logs, proxies, etc).
// Browsers cannot set Authorization headers for WebSockets, so we pass the token via
// Sec-WebSocket-Protocol (subprotocol list): ["sub2api-admin", "jwt.<token>"].
const rawToken = String(options.token ?? localStorage.getItem('auth_token') ?? '').trim()
const protocols: string[] = [OPS_WS_BASE_PROTOCOL]
if (rawToken) protocols.push(`jwt.${rawToken}`)
ws = new WebSocket(wsURL.toString(), protocols)
ws.onopen = () => {
reconnectAttempts = 0
isConnecting = false
hasConnectedOnce = true
clearReconnectTimer()
lastMessageAt = Date.now()
startStaleTimer()
setStatus('connected')
options.onOpen?.()
}
ws.onmessage = (e) => {
try {
const data = JSON.parse(e.data)
lastMessageAt = Date.now()
onMessage(data)
} catch (err) {
console.warn('[OpsWS] Failed to parse message:', err)
}
}
ws.onerror = (error) => {
console.error('[OpsWS] Connection error:', error)
options.onError?.(error)
}
ws.onclose = (event) => {
isConnecting = false
options.onClose?.(event)
clearStaleTimer()
ws = null
// If the server explicitly tells us to stop reconnecting, honor it.
if (event && typeof event.code === 'number' && event.code === OPS_WS_CLOSE_CODES.REALTIME_DISABLED) {
shouldReconnect = false
clearReconnectTimer()
setStatus('closed')
options.onFatalClose?.(event)
return
}
scheduleReconnect()
}
}
window.addEventListener('online', handleOnline)
window.addEventListener('offline', handleOffline)
connect()
return () => {
shouldReconnect = false
window.removeEventListener('online', handleOnline)
window.removeEventListener('offline', handleOffline)
clearReconnectTimer()
clearStaleTimer()
if (ws) ws.close()
ws = null
setStatus('closed')
}
}
export type OpsSeverity = string
export type OpsPhase = string
export type AlertSeverity = 'critical' | 'warning' | 'info'
export type ThresholdMode = 'count' | 'percentage' | 'both'
export type MetricType =
| 'success_rate'
| 'error_rate'
| 'upstream_error_rate'
| 'p95_latency_ms'
| 'p99_latency_ms'
| 'cpu_usage_percent'
| 'memory_usage_percent'
| 'concurrency_queue_depth'
| 'group_available_accounts'
| 'group_available_ratio'
| 'group_rate_limit_ratio'
| 'account_rate_limited_count'
| 'account_error_count'
| 'account_error_ratio'
| 'overload_account_count'
export type Operator = '>' | '>=' | '<' | '<=' | '==' | '!='
export interface AlertRule {
id?: number
name: string
description?: string
enabled: boolean
metric_type: MetricType
operator: Operator
threshold: number
window_minutes: number
sustained_minutes: number
severity: OpsSeverity
cooldown_minutes: number
notify_email: boolean
filters?: Record<string, any>
created_at?: string
updated_at?: string
last_triggered_at?: string | null
}
export interface AlertEvent {
id: number
rule_id: number
severity: OpsSeverity | string
status: 'firing' | 'resolved' | string
title?: string
description?: string
metric_value?: number
threshold_value?: number
dimensions?: Record<string, any>
fired_at: string
resolved_at?: string | null
email_sent: boolean
created_at: string
}
export interface EmailNotificationConfig {
alert: {
enabled: boolean
recipients: string[]
min_severity: AlertSeverity | ''
rate_limit_per_hour: number
batching_window_seconds: number
include_resolved_alerts: boolean
}
report: {
enabled: boolean
recipients: string[]
daily_summary_enabled: boolean
daily_summary_schedule: string
weekly_summary_enabled: boolean
weekly_summary_schedule: string
error_digest_enabled: boolean
error_digest_schedule: string
error_digest_min_count: number
account_health_enabled: boolean
account_health_schedule: string
account_health_error_rate_threshold: number
}
}
export interface OpsDistributedLockSettings {
enabled: boolean
key: string
ttl_seconds: number
}
export interface OpsAlertRuntimeSettings {
evaluation_interval_seconds: number
distributed_lock: OpsDistributedLockSettings
silencing: {
enabled: boolean
global_until_rfc3339: string
global_reason: string
entries?: Array<{
rule_id?: number
severities?: Array<OpsSeverity | string>
until_rfc3339: string
reason: string
}>
}
}
export interface OpsAdvancedSettings {
data_retention: OpsDataRetentionSettings
aggregation: OpsAggregationSettings
}
export interface OpsDataRetentionSettings {
cleanup_enabled: boolean
cleanup_schedule: string
error_log_retention_days: number
minute_metrics_retention_days: number
hourly_metrics_retention_days: number
}
export interface OpsAggregationSettings {
aggregation_enabled: boolean
}
export interface OpsErrorLog {
id: number
created_at: string
phase: OpsPhase
type: string
severity: OpsSeverity
status_code: number
platform: string
model: string
latency_ms?: number | null
client_request_id: string
request_id: string
message: string
user_id?: number | null
api_key_id?: number | null
account_id?: number | null
group_id?: number | null
client_ip?: string | null
request_path?: string
stream?: boolean
}
export interface OpsErrorDetail extends OpsErrorLog {
error_body: string
user_agent: string
// Upstream context (optional; enriched by gateway services)
upstream_status_code?: number | null
upstream_error_message?: string
upstream_error_detail?: string
upstream_errors?: string
auth_latency_ms?: number | null
routing_latency_ms?: number | null
upstream_latency_ms?: number | null
response_latency_ms?: number | null
time_to_first_token_ms?: number | null
request_body: string
request_body_truncated: boolean
request_body_bytes?: number | null
is_business_limited: boolean
}
export type OpsErrorLogsResponse = PaginatedResponse<OpsErrorLog>
export async function getDashboardOverview(
params: {
time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
start_time?: string
end_time?: string
platform?: string
group_id?: number | null
mode?: OpsQueryMode
},
options: OpsRequestOptions = {}
): Promise<OpsDashboardOverview> {
const { data } = await apiClient.get<OpsDashboardOverview>('/admin/ops/dashboard/overview', {
params,
signal: options.signal
})
return data
}
export async function getThroughputTrend(
params: {
time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
start_time?: string
end_time?: string
platform?: string
group_id?: number | null
mode?: OpsQueryMode
},
options: OpsRequestOptions = {}
): Promise<OpsThroughputTrendResponse> {
const { data } = await apiClient.get<OpsThroughputTrendResponse>('/admin/ops/dashboard/throughput-trend', {
params,
signal: options.signal
})
return data
}
export async function getLatencyHistogram(
params: {
time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
start_time?: string
end_time?: string
platform?: string
group_id?: number | null
mode?: OpsQueryMode
},
options: OpsRequestOptions = {}
): Promise<OpsLatencyHistogramResponse> {
const { data } = await apiClient.get<OpsLatencyHistogramResponse>('/admin/ops/dashboard/latency-histogram', {
params,
signal: options.signal
})
return data
}
export async function getErrorTrend(
params: {
time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
start_time?: string
end_time?: string
platform?: string
group_id?: number | null
mode?: OpsQueryMode
},
options: OpsRequestOptions = {}
): Promise<OpsErrorTrendResponse> {
const { data } = await apiClient.get<OpsErrorTrendResponse>('/admin/ops/dashboard/error-trend', {
params,
signal: options.signal
})
return data
}
export async function getErrorDistribution(
params: {
time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
start_time?: string
end_time?: string
platform?: string
group_id?: number | null
mode?: OpsQueryMode
},
options: OpsRequestOptions = {}
): Promise<OpsErrorDistributionResponse> {
const { data } = await apiClient.get<OpsErrorDistributionResponse>('/admin/ops/dashboard/error-distribution', {
params,
signal: options.signal
})
return data
}
export async function listErrorLogs(params: {
page?: number
page_size?: number
time_range?: string
start_time?: string
end_time?: string
platform?: string
group_id?: number | null
account_id?: number | null
phase?: string
q?: string
status_codes?: string
}): Promise<OpsErrorLogsResponse> {
const { data } = await apiClient.get<OpsErrorLogsResponse>('/admin/ops/errors', { params })
return data
}
export async function getErrorLogDetail(id: number): Promise<OpsErrorDetail> {
const { data } = await apiClient.get<OpsErrorDetail>(`/admin/ops/errors/${id}`)
return data
}
export async function retryErrorRequest(id: number, req: OpsRetryRequest): Promise<OpsRetryResult> {
const { data } = await apiClient.post<OpsRetryResult>(`/admin/ops/errors/${id}/retry`, req)
return data
}
export async function listRequestDetails(params: OpsRequestDetailsParams): Promise<OpsRequestDetailsResponse> {
const { data } = await apiClient.get<OpsRequestDetailsResponse>('/admin/ops/requests', { params })
return data
}
// Alert rules
export async function listAlertRules(): Promise<AlertRule[]> {
const { data } = await apiClient.get<AlertRule[]>('/admin/ops/alert-rules')
return data
}
export async function createAlertRule(rule: AlertRule): Promise<AlertRule> {
const { data } = await apiClient.post<AlertRule>('/admin/ops/alert-rules', rule)
return data
}
export async function updateAlertRule(id: number, rule: Partial<AlertRule>): Promise<AlertRule> {
const { data } = await apiClient.put<AlertRule>(`/admin/ops/alert-rules/${id}`, rule)
return data
}
export async function deleteAlertRule(id: number): Promise<void> {
await apiClient.delete(`/admin/ops/alert-rules/${id}`)
}
export async function listAlertEvents(limit = 100): Promise<AlertEvent[]> {
const { data } = await apiClient.get<AlertEvent[]>('/admin/ops/alert-events', { params: { limit } })
return data
}
// Email notification config
export async function getEmailNotificationConfig(): Promise<EmailNotificationConfig> {
const { data } = await apiClient.get<EmailNotificationConfig>('/admin/ops/email-notification/config')
return data
}
export async function updateEmailNotificationConfig(config: EmailNotificationConfig): Promise<EmailNotificationConfig> {
const { data } = await apiClient.put<EmailNotificationConfig>('/admin/ops/email-notification/config', config)
return data
}
// Runtime settings (DB-backed)
export async function getAlertRuntimeSettings(): Promise<OpsAlertRuntimeSettings> {
const { data } = await apiClient.get<OpsAlertRuntimeSettings>('/admin/ops/runtime/alert')
return data
}
export async function updateAlertRuntimeSettings(config: OpsAlertRuntimeSettings): Promise<OpsAlertRuntimeSettings> {
const { data } = await apiClient.put<OpsAlertRuntimeSettings>('/admin/ops/runtime/alert', config)
return data
}
// Advanced settings (DB-backed)
export async function getAdvancedSettings(): Promise<OpsAdvancedSettings> {
const { data } = await apiClient.get<OpsAdvancedSettings>('/admin/ops/advanced-settings')
return data
}
export async function updateAdvancedSettings(config: OpsAdvancedSettings): Promise<OpsAdvancedSettings> {
const { data } = await apiClient.put<OpsAdvancedSettings>('/admin/ops/advanced-settings', config)
return data
}
export const opsAPI = {
getDashboardOverview,
getThroughputTrend,
getLatencyHistogram,
getErrorTrend,
getErrorDistribution,
getConcurrencyStats,
getAccountAvailabilityStats,
subscribeQPS,
listErrorLogs,
getErrorLogDetail,
retryErrorRequest,
listRequestDetails,
listAlertRules,
createAlertRule,
updateAlertRule,
deleteAlertRule,
listAlertEvents,
getEmailNotificationConfig,
updateEmailNotificationConfig,
getAlertRuntimeSettings,
updateAlertRuntimeSettings,
getAdvancedSettings,
updateAdvancedSettings
}
export default opsAPI
...@@ -35,14 +35,23 @@ export interface SystemSettings { ...@@ -35,14 +35,23 @@ export interface SystemSettings {
turnstile_enabled: boolean turnstile_enabled: boolean
turnstile_site_key: string turnstile_site_key: string
turnstile_secret_key_configured: boolean turnstile_secret_key_configured: boolean
// LinuxDo Connect OAuth 登录(终端用户 SSO)
linuxdo_connect_enabled: boolean // Model fallback configuration
linuxdo_connect_client_id: string enable_model_fallback: boolean
linuxdo_connect_client_secret_configured: boolean fallback_model_anthropic: string
linuxdo_connect_redirect_url: string fallback_model_openai: string
fallback_model_gemini: string
fallback_model_antigravity: string
// Identity patch configuration (Claude -> Gemini) // Identity patch configuration (Claude -> Gemini)
enable_identity_patch: boolean enable_identity_patch: boolean
identity_patch_prompt: string identity_patch_prompt: string
// Ops Monitoring (vNext)
ops_monitoring_enabled: boolean
ops_realtime_monitoring_enabled: boolean
ops_query_mode_default: 'auto' | 'raw' | 'preagg' | string
ops_metrics_interval_seconds: number
} }
export interface UpdateSettingsRequest { export interface UpdateSettingsRequest {
...@@ -67,12 +76,17 @@ export interface UpdateSettingsRequest { ...@@ -67,12 +76,17 @@ export interface UpdateSettingsRequest {
turnstile_enabled?: boolean turnstile_enabled?: boolean
turnstile_site_key?: string turnstile_site_key?: string
turnstile_secret_key?: string turnstile_secret_key?: string
linuxdo_connect_enabled?: boolean enable_model_fallback?: boolean
linuxdo_connect_client_id?: string fallback_model_anthropic?: string
linuxdo_connect_client_secret?: string fallback_model_openai?: string
linuxdo_connect_redirect_url?: string fallback_model_gemini?: string
fallback_model_antigravity?: string
enable_identity_patch?: boolean enable_identity_patch?: boolean
identity_patch_prompt?: string identity_patch_prompt?: string
ops_monitoring_enabled?: boolean
ops_realtime_monitoring_enabled?: boolean
ops_query_mode_default?: 'auto' | 'raw' | 'preagg' | string
ops_metrics_interval_seconds?: number
} }
/** /**
......
...@@ -80,9 +80,45 @@ apiClient.interceptors.response.use( ...@@ -80,9 +80,45 @@ apiClient.interceptors.response.use(
return response return response
}, },
(error: AxiosError<ApiResponse<unknown>>) => { (error: AxiosError<ApiResponse<unknown>>) => {
// Request cancellation: keep the original axios cancellation error so callers can ignore it.
// Otherwise we'd misclassify it as a generic "network error".
if (error.code === 'ERR_CANCELED' || axios.isCancel(error)) {
return Promise.reject(error)
}
// Handle common errors // Handle common errors
if (error.response) { if (error.response) {
const { status, data } = error.response const { status, data } = error.response
const url = String(error.config?.url || '')
// Validate `data` shape to avoid HTML error pages breaking our error handling.
const apiData = (typeof data === 'object' && data !== null ? data : {}) as Record<string, any>
// Ops monitoring disabled: treat as feature-flagged 404, and proactively redirect away
// from ops pages to avoid broken UI states.
if (status === 404 && apiData.message === 'Ops monitoring is disabled') {
try {
localStorage.setItem('ops_monitoring_enabled_cached', 'false')
} catch {
// ignore localStorage failures
}
try {
window.dispatchEvent(new CustomEvent('ops-monitoring-disabled'))
} catch {
// ignore event failures
}
if (window.location.pathname.startsWith('/admin/ops')) {
window.location.href = '/admin/settings'
}
return Promise.reject({
status,
code: 'OPS_DISABLED',
message: apiData.message || error.message,
url
})
}
// 401: Unauthorized - clear token and redirect to login // 401: Unauthorized - clear token and redirect to login
if (status === 401) { if (status === 401) {
...@@ -113,8 +149,8 @@ apiClient.interceptors.response.use( ...@@ -113,8 +149,8 @@ apiClient.interceptors.response.use(
// Return structured error // Return structured error
return Promise.reject({ return Promise.reject({
status, status,
code: data?.code, code: apiData.code,
message: data?.message || error.message message: apiData.message || apiData.detail || error.message
}) })
} }
......
<script setup lang="ts">
import { ref } from 'vue'
defineProps<{
content?: string
}>()
const show = ref(false)
</script>
<template>
<div
class="group relative ml-1 inline-flex items-center align-middle"
@mouseenter="show = true"
@mouseleave="show = false"
>
<!-- Trigger Icon -->
<slot name="trigger">
<svg
class="h-4 w-4 cursor-help text-gray-400 transition-colors hover:text-primary-600 dark:text-gray-500 dark:hover:text-primary-400"
fill="none"
viewBox="0 0 24 24"
stroke="currentColor"
stroke-width="2"
>
<path
stroke-linecap="round"
stroke-linejoin="round"
d="M13 16h-1v-4h-1m1-4h.01M21 12a9 9 0 11-18 0 9 9 0 0118 0z"
/>
</svg>
</slot>
<!-- Popover Content -->
<div
v-show="show"
class="absolute bottom-full left-1/2 z-50 mb-2 w-64 -translate-x-1/2 rounded-lg bg-gray-900 p-3 text-xs leading-relaxed text-white shadow-xl ring-1 ring-white/10 opacity-0 transition-opacity duration-200 group-hover:opacity-100 dark:bg-gray-800"
>
<slot>{{ content }}</slot>
<div class="absolute -bottom-1 left-1/2 h-2 w-2 -translate-x-1/2 rotate-45 bg-gray-900 dark:bg-gray-800"></div>
</div>
</div>
</template>
...@@ -67,12 +67,13 @@ ...@@ -67,12 +67,13 @@
:aria-selected="isSelected(option)" :aria-selected="isSelected(option)"
:aria-disabled="isOptionDisabled(option)" :aria-disabled="isOptionDisabled(option)"
@click.stop="!isOptionDisabled(option) && selectOption(option)" @click.stop="!isOptionDisabled(option) && selectOption(option)"
@mouseenter="focusedIndex = index" @mouseenter="handleOptionMouseEnter(option, index)"
:class="[ :class="[
'select-option', 'select-option',
isGroupHeaderOption(option) && 'select-option-group',
isSelected(option) && 'select-option-selected', isSelected(option) && 'select-option-selected',
isOptionDisabled(option) && 'select-option-disabled', isOptionDisabled(option) && !isGroupHeaderOption(option) && 'select-option-disabled',
focusedIndex === index && 'select-option-focused' focusedIndex === index && !isGroupHeaderOption(option) && 'select-option-focused'
]" ]"
> >
<slot name="option" :option="option" :selected="isSelected(option)"> <slot name="option" :option="option" :selected="isSelected(option)">
...@@ -201,6 +202,13 @@ const isOptionDisabled = (option: any): boolean => { ...@@ -201,6 +202,13 @@ const isOptionDisabled = (option: any): boolean => {
return false return false
} }
const isGroupHeaderOption = (option: any): boolean => {
if (typeof option === 'object' && option !== null) {
return option.kind === 'group'
}
return false
}
const selectedOption = computed(() => { const selectedOption = computed(() => {
return props.options.find((opt) => getOptionValue(opt) === props.modelValue) || null return props.options.find((opt) => getOptionValue(opt) === props.modelValue) || null
}) })
...@@ -225,6 +233,31 @@ const isSelected = (option: any): boolean => { ...@@ -225,6 +233,31 @@ const isSelected = (option: any): boolean => {
return getOptionValue(option) === props.modelValue return getOptionValue(option) === props.modelValue
} }
const findNextEnabledIndex = (startIndex: number): number => {
const opts = filteredOptions.value
if (opts.length === 0) return -1
for (let offset = 0; offset < opts.length; offset++) {
const idx = (startIndex + offset) % opts.length
if (!isOptionDisabled(opts[idx])) return idx
}
return -1
}
const findPrevEnabledIndex = (startIndex: number): number => {
const opts = filteredOptions.value
if (opts.length === 0) return -1
for (let offset = 0; offset < opts.length; offset++) {
const idx = (startIndex - offset + opts.length) % opts.length
if (!isOptionDisabled(opts[idx])) return idx
}
return -1
}
const handleOptionMouseEnter = (option: any, index: number) => {
if (isOptionDisabled(option) || isGroupHeaderOption(option)) return
focusedIndex.value = index
}
// Update trigger rect periodically while open to follow scroll/resize // Update trigger rect periodically while open to follow scroll/resize
const updateTriggerRect = () => { const updateTriggerRect = () => {
if (containerRef.value) { if (containerRef.value) {
...@@ -259,8 +292,15 @@ watch(isOpen, (open) => { ...@@ -259,8 +292,15 @@ watch(isOpen, (open) => {
if (open) { if (open) {
calculateDropdownPosition() calculateDropdownPosition()
// Reset focused index to current selection or first item // Reset focused index to current selection or first item
const selectedIdx = filteredOptions.value.findIndex(isSelected) if (filteredOptions.value.length === 0) {
focusedIndex.value = selectedIdx >= 0 ? selectedIdx : 0 focusedIndex.value = -1
} else {
const selectedIdx = filteredOptions.value.findIndex(isSelected)
const initialIdx = selectedIdx >= 0 ? selectedIdx : 0
focusedIndex.value = isOptionDisabled(filteredOptions.value[initialIdx])
? findNextEnabledIndex(initialIdx + 1)
: initialIdx
}
if (props.searchable) { if (props.searchable) {
nextTick(() => searchInputRef.value?.focus()) nextTick(() => searchInputRef.value?.focus())
...@@ -295,13 +335,13 @@ const onDropdownKeyDown = (e: KeyboardEvent) => { ...@@ -295,13 +335,13 @@ const onDropdownKeyDown = (e: KeyboardEvent) => {
switch (e.key) { switch (e.key) {
case 'ArrowDown': case 'ArrowDown':
e.preventDefault() e.preventDefault()
focusedIndex.value = (focusedIndex.value + 1) % filteredOptions.value.length focusedIndex.value = findNextEnabledIndex(focusedIndex.value + 1)
scrollToFocused() if (focusedIndex.value >= 0) scrollToFocused()
break break
case 'ArrowUp': case 'ArrowUp':
e.preventDefault() e.preventDefault()
focusedIndex.value = (focusedIndex.value - 1 + filteredOptions.value.length) % filteredOptions.value.length focusedIndex.value = findPrevEnabledIndex(focusedIndex.value - 1)
scrollToFocused() if (focusedIndex.value >= 0) scrollToFocused()
break break
case 'Enter': case 'Enter':
e.preventDefault() e.preventDefault()
...@@ -441,6 +481,17 @@ onUnmounted(() => { ...@@ -441,6 +481,17 @@ onUnmounted(() => {
@apply cursor-not-allowed opacity-40; @apply cursor-not-allowed opacity-40;
} }
.select-dropdown-portal .select-option-group {
@apply cursor-default select-none;
@apply bg-gray-50 dark:bg-dark-900;
@apply text-[11px] font-bold uppercase tracking-wider;
@apply text-gray-500 dark:text-gray-400;
}
.select-dropdown-portal .select-option-group:hover {
@apply bg-gray-50 dark:bg-dark-900;
}
.select-dropdown-portal .select-option-label { .select-dropdown-portal .select-option-label {
@apply flex-1 min-w-0 truncate text-left; @apply flex-1 min-w-0 truncate text-left;
} }
......
...@@ -144,10 +144,10 @@ ...@@ -144,10 +144,10 @@
</template> </template>
<script setup lang="ts"> <script setup lang="ts">
import { computed, h, ref } from 'vue' import { computed, h, onMounted, ref, watch } from 'vue'
import { useRoute } from 'vue-router' import { useRoute } from 'vue-router'
import { useI18n } from 'vue-i18n' import { useI18n } from 'vue-i18n'
import { useAppStore, useAuthStore, useOnboardingStore } from '@/stores' import { useAdminSettingsStore, useAppStore, useAuthStore, useOnboardingStore } from '@/stores'
import VersionBadge from '@/components/common/VersionBadge.vue' import VersionBadge from '@/components/common/VersionBadge.vue'
const { t } = useI18n() const { t } = useI18n()
...@@ -156,6 +156,7 @@ const route = useRoute() ...@@ -156,6 +156,7 @@ const route = useRoute()
const appStore = useAppStore() const appStore = useAppStore()
const authStore = useAuthStore() const authStore = useAuthStore()
const onboardingStore = useOnboardingStore() const onboardingStore = useOnboardingStore()
const adminSettingsStore = useAdminSettingsStore()
const sidebarCollapsed = computed(() => appStore.sidebarCollapsed) const sidebarCollapsed = computed(() => appStore.sidebarCollapsed)
const mobileOpen = computed(() => appStore.mobileOpen) const mobileOpen = computed(() => appStore.mobileOpen)
...@@ -442,6 +443,9 @@ const personalNavItems = computed(() => { ...@@ -442,6 +443,9 @@ const personalNavItems = computed(() => {
const adminNavItems = computed(() => { const adminNavItems = computed(() => {
const baseItems = [ const baseItems = [
{ path: '/admin/dashboard', label: t('nav.dashboard'), icon: DashboardIcon }, { path: '/admin/dashboard', label: t('nav.dashboard'), icon: DashboardIcon },
...(adminSettingsStore.opsMonitoringEnabled
? [{ path: '/admin/ops', label: t('nav.ops'), icon: ChartIcon }]
: []),
{ path: '/admin/users', label: t('nav.users'), icon: UsersIcon, hideInSimpleMode: true }, { path: '/admin/users', label: t('nav.users'), icon: UsersIcon, hideInSimpleMode: true },
{ path: '/admin/groups', label: t('nav.groups'), icon: FolderIcon, hideInSimpleMode: true }, { path: '/admin/groups', label: t('nav.groups'), icon: FolderIcon, hideInSimpleMode: true },
{ path: '/admin/subscriptions', label: t('nav.subscriptions'), icon: CreditCardIcon, hideInSimpleMode: true }, { path: '/admin/subscriptions', label: t('nav.subscriptions'), icon: CreditCardIcon, hideInSimpleMode: true },
...@@ -511,6 +515,23 @@ if ( ...@@ -511,6 +515,23 @@ if (
isDark.value = true isDark.value = true
document.documentElement.classList.add('dark') document.documentElement.classList.add('dark')
} }
// Fetch admin settings (for feature-gated nav items like Ops).
watch(
isAdmin,
(v) => {
if (v) {
adminSettingsStore.fetch()
}
},
{ immediate: true }
)
onMounted(() => {
if (isAdmin.value) {
adminSettingsStore.fetch()
}
})
</script> </script>
<style scoped> <style scoped>
......
...@@ -131,6 +131,7 @@ export default { ...@@ -131,6 +131,7 @@ export default {
noData: 'No data', noData: 'No data',
success: 'Success', success: 'Success',
error: 'Error', error: 'Error',
critical: 'Critical',
warning: 'Warning', warning: 'Warning',
info: 'Info', info: 'Info',
active: 'Active', active: 'Active',
...@@ -145,9 +146,11 @@ export default { ...@@ -145,9 +146,11 @@ export default {
copiedToClipboard: 'Copied to clipboard', copiedToClipboard: 'Copied to clipboard',
copyFailed: 'Failed to copy', copyFailed: 'Failed to copy',
contactSupport: 'Contact Support', contactSupport: 'Contact Support',
add: 'Add',
invalidEmail: 'Please enter a valid email address',
optional: 'optional', optional: 'optional',
selectOption: 'Select an option', selectOption: 'Select an option',
searchPlaceholder: 'Search...', searchPlaceholder: 'Search...',
noOptionsFound: 'No options found', noOptionsFound: 'No options found',
noGroupsAvailable: 'No groups available', noGroupsAvailable: 'No groups available',
unknownError: 'Unknown error occurred', unknownError: 'Unknown error occurred',
...@@ -178,6 +181,7 @@ export default { ...@@ -178,6 +181,7 @@ export default {
accounts: 'Accounts', accounts: 'Accounts',
proxies: 'Proxies', proxies: 'Proxies',
redeemCodes: 'Redeem Codes', redeemCodes: 'Redeem Codes',
ops: 'Ops',
promoCodes: 'Promo Codes', promoCodes: 'Promo Codes',
settings: 'Settings', settings: 'Settings',
myAccount: 'My Account', myAccount: 'My Account',
...@@ -1837,6 +1841,524 @@ export default { ...@@ -1837,6 +1841,524 @@ export default {
ipAddress: 'IP' ipAddress: 'IP'
}, },
// Ops Monitoring
ops: {
title: 'Ops Monitoring',
description: 'Operational monitoring and troubleshooting',
// Dashboard
systemHealth: 'System Health',
overview: 'Overview',
noSystemMetrics: 'No system metrics collected yet.',
collectedAt: 'Collected at:',
window: 'window',
cpu: 'CPU',
memory: 'Memory',
db: 'DB',
redis: 'Redis',
goroutines: 'Goroutines',
jobs: 'Jobs',
jobsHelp: 'Click “Details” to view job heartbeats and recent errors',
active: 'active',
idle: 'idle',
waiting: 'waiting',
conns: 'conns',
queue: 'queue',
ok: 'ok',
lastRun: 'last_run:',
lastSuccess: 'last_success:',
lastError: 'last_error:',
noData: 'No data.',
loadingText: 'loading',
ready: 'ready',
requestsTotal: 'Requests (total)',
slaScope: 'SLA scope:',
tokens: 'Tokens',
tps: 'TPS:',
current: 'current',
peak: 'peak',
average: 'average',
totalRequests: 'Total Requests',
avgQps: 'Avg QPS',
avgTps: 'Avg TPS',
avgLatency: 'Avg Latency',
avgTtft: 'Avg TTFT',
exceptions: 'Exceptions',
requestErrors: 'Request Errors',
errorCount: 'Error Count',
upstreamErrors: 'Upstream Errors',
errorCountExcl429529: 'Error Count (excl 429/529)',
sla: 'SLA (excl business limits)',
businessLimited: 'business_limited:',
errors: 'Errors',
errorRate: 'error_rate:',
upstreamRate: 'upstream_rate:',
latencyDuration: 'Latency (duration_ms)',
ttftLabel: 'TTFT (first_token_ms)',
p50: 'p50:',
p90: 'p90:',
p95: 'p95:',
p99: 'p99:',
avg: 'avg:',
max: 'max:',
qps: 'QPS',
requests: 'Requests',
upstream: 'Upstream',
client: 'Client',
system: 'System',
other: 'Other',
errorsSla: 'Errors (SLA scope)',
upstreamExcl429529: 'Upstream (excl 429/529)',
failedToLoadData: 'Failed to load ops data.',
failedToLoadOverview: 'Failed to load overview',
failedToLoadThroughputTrend: 'Failed to load throughput trend',
failedToLoadLatencyHistogram: 'Failed to load latency histogram',
failedToLoadErrorTrend: 'Failed to load error trend',
failedToLoadErrorDistribution: 'Failed to load error distribution',
failedToLoadErrorDetail: 'Failed to load error detail',
retryFailed: 'Retry failed',
tpsK: 'TPS (K)',
top: 'Top:',
throughputTrend: 'Throughput Trend',
latencyHistogram: 'Latency Histogram',
errorTrend: 'Error Trend',
errorDistribution: 'Error Distribution',
// Health Score & Diagnosis
health: 'Health',
healthCondition: 'Health Condition',
healthHelp: 'Overall system health score based on SLA, error rate, and resource usage',
healthyStatus: 'Healthy',
riskyStatus: 'At Risk',
idleStatus: 'Idle',
timeRange: {
'5m': 'Last 5 minutes',
'30m': 'Last 30 minutes',
'1h': 'Last 1 hour',
'6h': 'Last 6 hours',
'24h': 'Last 24 hours'
},
diagnosis: {
title: 'Smart Diagnosis',
footer: 'Automated diagnostic suggestions based on current metrics',
idle: 'System is currently idle',
idleImpact: 'No active traffic',
// Resource diagnostics
dbDown: 'Database connection failed',
dbDownImpact: 'All database operations will fail',
dbDownAction: 'Check database service status, network connectivity, and connection configuration',
redisDown: 'Redis connection failed',
redisDownImpact: 'Cache functionality degraded, performance may decline',
redisDownAction: 'Check Redis service status and network connectivity',
cpuCritical: 'CPU usage critically high ({usage}%)',
cpuCriticalImpact: 'System response slowing, may affect all requests',
cpuCriticalAction: 'Check CPU-intensive tasks, consider scaling or code optimization',
cpuHigh: 'CPU usage elevated ({usage}%)',
cpuHighImpact: 'System load is high, needs attention',
cpuHighAction: 'Monitor CPU trends, prepare scaling plan',
memoryCritical: 'Memory usage critically high ({usage}%)',
memoryCriticalImpact: 'May trigger OOM, system stability threatened',
memoryCriticalAction: 'Check for memory leaks, consider increasing memory or optimizing usage',
memoryHigh: 'Memory usage elevated ({usage}%)',
memoryHighImpact: 'Memory pressure is high, needs attention',
memoryHighAction: 'Monitor memory trends, check for memory leaks',
// Latency diagnostics
latencyCritical: 'Response latency critically high ({latency}ms)',
latencyCriticalImpact: 'User experience extremely poor, many requests timing out',
latencyCriticalAction: 'Check slow queries, database indexes, network latency, and upstream services',
latencyHigh: 'Response latency elevated ({latency}ms)',
latencyHighImpact: 'User experience degraded, needs optimization',
latencyHighAction: 'Analyze slow request logs, optimize database queries and business logic',
ttftHigh: 'Time to first byte elevated ({ttft}ms)',
ttftHighImpact: 'User perceived latency increased',
ttftHighAction: 'Optimize request processing flow, reduce pre-processing time',
// Error rate diagnostics
upstreamCritical: 'Upstream error rate critically high ({rate}%)',
upstreamCriticalImpact: 'May affect many user requests',
upstreamCriticalAction: 'Check upstream service health, enable fallback strategies',
upstreamHigh: 'Upstream error rate elevated ({rate}%)',
upstreamHighImpact: 'Recommend checking upstream service status',
upstreamHighAction: 'Contact upstream service team, prepare fallback plan',
errorHigh: 'Error rate too high ({rate}%)',
errorHighImpact: 'Many requests failing',
errorHighAction: 'Check error logs, identify root cause, urgent fix required',
errorElevated: 'Error rate elevated ({rate}%)',
errorElevatedImpact: 'Recommend checking error logs',
errorElevatedAction: 'Analyze error types and distribution, create fix plan',
// SLA diagnostics
slaCritical: 'SLA critically below target ({sla}%)',
slaCriticalImpact: 'User experience severely degraded',
slaCriticalAction: 'Urgently investigate errors and latency, consider rate limiting',
slaLow: 'SLA below target ({sla}%)',
slaLowImpact: 'Service quality needs attention',
slaLowAction: 'Analyze SLA decline causes, optimize system performance',
// Health score diagnostics
healthCritical: 'Overall health score critically low ({score})',
healthCriticalImpact: 'Multiple metrics may be degraded; prioritize error rate and latency investigation',
healthCriticalAction: 'Comprehensive system check, prioritize critical-level issues',
healthLow: 'Overall health score low ({score})',
healthLowImpact: 'May indicate minor instability; monitor SLA and error rates',
healthLowAction: 'Monitor metric trends, prevent issue escalation',
healthy: 'All system metrics normal',
healthyImpact: 'Service running stable'
},
// Error Log
errorLog: {
timeId: 'Time / ID',
context: 'Context',
status: 'Status',
message: 'Message',
latency: 'Latency',
action: 'Action',
noErrors: 'No errors in this window.',
grp: 'GRP:',
acc: 'ACC:',
details: 'Details',
phase: 'Phase'
},
// Error Details Modal
errorDetails: {
upstreamErrors: 'Upstream Errors',
requestErrors: 'Request Errors',
total: 'Total:',
searchPlaceholder: 'Search request_id / client_request_id / message',
accountIdPlaceholder: 'account_id'
},
// Error Detail Modal
errorDetail: {
loading: 'Loading…',
requestId: 'Request ID',
time: 'Time',
phase: 'Phase',
status: 'Status',
message: 'Message',
basicInfo: 'Basic Info',
platform: 'Platform',
model: 'Model',
latency: 'Latency',
ttft: 'TTFT',
businessLimited: 'Business Limited',
requestPath: 'Request Path',
timings: 'Timings',
auth: 'Auth',
routing: 'Routing',
upstream: 'Upstream',
response: 'Response',
retry: 'Retry',
retryClient: 'Retry (Client)',
retryUpstream: 'Retry (Upstream pinned)',
pinnedAccountId: 'Pinned account_id',
retryNotes: 'Retry Notes',
requestBody: 'Request Body',
errorBody: 'Error Body',
trimmed: 'trimmed',
confirmRetry: 'Confirm Retry',
retrySuccess: 'Retry succeeded',
retryFailed: 'Retry failed',
na: 'N/A',
retryHint: 'Retry will resend the request with the same parameters',
retryClientHint: 'Use client retry (no account pinning)',
retryUpstreamHint: 'Use upstream pinned retry (pin to the error account)',
pinnedAccountIdHint: '(auto from error log)',
retryNote1: 'Retry will use the same request body and parameters',
retryNote2: 'If the original request failed due to account issues, pinned retry may still fail',
retryNote3: 'Client retry will reselect an account',
confirmRetryMessage: 'Confirm retry this request?',
confirmRetryHint: 'Will resend with the same request parameters'
},
requestDetails: {
title: 'Request Details',
details: 'Details',
rangeLabel: 'Window: {range}',
rangeMinutes: '{n} minutes',
rangeHours: '{n} hours',
empty: 'No requests in this window.',
emptyHint: 'Try a different time range or remove filters.',
failedToLoad: 'Failed to load request details',
requestIdCopied: 'Request ID copied',
copyFailed: 'Copy failed',
copy: 'Copy',
viewError: 'View Error',
kind: {
success: 'SUCCESS',
error: 'ERROR'
},
table: {
time: 'Time',
kind: 'Kind',
platform: 'Platform',
model: 'Model',
duration: 'Duration',
status: 'Status',
requestId: 'Request ID',
actions: 'Actions'
}
},
alertEvents: {
title: 'Alert Events',
description: 'Recent alert firing/resolution records (email-only)',
loading: 'Loading...',
empty: 'No alert events',
loadFailed: 'Failed to load alert events',
table: {
time: 'Time',
status: 'Status',
severity: 'Severity',
title: 'Title',
metric: 'Metric / Threshold',
email: 'Email Sent'
}
},
alertRules: {
title: 'Alert Rules',
description: 'Create and manage threshold-based system alerts (email-only)',
loading: 'Loading...',
empty: 'No alert rules',
loadFailed: 'Failed to load alert rules',
saveFailed: 'Failed to save alert rule',
deleteFailed: 'Failed to delete alert rule',
create: 'Create Rule',
createTitle: 'Create Alert Rule',
editTitle: 'Edit Alert Rule',
deleteConfirmTitle: 'Delete this rule?',
deleteConfirmMessage: 'This will remove the rule and its related events. Continue?',
metricGroups: {
system: 'System Metrics',
group: 'Group-level Metrics (requires group_id)',
account: 'Account-level Metrics'
},
metrics: {
successRate: 'Success Rate (%)',
errorRate: 'Error Rate (%)',
upstreamErrorRate: 'Upstream Error Rate (%)',
p95: 'P95 Latency (ms)',
p99: 'P99 Latency (ms)',
cpu: 'CPU Usage (%)',
memory: 'Memory Usage (%)',
queueDepth: 'Concurrency Queue Depth',
groupAvailableAccounts: 'Group Available Accounts',
groupAvailableRatio: 'Group Available Ratio (%)',
groupRateLimitRatio: 'Group Rate Limit Ratio (%)',
accountRateLimitedCount: 'Rate-limited Accounts',
accountErrorCount: 'Error Accounts (excluding temporarily unschedulable)',
accountErrorRatio: 'Error Account Ratio (%)',
overloadAccountCount: 'Overloaded Accounts'
},
metricDescriptions: {
successRate: 'Percentage of successful requests in the window (0-100).',
errorRate: 'Percentage of failed requests in the window (0-100).',
upstreamErrorRate: 'Percentage of upstream failures in the window (0-100).',
p95: 'P95 request latency within the window (ms).',
p99: 'P99 request latency within the window (ms).',
cpu: 'Current instance CPU usage (0-100).',
memory: 'Current instance memory usage (0-100).',
queueDepth: 'Concurrency queue depth within the window (queued requests).',
groupAvailableAccounts: 'Number of available accounts in the selected group (requires group_id).',
groupAvailableRatio: 'Available account ratio in the selected group (0-100, requires group_id).',
groupRateLimitRatio: 'Rate-limited account ratio in the selected group (0-100, requires group_id).',
accountRateLimitedCount: 'Number of rate-limited accounts within the window.',
accountErrorCount: 'Number of error accounts within the window (excluding temporarily unschedulable).',
accountErrorRatio: 'Error account ratio within the window (0-100).',
overloadAccountCount: 'Number of overloaded accounts within the window.'
},
hints: {
recommended: 'Recommended: operator {operator}, threshold {threshold}{unit}',
groupRequired: 'This is a group-level metric; selecting a group (group_id) is required.',
groupOptional: 'Optional: limit the rule to a specific group via group_id.'
},
table: {
name: 'Name',
metric: 'Metric',
severity: 'Severity',
enabled: 'Enabled',
actions: 'Actions'
},
form: {
name: 'Name',
description: 'Description',
metric: 'Metric',
operator: 'Operator',
groupId: 'Group (group_id)',
groupPlaceholder: 'Select a group',
allGroups: 'All groups',
threshold: 'Threshold',
severity: 'Severity',
window: 'Window (minutes)',
sustained: 'Sustained (samples)',
cooldown: 'Cooldown (minutes)',
enabled: 'Enabled',
notifyEmail: 'Send email notifications'
},
validation: {
title: 'Please fix the following issues',
invalid: 'Invalid rule',
nameRequired: 'Name is required',
metricRequired: 'Metric is required',
groupIdRequired: 'group_id is required for group-level metrics',
operatorRequired: 'Operator is required',
thresholdRequired: 'Threshold must be a number',
windowRange: 'Window must be one of: 1, 5, 60 minutes',
sustainedRange: 'Sustained must be between 1 and 1440 samples',
cooldownRange: 'Cooldown must be between 0 and 1440 minutes'
}
},
runtime: {
title: 'Ops Runtime Settings',
description: 'Stored in database; changes take effect without editing config files.',
loading: 'Loading...',
noData: 'No runtime settings available',
loadFailed: 'Failed to load runtime settings',
saveSuccess: 'Runtime settings saved',
saveFailed: 'Failed to save runtime settings',
alertTitle: 'Alert Evaluator',
groupAvailabilityTitle: 'Group Availability Monitor',
evalIntervalSeconds: 'Evaluation Interval (seconds)',
silencing: {
title: 'Alert Silencing (Maintenance Mode)',
enabled: 'Enable silencing',
globalUntil: 'Silence until (RFC3339)',
untilPlaceholder: '2026-01-05T00:00:00Z',
untilHint: 'Leave empty to only toggle silencing without an expiry (not recommended).',
reason: 'Reason',
reasonPlaceholder: 'e.g., planned maintenance',
entries: {
title: 'Advanced: targeted silencing',
hint: 'Optional: silence only certain rules or severities. Leave fields empty to match all.',
add: 'Add Entry',
empty: 'No targeted entries',
entryTitle: 'Entry #{n}',
ruleId: 'Rule ID (optional)',
ruleIdPlaceholder: 'e.g., 1',
severities: 'Severities (optional)',
severitiesPlaceholder: 'e.g., P0,P1 (empty = all)',
until: 'Until (RFC3339)',
reason: 'Reason',
validation: {
untilRequired: 'Entry until time is required',
untilFormat: 'Entry until time must be a valid RFC3339 timestamp',
ruleIdPositive: 'Entry rule_id must be a positive integer',
severitiesFormat: 'Entry severities must be a comma-separated list of P0..P3'
}
},
validation: {
timeFormat: 'Silence time must be a valid RFC3339 timestamp'
}
},
lockEnabled: 'Distributed Lock Enabled',
lockKey: 'Distributed Lock Key',
lockTTLSeconds: 'Distributed Lock TTL (seconds)',
showAdvancedDeveloperSettings: 'Show advanced developer settings (Distributed Lock)',
advancedSettingsSummary: 'Advanced settings (Distributed Lock)',
evalIntervalHint: 'How often the evaluator runs. Keeping the default is recommended.',
validation: {
title: 'Please fix the following issues',
invalid: 'Invalid settings',
evalIntervalRange: 'Evaluation interval must be between 1 and 86400 seconds',
lockKeyRequired: 'Distributed lock key is required when lock is enabled',
lockKeyPrefix: 'Distributed lock key must start with "{prefix}"',
lockKeyHint: 'Recommended: start with "{prefix}" to avoid conflicts',
lockTtlRange: 'Distributed lock TTL must be between 1 and 86400 seconds'
}
},
email: {
title: 'Email Notification',
description: 'Configure alert/report email notifications (stored in database).',
loading: 'Loading...',
noData: 'No email notification config',
loadFailed: 'Failed to load email notification config',
saveSuccess: 'Email notification config saved',
saveFailed: 'Failed to save email notification config',
alertTitle: 'Alert Emails',
reportTitle: 'Report Emails',
recipients: 'Recipients',
recipientsHint: 'If empty, the system may fallback to the first admin email.',
minSeverity: 'Min Severity',
minSeverityAll: 'All severities',
rateLimitPerHour: 'Rate limit per hour',
batchWindowSeconds: 'Batch window (seconds)',
includeResolved: 'Include resolved alerts',
dailySummary: 'Daily summary',
weeklySummary: 'Weekly summary',
errorDigest: 'Error digest',
errorDigestMinCount: 'Min errors for digest',
accountHealth: 'Account health',
accountHealthThreshold: 'Error rate threshold (%)',
cronPlaceholder: 'Cron expression',
reportHint: 'Schedules use cron syntax; leave empty to use defaults.',
validation: {
title: 'Please fix the following issues',
invalid: 'Invalid email notification config',
alertRecipientsRequired: 'Alert emails are enabled but no recipients are configured',
reportRecipientsRequired: 'Report emails are enabled but no recipients are configured',
invalidRecipients: 'One or more recipient emails are invalid',
rateLimitRange: 'Rate limit per hour must be a number ≥ 0',
batchWindowRange: 'Batch window must be between 0 and 86400 seconds',
cronRequired: 'A cron expression is required when schedule is enabled',
cronFormat: 'Cron expression format looks invalid (expected at least 5 parts)',
digestMinCountRange: 'Min errors for digest must be a number ≥ 0',
accountHealthThresholdRange: 'Account health threshold must be between 0 and 100'
}
},
concurrency: {
title: 'Concurrency / Queue',
byPlatform: 'By Platform',
byGroup: 'By Group',
byAccount: 'By Account',
totalRows: '{count} rows',
disabledHint: 'Realtime monitoring is disabled in settings.',
empty: 'No data',
queued: 'Queue {count}',
rateLimited: 'Rate-limited {count}',
errorAccounts: 'Errors {count}',
loadFailed: 'Failed to load concurrency data'
},
realtime: {
title: 'Realtime',
connected: 'Realtime connected',
connecting: 'Realtime connecting',
reconnecting: 'Realtime reconnecting',
offline: 'Realtime offline',
closed: 'Realtime closed',
reconnectIn: 'retry in {seconds}s'
},
queryMode: {
auto: 'Auto',
raw: 'Raw',
preagg: 'Preagg'
},
accountAvailability: {
available: 'Available',
unavailable: 'Unavailable',
accountError: 'Error'
},
tooltips: {
throughputTrend: 'Requests/QPS + Tokens/TPS in the selected window.',
latencyHistogram: 'Latency distribution (duration_ms) for successful requests.',
errorTrend: 'Error counts over time (SLA scope excludes business limits; upstream excludes 429/529).',
errorDistribution: 'Error distribution by status code.',
goroutines:
'Number of Go runtime goroutines (lightweight threads). There is no absolute “safe” number—use your historical baseline. Heuristic: <2k is common; 2k–8k watch; >8k plus rising queue/latency often suggests blocking/leaks.',
cpu: 'CPU usage percentage, showing system processor load.',
memory: 'Memory usage, including used and total available memory.',
db: 'Database connection pool status, including active, idle, and waiting connections.',
redis: 'Redis connection pool status, showing active and idle connections.',
jobs: 'Background job execution status, including last run time, success time, and error information.',
qps: 'Queries Per Second (QPS) and Tokens Per Second (TPS), real-time system throughput.',
tokens: 'Total number of tokens processed in the current time window.',
sla: 'Service Level Agreement success rate, excluding business limits (e.g., insufficient balance, quota exceeded).',
errors: 'Error statistics, including total errors, error rate, and upstream error rate.',
latency: 'Request latency statistics, including p50, p90, p95, p99 percentiles.',
ttft: 'Time To First Token, measuring the speed of first byte return in streaming responses.',
health: 'System health score (0-100), considering SLA, error rate, and resource usage.'
},
charts: {
emptyRequest: 'No requests in this window.',
emptyError: 'No errors in this window.',
resetZoom: 'Reset',
resetZoomHint: 'Reset zoom (if enabled)',
downloadChart: 'Download',
downloadChartHint: 'Download chart as image'
}
},
// Settings // Settings
settings: { settings: {
title: 'System Settings', title: 'System Settings',
...@@ -1951,6 +2473,22 @@ export default { ...@@ -1951,6 +2473,22 @@ export default {
sending: 'Sending...', sending: 'Sending...',
enterRecipientHint: 'Please enter a recipient email address' enterRecipientHint: 'Please enter a recipient email address'
}, },
opsMonitoring: {
title: 'Ops Monitoring',
description: 'Enable ops monitoring for troubleshooting and health visibility',
disabled: 'Ops monitoring is disabled',
enabled: 'Enable Ops Monitoring',
enabledHint: 'Enable the ops monitoring module (admin only)',
realtimeEnabled: 'Enable Realtime Monitoring',
realtimeEnabledHint: 'Enable realtime QPS/metrics push (WebSocket)',
queryMode: 'Default Query Mode',
queryModeHint: 'Default query mode for Ops Dashboard (auto/raw/preagg)',
queryModeAuto: 'Auto (recommended)',
queryModeRaw: 'Raw (most accurate, slower)',
queryModePreagg: 'Preagg (fastest, requires aggregation)',
metricsInterval: 'Metrics Collection Interval (seconds)',
metricsIntervalHint: 'How often to collect system/request metrics (60-3600 seconds)'
},
adminApiKey: { adminApiKey: {
title: 'Admin API Key', title: 'Admin API Key',
description: 'Global API key for external system integration with full admin access', description: 'Global API key for external system integration with full admin access',
......
...@@ -128,6 +128,7 @@ export default { ...@@ -128,6 +128,7 @@ export default {
noData: '暂无数据', noData: '暂无数据',
success: '成功', success: '成功',
error: '错误', error: '错误',
critical: '严重',
warning: '警告', warning: '警告',
info: '提示', info: '提示',
active: '启用', active: '启用',
...@@ -142,6 +143,8 @@ export default { ...@@ -142,6 +143,8 @@ export default {
copiedToClipboard: '已复制到剪贴板', copiedToClipboard: '已复制到剪贴板',
copyFailed: '复制失败', copyFailed: '复制失败',
contactSupport: '联系客服', contactSupport: '联系客服',
add: '添加',
invalidEmail: '请输入有效的邮箱地址',
optional: '可选', optional: '可选',
selectOption: '请选择', selectOption: '请选择',
searchPlaceholder: '搜索...', searchPlaceholder: '搜索...',
...@@ -151,6 +154,7 @@ export default { ...@@ -151,6 +154,7 @@ export default {
saving: '保存中...', saving: '保存中...',
selectedCount: '(已选 {count} 个)', selectedCount: '(已选 {count} 个)',
refresh: '刷新', refresh: '刷新',
settings: '设置',
notAvailable: '不可用', notAvailable: '不可用',
now: '现在', now: '现在',
unknown: '未知', unknown: '未知',
...@@ -176,6 +180,7 @@ export default { ...@@ -176,6 +180,7 @@ export default {
accounts: '账号管理', accounts: '账号管理',
proxies: 'IP管理', proxies: 'IP管理',
redeemCodes: '兑换码', redeemCodes: '兑换码',
ops: '运维监控',
promoCodes: '优惠码', promoCodes: '优惠码',
settings: '系统设置', settings: '系统设置',
myAccount: '我的账户', myAccount: '我的账户',
...@@ -1982,6 +1987,565 @@ export default { ...@@ -1982,6 +1987,565 @@ export default {
ipAddress: 'IP' ipAddress: 'IP'
}, },
// Ops Monitoring
ops: {
title: '运维监控',
description: '运维监控与排障',
// Dashboard
systemHealth: '系统健康',
overview: '概览',
noSystemMetrics: '尚未收集系统指标。',
collectedAt: '采集时间:',
window: '窗口',
cpu: 'CPU',
memory: '内存',
db: '数据库',
redis: 'Redis',
goroutines: '协程',
jobs: '后台任务',
jobsHelp: '点击“明细”查看任务心跳与报错信息',
active: '活跃',
idle: '空闲',
waiting: '等待',
conns: '连接',
queue: '队列',
ok: '正常',
lastRun: '最近运行',
lastSuccess: '最近成功',
lastError: '最近错误',
noData: '暂无数据',
loadingText: '加载中...',
ready: '就绪',
requestsTotal: '请求(总计)',
slaScope: 'SLA 范围:',
tokens: 'Token',
tps: 'TPS',
current: '当前',
peak: '峰值',
average: '平均',
totalRequests: '总请求',
avgQps: '平均 QPS',
avgTps: '平均 TPS',
avgLatency: '平均延迟',
avgTtft: '平均首字延迟',
exceptions: '异常数',
requestErrors: '请求错误',
errorCount: '错误数',
upstreamErrors: '上游错误',
errorCountExcl429529: '错误数(排除429/529)',
sla: 'SLA(排除业务限制)',
businessLimited: '业务限制:',
errors: '错误',
errorRate: '错误率:',
upstreamRate: '上游错误率:',
latencyDuration: '延迟(毫秒)',
ttftLabel: '首字延迟(毫秒)',
p50: 'p50',
p90: 'p90',
p95: 'p95',
p99: 'p99',
avg: 'avg',
max: 'max',
qps: 'QPS',
requests: '请求',
upstream: '上游',
client: '客户端',
system: '系统',
other: '其他',
errorsSla: '错误(SLA范围)',
upstreamExcl429529: '上游(排除429/529)',
failedToLoadData: '加载运维数据失败',
failedToLoadOverview: '加载概览数据失败',
failedToLoadThroughputTrend: '加载吞吐趋势失败',
failedToLoadLatencyHistogram: '加载延迟分布失败',
failedToLoadErrorTrend: '加载错误趋势失败',
failedToLoadErrorDistribution: '加载错误分布失败',
failedToLoadErrorDetail: '加载错误详情失败',
retryFailed: '重试失败',
tpsK: 'TPS(千)',
top: '最高:',
throughputTrend: '吞吐趋势',
latencyHistogram: '延迟分布',
errorTrend: '错误趋势',
errorDistribution: '错误分布',
// Health Score & Diagnosis
health: '健康',
healthCondition: '健康状况',
healthHelp: '基于 SLA、错误率和资源使用情况的系统整体健康评分',
healthyStatus: '健康',
riskyStatus: '风险',
idleStatus: '待机',
timeRange: {
'5m': '近5分钟',
'30m': '近30分钟',
'1h': '近1小时',
'6h': '近6小时',
'24h': '近24小时'
},
diagnosis: {
title: '智能诊断',
footer: '基于当前指标的自动诊断建议',
idle: '系统当前处于待机状态',
idleImpact: '无活跃流量',
// Resource diagnostics
dbDown: '数据库连接失败',
dbDownImpact: '所有数据库操作将失败',
dbDownAction: '检查数据库服务状态、网络连接和连接配置',
redisDown: 'Redis连接失败',
redisDownImpact: '缓存功能降级,性能可能下降',
redisDownAction: '检查Redis服务状态和网络连接',
cpuCritical: 'CPU使用率严重过高 ({usage}%)',
cpuCriticalImpact: '系统响应变慢,可能影响所有请求',
cpuCriticalAction: '检查CPU密集型任务,考虑扩容或优化代码',
cpuHigh: 'CPU使用率偏高 ({usage}%)',
cpuHighImpact: '系统负载较高,需要关注',
cpuHighAction: '监控CPU趋势,准备扩容方案',
memoryCritical: '内存使用率严重过高 ({usage}%)',
memoryCriticalImpact: '可能触发OOM,系统稳定性受威胁',
memoryCriticalAction: '检查内存泄漏,考虑增加内存或优化内存使用',
memoryHigh: '内存使用率偏高 ({usage}%)',
memoryHighImpact: '内存压力较大,需要关注',
memoryHighAction: '监控内存趋势,检查是否有内存泄漏',
// Latency diagnostics
latencyCritical: '响应延迟严重过高 ({latency}ms)',
latencyCriticalImpact: '用户体验极差,大量请求超时',
latencyCriticalAction: '检查慢查询、数据库索引、网络延迟和上游服务',
latencyHigh: '响应延迟偏高 ({latency}ms)',
latencyHighImpact: '用户体验下降,需要优化',
latencyHighAction: '分析慢请求日志,优化数据库查询和业务逻辑',
ttftHigh: '首字节时间偏高 ({ttft}ms)',
ttftHighImpact: '用户感知延迟增加',
ttftHighAction: '优化请求处理流程,减少前置逻辑耗时',
// Error rate diagnostics
upstreamCritical: '上游错误率严重偏高 ({rate}%)',
upstreamCriticalImpact: '可能影响大量用户请求',
upstreamCriticalAction: '检查上游服务健康状态,启用降级策略',
upstreamHigh: '上游错误率偏高 ({rate}%)',
upstreamHighImpact: '建议检查上游服务状态',
upstreamHighAction: '联系上游服务团队,准备降级方案',
errorHigh: '错误率过高 ({rate}%)',
errorHighImpact: '大量请求失败',
errorHighAction: '查看错误日志,定位错误根因,紧急修复',
errorElevated: '错误率偏高 ({rate}%)',
errorElevatedImpact: '建议检查错误日志',
errorElevatedAction: '分析错误类型和分布,制定修复计划',
// SLA diagnostics
slaCritical: 'SLA 严重低于目标 ({sla}%)',
slaCriticalImpact: '用户体验严重受损',
slaCriticalAction: '紧急排查错误和延迟问题,考虑限流保护',
slaLow: 'SLA 低于目标 ({sla}%)',
slaLowImpact: '需要关注服务质量',
slaLowAction: '分析SLA下降原因,优化系统性能',
// Health score diagnostics
healthCritical: '综合健康评分过低 ({score})',
healthCriticalImpact: '多个指标可能同时异常,建议优先排查错误与延迟',
healthCriticalAction: '全面检查系统状态,优先处理critical级别问题',
healthLow: '综合健康评分偏低 ({score})',
healthLowImpact: '可能存在轻度波动,建议关注 SLA 与错误率',
healthLowAction: '监控指标趋势,预防问题恶化',
healthy: '所有系统指标正常',
healthyImpact: '服务运行稳定'
},
// Error Log
errorLog: {
timeId: '时间 / ID',
context: '上下文',
status: '状态码',
message: '消息',
latency: '延迟',
action: '操作',
noErrors: '该窗口内暂无错误。',
grp: 'GRP:',
acc: 'ACC:',
details: '详情',
phase: '阶段'
},
// Error Details Modal
errorDetails: {
upstreamErrors: '上游错误',
requestErrors: '请求错误',
total: '总计:',
searchPlaceholder: '搜索 request_id / client_request_id / message',
accountIdPlaceholder: 'account_id'
},
// Error Detail Modal
errorDetail: {
loading: '加载中…',
requestId: '请求 ID',
time: '时间',
phase: '阶段',
status: '状态码',
message: '消息',
basicInfo: '基本信息',
platform: '平台',
model: '模型',
latency: '延迟',
ttft: 'TTFT',
businessLimited: '业务限制',
requestPath: '请求路径',
timings: '时序信息',
auth: '认证',
routing: '路由',
upstream: '上游',
response: '响应',
retry: '重试',
retryClient: '重试(客户端)',
retryUpstream: '重试(上游固定)',
pinnedAccountId: '固定 account_id',
retryNotes: '重试说明',
requestBody: '请求体',
errorBody: '错误体',
trimmed: '已截断',
confirmRetry: '确认重试',
retrySuccess: '重试成功',
retryFailed: '重试失败',
na: 'N/A',
retryHint: '重试将使用相同的请求参数重新发送请求',
retryClientHint: '使用客户端重试(不固定账号)',
retryUpstreamHint: '使用上游固定重试(固定到错误的账号)',
pinnedAccountIdHint: '(自动从错误日志获取)',
retryNote1: '重试会使用相同的请求体和参数',
retryNote2: '如果原请求失败是因为账号问题,固定重试可能仍会失败',
retryNote3: '客户端重试会重新选择账号',
confirmRetryMessage: '确认要重试该请求吗?',
confirmRetryHint: '将使用相同的请求参数重新发送'
},
requestDetails: {
title: '请求明细',
details: '明细',
rangeLabel: '窗口:{range}',
rangeMinutes: '{n} 分钟',
rangeHours: '{n} 小时',
empty: '该窗口内暂无请求。',
emptyHint: '可尝试调整时间范围或取消部分筛选。',
failedToLoad: '加载请求明细失败',
requestIdCopied: '请求ID已复制',
copyFailed: '复制失败',
copy: '复制',
viewError: '查看错误',
kind: {
success: '成功',
error: '失败'
},
table: {
time: '时间',
kind: '类型',
platform: '平台',
model: '模型',
duration: '耗时',
status: '状态码',
requestId: '请求ID',
actions: '操作'
}
},
alertEvents: {
title: '告警事件',
description: '最近的告警触发/恢复记录(仅邮件通知)',
loading: '加载中...',
empty: '暂无告警事件',
loadFailed: '加载告警事件失败',
table: {
time: '时间',
status: '状态',
severity: '级别',
title: '标题',
metric: '指标 / 阈值',
email: '邮件已发送'
}
},
alertRules: {
title: '告警规则',
description: '创建与管理系统阈值告警(仅邮件通知)',
loading: '加载中...',
empty: '暂无告警规则',
loadFailed: '加载告警规则失败',
saveSuccess: '警报规则保存成功',
saveFailed: '保存告警规则失败',
deleteSuccess: '警报规则删除成功',
deleteFailed: '删除告警规则失败',
create: '新建规则',
createTitle: '新建告警规则',
editTitle: '编辑告警规则',
deleteConfirmTitle: '确认删除该规则?',
deleteConfirmMessage: '将删除该规则及其关联的告警事件,是否继续?',
manage: '预警规则',
metricGroups: {
system: '系统指标',
group: '分组级别指标(需 group_id)',
account: '账号级别指标'
},
metrics: {
successRate: '成功率 (%)',
errorRate: '错误率 (%)',
upstreamErrorRate: '上游错误率 (%)',
p95: 'P95 延迟 (ms)',
p99: 'P99 延迟 (ms)',
cpu: 'CPU 使用率 (%)',
memory: '内存使用率 (%)',
queueDepth: '并发排队深度',
groupAvailableAccounts: '分组可用账号数',
groupAvailableRatio: '分组可用比例 (%)',
groupRateLimitRatio: '分组限流比例 (%)',
accountRateLimitedCount: '限流账号数',
accountErrorCount: '错误账号数(不含临时不可调度)',
accountErrorRatio: '错误账号比例 (%)',
overloadAccountCount: '过载账号数'
},
metricDescriptions: {
successRate: '统计窗口内成功请求占比(0~100)。',
errorRate: '统计窗口内失败请求占比(0~100)。',
upstreamErrorRate: '统计窗口内上游错误占比(0~100)。',
p95: '统计窗口内 P95 请求耗时(毫秒)。',
p99: '统计窗口内 P99 请求耗时(毫秒)。',
cpu: '当前实例 CPU 使用率(0~100)。',
memory: '当前实例内存使用率(0~100)。',
queueDepth: '统计窗口内并发队列排队深度(等待中的请求数)。',
groupAvailableAccounts: '指定分组中当前可用账号数量(需要 group_id 过滤)。',
groupAvailableRatio: '指定分组中可用账号占比(0~100,需要 group_id 过滤)。',
groupRateLimitRatio: '指定分组中账号被限流的比例(0~100,需要 group_id 过滤)。',
accountRateLimitedCount: '统计窗口内被限流的账号数量。',
accountErrorCount: '统计窗口内产生错误的账号数量(不含临时不可调度)。',
accountErrorRatio: '统计窗口内错误账号占比(0~100)。',
overloadAccountCount: '统计窗口内过载账号数量。'
},
hints: {
recommended: '推荐:运算符 {operator},阈值 {threshold}{unit}',
groupRequired: '该指标为分组级别指标,必须选择分组(group_id)。',
groupOptional: '可选:通过 group_id 将规则限定到某个分组。'
},
table: {
name: '名称',
metric: '指标',
severity: '级别',
enabled: '启用',
actions: '操作'
},
form: {
name: '名称',
description: '描述',
metric: '指标',
operator: '运算符',
groupId: '分组(group_id)',
groupPlaceholder: '请选择分组',
allGroups: '全部分组',
threshold: '阈值',
severity: '级别',
window: '统计窗口(分钟)',
sustained: '连续样本数(每分钟)',
cooldown: '冷却期(分钟)',
enabled: '启用',
notifyEmail: '发送邮件通知'
},
validation: {
title: '请先修正以下问题',
invalid: '规则不合法',
nameRequired: '名称不能为空',
metricRequired: '指标不能为空',
groupIdRequired: '分组级别指标必须指定 group_id',
operatorRequired: '运算符不能为空',
thresholdRequired: '阈值必须为数字',
windowRange: '统计窗口必须为 1 / 5 / 60 分钟之一',
sustainedRange: '连续样本数必须在 1 到 1440 之间',
cooldownRange: '冷却期必须在 0 到 1440 分钟之间'
}
},
runtime: {
title: '运维监控运行设置',
description: '配置存储在数据库中,无需修改 config 文件即可生效。',
loading: '加载中...',
noData: '暂无运行设置',
loadFailed: '加载运行设置失败',
saveSuccess: '运行设置已保存',
saveFailed: '保存运行设置失败',
alertTitle: '告警评估器',
groupAvailabilityTitle: '分组可用性监控',
evalIntervalSeconds: '评估间隔(秒)',
silencing: {
title: '告警静默(维护模式)',
enabled: '启用静默',
globalUntil: '静默截止时间(RFC3339)',
untilPlaceholder: '2026-01-05T00:00:00Z',
untilHint: '建议填写截止时间,避免忘记关闭静默。',
reason: '原因',
reasonPlaceholder: '例如:计划维护',
entries: {
title: '高级:定向静默',
hint: '可选:仅静默特定规则或特定级别。字段留空表示匹配全部。',
add: '新增条目',
empty: '暂无定向静默条目',
entryTitle: '条目 #{n}',
ruleId: '规则ID(可选)',
ruleIdPlaceholder: '例如:1',
severities: '级别(可选)',
severitiesPlaceholder: '例如:P0,P1(留空=全部)',
until: '截止时间(RFC3339)',
reason: '原因',
validation: {
untilRequired: '条目截止时间不能为空',
untilFormat: '条目截止时间必须为合法的 RFC3339 时间戳',
ruleIdPositive: '条目 rule_id 必须为正整数',
severitiesFormat: '条目级别必须为 P0..P3 的逗号分隔列表'
}
},
validation: {
timeFormat: '静默时间必须为合法的 RFC3339 时间戳'
}
},
lockEnabled: '启用分布式锁',
lockKey: '分布式锁 Key',
lockTTLSeconds: '分布式锁 TTL(秒)',
showAdvancedDeveloperSettings: '显示高级开发者设置 (Distributed Lock)',
advancedSettingsSummary: '高级设置 (分布式锁)',
evalIntervalHint: '检测任务的执行频率,建议保持默认。',
validation: {
title: '请先修正以下问题',
invalid: '设置不合法',
evalIntervalRange: '评估间隔必须在 1 到 86400 秒之间',
lockKeyRequired: '启用分布式锁时必须填写 Lock Key',
lockKeyPrefix: '分布式锁 Key 必须以「{prefix}」开头',
lockKeyHint: '建议以「{prefix}」开头以避免冲突',
lockTtlRange: '分布式锁 TTL 必须在 1 到 86400 秒之间'
}
},
email: {
title: '邮件通知配置',
description: '配置告警/报告邮件通知(存储在数据库中)。',
loading: '加载中...',
noData: '暂无邮件通知配置',
loadFailed: '加载邮件通知配置失败',
saveSuccess: '邮件通知配置已保存',
saveFailed: '保存邮件通知配置失败',
alertTitle: '告警邮件',
reportTitle: '报告邮件',
recipients: '收件人',
recipientsHint: '若为空,系统可能会回退使用第一个管理员邮箱。',
minSeverity: '最低级别',
minSeverityAll: '全部级别',
rateLimitPerHour: '每小时限额',
batchWindowSeconds: '合并窗口(秒)',
includeResolved: '包含恢复通知',
dailySummary: '每日摘要',
weeklySummary: '每周摘要',
errorDigest: '错误摘要',
errorDigestMinCount: '错误摘要最小数量',
accountHealth: '账号健康报告',
accountHealthThreshold: '错误率阈值(%)',
cronPlaceholder: 'Cron 表达式',
reportHint: '发送时间使用 Cron 语法;留空将使用默认值。',
validation: {
title: '请先修正以下问题',
invalid: '邮件通知配置不合法',
alertRecipientsRequired: '已启用告警邮件,但未配置任何收件人',
reportRecipientsRequired: '已启用报告邮件,但未配置任何收件人',
invalidRecipients: '存在不合法的收件人邮箱',
rateLimitRange: '每小时限额必须为 ≥ 0 的数字',
batchWindowRange: '合并窗口必须在 0 到 86400 秒之间',
cronRequired: '启用定时任务时必须填写 Cron 表达式',
cronFormat: 'Cron 表达式格式可能不正确(至少应包含 5 段)',
digestMinCountRange: '错误摘要最小数量必须为 ≥ 0 的数字',
accountHealthThresholdRange: '账号健康错误率阈值必须在 0 到 100 之间'
}
},
settings: {
title: '运维监控设置',
loadFailed: '加载设置失败',
saveSuccess: '运维监控设置保存成功',
saveFailed: '保存设置失败',
dataCollection: '数据采集',
evaluationInterval: '评估间隔(秒)',
evaluationIntervalHint: '检测任务的执行频率,建议保持默认',
alertConfig: '预警配置',
enableAlert: '开启预警',
alertRecipients: '预警接收邮箱',
emailPlaceholder: '输入邮箱地址',
recipientsHint: '若为空,系统将使用第一个管理员邮箱作为默认收件人',
minSeverity: '最低级别',
reportConfig: '评估报告配置',
enableReport: '开启评估报告',
reportRecipients: '评估报告接收邮箱',
dailySummary: '每日摘要',
weeklySummary: '每周摘要',
advancedSettings: '高级设置',
dataRetention: '数据保留策略',
enableCleanup: '启用数据清理',
cleanupSchedule: '清理计划(Cron)',
cleanupScheduleHint: '例如:0 2 * * * 表示每天凌晨2点',
errorLogRetentionDays: '错误日志保留天数',
minuteMetricsRetentionDays: '分钟指标保留天数',
hourlyMetricsRetentionDays: '小时指标保留天数',
retentionDaysHint: '建议保留7-90天,过长会占用存储空间',
aggregation: '预聚合任务',
enableAggregation: '启用预聚合任务',
aggregationHint: '预聚合可提升长时间窗口查询性能',
validation: {
title: '请先修正以下问题',
retentionDaysRange: '保留天数必须在1-365天之间'
}
},
concurrency: {
title: '并发 / 排队',
byPlatform: '按平台',
byGroup: '按分组',
byAccount: '按账号',
totalRows: '共 {count} 项',
disabledHint: '已在设置中关闭实时监控。',
empty: '暂无数据',
queued: '队列 {count}',
rateLimited: '限流 {count}',
errorAccounts: '异常 {count}',
loadFailed: '加载并发数据失败'
},
realtime: {
title: '实时信息',
connected: '实时已连接',
connecting: '实时连接中',
reconnecting: '实时重连中',
offline: '实时离线',
closed: '实时已关闭',
reconnectIn: '重连 {seconds}s'
},
queryMode: {
auto: 'Auto(自动)',
raw: 'Raw(不聚合)',
preagg: 'Preagg(聚合)'
},
accountAvailability: {
available: '可用',
unavailable: '不可用',
accountError: '异常'
},
tooltips: {
totalRequests: '当前时间窗口内的总请求数和Token消耗量。',
throughputTrend: '当前窗口内的请求/QPS 与 token/TPS 趋势。',
latencyHistogram: '成功请求的延迟分布(毫秒)。',
errorTrend: '错误趋势(SLA 口径排除业务限制;上游错误率排除 429/529)。',
errorDistribution: '按状态码统计的错误分布。',
upstreamErrors: '上游服务返回的错误,包括API提供商的错误响应(排除429/529限流错误)。',
goroutines:
'Go 运行时的协程数量(轻量级线程)。没有绝对“安全值”,建议以历史基线为准。经验参考:<2000 常见;2000-8000 需关注;>8000 且伴随队列/延迟上升时,优先排查阻塞/泄漏。',
cpu: 'CPU 使用率,显示系统处理器的负载情况。',
memory: '内存使用率,包括已使用和总可用内存。',
db: '数据库连接池状态,包括活跃连接、空闲连接和等待连接数。',
redis: 'Redis 连接池状态,显示活跃和空闲的连接数。',
jobs: '后台任务执行状态,包括最近运行时间、成功时间和错误信息。',
qps: '每秒查询数(QPS)和每秒Token数(TPS),实时显示系统吞吐量。',
tokens: '当前时间窗口内处理的总Token数量。',
sla: '服务等级协议达成率,排除业务限制(如余额不足、配额超限)的成功请求占比。',
errors: '错误统计,包括总错误数、错误率和上游错误率。',
latency: '请求延迟统计,包括 p50、p90、p95、p99 等百分位数。',
ttft: '首Token延迟(Time To First Token),衡量流式响应的首字节返回速度。',
health: '系统健康评分(0-100),综合考虑 SLA、错误率和资源使用情况。'
},
charts: {
emptyRequest: '该时间窗口内暂无请求。',
emptyError: '该时间窗口内暂无错误。',
resetZoom: '重置',
resetZoomHint: '重置缩放(若启用)',
downloadChart: '下载',
downloadChartHint: '下载图表图片'
}
},
// Settings // Settings
settings: { settings: {
title: '系统设置', title: '系统设置',
...@@ -2094,6 +2658,22 @@ export default { ...@@ -2094,6 +2658,22 @@ export default {
sending: '发送中...', sending: '发送中...',
enterRecipientHint: '请输入收件人邮箱地址' enterRecipientHint: '请输入收件人邮箱地址'
}, },
opsMonitoring: {
title: '运维监控',
description: '启用运维监控模块,用于排障与健康可视化',
disabled: '运维监控已关闭',
enabled: '启用运维监控',
enabledHint: '启用运维监控模块(仅管理员可见)',
realtimeEnabled: '启用实时监控',
realtimeEnabledHint: '启用实时请求速率和指标推送(WebSocket)',
queryMode: '默认查询模式',
queryModeHint: '运维监控默认查询模式(自动/原始/预聚合)',
queryModeAuto: '自动(推荐)',
queryModeRaw: '原始(最准确,但较慢)',
queryModePreagg: '预聚合(最快,需预聚合)',
metricsInterval: '采集频率(秒)',
metricsIntervalHint: '系统/请求指标采集频率(60-3600 秒)'
},
adminApiKey: { adminApiKey: {
title: '管理员 API Key', title: '管理员 API Key',
description: '用于外部系统集成的全局 API Key,拥有完整的管理员权限', description: '用于外部系统集成的全局 API Key,拥有完整的管理员权限',
......
...@@ -173,6 +173,18 @@ const routes: RouteRecordRaw[] = [ ...@@ -173,6 +173,18 @@ const routes: RouteRecordRaw[] = [
descriptionKey: 'admin.dashboard.description' descriptionKey: 'admin.dashboard.description'
} }
}, },
{
path: '/admin/ops',
name: 'AdminOps',
component: () => import('@/views/admin/ops/OpsDashboard.vue'),
meta: {
requiresAuth: true,
requiresAdmin: true,
title: 'Ops Monitoring',
titleKey: 'admin.ops.title',
descriptionKey: 'admin.ops.description'
}
},
{ {
path: '/admin/users', path: '/admin/users',
name: 'AdminUsers', name: 'AdminUsers',
......
import { defineStore } from 'pinia'
import { ref } from 'vue'
import { adminAPI } from '@/api'
export const useAdminSettingsStore = defineStore('adminSettings', () => {
const loaded = ref(false)
const loading = ref(false)
const readCachedBool = (key: string, defaultValue: boolean): boolean => {
try {
const raw = localStorage.getItem(key)
if (raw === 'true') return true
if (raw === 'false') return false
} catch {
// ignore localStorage failures
}
return defaultValue
}
const writeCachedBool = (key: string, value: boolean) => {
try {
localStorage.setItem(key, value ? 'true' : 'false')
} catch {
// ignore localStorage failures
}
}
const readCachedString = (key: string, defaultValue: string): string => {
try {
const raw = localStorage.getItem(key)
if (typeof raw === 'string' && raw.length > 0) return raw
} catch {
// ignore localStorage failures
}
return defaultValue
}
const writeCachedString = (key: string, value: string) => {
try {
localStorage.setItem(key, value)
} catch {
// ignore localStorage failures
}
}
// Default open, but honor cached value to reduce UI flicker on first paint.
const opsMonitoringEnabled = ref(readCachedBool('ops_monitoring_enabled_cached', true))
const opsRealtimeMonitoringEnabled = ref(readCachedBool('ops_realtime_monitoring_enabled_cached', true))
const opsQueryModeDefault = ref(readCachedString('ops_query_mode_default_cached', 'auto'))
async function fetch(force = false): Promise<void> {
if (loaded.value && !force) return
if (loading.value) return
loading.value = true
try {
const settings = await adminAPI.settings.getSettings()
opsMonitoringEnabled.value = settings.ops_monitoring_enabled ?? true
writeCachedBool('ops_monitoring_enabled_cached', opsMonitoringEnabled.value)
opsRealtimeMonitoringEnabled.value = settings.ops_realtime_monitoring_enabled ?? true
writeCachedBool('ops_realtime_monitoring_enabled_cached', opsRealtimeMonitoringEnabled.value)
opsQueryModeDefault.value = settings.ops_query_mode_default || 'auto'
writeCachedString('ops_query_mode_default_cached', opsQueryModeDefault.value)
loaded.value = true
} catch (err) {
// Keep cached/default value: do not "flip" the UI based on a transient fetch failure.
loaded.value = true
console.error('[adminSettings] Failed to fetch settings:', err)
} finally {
loading.value = false
}
}
function setOpsMonitoringEnabledLocal(value: boolean) {
opsMonitoringEnabled.value = value
writeCachedBool('ops_monitoring_enabled_cached', value)
loaded.value = true
}
function setOpsRealtimeMonitoringEnabledLocal(value: boolean) {
opsRealtimeMonitoringEnabled.value = value
writeCachedBool('ops_realtime_monitoring_enabled_cached', value)
loaded.value = true
}
function setOpsQueryModeDefaultLocal(value: string) {
opsQueryModeDefault.value = value || 'auto'
writeCachedString('ops_query_mode_default_cached', opsQueryModeDefault.value)
loaded.value = true
}
// Keep UI consistent if we learn that ops is disabled via feature-gated 404s.
// (event is dispatched from the axios interceptor)
let eventHandlerCleanup: (() => void) | null = null
function initializeEventListeners() {
if (eventHandlerCleanup) return
try {
const handler = () => {
setOpsMonitoringEnabledLocal(false)
}
window.addEventListener('ops-monitoring-disabled', handler)
eventHandlerCleanup = () => {
window.removeEventListener('ops-monitoring-disabled', handler)
}
} catch {
// ignore window access failures (SSR)
}
}
if (typeof window !== 'undefined') {
initializeEventListeners()
}
return {
loaded,
loading,
opsMonitoringEnabled,
opsRealtimeMonitoringEnabled,
opsQueryModeDefault,
fetch,
setOpsMonitoringEnabledLocal,
setOpsRealtimeMonitoringEnabledLocal,
setOpsQueryModeDefaultLocal
}
})
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment