Commit 839ab37d authored by yangjianbo's avatar yangjianbo
Browse files
parents 9dd0ef18 fd8473f2
package service
import (
"encoding/json"
"strings"
"time"
"github.com/gin-gonic/gin"
)
// Gin context keys used by Ops error logger for capturing upstream error details.
// These keys are set by gateway services and consumed by handler/ops_error_logger.go.
const (
OpsUpstreamStatusCodeKey = "ops_upstream_status_code"
OpsUpstreamErrorMessageKey = "ops_upstream_error_message"
OpsUpstreamErrorDetailKey = "ops_upstream_error_detail"
OpsUpstreamErrorsKey = "ops_upstream_errors"
)
func setOpsUpstreamError(c *gin.Context, upstreamStatusCode int, upstreamMessage, upstreamDetail string) {
if c == nil {
return
}
if upstreamStatusCode > 0 {
c.Set(OpsUpstreamStatusCodeKey, upstreamStatusCode)
}
if msg := strings.TrimSpace(upstreamMessage); msg != "" {
c.Set(OpsUpstreamErrorMessageKey, msg)
}
if detail := strings.TrimSpace(upstreamDetail); detail != "" {
c.Set(OpsUpstreamErrorDetailKey, detail)
}
}
// OpsUpstreamErrorEvent describes one upstream error attempt during a single gateway request.
// It is stored in ops_error_logs.upstream_errors as a JSON array.
type OpsUpstreamErrorEvent struct {
AtUnixMs int64 `json:"at_unix_ms,omitempty"`
// Context
Platform string `json:"platform,omitempty"`
AccountID int64 `json:"account_id,omitempty"`
// Outcome
UpstreamStatusCode int `json:"upstream_status_code,omitempty"`
UpstreamRequestID string `json:"upstream_request_id,omitempty"`
// Kind: http_error | request_error | retry_exhausted | failover
Kind string `json:"kind,omitempty"`
Message string `json:"message,omitempty"`
Detail string `json:"detail,omitempty"`
}
func appendOpsUpstreamError(c *gin.Context, ev OpsUpstreamErrorEvent) {
if c == nil {
return
}
if ev.AtUnixMs <= 0 {
ev.AtUnixMs = time.Now().UnixMilli()
}
ev.Platform = strings.TrimSpace(ev.Platform)
ev.UpstreamRequestID = strings.TrimSpace(ev.UpstreamRequestID)
ev.Kind = strings.TrimSpace(ev.Kind)
ev.Message = strings.TrimSpace(ev.Message)
ev.Detail = strings.TrimSpace(ev.Detail)
if ev.Message != "" {
ev.Message = sanitizeUpstreamErrorMessage(ev.Message)
}
var existing []*OpsUpstreamErrorEvent
if v, ok := c.Get(OpsUpstreamErrorsKey); ok {
if arr, ok := v.([]*OpsUpstreamErrorEvent); ok {
existing = arr
}
}
evCopy := ev
existing = append(existing, &evCopy)
c.Set(OpsUpstreamErrorsKey, existing)
}
func marshalOpsUpstreamErrors(events []*OpsUpstreamErrorEvent) *string {
if len(events) == 0 {
return nil
}
// Ensure we always store a valid JSON value.
raw, err := json.Marshal(events)
if err != nil || len(raw) == 0 {
return nil
}
s := string(raw)
return &s
}
package service
import (
"context"
"time"
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
)
// GetWindowStats returns lightweight request/token counts for the provided window.
// It is intended for realtime sampling (e.g. WebSocket QPS push) without computing percentiles/peaks.
func (s *OpsService) GetWindowStats(ctx context.Context, startTime, endTime time.Time) (*OpsWindowStats, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
filter := &OpsDashboardFilter{
StartTime: startTime,
EndTime: endTime,
}
return s.opsRepo.GetWindowStats(ctx, filter)
}
...@@ -55,19 +55,36 @@ func (s *RateLimitService) HandleUpstreamError(ctx context.Context, account *Acc ...@@ -55,19 +55,36 @@ func (s *RateLimitService) HandleUpstreamError(ctx context.Context, account *Acc
} }
tempMatched := s.tryTempUnschedulable(ctx, account, statusCode, responseBody) tempMatched := s.tryTempUnschedulable(ctx, account, statusCode, responseBody)
upstreamMsg := strings.TrimSpace(extractUpstreamErrorMessage(responseBody))
upstreamMsg = sanitizeUpstreamErrorMessage(upstreamMsg)
if upstreamMsg != "" {
upstreamMsg = truncateForLog([]byte(upstreamMsg), 512)
}
switch statusCode { switch statusCode {
case 401: case 401:
// 认证失败:停止调度,记录错误 // 认证失败:停止调度,记录错误
s.handleAuthError(ctx, account, "Authentication failed (401): invalid or expired credentials") msg := "Authentication failed (401): invalid or expired credentials"
if upstreamMsg != "" {
msg = "Authentication failed (401): " + upstreamMsg
}
s.handleAuthError(ctx, account, msg)
shouldDisable = true shouldDisable = true
case 402: case 402:
// 支付要求:余额不足或计费问题,停止调度 // 支付要求:余额不足或计费问题,停止调度
s.handleAuthError(ctx, account, "Payment required (402): insufficient balance or billing issue") msg := "Payment required (402): insufficient balance or billing issue"
if upstreamMsg != "" {
msg = "Payment required (402): " + upstreamMsg
}
s.handleAuthError(ctx, account, msg)
shouldDisable = true shouldDisable = true
case 403: case 403:
// 禁止访问:停止调度,记录错误 // 禁止访问:停止调度,记录错误
s.handleAuthError(ctx, account, "Access forbidden (403): account may be suspended or lack permissions") msg := "Access forbidden (403): account may be suspended or lack permissions"
if upstreamMsg != "" {
msg = "Access forbidden (403): " + upstreamMsg
}
s.handleAuthError(ctx, account, msg)
shouldDisable = true shouldDisable = true
case 429: case 429:
s.handle429(ctx, account, headers) s.handle429(ctx, account, headers)
......
...@@ -176,7 +176,7 @@ func (s *SettingService) UpdateSettings(ctx context.Context, settings *SystemSet ...@@ -176,7 +176,7 @@ func (s *SettingService) UpdateSettings(ctx context.Context, settings *SystemSet
updates[SettingKeyTurnstileSecretKey] = settings.TurnstileSecretKey updates[SettingKeyTurnstileSecretKey] = settings.TurnstileSecretKey
} }
// LinuxDo Connect OAuth 登录(终端用户 SSO) // LinuxDo Connect OAuth 登录
updates[SettingKeyLinuxDoConnectEnabled] = strconv.FormatBool(settings.LinuxDoConnectEnabled) updates[SettingKeyLinuxDoConnectEnabled] = strconv.FormatBool(settings.LinuxDoConnectEnabled)
updates[SettingKeyLinuxDoConnectClientID] = settings.LinuxDoConnectClientID updates[SettingKeyLinuxDoConnectClientID] = settings.LinuxDoConnectClientID
updates[SettingKeyLinuxDoConnectRedirectURL] = settings.LinuxDoConnectRedirectURL updates[SettingKeyLinuxDoConnectRedirectURL] = settings.LinuxDoConnectRedirectURL
...@@ -208,6 +208,14 @@ func (s *SettingService) UpdateSettings(ctx context.Context, settings *SystemSet ...@@ -208,6 +208,14 @@ func (s *SettingService) UpdateSettings(ctx context.Context, settings *SystemSet
updates[SettingKeyEnableIdentityPatch] = strconv.FormatBool(settings.EnableIdentityPatch) updates[SettingKeyEnableIdentityPatch] = strconv.FormatBool(settings.EnableIdentityPatch)
updates[SettingKeyIdentityPatchPrompt] = settings.IdentityPatchPrompt updates[SettingKeyIdentityPatchPrompt] = settings.IdentityPatchPrompt
// Ops monitoring (vNext)
updates[SettingKeyOpsMonitoringEnabled] = strconv.FormatBool(settings.OpsMonitoringEnabled)
updates[SettingKeyOpsRealtimeMonitoringEnabled] = strconv.FormatBool(settings.OpsRealtimeMonitoringEnabled)
updates[SettingKeyOpsQueryModeDefault] = string(ParseOpsQueryMode(settings.OpsQueryModeDefault))
if settings.OpsMetricsIntervalSeconds > 0 {
updates[SettingKeyOpsMetricsIntervalSeconds] = strconv.Itoa(settings.OpsMetricsIntervalSeconds)
}
err := s.settingRepo.SetMultiple(ctx, updates) err := s.settingRepo.SetMultiple(ctx, updates)
if err == nil && s.onUpdate != nil { if err == nil && s.onUpdate != nil {
s.onUpdate() // Invalidate cache after settings update s.onUpdate() // Invalidate cache after settings update
...@@ -298,6 +306,12 @@ func (s *SettingService) InitializeDefaultSettings(ctx context.Context) error { ...@@ -298,6 +306,12 @@ func (s *SettingService) InitializeDefaultSettings(ctx context.Context) error {
// Identity patch defaults // Identity patch defaults
SettingKeyEnableIdentityPatch: "true", SettingKeyEnableIdentityPatch: "true",
SettingKeyIdentityPatchPrompt: "", SettingKeyIdentityPatchPrompt: "",
// Ops monitoring defaults (vNext)
SettingKeyOpsMonitoringEnabled: "true",
SettingKeyOpsRealtimeMonitoringEnabled: "true",
SettingKeyOpsQueryModeDefault: "auto",
SettingKeyOpsMetricsIntervalSeconds: "60",
} }
return s.settingRepo.SetMultiple(ctx, defaults) return s.settingRepo.SetMultiple(ctx, defaults)
...@@ -397,100 +411,33 @@ func (s *SettingService) parseSettings(settings map[string]string) *SystemSettin ...@@ -397,100 +411,33 @@ func (s *SettingService) parseSettings(settings map[string]string) *SystemSettin
} }
result.IdentityPatchPrompt = settings[SettingKeyIdentityPatchPrompt] result.IdentityPatchPrompt = settings[SettingKeyIdentityPatchPrompt]
return result // Ops monitoring settings (default: enabled, fail-open)
} result.OpsMonitoringEnabled = !isFalseSettingValue(settings[SettingKeyOpsMonitoringEnabled])
result.OpsRealtimeMonitoringEnabled = !isFalseSettingValue(settings[SettingKeyOpsRealtimeMonitoringEnabled])
// GetLinuxDoConnectOAuthConfig 返回用于登录的“最终生效” LinuxDo Connect 配置。 result.OpsQueryModeDefault = string(ParseOpsQueryMode(settings[SettingKeyOpsQueryModeDefault]))
// result.OpsMetricsIntervalSeconds = 60
// 优先级: if raw := strings.TrimSpace(settings[SettingKeyOpsMetricsIntervalSeconds]); raw != "" {
// - 若对应系统设置键存在,则覆盖 config.yaml/env 的值 if v, err := strconv.Atoi(raw); err == nil {
// - 否则回退到 config.yaml/env 的值 if v < 60 {
func (s *SettingService) GetLinuxDoConnectOAuthConfig(ctx context.Context) (config.LinuxDoConnectConfig, error) { v = 60
if s == nil || s.cfg == nil { }
return config.LinuxDoConnectConfig{}, infraerrors.ServiceUnavailable("CONFIG_NOT_READY", "config not loaded") if v > 3600 {
} v = 3600
}
effective := s.cfg.LinuxDo result.OpsMetricsIntervalSeconds = v
}
keys := []string{
SettingKeyLinuxDoConnectEnabled,
SettingKeyLinuxDoConnectClientID,
SettingKeyLinuxDoConnectClientSecret,
SettingKeyLinuxDoConnectRedirectURL,
}
settings, err := s.settingRepo.GetMultiple(ctx, keys)
if err != nil {
return config.LinuxDoConnectConfig{}, fmt.Errorf("get linuxdo connect settings: %w", err)
}
if raw, ok := settings[SettingKeyLinuxDoConnectEnabled]; ok {
effective.Enabled = raw == "true"
}
if v, ok := settings[SettingKeyLinuxDoConnectClientID]; ok && strings.TrimSpace(v) != "" {
effective.ClientID = strings.TrimSpace(v)
}
if v, ok := settings[SettingKeyLinuxDoConnectClientSecret]; ok && strings.TrimSpace(v) != "" {
effective.ClientSecret = strings.TrimSpace(v)
}
if v, ok := settings[SettingKeyLinuxDoConnectRedirectURL]; ok && strings.TrimSpace(v) != "" {
effective.RedirectURL = strings.TrimSpace(v)
}
if !effective.Enabled {
return config.LinuxDoConnectConfig{}, infraerrors.NotFound("OAUTH_DISABLED", "oauth login is disabled")
}
// 基础健壮性校验(避免把用户重定向到一个必然失败或不安全的 OAuth 流程里)。
if strings.TrimSpace(effective.ClientID) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth client id not configured")
}
if strings.TrimSpace(effective.AuthorizeURL) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth authorize url not configured")
}
if strings.TrimSpace(effective.TokenURL) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token url not configured")
}
if strings.TrimSpace(effective.UserInfoURL) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth userinfo url not configured")
}
if strings.TrimSpace(effective.RedirectURL) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth redirect url not configured")
}
if strings.TrimSpace(effective.FrontendRedirectURL) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth frontend redirect url not configured")
} }
if err := config.ValidateAbsoluteHTTPURL(effective.AuthorizeURL); err != nil { return result
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth authorize url invalid") }
}
if err := config.ValidateAbsoluteHTTPURL(effective.TokenURL); err != nil {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token url invalid")
}
if err := config.ValidateAbsoluteHTTPURL(effective.UserInfoURL); err != nil {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth userinfo url invalid")
}
if err := config.ValidateAbsoluteHTTPURL(effective.RedirectURL); err != nil {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth redirect url invalid")
}
if err := config.ValidateFrontendRedirectURL(effective.FrontendRedirectURL); err != nil {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth frontend redirect url invalid")
}
method := strings.ToLower(strings.TrimSpace(effective.TokenAuthMethod)) func isFalseSettingValue(value string) bool {
switch method { switch strings.ToLower(strings.TrimSpace(value)) {
case "", "client_secret_post", "client_secret_basic": case "false", "0", "off", "disabled":
if strings.TrimSpace(effective.ClientSecret) == "" { return true
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth client secret not configured")
}
case "none":
if !effective.UsePKCE {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth pkce must be enabled when token_auth_method=none")
}
default: default:
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token_auth_method invalid") return false
} }
return effective, nil
} }
// getStringOrDefault 获取字符串值或默认值 // getStringOrDefault 获取字符串值或默认值
...@@ -635,3 +582,96 @@ func (s *SettingService) GetFallbackModel(ctx context.Context, platform string) ...@@ -635,3 +582,96 @@ func (s *SettingService) GetFallbackModel(ctx context.Context, platform string)
} }
return value return value
} }
// GetLinuxDoConnectOAuthConfig 返回用于登录的"最终生效" LinuxDo Connect 配置。
//
// 优先级:
// - 若对应系统设置键存在,则覆盖 config.yaml/env 的值
// - 否则回退到 config.yaml/env 的值
func (s *SettingService) GetLinuxDoConnectOAuthConfig(ctx context.Context) (config.LinuxDoConnectConfig, error) {
if s == nil || s.cfg == nil {
return config.LinuxDoConnectConfig{}, infraerrors.ServiceUnavailable("CONFIG_NOT_READY", "config not loaded")
}
effective := s.cfg.LinuxDo
keys := []string{
SettingKeyLinuxDoConnectEnabled,
SettingKeyLinuxDoConnectClientID,
SettingKeyLinuxDoConnectClientSecret,
SettingKeyLinuxDoConnectRedirectURL,
}
settings, err := s.settingRepo.GetMultiple(ctx, keys)
if err != nil {
return config.LinuxDoConnectConfig{}, fmt.Errorf("get linuxdo connect settings: %w", err)
}
if raw, ok := settings[SettingKeyLinuxDoConnectEnabled]; ok {
effective.Enabled = raw == "true"
}
if v, ok := settings[SettingKeyLinuxDoConnectClientID]; ok && strings.TrimSpace(v) != "" {
effective.ClientID = strings.TrimSpace(v)
}
if v, ok := settings[SettingKeyLinuxDoConnectClientSecret]; ok && strings.TrimSpace(v) != "" {
effective.ClientSecret = strings.TrimSpace(v)
}
if v, ok := settings[SettingKeyLinuxDoConnectRedirectURL]; ok && strings.TrimSpace(v) != "" {
effective.RedirectURL = strings.TrimSpace(v)
}
if !effective.Enabled {
return config.LinuxDoConnectConfig{}, infraerrors.NotFound("OAUTH_DISABLED", "oauth login is disabled")
}
// 基础健壮性校验(避免把用户重定向到一个必然失败或不安全的 OAuth 流程里)。
if strings.TrimSpace(effective.ClientID) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth client id not configured")
}
if strings.TrimSpace(effective.AuthorizeURL) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth authorize url not configured")
}
if strings.TrimSpace(effective.TokenURL) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token url not configured")
}
if strings.TrimSpace(effective.UserInfoURL) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth userinfo url not configured")
}
if strings.TrimSpace(effective.RedirectURL) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth redirect url not configured")
}
if strings.TrimSpace(effective.FrontendRedirectURL) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth frontend redirect url not configured")
}
if err := config.ValidateAbsoluteHTTPURL(effective.AuthorizeURL); err != nil {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth authorize url invalid")
}
if err := config.ValidateAbsoluteHTTPURL(effective.TokenURL); err != nil {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token url invalid")
}
if err := config.ValidateAbsoluteHTTPURL(effective.UserInfoURL); err != nil {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth userinfo url invalid")
}
if err := config.ValidateAbsoluteHTTPURL(effective.RedirectURL); err != nil {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth redirect url invalid")
}
if err := config.ValidateFrontendRedirectURL(effective.FrontendRedirectURL); err != nil {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth frontend redirect url invalid")
}
method := strings.ToLower(strings.TrimSpace(effective.TokenAuthMethod))
switch method {
case "", "client_secret_post", "client_secret_basic":
if strings.TrimSpace(effective.ClientSecret) == "" {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth client secret not configured")
}
case "none":
if !effective.UsePKCE {
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth pkce must be enabled when token_auth_method=none")
}
default:
return config.LinuxDoConnectConfig{}, infraerrors.InternalServer("OAUTH_CONFIG_INVALID", "oauth token_auth_method invalid")
}
return effective, nil
}
...@@ -18,7 +18,7 @@ type SystemSettings struct { ...@@ -18,7 +18,7 @@ type SystemSettings struct {
TurnstileSecretKey string TurnstileSecretKey string
TurnstileSecretKeyConfigured bool TurnstileSecretKeyConfigured bool
// LinuxDo Connect OAuth 登录(终端用户 SSO) // LinuxDo Connect OAuth 登录
LinuxDoConnectEnabled bool LinuxDoConnectEnabled bool
LinuxDoConnectClientID string LinuxDoConnectClientID string
LinuxDoConnectClientSecret string LinuxDoConnectClientSecret string
...@@ -46,6 +46,12 @@ type SystemSettings struct { ...@@ -46,6 +46,12 @@ type SystemSettings struct {
// Identity patch configuration (Claude -> Gemini) // Identity patch configuration (Claude -> Gemini)
EnableIdentityPatch bool `json:"enable_identity_patch"` EnableIdentityPatch bool `json:"enable_identity_patch"`
IdentityPatchPrompt string `json:"identity_patch_prompt"` IdentityPatchPrompt string `json:"identity_patch_prompt"`
// Ops monitoring (vNext)
OpsMonitoringEnabled bool
OpsRealtimeMonitoringEnabled bool
OpsQueryModeDefault string
OpsMetricsIntervalSeconds int
} }
type PublicSettings struct { type PublicSettings struct {
......
package service package service
import ( import (
"database/sql"
"time" "time"
"github.com/Wei-Shaw/sub2api/internal/config" "github.com/Wei-Shaw/sub2api/internal/config"
"github.com/google/wire" "github.com/google/wire"
"github.com/redis/go-redis/v9"
) )
// BuildInfo contains build information // BuildInfo contains build information
...@@ -84,6 +86,72 @@ func ProvideConcurrencyService(cache ConcurrencyCache, accountRepo AccountReposi ...@@ -84,6 +86,72 @@ func ProvideConcurrencyService(cache ConcurrencyCache, accountRepo AccountReposi
return svc return svc
} }
// ProvideOpsMetricsCollector creates and starts OpsMetricsCollector.
func ProvideOpsMetricsCollector(
opsRepo OpsRepository,
settingRepo SettingRepository,
accountRepo AccountRepository,
concurrencyService *ConcurrencyService,
db *sql.DB,
redisClient *redis.Client,
cfg *config.Config,
) *OpsMetricsCollector {
collector := NewOpsMetricsCollector(opsRepo, settingRepo, accountRepo, concurrencyService, db, redisClient, cfg)
collector.Start()
return collector
}
// ProvideOpsAggregationService creates and starts OpsAggregationService (hourly/daily pre-aggregation).
func ProvideOpsAggregationService(
opsRepo OpsRepository,
settingRepo SettingRepository,
db *sql.DB,
redisClient *redis.Client,
cfg *config.Config,
) *OpsAggregationService {
svc := NewOpsAggregationService(opsRepo, settingRepo, db, redisClient, cfg)
svc.Start()
return svc
}
// ProvideOpsAlertEvaluatorService creates and starts OpsAlertEvaluatorService.
func ProvideOpsAlertEvaluatorService(
opsService *OpsService,
opsRepo OpsRepository,
emailService *EmailService,
redisClient *redis.Client,
cfg *config.Config,
) *OpsAlertEvaluatorService {
svc := NewOpsAlertEvaluatorService(opsService, opsRepo, emailService, redisClient, cfg)
svc.Start()
return svc
}
// ProvideOpsCleanupService creates and starts OpsCleanupService (cron scheduled).
func ProvideOpsCleanupService(
opsRepo OpsRepository,
db *sql.DB,
redisClient *redis.Client,
cfg *config.Config,
) *OpsCleanupService {
svc := NewOpsCleanupService(opsRepo, db, redisClient, cfg)
svc.Start()
return svc
}
// ProvideOpsScheduledReportService creates and starts OpsScheduledReportService.
func ProvideOpsScheduledReportService(
opsService *OpsService,
userService *UserService,
emailService *EmailService,
redisClient *redis.Client,
cfg *config.Config,
) *OpsScheduledReportService {
svc := NewOpsScheduledReportService(opsService, userService, emailService, redisClient, cfg)
svc.Start()
return svc
}
// ProvideAPIKeyAuthCacheInvalidator 提供 API Key 认证缓存失效能力 // ProvideAPIKeyAuthCacheInvalidator 提供 API Key 认证缓存失效能力
func ProvideAPIKeyAuthCacheInvalidator(apiKeyService *APIKeyService) APIKeyAuthCacheInvalidator { func ProvideAPIKeyAuthCacheInvalidator(apiKeyService *APIKeyService) APIKeyAuthCacheInvalidator {
return apiKeyService return apiKeyService
...@@ -122,6 +190,12 @@ var ProviderSet = wire.NewSet( ...@@ -122,6 +190,12 @@ var ProviderSet = wire.NewSet(
NewAccountUsageService, NewAccountUsageService,
NewAccountTestService, NewAccountTestService,
NewSettingService, NewSettingService,
NewOpsService,
ProvideOpsMetricsCollector,
ProvideOpsAggregationService,
ProvideOpsAlertEvaluatorService,
ProvideOpsCleanupService,
ProvideOpsScheduledReportService,
NewEmailService, NewEmailService,
ProvideEmailQueueService, ProvideEmailQueueService,
NewTurnstileService, NewTurnstileService,
......
-- Ops Monitoring (vNext): squashed migration (030)
--
-- This repository originally planned Ops vNext as migrations 030-036:
-- 030 drop legacy ops tables
-- 031 core schema
-- 032 pre-aggregation tables
-- 033 indexes + optional extensions
-- 034 add avg/max to preagg
-- 035 add notify_email to alert rules
-- 036 seed default alert rules
--
-- Since these migrations have NOT been applied to any environment yet, we squash them
-- into a single 030 migration for easier review and a cleaner migration history.
--
-- Notes:
-- - This is intentionally destructive for ops_* data (error logs / metrics / alerts).
-- - It is idempotent (DROP/CREATE/ALTER IF EXISTS/IF NOT EXISTS), but will wipe ops_* data if re-run.
-- =====================================================================
-- 030_ops_drop_legacy_ops_tables.sql
-- =====================================================================
SET LOCAL lock_timeout = '5s';
SET LOCAL statement_timeout = '10min';
-- Legacy pre-aggregation tables (from 026 and/or previous branches)
DROP TABLE IF EXISTS ops_metrics_daily CASCADE;
DROP TABLE IF EXISTS ops_metrics_hourly CASCADE;
-- Core ops tables that may exist in some deployments / branches
DROP TABLE IF EXISTS ops_system_metrics CASCADE;
DROP TABLE IF EXISTS ops_error_logs CASCADE;
DROP TABLE IF EXISTS ops_alert_events CASCADE;
DROP TABLE IF EXISTS ops_alert_rules CASCADE;
DROP TABLE IF EXISTS ops_job_heartbeats CASCADE;
DROP TABLE IF EXISTS ops_retry_attempts CASCADE;
-- Optional legacy tables (best-effort cleanup)
DROP TABLE IF EXISTS ops_scheduled_reports CASCADE;
DROP TABLE IF EXISTS ops_group_availability_configs CASCADE;
DROP TABLE IF EXISTS ops_group_availability_events CASCADE;
-- Optional legacy views/indexes
DROP VIEW IF EXISTS ops_latest_metrics CASCADE;
-- =====================================================================
-- 031_ops_core_schema.sql
-- =====================================================================
-- Ops Monitoring (vNext): core schema (errors / retries / metrics / jobs / alerts)
--
-- Design goals:
-- - Support global filtering (time/platform/group) across all ops modules.
-- - Persist enough context for two retry modes (client retry / pinned upstream retry).
-- - Make ops background jobs observable via job heartbeats.
-- - Keep schema stable and indexes targeted (high-write tables).
--
-- Notes:
-- - This migration is idempotent.
-- - ops_* tables intentionally avoid strict foreign keys to reduce write amplification/locks.
SET LOCAL lock_timeout = '5s';
SET LOCAL statement_timeout = '10min';
-- ============================================
-- 1) ops_error_logs: error log details (high-write)
-- ============================================
CREATE TABLE IF NOT EXISTS ops_error_logs (
id BIGSERIAL PRIMARY KEY,
-- Correlation / identities
request_id VARCHAR(64),
client_request_id VARCHAR(64),
user_id BIGINT,
api_key_id BIGINT,
account_id BIGINT,
group_id BIGINT,
client_ip inet,
-- Dimensions for global filtering
platform VARCHAR(32),
-- Request metadata
model VARCHAR(100),
request_path VARCHAR(256),
stream BOOLEAN NOT NULL DEFAULT false,
user_agent TEXT,
-- Core error classification
error_phase VARCHAR(32) NOT NULL,
error_type VARCHAR(64) NOT NULL,
severity VARCHAR(8) NOT NULL DEFAULT 'P2',
status_code INT,
-- vNext metric semantics
is_business_limited BOOLEAN NOT NULL DEFAULT false,
-- Error details (sanitized/truncated at ingest time)
error_message TEXT,
error_body TEXT,
-- Provider/upstream details (optional; useful for trends & account health)
error_source VARCHAR(64),
error_owner VARCHAR(32),
account_status VARCHAR(50),
upstream_status_code INT,
upstream_error_message TEXT,
upstream_error_detail TEXT,
provider_error_code VARCHAR(64),
provider_error_type VARCHAR(64),
network_error_type VARCHAR(50),
retry_after_seconds INT,
-- Timings (ms) - optional
duration_ms INT,
time_to_first_token_ms BIGINT,
auth_latency_ms BIGINT,
routing_latency_ms BIGINT,
upstream_latency_ms BIGINT,
response_latency_ms BIGINT,
-- Retry context (only stored for error requests)
request_body JSONB,
request_headers JSONB,
request_body_truncated BOOLEAN NOT NULL DEFAULT false,
request_body_bytes INT,
-- Retryability flags (best-effort classification)
is_retryable BOOLEAN NOT NULL DEFAULT false,
retry_count INT NOT NULL DEFAULT 0,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
COMMENT ON TABLE ops_error_logs IS 'Ops error logs (vNext). Stores sanitized error details and request_body for retries (errors only).';
-- ============================================
-- 2) ops_retry_attempts: audit log for retries
-- ============================================
CREATE TABLE IF NOT EXISTS ops_retry_attempts (
id BIGSERIAL PRIMARY KEY,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
requested_by_user_id BIGINT,
source_error_id BIGINT,
-- client|upstream
mode VARCHAR(16) NOT NULL,
pinned_account_id BIGINT,
-- queued|running|succeeded|failed
status VARCHAR(16) NOT NULL DEFAULT 'queued',
started_at TIMESTAMPTZ,
finished_at TIMESTAMPTZ,
duration_ms BIGINT,
-- Optional result correlation
result_request_id VARCHAR(64),
result_error_id BIGINT,
result_usage_request_id VARCHAR(64),
error_message TEXT
);
COMMENT ON TABLE ops_retry_attempts IS 'Audit table for ops retries (client retry / pinned upstream retry).';
-- ============================================
-- 3) ops_system_metrics: system + request window snapshots
-- ============================================
CREATE TABLE IF NOT EXISTS ops_system_metrics (
id BIGSERIAL PRIMARY KEY,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
window_minutes INT NOT NULL DEFAULT 1,
-- Optional dimensions (only if collector chooses to write per-dimension snapshots)
platform VARCHAR(32),
group_id BIGINT,
-- Core counts
success_count BIGINT NOT NULL DEFAULT 0,
error_count_total BIGINT NOT NULL DEFAULT 0,
business_limited_count BIGINT NOT NULL DEFAULT 0,
error_count_sla BIGINT NOT NULL DEFAULT 0,
upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0,
upstream_429_count BIGINT NOT NULL DEFAULT 0,
upstream_529_count BIGINT NOT NULL DEFAULT 0,
token_consumed BIGINT NOT NULL DEFAULT 0,
-- Rates
qps DOUBLE PRECISION,
tps DOUBLE PRECISION,
-- Duration percentiles (ms) - success requests
duration_p50_ms INT,
duration_p90_ms INT,
duration_p95_ms INT,
duration_p99_ms INT,
duration_avg_ms DOUBLE PRECISION,
duration_max_ms INT,
-- TTFT percentiles (ms) - success requests (streaming)
ttft_p50_ms INT,
ttft_p90_ms INT,
ttft_p95_ms INT,
ttft_p99_ms INT,
ttft_avg_ms DOUBLE PRECISION,
ttft_max_ms INT,
-- System resources
cpu_usage_percent DOUBLE PRECISION,
memory_used_mb BIGINT,
memory_total_mb BIGINT,
memory_usage_percent DOUBLE PRECISION,
-- Dependency health (best-effort)
db_ok BOOLEAN,
redis_ok BOOLEAN,
-- DB pool & runtime
db_conn_active INT,
db_conn_idle INT,
db_conn_waiting INT,
goroutine_count INT,
-- Queue / concurrency
concurrency_queue_depth INT
);
COMMENT ON TABLE ops_system_metrics IS 'Ops system/request metrics snapshots (vNext). Used for dashboard overview and realtime rates.';
-- ============================================
-- 4) ops_job_heartbeats: background jobs health
-- ============================================
CREATE TABLE IF NOT EXISTS ops_job_heartbeats (
job_name VARCHAR(64) PRIMARY KEY,
last_run_at TIMESTAMPTZ,
last_success_at TIMESTAMPTZ,
last_error_at TIMESTAMPTZ,
last_error TEXT,
last_duration_ms BIGINT,
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
COMMENT ON TABLE ops_job_heartbeats IS 'Ops background jobs heartbeats (vNext).';
-- ============================================
-- 5) ops_alert_rules / ops_alert_events
-- ============================================
CREATE TABLE IF NOT EXISTS ops_alert_rules (
id BIGSERIAL PRIMARY KEY,
name VARCHAR(128) NOT NULL,
description TEXT,
enabled BOOLEAN NOT NULL DEFAULT true,
severity VARCHAR(16) NOT NULL DEFAULT 'warning',
-- Metric definition
-- Metric definition
metric_type VARCHAR(64) NOT NULL,
operator VARCHAR(8) NOT NULL,
threshold DOUBLE PRECISION NOT NULL,
window_minutes INT NOT NULL DEFAULT 5,
sustained_minutes INT NOT NULL DEFAULT 5,
cooldown_minutes INT NOT NULL DEFAULT 10,
-- Optional scoping: platform/group filters etc.
filters JSONB,
last_triggered_at TIMESTAMPTZ,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_alert_rules_name_unique
ON ops_alert_rules (name);
CREATE INDEX IF NOT EXISTS idx_ops_alert_rules_enabled
ON ops_alert_rules (enabled);
CREATE TABLE IF NOT EXISTS ops_alert_events (
id BIGSERIAL PRIMARY KEY,
rule_id BIGINT,
severity VARCHAR(16) NOT NULL,
status VARCHAR(16) NOT NULL DEFAULT 'firing',
title VARCHAR(200),
description TEXT,
metric_value DOUBLE PRECISION,
threshold_value DOUBLE PRECISION,
dimensions JSONB,
fired_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
resolved_at TIMESTAMPTZ,
email_sent BOOLEAN NOT NULL DEFAULT false,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_ops_alert_events_rule_status
ON ops_alert_events (rule_id, status);
CREATE INDEX IF NOT EXISTS idx_ops_alert_events_fired_at
ON ops_alert_events (fired_at DESC);
-- =====================================================================
-- 032_ops_preaggregation_tables.sql
-- =====================================================================
-- Ops Monitoring (vNext): pre-aggregation tables
--
-- Purpose:
-- - Provide stable query performance for 1–24h windows (and beyond), avoiding expensive
-- percentile_cont scans on raw logs for every dashboard refresh.
-- - Support global filter dimensions: overall / platform / group.
--
-- Design note:
-- - We keep a single table with nullable platform/group_id, and enforce uniqueness via a
-- COALESCE-based unique index (because UNIQUE with NULLs allows duplicates in Postgres).
SET LOCAL lock_timeout = '5s';
SET LOCAL statement_timeout = '10min';
-- ============================================
-- 1) ops_metrics_hourly
-- ============================================
CREATE TABLE IF NOT EXISTS ops_metrics_hourly (
id BIGSERIAL PRIMARY KEY,
bucket_start TIMESTAMPTZ NOT NULL,
platform VARCHAR(32),
group_id BIGINT,
success_count BIGINT NOT NULL DEFAULT 0,
error_count_total BIGINT NOT NULL DEFAULT 0,
business_limited_count BIGINT NOT NULL DEFAULT 0,
error_count_sla BIGINT NOT NULL DEFAULT 0,
upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0,
upstream_429_count BIGINT NOT NULL DEFAULT 0,
upstream_529_count BIGINT NOT NULL DEFAULT 0,
token_consumed BIGINT NOT NULL DEFAULT 0,
-- Duration percentiles (ms)
duration_p50_ms INT,
duration_p90_ms INT,
duration_p95_ms INT,
duration_p99_ms INT,
-- TTFT percentiles (ms)
ttft_p50_ms INT,
ttft_p90_ms INT,
ttft_p95_ms INT,
ttft_p99_ms INT,
computed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
-- Uniqueness across three “dimension modes” (overall / platform / group).
-- Postgres UNIQUE treats NULLs as distinct, so we enforce uniqueness via COALESCE.
CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_metrics_hourly_unique_dim
ON ops_metrics_hourly (
bucket_start,
COALESCE(platform, ''),
COALESCE(group_id, 0)
);
CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_bucket
ON ops_metrics_hourly (bucket_start DESC);
CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_platform_bucket
ON ops_metrics_hourly (platform, bucket_start DESC)
WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL;
CREATE INDEX IF NOT EXISTS idx_ops_metrics_hourly_group_bucket
ON ops_metrics_hourly (group_id, bucket_start DESC)
WHERE group_id IS NOT NULL AND group_id <> 0;
COMMENT ON TABLE ops_metrics_hourly IS 'vNext hourly pre-aggregated ops metrics (overall/platform/group).';
-- ============================================
-- 2) ops_metrics_daily (optional; for longer windows)
-- ============================================
CREATE TABLE IF NOT EXISTS ops_metrics_daily (
id BIGSERIAL PRIMARY KEY,
bucket_date DATE NOT NULL,
platform VARCHAR(32),
group_id BIGINT,
success_count BIGINT NOT NULL DEFAULT 0,
error_count_total BIGINT NOT NULL DEFAULT 0,
business_limited_count BIGINT NOT NULL DEFAULT 0,
error_count_sla BIGINT NOT NULL DEFAULT 0,
upstream_error_count_excl_429_529 BIGINT NOT NULL DEFAULT 0,
upstream_429_count BIGINT NOT NULL DEFAULT 0,
upstream_529_count BIGINT NOT NULL DEFAULT 0,
token_consumed BIGINT NOT NULL DEFAULT 0,
duration_p50_ms INT,
duration_p90_ms INT,
duration_p95_ms INT,
duration_p99_ms INT,
ttft_p50_ms INT,
ttft_p90_ms INT,
ttft_p95_ms INT,
ttft_p99_ms INT,
computed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_metrics_daily_unique_dim
ON ops_metrics_daily (
bucket_date,
COALESCE(platform, ''),
COALESCE(group_id, 0)
);
CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_bucket
ON ops_metrics_daily (bucket_date DESC);
CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_platform_bucket
ON ops_metrics_daily (platform, bucket_date DESC)
WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL;
CREATE INDEX IF NOT EXISTS idx_ops_metrics_daily_group_bucket
ON ops_metrics_daily (group_id, bucket_date DESC)
WHERE group_id IS NOT NULL AND group_id <> 0;
COMMENT ON TABLE ops_metrics_daily IS 'vNext daily pre-aggregated ops metrics (overall/platform/group).';
-- =====================================================================
-- 033_ops_indexes_and_extensions.sql
-- =====================================================================
-- Ops Monitoring (vNext): indexes and optional extensions
--
-- This migration intentionally keeps "optional" objects (like pg_trgm) best-effort,
-- so environments without extension privileges won't fail the whole migration chain.
SET LOCAL lock_timeout = '5s';
SET LOCAL statement_timeout = '10min';
-- ============================================
-- 1) Core btree indexes (always safe)
-- ============================================
-- ops_error_logs
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_created_at
ON ops_error_logs (created_at DESC);
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_platform_time
ON ops_error_logs (platform, created_at DESC);
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_group_time
ON ops_error_logs (group_id, created_at DESC)
WHERE group_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_account_time
ON ops_error_logs (account_id, created_at DESC)
WHERE account_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_status_time
ON ops_error_logs (status_code, created_at DESC);
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_phase_time
ON ops_error_logs (error_phase, created_at DESC);
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_type_time
ON ops_error_logs (error_type, created_at DESC);
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_request_id
ON ops_error_logs (request_id);
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_client_request_id
ON ops_error_logs (client_request_id);
-- ops_system_metrics
CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_created_at
ON ops_system_metrics (created_at DESC);
CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_window_time
ON ops_system_metrics (window_minutes, created_at DESC);
CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_platform_time
ON ops_system_metrics (platform, created_at DESC)
WHERE platform IS NOT NULL AND platform <> '' AND group_id IS NULL;
CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_group_time
ON ops_system_metrics (group_id, created_at DESC)
WHERE group_id IS NOT NULL;
-- ops_retry_attempts
CREATE INDEX IF NOT EXISTS idx_ops_retry_attempts_created_at
ON ops_retry_attempts (created_at DESC);
CREATE INDEX IF NOT EXISTS idx_ops_retry_attempts_source_error
ON ops_retry_attempts (source_error_id, created_at DESC)
WHERE source_error_id IS NOT NULL;
-- Prevent concurrent retries for the same ops_error_logs row (race-free, multi-instance safe).
CREATE UNIQUE INDEX IF NOT EXISTS idx_ops_retry_attempts_unique_active
ON ops_retry_attempts (source_error_id)
WHERE source_error_id IS NOT NULL AND status IN ('queued', 'running');
-- ============================================
-- 2) Optional: pg_trgm + trigram indexes for fuzzy search
-- ============================================
DO $$
BEGIN
BEGIN
CREATE EXTENSION IF NOT EXISTS pg_trgm;
EXCEPTION WHEN OTHERS THEN
-- Missing privileges or extension package should not block migrations.
RAISE NOTICE 'pg_trgm extension not created: %', SQLERRM;
END;
IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_trgm') THEN
-- request_id / client_request_id fuzzy search
EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_request_id_trgm
ON ops_error_logs USING gin (request_id gin_trgm_ops)';
EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_client_request_id_trgm
ON ops_error_logs USING gin (client_request_id gin_trgm_ops)';
-- error_message fuzzy search
EXECUTE 'CREATE INDEX IF NOT EXISTS idx_ops_error_logs_error_message_trgm
ON ops_error_logs USING gin (error_message gin_trgm_ops)';
END IF;
END $$;
-- =====================================================================
-- 034_ops_preaggregation_add_avg_max.sql
-- =====================================================================
-- Ops Monitoring (vNext): extend pre-aggregation tables with avg/max latency fields
--
-- Why:
-- - The dashboard overview returns avg/max for duration/TTFT.
-- - Hourly/daily pre-aggregation tables originally stored only p50/p90/p95/p99, which makes
-- it impossible to answer avg/max in preagg mode without falling back to raw scans.
--
-- This migration is idempotent and safe to run multiple times.
--
-- NOTE: We keep the existing p50/p90/p95/p99 columns as-is; these are still used for
-- approximate long-window summaries.
SET LOCAL lock_timeout = '5s';
SET LOCAL statement_timeout = '10min';
-- Hourly table
ALTER TABLE ops_metrics_hourly
ADD COLUMN IF NOT EXISTS duration_avg_ms DOUBLE PRECISION,
ADD COLUMN IF NOT EXISTS duration_max_ms INT,
ADD COLUMN IF NOT EXISTS ttft_avg_ms DOUBLE PRECISION,
ADD COLUMN IF NOT EXISTS ttft_max_ms INT;
-- Daily table
ALTER TABLE ops_metrics_daily
ADD COLUMN IF NOT EXISTS duration_avg_ms DOUBLE PRECISION,
ADD COLUMN IF NOT EXISTS duration_max_ms INT,
ADD COLUMN IF NOT EXISTS ttft_avg_ms DOUBLE PRECISION,
ADD COLUMN IF NOT EXISTS ttft_max_ms INT;
-- =====================================================================
-- 035_ops_alert_rules_notify_email.sql
-- =====================================================================
-- Ops Monitoring (vNext): alert rule notify settings
--
-- Adds notify_email flag to ops_alert_rules to keep UI parity with the backup Ops dashboard.
-- Migration is idempotent.
SET LOCAL lock_timeout = '5s';
SET LOCAL statement_timeout = '10min';
ALTER TABLE ops_alert_rules
ADD COLUMN IF NOT EXISTS notify_email BOOLEAN NOT NULL DEFAULT true;
-- =====================================================================
-- 036_ops_seed_default_alert_rules.sql
-- =====================================================================
-- Ops Monitoring (vNext): seed default alert rules (idempotent)
--
-- Goal:
-- - Provide "out of the box" alert rules so the Ops dashboard can immediately show alert events.
-- - Keep inserts idempotent via ON CONFLICT (name) DO NOTHING.
--
-- Notes:
-- - Thresholds are intentionally conservative defaults and should be tuned per deployment.
-- - Metric semantics follow vNext:
-- - success_rate / error_rate are based on SLA-scope counts (exclude is_business_limited).
-- - upstream_error_rate excludes 429/529.
SET LOCAL lock_timeout = '5s';
SET LOCAL statement_timeout = '10min';
-- 1) High error rate (P1)
INSERT INTO ops_alert_rules (
name, description, enabled, metric_type, operator, threshold,
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
created_at, updated_at
) VALUES (
'错误率过高',
'当错误率超过 5% 且持续 5 分钟时触发告警',
true, 'error_rate', '>', 5.0, 5, 5, 'P1', true, 20, NOW(), NOW()
) ON CONFLICT (name) DO NOTHING;
-- 2) Low success rate (P0)
INSERT INTO ops_alert_rules (
name, description, enabled, metric_type, operator, threshold,
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
created_at, updated_at
) VALUES (
'成功率过低',
'当成功率低于 95% 且持续 5 分钟时触发告警(服务可用性下降)',
true, 'success_rate', '<', 95.0, 5, 5, 'P0', true, 15, NOW(), NOW()
) ON CONFLICT (name) DO NOTHING;
-- 3) P99 latency too high (P2)
INSERT INTO ops_alert_rules (
name, description, enabled, metric_type, operator, threshold,
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
created_at, updated_at
) VALUES (
'P99延迟过高',
'当 P99 延迟超过 3000ms 且持续 10 分钟时触发告警',
true, 'p99_latency_ms', '>', 3000.0, 5, 10, 'P2', true, 30, NOW(), NOW()
) ON CONFLICT (name) DO NOTHING;
-- 4) P95 latency too high (P2)
INSERT INTO ops_alert_rules (
name, description, enabled, metric_type, operator, threshold,
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
created_at, updated_at
) VALUES (
'P95延迟过高',
'当 P95 延迟超过 2000ms 且持续 10 分钟时触发告警',
true, 'p95_latency_ms', '>', 2000.0, 5, 10, 'P2', true, 30, NOW(), NOW()
) ON CONFLICT (name) DO NOTHING;
-- 5) CPU usage too high (P2)
INSERT INTO ops_alert_rules (
name, description, enabled, metric_type, operator, threshold,
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
created_at, updated_at
) VALUES (
'CPU使用率过高',
'当 CPU 使用率超过 85% 且持续 10 分钟时触发告警',
true, 'cpu_usage_percent', '>', 85.0, 5, 10, 'P2', true, 30, NOW(), NOW()
) ON CONFLICT (name) DO NOTHING;
-- 6) Memory usage too high (P1)
INSERT INTO ops_alert_rules (
name, description, enabled, metric_type, operator, threshold,
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
created_at, updated_at
) VALUES (
'内存使用率过高',
'当内存使用率超过 90% 且持续 10 分钟时触发告警(可能导致 OOM)',
true, 'memory_usage_percent', '>', 90.0, 5, 10, 'P1', true, 20, NOW(), NOW()
) ON CONFLICT (name) DO NOTHING;
-- 7) Concurrency queue buildup (P1)
INSERT INTO ops_alert_rules (
name, description, enabled, metric_type, operator, threshold,
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
created_at, updated_at
) VALUES (
'并发队列积压',
'当并发队列深度超过 100 且持续 5 分钟时触发告警(系统处理能力不足)',
true, 'concurrency_queue_depth', '>', 100.0, 5, 5, 'P1', true, 20, NOW(), NOW()
) ON CONFLICT (name) DO NOTHING;
-- 8) Extremely high error rate (P0)
INSERT INTO ops_alert_rules (
name, description, enabled, metric_type, operator, threshold,
window_minutes, sustained_minutes, severity, notify_email, cooldown_minutes,
created_at, updated_at
) VALUES (
'错误率极高',
'当错误率超过 20% 且持续 1 分钟时触发告警(服务严重异常)',
true, 'error_rate', '>', 20.0, 1, 1, 'P0', true, 15, NOW(), NOW()
) ON CONFLICT (name) DO NOTHING;
-- Ops Monitoring vNext: add Redis pool stats fields to system metrics snapshots.
-- This migration is intentionally idempotent.
ALTER TABLE ops_system_metrics
ADD COLUMN IF NOT EXISTS redis_conn_total INT,
ADD COLUMN IF NOT EXISTS redis_conn_idle INT;
COMMENT ON COLUMN ops_system_metrics.redis_conn_total IS 'Redis pool total connections (go-redis PoolStats.TotalConns).';
COMMENT ON COLUMN ops_system_metrics.redis_conn_idle IS 'Redis pool idle connections (go-redis PoolStats.IdleConns).';
-- Add upstream error events list (JSONB) to ops_error_logs for per-request correlation.
--
-- This is intentionally idempotent.
ALTER TABLE ops_error_logs
ADD COLUMN IF NOT EXISTS upstream_errors JSONB;
COMMENT ON COLUMN ops_error_logs.upstream_errors IS
'Sanitized upstream error events list (JSON array), correlated per gateway request (request_id/client_request_id); used for per-request upstream debugging.';
...@@ -159,7 +159,7 @@ gateway: ...@@ -159,7 +159,7 @@ gateway:
max_line_size: 41943040 max_line_size: 41943040
# Log upstream error response body summary (safe/truncated; does not log request content) # Log upstream error response body summary (safe/truncated; does not log request content)
# 记录上游错误响应体摘要(安全/截断;不记录请求内容) # 记录上游错误响应体摘要(安全/截断;不记录请求内容)
log_upstream_error_body: false log_upstream_error_body: true
# Max bytes to log from upstream error body # Max bytes to log from upstream error body
# 记录上游错误响应体的最大字节数 # 记录上游错误响应体的最大字节数
log_upstream_error_body_max_bytes: 2048 log_upstream_error_body_max_bytes: 2048
...@@ -302,6 +302,41 @@ redis: ...@@ -302,6 +302,41 @@ redis:
# 数据库编号(0-15) # 数据库编号(0-15)
db: 0 db: 0
# =============================================================================
# Ops Monitoring (Optional)
# 运维监控 (可选)
# =============================================================================
ops:
# Hard switch: disable all ops background jobs and APIs when false
# 硬开关:为 false 时禁用所有 Ops 后台任务与接口
enabled: true
# Prefer pre-aggregated tables (ops_metrics_hourly/ops_metrics_daily) for long-window dashboard queries.
# 优先使用预聚合表(用于长时间窗口查询性能)
use_preaggregated_tables: false
# Data cleanup configuration
# 数据清理配置(vNext 默认统一保留 30 天)
cleanup:
enabled: true
# Cron expression (minute hour dom month dow), e.g. "0 2 * * *" = daily at 2 AM
# Cron 表达式(分 时 日 月 周),例如 "0 2 * * *" = 每天凌晨 2 点
schedule: "0 2 * * *"
error_log_retention_days: 30
minute_metrics_retention_days: 30
hourly_metrics_retention_days: 30
# Pre-aggregation configuration
# 预聚合任务配置
aggregation:
enabled: true
# OpsMetricsCollector Redis cache (reduces duplicate expensive window aggregation in multi-replica deployments)
# 指标采集 Redis 缓存(多副本部署时减少重复计算)
metrics_collector_cache:
enabled: true
ttl: 65s
# ============================================================================= # =============================================================================
# JWT Configuration # JWT Configuration
# JWT 配置 # JWT 配置
......
...@@ -151,6 +151,15 @@ GEMINI_OAUTH_SCOPES= ...@@ -151,6 +151,15 @@ GEMINI_OAUTH_SCOPES=
# GEMINI_QUOTA_POLICY={"tiers":{"LEGACY":{"pro_rpd":50,"flash_rpd":1500,"cooldown_minutes":30},"PRO":{"pro_rpd":1500,"flash_rpd":4000,"cooldown_minutes":5},"ULTRA":{"pro_rpd":2000,"flash_rpd":0,"cooldown_minutes":5}}} # GEMINI_QUOTA_POLICY={"tiers":{"LEGACY":{"pro_rpd":50,"flash_rpd":1500,"cooldown_minutes":30},"PRO":{"pro_rpd":1500,"flash_rpd":4000,"cooldown_minutes":5},"ULTRA":{"pro_rpd":2000,"flash_rpd":0,"cooldown_minutes":5}}}
GEMINI_QUOTA_POLICY= GEMINI_QUOTA_POLICY=
# -----------------------------------------------------------------------------
# Ops Monitoring Configuration (运维监控配置)
# -----------------------------------------------------------------------------
# Enable ops monitoring features (background jobs and APIs)
# 是否启用运维监控功能(后台任务和接口)
# Set to false to hide ops menu in sidebar and disable all ops features
# 设置为 false 可在左侧栏隐藏运维监控菜单并禁用所有运维监控功能
OPS_ENABLED=true
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# Update Configuration (在线更新配置) # Update Configuration (在线更新配置)
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
......
...@@ -159,7 +159,7 @@ gateway: ...@@ -159,7 +159,7 @@ gateway:
max_line_size: 41943040 max_line_size: 41943040
# Log upstream error response body summary (safe/truncated; does not log request content) # Log upstream error response body summary (safe/truncated; does not log request content)
# 记录上游错误响应体摘要(安全/截断;不记录请求内容) # 记录上游错误响应体摘要(安全/截断;不记录请求内容)
log_upstream_error_body: false log_upstream_error_body: true
# Max bytes to log from upstream error body # Max bytes to log from upstream error body
# 记录上游错误响应体的最大字节数 # 记录上游错误响应体的最大字节数
log_upstream_error_body_max_bytes: 2048 log_upstream_error_body_max_bytes: 2048
...@@ -302,6 +302,19 @@ redis: ...@@ -302,6 +302,19 @@ redis:
# 数据库编号(0-15) # 数据库编号(0-15)
db: 0 db: 0
# =============================================================================
# Ops Monitoring (Optional)
# 运维监控 (可选)
# =============================================================================
ops:
# Enable ops monitoring features (background jobs and APIs)
# 是否启用运维监控功能(后台任务和接口)
# Set to false to hide ops menu in sidebar and disable all ops features
# 设置为 false 可在左侧栏隐藏运维监控菜单并禁用所有运维监控功能
# Other detailed settings (cleanup, aggregation, etc.) are configured in ops settings dialog
# 其他详细设置(数据清理、预聚合等)在运维监控设置对话框中配置
enabled: true
# ============================================================================= # =============================================================================
# JWT Configuration # JWT Configuration
# JWT 配置 # JWT 配置
......
...@@ -17,6 +17,7 @@ import usageAPI from './usage' ...@@ -17,6 +17,7 @@ import usageAPI from './usage'
import geminiAPI from './gemini' import geminiAPI from './gemini'
import antigravityAPI from './antigravity' import antigravityAPI from './antigravity'
import userAttributesAPI from './userAttributes' import userAttributesAPI from './userAttributes'
import opsAPI from './ops'
/** /**
* Unified admin API object for convenient access * Unified admin API object for convenient access
...@@ -35,7 +36,8 @@ export const adminAPI = { ...@@ -35,7 +36,8 @@ export const adminAPI = {
usage: usageAPI, usage: usageAPI,
gemini: geminiAPI, gemini: geminiAPI,
antigravity: antigravityAPI, antigravity: antigravityAPI,
userAttributes: userAttributesAPI userAttributes: userAttributesAPI,
ops: opsAPI
} }
export { export {
...@@ -52,7 +54,8 @@ export { ...@@ -52,7 +54,8 @@ export {
usageAPI, usageAPI,
geminiAPI, geminiAPI,
antigravityAPI, antigravityAPI,
userAttributesAPI userAttributesAPI,
opsAPI
} }
export default adminAPI export default adminAPI
/**
* Admin Ops API endpoints (vNext)
* - Error logs list/detail + retry (client/upstream)
* - Dashboard overview (raw path)
*/
import { apiClient } from '../client'
import type { PaginatedResponse } from '@/types'
export type OpsRetryMode = 'client' | 'upstream'
export type OpsQueryMode = 'auto' | 'raw' | 'preagg'
export interface OpsRequestOptions {
signal?: AbortSignal
}
export interface OpsRetryRequest {
mode: OpsRetryMode
pinned_account_id?: number
}
export interface OpsRetryResult {
attempt_id: number
mode: OpsRetryMode
status: 'running' | 'succeeded' | 'failed' | string
pinned_account_id?: number | null
used_account_id?: number | null
http_status_code: number
upstream_request_id: string
response_preview: string
response_truncated: boolean
error_message: string
started_at: string
finished_at: string
duration_ms: number
}
export interface OpsDashboardOverview {
start_time: string
end_time: string
platform: string
group_id?: number | null
health_score?: number
system_metrics?: OpsSystemMetricsSnapshot | null
job_heartbeats?: OpsJobHeartbeat[] | null
success_count: number
error_count_total: number
business_limited_count: number
error_count_sla: number
request_count_total: number
request_count_sla: number
token_consumed: number
sla: number
error_rate: number
upstream_error_rate: number
upstream_error_count_excl_429_529: number
upstream_429_count: number
upstream_529_count: number
qps: {
current: number
peak: number
avg: number
}
tps: {
current: number
peak: number
avg: number
}
duration: OpsPercentiles
ttft: OpsPercentiles
}
export interface OpsPercentiles {
p50_ms?: number | null
p90_ms?: number | null
p95_ms?: number | null
p99_ms?: number | null
avg_ms?: number | null
max_ms?: number | null
}
export interface OpsThroughputTrendPoint {
bucket_start: string
request_count: number
token_consumed: number
qps: number
tps: number
}
export interface OpsThroughputPlatformBreakdownItem {
platform: string
request_count: number
token_consumed: number
}
export interface OpsThroughputGroupBreakdownItem {
group_id: number
group_name: string
request_count: number
token_consumed: number
}
export interface OpsThroughputTrendResponse {
bucket: string
points: OpsThroughputTrendPoint[]
by_platform?: OpsThroughputPlatformBreakdownItem[]
top_groups?: OpsThroughputGroupBreakdownItem[]
}
export type OpsRequestKind = 'success' | 'error'
export type OpsRequestDetailsKind = OpsRequestKind | 'all'
export type OpsRequestDetailsSort = 'created_at_desc' | 'duration_desc'
export interface OpsRequestDetail {
kind: OpsRequestKind
created_at: string
request_id: string
platform?: string
model?: string
duration_ms?: number | null
status_code?: number | null
error_id?: number | null
phase?: string
severity?: string
message?: string
user_id?: number | null
api_key_id?: number | null
account_id?: number | null
group_id?: number | null
stream?: boolean
}
export interface OpsRequestDetailsParams {
time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
start_time?: string
end_time?: string
kind?: OpsRequestDetailsKind
platform?: string
group_id?: number | null
user_id?: number
api_key_id?: number
account_id?: number
model?: string
request_id?: string
q?: string
min_duration_ms?: number
max_duration_ms?: number
sort?: OpsRequestDetailsSort
page?: number
page_size?: number
}
export type OpsRequestDetailsResponse = PaginatedResponse<OpsRequestDetail>
export interface OpsLatencyHistogramBucket {
range: string
count: number
}
export interface OpsLatencyHistogramResponse {
start_time: string
end_time: string
platform: string
group_id?: number | null
total_requests: number
buckets: OpsLatencyHistogramBucket[]
}
export interface OpsErrorTrendPoint {
bucket_start: string
error_count_total: number
business_limited_count: number
error_count_sla: number
upstream_error_count_excl_429_529: number
upstream_429_count: number
upstream_529_count: number
}
export interface OpsErrorTrendResponse {
bucket: string
points: OpsErrorTrendPoint[]
}
export interface OpsErrorDistributionItem {
status_code: number
total: number
sla: number
business_limited: number
}
export interface OpsErrorDistributionResponse {
total: number
items: OpsErrorDistributionItem[]
}
export interface OpsSystemMetricsSnapshot {
id: number
created_at: string
window_minutes: number
cpu_usage_percent?: number | null
memory_used_mb?: number | null
memory_total_mb?: number | null
memory_usage_percent?: number | null
db_ok?: boolean | null
redis_ok?: boolean | null
// Config-derived limits (best-effort) for rendering "current vs max".
db_max_open_conns?: number | null
redis_pool_size?: number | null
redis_conn_total?: number | null
redis_conn_idle?: number | null
db_conn_active?: number | null
db_conn_idle?: number | null
db_conn_waiting?: number | null
goroutine_count?: number | null
concurrency_queue_depth?: number | null
}
export interface OpsJobHeartbeat {
job_name: string
last_run_at?: string | null
last_success_at?: string | null
last_error_at?: string | null
last_error?: string | null
last_duration_ms?: number | null
updated_at: string
}
export interface PlatformConcurrencyInfo {
platform: string
current_in_use: number
max_capacity: number
load_percentage: number
waiting_in_queue: number
}
export interface GroupConcurrencyInfo {
group_id: number
group_name: string
platform: string
current_in_use: number
max_capacity: number
load_percentage: number
waiting_in_queue: number
}
export interface AccountConcurrencyInfo {
account_id: number
account_name?: string
platform: string
group_id: number
group_name: string
current_in_use: number
max_capacity: number
load_percentage: number
waiting_in_queue: number
}
export interface OpsConcurrencyStatsResponse {
enabled: boolean
platform: Record<string, PlatformConcurrencyInfo>
group: Record<string, GroupConcurrencyInfo>
account: Record<string, AccountConcurrencyInfo>
timestamp?: string
}
export async function getConcurrencyStats(platform?: string, groupId?: number | null): Promise<OpsConcurrencyStatsResponse> {
const params: Record<string, any> = {}
if (platform) {
params.platform = platform
}
if (typeof groupId === 'number' && groupId > 0) {
params.group_id = groupId
}
const { data } = await apiClient.get<OpsConcurrencyStatsResponse>('/admin/ops/concurrency', { params })
return data
}
export interface PlatformAvailability {
platform: string
total_accounts: number
available_count: number
rate_limit_count: number
error_count: number
}
export interface GroupAvailability {
group_id: number
group_name: string
platform: string
total_accounts: number
available_count: number
rate_limit_count: number
error_count: number
}
export interface AccountAvailability {
account_id: number
account_name: string
platform: string
group_id: number
group_name: string
status: string
is_available: boolean
is_rate_limited: boolean
rate_limit_reset_at?: string
rate_limit_remaining_sec?: number
is_overloaded: boolean
overload_until?: string
overload_remaining_sec?: number
has_error: boolean
error_message?: string
}
export interface OpsAccountAvailabilityStatsResponse {
enabled: boolean
platform: Record<string, PlatformAvailability>
group: Record<string, GroupAvailability>
account: Record<string, AccountAvailability>
timestamp?: string
}
export async function getAccountAvailabilityStats(platform?: string, groupId?: number | null): Promise<OpsAccountAvailabilityStatsResponse> {
const params: Record<string, any> = {}
if (platform) {
params.platform = platform
}
if (typeof groupId === 'number' && groupId > 0) {
params.group_id = groupId
}
const { data } = await apiClient.get<OpsAccountAvailabilityStatsResponse>('/admin/ops/account-availability', { params })
return data
}
/**
* Subscribe to realtime QPS updates via WebSocket.
*
* Note: browsers cannot set Authorization headers for WebSockets.
* We authenticate via Sec-WebSocket-Protocol using a prefixed token item:
* ["sub2api-admin", "jwt.<token>"]
*/
export interface SubscribeQPSOptions {
token?: string | null
onOpen?: () => void
onClose?: (event: CloseEvent) => void
onError?: (event: Event) => void
/**
* Called when the server closes with an application close code that indicates
* reconnecting is not useful (e.g. feature flag disabled).
*/
onFatalClose?: (event: CloseEvent) => void
/**
* More granular status updates for UI (connecting/reconnecting/offline/etc).
*/
onStatusChange?: (status: OpsWSStatus) => void
/**
* Called when a reconnect is scheduled (helps display "retry in Xs").
*/
onReconnectScheduled?: (info: { attempt: number, delayMs: number }) => void
wsBaseUrl?: string
/**
* Maximum reconnect attempts. Defaults to Infinity to keep the dashboard live.
* Set to 0 to disable reconnect.
*/
maxReconnectAttempts?: number
reconnectBaseDelayMs?: number
reconnectMaxDelayMs?: number
/**
* Stale connection detection (heartbeat-by-observation).
* If no messages are received within this window, the socket is closed to trigger a reconnect.
* Set to 0 to disable.
*/
staleTimeoutMs?: number
/**
* How often to check staleness. Only used when `staleTimeoutMs > 0`.
*/
staleCheckIntervalMs?: number
}
export type OpsWSStatus = 'connecting' | 'connected' | 'reconnecting' | 'offline' | 'closed'
export const OPS_WS_CLOSE_CODES = {
REALTIME_DISABLED: 4001
} as const
const OPS_WS_BASE_PROTOCOL = 'sub2api-admin'
export function subscribeQPS(onMessage: (data: any) => void, options: SubscribeQPSOptions = {}): () => void {
let ws: WebSocket | null = null
let reconnectAttempts = 0
const maxReconnectAttempts = Number.isFinite(options.maxReconnectAttempts as number)
? (options.maxReconnectAttempts as number)
: Infinity
const baseDelayMs = options.reconnectBaseDelayMs ?? 1000
const maxDelayMs = options.reconnectMaxDelayMs ?? 30000
let reconnectTimer: ReturnType<typeof setTimeout> | null = null
let shouldReconnect = true
let isConnecting = false
let hasConnectedOnce = false
let lastMessageAt = 0
const staleTimeoutMs = options.staleTimeoutMs ?? 120_000
const staleCheckIntervalMs = options.staleCheckIntervalMs ?? 30_000
let staleTimer: ReturnType<typeof setInterval> | null = null
const setStatus = (status: OpsWSStatus) => {
options.onStatusChange?.(status)
}
const clearReconnectTimer = () => {
if (reconnectTimer) {
clearTimeout(reconnectTimer)
reconnectTimer = null
}
}
const clearStaleTimer = () => {
if (staleTimer) {
clearInterval(staleTimer)
staleTimer = null
}
}
const startStaleTimer = () => {
clearStaleTimer()
if (!staleTimeoutMs || staleTimeoutMs <= 0) return
staleTimer = setInterval(() => {
if (!shouldReconnect) return
if (!ws || ws.readyState !== WebSocket.OPEN) return
if (!lastMessageAt) return
const ageMs = Date.now() - lastMessageAt
if (ageMs > staleTimeoutMs) {
// Treat as a half-open connection; closing triggers the normal reconnect path.
ws.close()
}
}, staleCheckIntervalMs)
}
const scheduleReconnect = () => {
if (!shouldReconnect) return
if (hasConnectedOnce && reconnectAttempts >= maxReconnectAttempts) return
// If we're offline, wait for the browser to come back online.
if (typeof navigator !== 'undefined' && 'onLine' in navigator && !navigator.onLine) {
setStatus('offline')
return
}
const expDelay = baseDelayMs * Math.pow(2, reconnectAttempts)
const delay = Math.min(expDelay, maxDelayMs)
const jitter = Math.floor(Math.random() * 250)
clearReconnectTimer()
reconnectTimer = setTimeout(() => {
reconnectAttempts++
connect()
}, delay + jitter)
options.onReconnectScheduled?.({ attempt: reconnectAttempts + 1, delayMs: delay + jitter })
}
const handleOnline = () => {
if (!shouldReconnect) return
if (ws && (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING)) return
connect()
}
const handleOffline = () => {
setStatus('offline')
}
const connect = () => {
if (!shouldReconnect) return
if (isConnecting) return
if (ws && (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING)) return
if (hasConnectedOnce && reconnectAttempts >= maxReconnectAttempts) return
isConnecting = true
setStatus(hasConnectedOnce ? 'reconnecting' : 'connecting')
const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'
const wsBaseUrl = options.wsBaseUrl || import.meta.env.VITE_WS_BASE_URL || window.location.host
const wsURL = new URL(`${protocol}//${wsBaseUrl}/api/v1/admin/ops/ws/qps`)
// Do NOT put admin JWT in the URL query string (it can leak via access logs, proxies, etc).
// Browsers cannot set Authorization headers for WebSockets, so we pass the token via
// Sec-WebSocket-Protocol (subprotocol list): ["sub2api-admin", "jwt.<token>"].
const rawToken = String(options.token ?? localStorage.getItem('auth_token') ?? '').trim()
const protocols: string[] = [OPS_WS_BASE_PROTOCOL]
if (rawToken) protocols.push(`jwt.${rawToken}`)
ws = new WebSocket(wsURL.toString(), protocols)
ws.onopen = () => {
reconnectAttempts = 0
isConnecting = false
hasConnectedOnce = true
clearReconnectTimer()
lastMessageAt = Date.now()
startStaleTimer()
setStatus('connected')
options.onOpen?.()
}
ws.onmessage = (e) => {
try {
const data = JSON.parse(e.data)
lastMessageAt = Date.now()
onMessage(data)
} catch (err) {
console.warn('[OpsWS] Failed to parse message:', err)
}
}
ws.onerror = (error) => {
console.error('[OpsWS] Connection error:', error)
options.onError?.(error)
}
ws.onclose = (event) => {
isConnecting = false
options.onClose?.(event)
clearStaleTimer()
ws = null
// If the server explicitly tells us to stop reconnecting, honor it.
if (event && typeof event.code === 'number' && event.code === OPS_WS_CLOSE_CODES.REALTIME_DISABLED) {
shouldReconnect = false
clearReconnectTimer()
setStatus('closed')
options.onFatalClose?.(event)
return
}
scheduleReconnect()
}
}
window.addEventListener('online', handleOnline)
window.addEventListener('offline', handleOffline)
connect()
return () => {
shouldReconnect = false
window.removeEventListener('online', handleOnline)
window.removeEventListener('offline', handleOffline)
clearReconnectTimer()
clearStaleTimer()
if (ws) ws.close()
ws = null
setStatus('closed')
}
}
export type OpsSeverity = string
export type OpsPhase = string
export type AlertSeverity = 'critical' | 'warning' | 'info'
export type ThresholdMode = 'count' | 'percentage' | 'both'
export type MetricType =
| 'success_rate'
| 'error_rate'
| 'upstream_error_rate'
| 'p95_latency_ms'
| 'p99_latency_ms'
| 'cpu_usage_percent'
| 'memory_usage_percent'
| 'concurrency_queue_depth'
| 'group_available_accounts'
| 'group_available_ratio'
| 'group_rate_limit_ratio'
| 'account_rate_limited_count'
| 'account_error_count'
| 'account_error_ratio'
| 'overload_account_count'
export type Operator = '>' | '>=' | '<' | '<=' | '==' | '!='
export interface AlertRule {
id?: number
name: string
description?: string
enabled: boolean
metric_type: MetricType
operator: Operator
threshold: number
window_minutes: number
sustained_minutes: number
severity: OpsSeverity
cooldown_minutes: number
notify_email: boolean
filters?: Record<string, any>
created_at?: string
updated_at?: string
last_triggered_at?: string | null
}
export interface AlertEvent {
id: number
rule_id: number
severity: OpsSeverity | string
status: 'firing' | 'resolved' | string
title?: string
description?: string
metric_value?: number
threshold_value?: number
dimensions?: Record<string, any>
fired_at: string
resolved_at?: string | null
email_sent: boolean
created_at: string
}
export interface EmailNotificationConfig {
alert: {
enabled: boolean
recipients: string[]
min_severity: AlertSeverity | ''
rate_limit_per_hour: number
batching_window_seconds: number
include_resolved_alerts: boolean
}
report: {
enabled: boolean
recipients: string[]
daily_summary_enabled: boolean
daily_summary_schedule: string
weekly_summary_enabled: boolean
weekly_summary_schedule: string
error_digest_enabled: boolean
error_digest_schedule: string
error_digest_min_count: number
account_health_enabled: boolean
account_health_schedule: string
account_health_error_rate_threshold: number
}
}
export interface OpsDistributedLockSettings {
enabled: boolean
key: string
ttl_seconds: number
}
export interface OpsAlertRuntimeSettings {
evaluation_interval_seconds: number
distributed_lock: OpsDistributedLockSettings
silencing: {
enabled: boolean
global_until_rfc3339: string
global_reason: string
entries?: Array<{
rule_id?: number
severities?: Array<OpsSeverity | string>
until_rfc3339: string
reason: string
}>
}
}
export interface OpsAdvancedSettings {
data_retention: OpsDataRetentionSettings
aggregation: OpsAggregationSettings
}
export interface OpsDataRetentionSettings {
cleanup_enabled: boolean
cleanup_schedule: string
error_log_retention_days: number
minute_metrics_retention_days: number
hourly_metrics_retention_days: number
}
export interface OpsAggregationSettings {
aggregation_enabled: boolean
}
export interface OpsErrorLog {
id: number
created_at: string
phase: OpsPhase
type: string
severity: OpsSeverity
status_code: number
platform: string
model: string
latency_ms?: number | null
client_request_id: string
request_id: string
message: string
user_id?: number | null
api_key_id?: number | null
account_id?: number | null
group_id?: number | null
client_ip?: string | null
request_path?: string
stream?: boolean
}
export interface OpsErrorDetail extends OpsErrorLog {
error_body: string
user_agent: string
// Upstream context (optional; enriched by gateway services)
upstream_status_code?: number | null
upstream_error_message?: string
upstream_error_detail?: string
upstream_errors?: string
auth_latency_ms?: number | null
routing_latency_ms?: number | null
upstream_latency_ms?: number | null
response_latency_ms?: number | null
time_to_first_token_ms?: number | null
request_body: string
request_body_truncated: boolean
request_body_bytes?: number | null
is_business_limited: boolean
}
export type OpsErrorLogsResponse = PaginatedResponse<OpsErrorLog>
export async function getDashboardOverview(
params: {
time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
start_time?: string
end_time?: string
platform?: string
group_id?: number | null
mode?: OpsQueryMode
},
options: OpsRequestOptions = {}
): Promise<OpsDashboardOverview> {
const { data } = await apiClient.get<OpsDashboardOverview>('/admin/ops/dashboard/overview', {
params,
signal: options.signal
})
return data
}
export async function getThroughputTrend(
params: {
time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
start_time?: string
end_time?: string
platform?: string
group_id?: number | null
mode?: OpsQueryMode
},
options: OpsRequestOptions = {}
): Promise<OpsThroughputTrendResponse> {
const { data } = await apiClient.get<OpsThroughputTrendResponse>('/admin/ops/dashboard/throughput-trend', {
params,
signal: options.signal
})
return data
}
export async function getLatencyHistogram(
params: {
time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
start_time?: string
end_time?: string
platform?: string
group_id?: number | null
mode?: OpsQueryMode
},
options: OpsRequestOptions = {}
): Promise<OpsLatencyHistogramResponse> {
const { data } = await apiClient.get<OpsLatencyHistogramResponse>('/admin/ops/dashboard/latency-histogram', {
params,
signal: options.signal
})
return data
}
export async function getErrorTrend(
params: {
time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
start_time?: string
end_time?: string
platform?: string
group_id?: number | null
mode?: OpsQueryMode
},
options: OpsRequestOptions = {}
): Promise<OpsErrorTrendResponse> {
const { data } = await apiClient.get<OpsErrorTrendResponse>('/admin/ops/dashboard/error-trend', {
params,
signal: options.signal
})
return data
}
export async function getErrorDistribution(
params: {
time_range?: '5m' | '30m' | '1h' | '6h' | '24h'
start_time?: string
end_time?: string
platform?: string
group_id?: number | null
mode?: OpsQueryMode
},
options: OpsRequestOptions = {}
): Promise<OpsErrorDistributionResponse> {
const { data } = await apiClient.get<OpsErrorDistributionResponse>('/admin/ops/dashboard/error-distribution', {
params,
signal: options.signal
})
return data
}
export async function listErrorLogs(params: {
page?: number
page_size?: number
time_range?: string
start_time?: string
end_time?: string
platform?: string
group_id?: number | null
account_id?: number | null
phase?: string
q?: string
status_codes?: string
}): Promise<OpsErrorLogsResponse> {
const { data } = await apiClient.get<OpsErrorLogsResponse>('/admin/ops/errors', { params })
return data
}
export async function getErrorLogDetail(id: number): Promise<OpsErrorDetail> {
const { data } = await apiClient.get<OpsErrorDetail>(`/admin/ops/errors/${id}`)
return data
}
export async function retryErrorRequest(id: number, req: OpsRetryRequest): Promise<OpsRetryResult> {
const { data } = await apiClient.post<OpsRetryResult>(`/admin/ops/errors/${id}/retry`, req)
return data
}
export async function listRequestDetails(params: OpsRequestDetailsParams): Promise<OpsRequestDetailsResponse> {
const { data } = await apiClient.get<OpsRequestDetailsResponse>('/admin/ops/requests', { params })
return data
}
// Alert rules
export async function listAlertRules(): Promise<AlertRule[]> {
const { data } = await apiClient.get<AlertRule[]>('/admin/ops/alert-rules')
return data
}
export async function createAlertRule(rule: AlertRule): Promise<AlertRule> {
const { data } = await apiClient.post<AlertRule>('/admin/ops/alert-rules', rule)
return data
}
export async function updateAlertRule(id: number, rule: Partial<AlertRule>): Promise<AlertRule> {
const { data } = await apiClient.put<AlertRule>(`/admin/ops/alert-rules/${id}`, rule)
return data
}
export async function deleteAlertRule(id: number): Promise<void> {
await apiClient.delete(`/admin/ops/alert-rules/${id}`)
}
export async function listAlertEvents(limit = 100): Promise<AlertEvent[]> {
const { data } = await apiClient.get<AlertEvent[]>('/admin/ops/alert-events', { params: { limit } })
return data
}
// Email notification config
export async function getEmailNotificationConfig(): Promise<EmailNotificationConfig> {
const { data } = await apiClient.get<EmailNotificationConfig>('/admin/ops/email-notification/config')
return data
}
export async function updateEmailNotificationConfig(config: EmailNotificationConfig): Promise<EmailNotificationConfig> {
const { data } = await apiClient.put<EmailNotificationConfig>('/admin/ops/email-notification/config', config)
return data
}
// Runtime settings (DB-backed)
export async function getAlertRuntimeSettings(): Promise<OpsAlertRuntimeSettings> {
const { data } = await apiClient.get<OpsAlertRuntimeSettings>('/admin/ops/runtime/alert')
return data
}
export async function updateAlertRuntimeSettings(config: OpsAlertRuntimeSettings): Promise<OpsAlertRuntimeSettings> {
const { data } = await apiClient.put<OpsAlertRuntimeSettings>('/admin/ops/runtime/alert', config)
return data
}
// Advanced settings (DB-backed)
export async function getAdvancedSettings(): Promise<OpsAdvancedSettings> {
const { data } = await apiClient.get<OpsAdvancedSettings>('/admin/ops/advanced-settings')
return data
}
export async function updateAdvancedSettings(config: OpsAdvancedSettings): Promise<OpsAdvancedSettings> {
const { data } = await apiClient.put<OpsAdvancedSettings>('/admin/ops/advanced-settings', config)
return data
}
export const opsAPI = {
getDashboardOverview,
getThroughputTrend,
getLatencyHistogram,
getErrorTrend,
getErrorDistribution,
getConcurrencyStats,
getAccountAvailabilityStats,
subscribeQPS,
listErrorLogs,
getErrorLogDetail,
retryErrorRequest,
listRequestDetails,
listAlertRules,
createAlertRule,
updateAlertRule,
deleteAlertRule,
listAlertEvents,
getEmailNotificationConfig,
updateEmailNotificationConfig,
getAlertRuntimeSettings,
updateAlertRuntimeSettings,
getAdvancedSettings,
updateAdvancedSettings
}
export default opsAPI
...@@ -35,14 +35,29 @@ export interface SystemSettings { ...@@ -35,14 +35,29 @@ export interface SystemSettings {
turnstile_enabled: boolean turnstile_enabled: boolean
turnstile_site_key: string turnstile_site_key: string
turnstile_secret_key_configured: boolean turnstile_secret_key_configured: boolean
// LinuxDo Connect OAuth 登录(终端用户 SSO)
// LinuxDo Connect OAuth settings
linuxdo_connect_enabled: boolean linuxdo_connect_enabled: boolean
linuxdo_connect_client_id: string linuxdo_connect_client_id: string
linuxdo_connect_client_secret_configured: boolean linuxdo_connect_client_secret_configured: boolean
linuxdo_connect_redirect_url: string linuxdo_connect_redirect_url: string
// Model fallback configuration
enable_model_fallback: boolean
fallback_model_anthropic: string
fallback_model_openai: string
fallback_model_gemini: string
fallback_model_antigravity: string
// Identity patch configuration (Claude -> Gemini) // Identity patch configuration (Claude -> Gemini)
enable_identity_patch: boolean enable_identity_patch: boolean
identity_patch_prompt: string identity_patch_prompt: string
// Ops Monitoring (vNext)
ops_monitoring_enabled: boolean
ops_realtime_monitoring_enabled: boolean
ops_query_mode_default: 'auto' | 'raw' | 'preagg' | string
ops_metrics_interval_seconds: number
} }
export interface UpdateSettingsRequest { export interface UpdateSettingsRequest {
...@@ -71,8 +86,17 @@ export interface UpdateSettingsRequest { ...@@ -71,8 +86,17 @@ export interface UpdateSettingsRequest {
linuxdo_connect_client_id?: string linuxdo_connect_client_id?: string
linuxdo_connect_client_secret?: string linuxdo_connect_client_secret?: string
linuxdo_connect_redirect_url?: string linuxdo_connect_redirect_url?: string
enable_model_fallback?: boolean
fallback_model_anthropic?: string
fallback_model_openai?: string
fallback_model_gemini?: string
fallback_model_antigravity?: string
enable_identity_patch?: boolean enable_identity_patch?: boolean
identity_patch_prompt?: string identity_patch_prompt?: string
ops_monitoring_enabled?: boolean
ops_realtime_monitoring_enabled?: boolean
ops_query_mode_default?: 'auto' | 'raw' | 'preagg' | string
ops_metrics_interval_seconds?: number
} }
/** /**
......
...@@ -80,9 +80,45 @@ apiClient.interceptors.response.use( ...@@ -80,9 +80,45 @@ apiClient.interceptors.response.use(
return response return response
}, },
(error: AxiosError<ApiResponse<unknown>>) => { (error: AxiosError<ApiResponse<unknown>>) => {
// Request cancellation: keep the original axios cancellation error so callers can ignore it.
// Otherwise we'd misclassify it as a generic "network error".
if (error.code === 'ERR_CANCELED' || axios.isCancel(error)) {
return Promise.reject(error)
}
// Handle common errors // Handle common errors
if (error.response) { if (error.response) {
const { status, data } = error.response const { status, data } = error.response
const url = String(error.config?.url || '')
// Validate `data` shape to avoid HTML error pages breaking our error handling.
const apiData = (typeof data === 'object' && data !== null ? data : {}) as Record<string, any>
// Ops monitoring disabled: treat as feature-flagged 404, and proactively redirect away
// from ops pages to avoid broken UI states.
if (status === 404 && apiData.message === 'Ops monitoring is disabled') {
try {
localStorage.setItem('ops_monitoring_enabled_cached', 'false')
} catch {
// ignore localStorage failures
}
try {
window.dispatchEvent(new CustomEvent('ops-monitoring-disabled'))
} catch {
// ignore event failures
}
if (window.location.pathname.startsWith('/admin/ops')) {
window.location.href = '/admin/settings'
}
return Promise.reject({
status,
code: 'OPS_DISABLED',
message: apiData.message || error.message,
url
})
}
// 401: Unauthorized - clear token and redirect to login // 401: Unauthorized - clear token and redirect to login
if (status === 401) { if (status === 401) {
...@@ -113,8 +149,8 @@ apiClient.interceptors.response.use( ...@@ -113,8 +149,8 @@ apiClient.interceptors.response.use(
// Return structured error // Return structured error
return Promise.reject({ return Promise.reject({
status, status,
code: data?.code, code: apiData.code,
message: data?.message || error.message message: apiData.message || apiData.detail || error.message
}) })
} }
......
<script setup lang="ts">
import { ref } from 'vue'
defineProps<{
content?: string
}>()
const show = ref(false)
</script>
<template>
<div
class="group relative ml-1 inline-flex items-center align-middle"
@mouseenter="show = true"
@mouseleave="show = false"
>
<!-- Trigger Icon -->
<slot name="trigger">
<svg
class="h-4 w-4 cursor-help text-gray-400 transition-colors hover:text-primary-600 dark:text-gray-500 dark:hover:text-primary-400"
fill="none"
viewBox="0 0 24 24"
stroke="currentColor"
stroke-width="2"
>
<path
stroke-linecap="round"
stroke-linejoin="round"
d="M13 16h-1v-4h-1m1-4h.01M21 12a9 9 0 11-18 0 9 9 0 0118 0z"
/>
</svg>
</slot>
<!-- Popover Content -->
<div
v-show="show"
class="absolute bottom-full left-1/2 z-50 mb-2 w-64 -translate-x-1/2 rounded-lg bg-gray-900 p-3 text-xs leading-relaxed text-white shadow-xl ring-1 ring-white/10 opacity-0 transition-opacity duration-200 group-hover:opacity-100 dark:bg-gray-800"
>
<slot>{{ content }}</slot>
<div class="absolute -bottom-1 left-1/2 h-2 w-2 -translate-x-1/2 rotate-45 bg-gray-900 dark:bg-gray-800"></div>
</div>
</div>
</template>
...@@ -67,12 +67,13 @@ ...@@ -67,12 +67,13 @@
:aria-selected="isSelected(option)" :aria-selected="isSelected(option)"
:aria-disabled="isOptionDisabled(option)" :aria-disabled="isOptionDisabled(option)"
@click.stop="!isOptionDisabled(option) && selectOption(option)" @click.stop="!isOptionDisabled(option) && selectOption(option)"
@mouseenter="focusedIndex = index" @mouseenter="handleOptionMouseEnter(option, index)"
:class="[ :class="[
'select-option', 'select-option',
isGroupHeaderOption(option) && 'select-option-group',
isSelected(option) && 'select-option-selected', isSelected(option) && 'select-option-selected',
isOptionDisabled(option) && 'select-option-disabled', isOptionDisabled(option) && !isGroupHeaderOption(option) && 'select-option-disabled',
focusedIndex === index && 'select-option-focused' focusedIndex === index && !isGroupHeaderOption(option) && 'select-option-focused'
]" ]"
> >
<slot name="option" :option="option" :selected="isSelected(option)"> <slot name="option" :option="option" :selected="isSelected(option)">
...@@ -201,6 +202,13 @@ const isOptionDisabled = (option: any): boolean => { ...@@ -201,6 +202,13 @@ const isOptionDisabled = (option: any): boolean => {
return false return false
} }
const isGroupHeaderOption = (option: any): boolean => {
if (typeof option === 'object' && option !== null) {
return option.kind === 'group'
}
return false
}
const selectedOption = computed(() => { const selectedOption = computed(() => {
return props.options.find((opt) => getOptionValue(opt) === props.modelValue) || null return props.options.find((opt) => getOptionValue(opt) === props.modelValue) || null
}) })
...@@ -225,6 +233,31 @@ const isSelected = (option: any): boolean => { ...@@ -225,6 +233,31 @@ const isSelected = (option: any): boolean => {
return getOptionValue(option) === props.modelValue return getOptionValue(option) === props.modelValue
} }
const findNextEnabledIndex = (startIndex: number): number => {
const opts = filteredOptions.value
if (opts.length === 0) return -1
for (let offset = 0; offset < opts.length; offset++) {
const idx = (startIndex + offset) % opts.length
if (!isOptionDisabled(opts[idx])) return idx
}
return -1
}
const findPrevEnabledIndex = (startIndex: number): number => {
const opts = filteredOptions.value
if (opts.length === 0) return -1
for (let offset = 0; offset < opts.length; offset++) {
const idx = (startIndex - offset + opts.length) % opts.length
if (!isOptionDisabled(opts[idx])) return idx
}
return -1
}
const handleOptionMouseEnter = (option: any, index: number) => {
if (isOptionDisabled(option) || isGroupHeaderOption(option)) return
focusedIndex.value = index
}
// Update trigger rect periodically while open to follow scroll/resize // Update trigger rect periodically while open to follow scroll/resize
const updateTriggerRect = () => { const updateTriggerRect = () => {
if (containerRef.value) { if (containerRef.value) {
...@@ -259,8 +292,15 @@ watch(isOpen, (open) => { ...@@ -259,8 +292,15 @@ watch(isOpen, (open) => {
if (open) { if (open) {
calculateDropdownPosition() calculateDropdownPosition()
// Reset focused index to current selection or first item // Reset focused index to current selection or first item
const selectedIdx = filteredOptions.value.findIndex(isSelected) if (filteredOptions.value.length === 0) {
focusedIndex.value = selectedIdx >= 0 ? selectedIdx : 0 focusedIndex.value = -1
} else {
const selectedIdx = filteredOptions.value.findIndex(isSelected)
const initialIdx = selectedIdx >= 0 ? selectedIdx : 0
focusedIndex.value = isOptionDisabled(filteredOptions.value[initialIdx])
? findNextEnabledIndex(initialIdx + 1)
: initialIdx
}
if (props.searchable) { if (props.searchable) {
nextTick(() => searchInputRef.value?.focus()) nextTick(() => searchInputRef.value?.focus())
...@@ -295,13 +335,13 @@ const onDropdownKeyDown = (e: KeyboardEvent) => { ...@@ -295,13 +335,13 @@ const onDropdownKeyDown = (e: KeyboardEvent) => {
switch (e.key) { switch (e.key) {
case 'ArrowDown': case 'ArrowDown':
e.preventDefault() e.preventDefault()
focusedIndex.value = (focusedIndex.value + 1) % filteredOptions.value.length focusedIndex.value = findNextEnabledIndex(focusedIndex.value + 1)
scrollToFocused() if (focusedIndex.value >= 0) scrollToFocused()
break break
case 'ArrowUp': case 'ArrowUp':
e.preventDefault() e.preventDefault()
focusedIndex.value = (focusedIndex.value - 1 + filteredOptions.value.length) % filteredOptions.value.length focusedIndex.value = findPrevEnabledIndex(focusedIndex.value - 1)
scrollToFocused() if (focusedIndex.value >= 0) scrollToFocused()
break break
case 'Enter': case 'Enter':
e.preventDefault() e.preventDefault()
...@@ -441,6 +481,17 @@ onUnmounted(() => { ...@@ -441,6 +481,17 @@ onUnmounted(() => {
@apply cursor-not-allowed opacity-40; @apply cursor-not-allowed opacity-40;
} }
.select-dropdown-portal .select-option-group {
@apply cursor-default select-none;
@apply bg-gray-50 dark:bg-dark-900;
@apply text-[11px] font-bold uppercase tracking-wider;
@apply text-gray-500 dark:text-gray-400;
}
.select-dropdown-portal .select-option-group:hover {
@apply bg-gray-50 dark:bg-dark-900;
}
.select-dropdown-portal .select-option-label { .select-dropdown-portal .select-option-label {
@apply flex-1 min-w-0 truncate text-left; @apply flex-1 min-w-0 truncate text-left;
} }
......
...@@ -28,8 +28,8 @@ ...@@ -28,8 +28,8 @@
{{ platformDescription }} {{ platformDescription }}
</p> </p>
<!-- Client Tabs (only for Antigravity platform) --> <!-- Client Tabs -->
<div v-if="platform === 'antigravity'" class="border-b border-gray-200 dark:border-dark-700"> <div v-if="clientTabs.length" class="border-b border-gray-200 dark:border-dark-700">
<nav class="-mb-px flex space-x-6" aria-label="Client"> <nav class="-mb-px flex space-x-6" aria-label="Client">
<button <button
v-for="tab in clientTabs" v-for="tab in clientTabs"
...@@ -51,7 +51,7 @@ ...@@ -51,7 +51,7 @@
</div> </div>
<!-- OS/Shell Tabs --> <!-- OS/Shell Tabs -->
<div class="border-b border-gray-200 dark:border-dark-700"> <div v-if="showShellTabs" class="border-b border-gray-200 dark:border-dark-700">
<nav class="-mb-px flex space-x-4" aria-label="Tabs"> <nav class="-mb-px flex space-x-4" aria-label="Tabs">
<button <button
v-for="tab in currentTabs" v-for="tab in currentTabs"
...@@ -111,7 +111,7 @@ ...@@ -111,7 +111,7 @@
</div> </div>
<!-- Usage Note --> <!-- Usage Note -->
<div class="flex items-start gap-3 p-3 rounded-lg bg-blue-50 dark:bg-blue-900/20 border border-blue-100 dark:border-blue-800"> <div v-if="showPlatformNote" class="flex items-start gap-3 p-3 rounded-lg bg-blue-50 dark:bg-blue-900/20 border border-blue-100 dark:border-blue-800">
<Icon name="infoCircle" size="md" class="text-blue-500 flex-shrink-0 mt-0.5" /> <Icon name="infoCircle" size="md" class="text-blue-500 flex-shrink-0 mt-0.5" />
<p class="text-sm text-blue-700 dark:text-blue-300"> <p class="text-sm text-blue-700 dark:text-blue-300">
{{ platformNote }} {{ platformNote }}
...@@ -173,17 +173,28 @@ const { copyToClipboard: clipboardCopy } = useClipboard() ...@@ -173,17 +173,28 @@ const { copyToClipboard: clipboardCopy } = useClipboard()
const copiedIndex = ref<number | null>(null) const copiedIndex = ref<number | null>(null)
const activeTab = ref<string>('unix') const activeTab = ref<string>('unix')
const activeClientTab = ref<string>('claude') // Level 1 tab for antigravity platform const activeClientTab = ref<string>('claude')
// Reset tabs when platform changes // Reset tabs when platform changes
watch(() => props.platform, (newPlatform) => { const defaultClientTab = computed(() => {
activeTab.value = 'unix' switch (props.platform) {
if (newPlatform === 'antigravity') { case 'openai':
activeClientTab.value = 'claude' return 'codex'
case 'gemini':
return 'gemini'
case 'antigravity':
return 'claude'
default:
return 'claude'
} }
}) })
// Reset shell tab when client changes (for antigravity) watch(() => props.platform, () => {
activeTab.value = 'unix'
activeClientTab.value = defaultClientTab.value
}, { immediate: true })
// Reset shell tab when client changes
watch(activeClientTab, () => { watch(activeClientTab, () => {
activeTab.value = 'unix' activeTab.value = 'unix'
}) })
...@@ -251,11 +262,32 @@ const SparkleIcon = { ...@@ -251,11 +262,32 @@ const SparkleIcon = {
} }
} }
// Client tabs for Antigravity platform (Level 1) const clientTabs = computed((): TabConfig[] => {
const clientTabs = computed((): TabConfig[] => [ if (!props.platform) return []
{ id: 'claude', label: t('keys.useKeyModal.antigravity.claudeCode'), icon: TerminalIcon }, switch (props.platform) {
{ id: 'gemini', label: t('keys.useKeyModal.antigravity.geminiCli'), icon: SparkleIcon } case 'openai':
]) return [
{ id: 'codex', label: t('keys.useKeyModal.cliTabs.codexCli'), icon: TerminalIcon },
{ id: 'opencode', label: t('keys.useKeyModal.cliTabs.opencode'), icon: TerminalIcon }
]
case 'gemini':
return [
{ id: 'gemini', label: t('keys.useKeyModal.cliTabs.geminiCli'), icon: SparkleIcon },
{ id: 'opencode', label: t('keys.useKeyModal.cliTabs.opencode'), icon: TerminalIcon }
]
case 'antigravity':
return [
{ id: 'claude', label: t('keys.useKeyModal.cliTabs.claudeCode'), icon: TerminalIcon },
{ id: 'gemini', label: t('keys.useKeyModal.cliTabs.geminiCli'), icon: SparkleIcon },
{ id: 'opencode', label: t('keys.useKeyModal.cliTabs.opencode'), icon: TerminalIcon }
]
default:
return [
{ id: 'claude', label: t('keys.useKeyModal.cliTabs.claudeCode'), icon: TerminalIcon },
{ id: 'opencode', label: t('keys.useKeyModal.cliTabs.opencode'), icon: TerminalIcon }
]
}
})
// Shell tabs (3 types for environment variable based configs) // Shell tabs (3 types for environment variable based configs)
const shellTabs: TabConfig[] = [ const shellTabs: TabConfig[] = [
...@@ -270,11 +302,13 @@ const openaiTabs: TabConfig[] = [ ...@@ -270,11 +302,13 @@ const openaiTabs: TabConfig[] = [
{ id: 'windows', label: 'Windows', icon: WindowsIcon } { id: 'windows', label: 'Windows', icon: WindowsIcon }
] ]
const showShellTabs = computed(() => activeClientTab.value !== 'opencode')
const currentTabs = computed(() => { const currentTabs = computed(() => {
if (!showShellTabs.value) return []
if (props.platform === 'openai') { if (props.platform === 'openai') {
return openaiTabs // 2 tabs: unix, windows return openaiTabs
} }
// All other platforms (anthropic, gemini, antigravity) use shell tabs
return shellTabs return shellTabs
}) })
...@@ -308,6 +342,8 @@ const platformNote = computed(() => { ...@@ -308,6 +342,8 @@ const platformNote = computed(() => {
} }
}) })
const showPlatformNote = computed(() => activeClientTab.value !== 'opencode')
const escapeHtml = (value: string) => value const escapeHtml = (value: string) => value
.replace(/&/g, '&amp;') .replace(/&/g, '&amp;')
.replace(/</g, '&lt;') .replace(/</g, '&lt;')
...@@ -329,6 +365,35 @@ const comment = (value: string) => wrapToken('text-slate-500', value) ...@@ -329,6 +365,35 @@ const comment = (value: string) => wrapToken('text-slate-500', value)
const currentFiles = computed((): FileConfig[] => { const currentFiles = computed((): FileConfig[] => {
const baseUrl = props.baseUrl || window.location.origin const baseUrl = props.baseUrl || window.location.origin
const apiKey = props.apiKey const apiKey = props.apiKey
const baseRoot = baseUrl.replace(/\/v1\/?$/, '').replace(/\/+$/, '')
const ensureV1 = (value: string) => {
const trimmed = value.replace(/\/+$/, '')
return trimmed.endsWith('/v1') ? trimmed : `${trimmed}/v1`
}
const apiBase = ensureV1(baseRoot)
const antigravityBase = ensureV1(`${baseRoot}/antigravity`)
const antigravityGeminiBase = (() => {
const trimmed = `${baseRoot}/antigravity`.replace(/\/+$/, '')
return trimmed.endsWith('/v1beta') ? trimmed : `${trimmed}/v1beta`
})()
if (activeClientTab.value === 'opencode') {
switch (props.platform) {
case 'anthropic':
return [generateOpenCodeConfig('anthropic', apiBase, apiKey)]
case 'openai':
return [generateOpenCodeConfig('openai', apiBase, apiKey)]
case 'gemini':
return [generateOpenCodeConfig('gemini', apiBase, apiKey)]
case 'antigravity':
return [
generateOpenCodeConfig('antigravity-claude', antigravityBase, apiKey, 'opencode.json (Claude)'),
generateOpenCodeConfig('antigravity-gemini', antigravityGeminiBase, apiKey, 'opencode.json (Gemini)')
]
default:
return [generateOpenCodeConfig('openai', apiBase, apiKey)]
}
}
switch (props.platform) { switch (props.platform) {
case 'openai': case 'openai':
...@@ -336,12 +401,11 @@ const currentFiles = computed((): FileConfig[] => { ...@@ -336,12 +401,11 @@ const currentFiles = computed((): FileConfig[] => {
case 'gemini': case 'gemini':
return [generateGeminiCliContent(baseUrl, apiKey)] return [generateGeminiCliContent(baseUrl, apiKey)]
case 'antigravity': case 'antigravity':
// Both Claude Code and Gemini CLI need /antigravity suffix for antigravity platform if (activeClientTab.value === 'gemini') {
if (activeClientTab.value === 'claude') { return [generateGeminiCliContent(`${baseUrl}/antigravity`, apiKey)]
return generateAnthropicFiles(`${baseUrl}/antigravity`, apiKey)
} }
return [generateGeminiCliContent(`${baseUrl}/antigravity`, apiKey)] return generateAnthropicFiles(`${baseUrl}/antigravity`, apiKey)
default: // anthropic default:
return generateAnthropicFiles(baseUrl, apiKey) return generateAnthropicFiles(baseUrl, apiKey)
} }
}) })
...@@ -456,6 +520,76 @@ requires_openai_auth = true` ...@@ -456,6 +520,76 @@ requires_openai_auth = true`
] ]
} }
function generateOpenCodeConfig(platform: string, baseUrl: string, apiKey: string, pathLabel?: string): FileConfig {
const provider: Record<string, any> = {
[platform]: {
options: {
baseURL: baseUrl,
apiKey,
...(platform === 'openai' ? { store: false } : {})
}
}
}
const openaiModels = {
'gpt-5.2-codex': {
name: 'GPT-5.2 Codex',
variants: {
low: {},
medium: {},
high: {},
xhigh: {}
}
}
}
const geminiModels = {
'gemini-3-pro-high': { name: 'Gemini 3 Pro High' },
'gemini-3-pro-low': { name: 'Gemini 3 Pro Low' },
'gemini-3-pro-preview': { name: 'Gemini 3 Pro Preview' },
'gemini-3-pro-image': { name: 'Gemini 3 Pro Image' },
'gemini-3-flash': { name: 'Gemini 3 Flash' },
'gemini-2.5-flash-thinking': { name: 'Gemini 2.5 Flash Thinking' },
'gemini-2.5-flash': { name: 'Gemini 2.5 Flash' },
'gemini-2.5-flash-lite': { name: 'Gemini 2.5 Flash Lite' }
}
const claudeModels = {
'claude-opus-4-5-thinking': { name: 'Claude Opus 4.5 Thinking' },
'claude-sonnet-4-5-thinking': { name: 'Claude Sonnet 4.5 Thinking' },
'claude-sonnet-4-5': { name: 'Claude Sonnet 4.5' }
}
if (platform === 'gemini') {
provider[platform].npm = '@ai-sdk/google'
provider[platform].models = geminiModels
} else if (platform === 'anthropic') {
provider[platform].npm = '@ai-sdk/anthropic'
} else if (platform === 'antigravity-claude') {
provider[platform].npm = '@ai-sdk/anthropic'
provider[platform].name = 'Antigravity (Claude)'
provider[platform].models = claudeModels
} else if (platform === 'antigravity-gemini') {
provider[platform].npm = '@ai-sdk/google'
provider[platform].name = 'Antigravity (Gemini)'
provider[platform].models = geminiModels
} else if (platform === 'openai') {
provider[platform].models = openaiModels
}
const content = JSON.stringify(
{
provider,
$schema: 'https://opencode.ai/config.json'
},
null,
2
)
return {
path: pathLabel ?? 'opencode.json',
content,
hint: t('keys.useKeyModal.opencode.hint')
}
}
const copyContent = async (content: string, index: number) => { const copyContent = async (content: string, index: number) => {
const success = await clipboardCopy(content, t('keys.copied')) const success = await clipboardCopy(content, t('keys.copied'))
if (success) { if (success) {
......
...@@ -144,10 +144,10 @@ ...@@ -144,10 +144,10 @@
</template> </template>
<script setup lang="ts"> <script setup lang="ts">
import { computed, h, ref } from 'vue' import { computed, h, onMounted, ref, watch } from 'vue'
import { useRoute } from 'vue-router' import { useRoute } from 'vue-router'
import { useI18n } from 'vue-i18n' import { useI18n } from 'vue-i18n'
import { useAppStore, useAuthStore, useOnboardingStore } from '@/stores' import { useAdminSettingsStore, useAppStore, useAuthStore, useOnboardingStore } from '@/stores'
import VersionBadge from '@/components/common/VersionBadge.vue' import VersionBadge from '@/components/common/VersionBadge.vue'
const { t } = useI18n() const { t } = useI18n()
...@@ -156,6 +156,7 @@ const route = useRoute() ...@@ -156,6 +156,7 @@ const route = useRoute()
const appStore = useAppStore() const appStore = useAppStore()
const authStore = useAuthStore() const authStore = useAuthStore()
const onboardingStore = useOnboardingStore() const onboardingStore = useOnboardingStore()
const adminSettingsStore = useAdminSettingsStore()
const sidebarCollapsed = computed(() => appStore.sidebarCollapsed) const sidebarCollapsed = computed(() => appStore.sidebarCollapsed)
const mobileOpen = computed(() => appStore.mobileOpen) const mobileOpen = computed(() => appStore.mobileOpen)
...@@ -442,6 +443,9 @@ const personalNavItems = computed(() => { ...@@ -442,6 +443,9 @@ const personalNavItems = computed(() => {
const adminNavItems = computed(() => { const adminNavItems = computed(() => {
const baseItems = [ const baseItems = [
{ path: '/admin/dashboard', label: t('nav.dashboard'), icon: DashboardIcon }, { path: '/admin/dashboard', label: t('nav.dashboard'), icon: DashboardIcon },
...(adminSettingsStore.opsMonitoringEnabled
? [{ path: '/admin/ops', label: t('nav.ops'), icon: ChartIcon }]
: []),
{ path: '/admin/users', label: t('nav.users'), icon: UsersIcon, hideInSimpleMode: true }, { path: '/admin/users', label: t('nav.users'), icon: UsersIcon, hideInSimpleMode: true },
{ path: '/admin/groups', label: t('nav.groups'), icon: FolderIcon, hideInSimpleMode: true }, { path: '/admin/groups', label: t('nav.groups'), icon: FolderIcon, hideInSimpleMode: true },
{ path: '/admin/subscriptions', label: t('nav.subscriptions'), icon: CreditCardIcon, hideInSimpleMode: true }, { path: '/admin/subscriptions', label: t('nav.subscriptions'), icon: CreditCardIcon, hideInSimpleMode: true },
...@@ -511,6 +515,23 @@ if ( ...@@ -511,6 +515,23 @@ if (
isDark.value = true isDark.value = true
document.documentElement.classList.add('dark') document.documentElement.classList.add('dark')
} }
// Fetch admin settings (for feature-gated nav items like Ops).
watch(
isAdmin,
(v) => {
if (v) {
adminSettingsStore.fetch()
}
},
{ immediate: true }
)
onMounted(() => {
if (isAdmin.value) {
adminSettingsStore.fetch()
}
})
</script> </script>
<style scoped> <style scoped>
......
...@@ -131,6 +131,7 @@ export default { ...@@ -131,6 +131,7 @@ export default {
noData: 'No data', noData: 'No data',
success: 'Success', success: 'Success',
error: 'Error', error: 'Error',
critical: 'Critical',
warning: 'Warning', warning: 'Warning',
info: 'Info', info: 'Info',
active: 'Active', active: 'Active',
...@@ -145,9 +146,11 @@ export default { ...@@ -145,9 +146,11 @@ export default {
copiedToClipboard: 'Copied to clipboard', copiedToClipboard: 'Copied to clipboard',
copyFailed: 'Failed to copy', copyFailed: 'Failed to copy',
contactSupport: 'Contact Support', contactSupport: 'Contact Support',
add: 'Add',
invalidEmail: 'Please enter a valid email address',
optional: 'optional', optional: 'optional',
selectOption: 'Select an option', selectOption: 'Select an option',
searchPlaceholder: 'Search...', searchPlaceholder: 'Search...',
noOptionsFound: 'No options found', noOptionsFound: 'No options found',
noGroupsAvailable: 'No groups available', noGroupsAvailable: 'No groups available',
unknownError: 'Unknown error occurred', unknownError: 'Unknown error occurred',
...@@ -178,6 +181,7 @@ export default { ...@@ -178,6 +181,7 @@ export default {
accounts: 'Accounts', accounts: 'Accounts',
proxies: 'Proxies', proxies: 'Proxies',
redeemCodes: 'Redeem Codes', redeemCodes: 'Redeem Codes',
ops: 'Ops',
promoCodes: 'Promo Codes', promoCodes: 'Promo Codes',
settings: 'Settings', settings: 'Settings',
myAccount: 'My Account', myAccount: 'My Account',
...@@ -364,6 +368,12 @@ export default { ...@@ -364,6 +368,12 @@ export default {
note: 'Make sure the config directory exists. macOS/Linux users can run mkdir -p ~/.codex to create it.', note: 'Make sure the config directory exists. macOS/Linux users can run mkdir -p ~/.codex to create it.',
noteWindows: 'Press Win+R and enter %userprofile%\\.codex to open the config directory. Create it manually if it does not exist.', noteWindows: 'Press Win+R and enter %userprofile%\\.codex to open the config directory. Create it manually if it does not exist.',
}, },
cliTabs: {
claudeCode: 'Claude Code',
geminiCli: 'Gemini CLI',
codexCli: 'Codex CLI',
opencode: 'OpenCode',
},
antigravity: { antigravity: {
description: 'Configure API access for Antigravity group. Select the configuration method based on your client.', description: 'Configure API access for Antigravity group. Select the configuration method based on your client.',
claudeCode: 'Claude Code', claudeCode: 'Claude Code',
...@@ -376,6 +386,11 @@ export default { ...@@ -376,6 +386,11 @@ export default {
modelComment: 'If you have Gemini 3 access, you can use: gemini-3-pro-preview', modelComment: 'If you have Gemini 3 access, you can use: gemini-3-pro-preview',
note: 'These environment variables will be active in the current terminal session. For permanent configuration, add them to ~/.bashrc, ~/.zshrc, or the appropriate configuration file.', note: 'These environment variables will be active in the current terminal session. For permanent configuration, add them to ~/.bashrc, ~/.zshrc, or the appropriate configuration file.',
}, },
opencode: {
title: 'OpenCode Example',
subtitle: 'opencode.json',
hint: 'This is a group configuration example. Adjust model and options as needed.',
},
}, },
customKeyLabel: 'Custom Key', customKeyLabel: 'Custom Key',
customKeyPlaceholder: 'Enter your custom key (min 16 chars)', customKeyPlaceholder: 'Enter your custom key (min 16 chars)',
...@@ -1826,6 +1841,524 @@ export default { ...@@ -1826,6 +1841,524 @@ export default {
ipAddress: 'IP' ipAddress: 'IP'
}, },
// Ops Monitoring
ops: {
title: 'Ops Monitoring',
description: 'Operational monitoring and troubleshooting',
// Dashboard
systemHealth: 'System Health',
overview: 'Overview',
noSystemMetrics: 'No system metrics collected yet.',
collectedAt: 'Collected at:',
window: 'window',
cpu: 'CPU',
memory: 'Memory',
db: 'DB',
redis: 'Redis',
goroutines: 'Goroutines',
jobs: 'Jobs',
jobsHelp: 'Click “Details” to view job heartbeats and recent errors',
active: 'active',
idle: 'idle',
waiting: 'waiting',
conns: 'conns',
queue: 'queue',
ok: 'ok',
lastRun: 'last_run:',
lastSuccess: 'last_success:',
lastError: 'last_error:',
noData: 'No data.',
loadingText: 'loading',
ready: 'ready',
requestsTotal: 'Requests (total)',
slaScope: 'SLA scope:',
tokens: 'Tokens',
tps: 'TPS:',
current: 'current',
peak: 'peak',
average: 'average',
totalRequests: 'Total Requests',
avgQps: 'Avg QPS',
avgTps: 'Avg TPS',
avgLatency: 'Avg Latency',
avgTtft: 'Avg TTFT',
exceptions: 'Exceptions',
requestErrors: 'Request Errors',
errorCount: 'Error Count',
upstreamErrors: 'Upstream Errors',
errorCountExcl429529: 'Error Count (excl 429/529)',
sla: 'SLA (excl business limits)',
businessLimited: 'business_limited:',
errors: 'Errors',
errorRate: 'error_rate:',
upstreamRate: 'upstream_rate:',
latencyDuration: 'Latency (duration_ms)',
ttftLabel: 'TTFT (first_token_ms)',
p50: 'p50:',
p90: 'p90:',
p95: 'p95:',
p99: 'p99:',
avg: 'avg:',
max: 'max:',
qps: 'QPS',
requests: 'Requests',
upstream: 'Upstream',
client: 'Client',
system: 'System',
other: 'Other',
errorsSla: 'Errors (SLA scope)',
upstreamExcl429529: 'Upstream (excl 429/529)',
failedToLoadData: 'Failed to load ops data.',
failedToLoadOverview: 'Failed to load overview',
failedToLoadThroughputTrend: 'Failed to load throughput trend',
failedToLoadLatencyHistogram: 'Failed to load latency histogram',
failedToLoadErrorTrend: 'Failed to load error trend',
failedToLoadErrorDistribution: 'Failed to load error distribution',
failedToLoadErrorDetail: 'Failed to load error detail',
retryFailed: 'Retry failed',
tpsK: 'TPS (K)',
top: 'Top:',
throughputTrend: 'Throughput Trend',
latencyHistogram: 'Latency Histogram',
errorTrend: 'Error Trend',
errorDistribution: 'Error Distribution',
// Health Score & Diagnosis
health: 'Health',
healthCondition: 'Health Condition',
healthHelp: 'Overall system health score based on SLA, error rate, and resource usage',
healthyStatus: 'Healthy',
riskyStatus: 'At Risk',
idleStatus: 'Idle',
timeRange: {
'5m': 'Last 5 minutes',
'30m': 'Last 30 minutes',
'1h': 'Last 1 hour',
'6h': 'Last 6 hours',
'24h': 'Last 24 hours'
},
diagnosis: {
title: 'Smart Diagnosis',
footer: 'Automated diagnostic suggestions based on current metrics',
idle: 'System is currently idle',
idleImpact: 'No active traffic',
// Resource diagnostics
dbDown: 'Database connection failed',
dbDownImpact: 'All database operations will fail',
dbDownAction: 'Check database service status, network connectivity, and connection configuration',
redisDown: 'Redis connection failed',
redisDownImpact: 'Cache functionality degraded, performance may decline',
redisDownAction: 'Check Redis service status and network connectivity',
cpuCritical: 'CPU usage critically high ({usage}%)',
cpuCriticalImpact: 'System response slowing, may affect all requests',
cpuCriticalAction: 'Check CPU-intensive tasks, consider scaling or code optimization',
cpuHigh: 'CPU usage elevated ({usage}%)',
cpuHighImpact: 'System load is high, needs attention',
cpuHighAction: 'Monitor CPU trends, prepare scaling plan',
memoryCritical: 'Memory usage critically high ({usage}%)',
memoryCriticalImpact: 'May trigger OOM, system stability threatened',
memoryCriticalAction: 'Check for memory leaks, consider increasing memory or optimizing usage',
memoryHigh: 'Memory usage elevated ({usage}%)',
memoryHighImpact: 'Memory pressure is high, needs attention',
memoryHighAction: 'Monitor memory trends, check for memory leaks',
// Latency diagnostics
latencyCritical: 'Response latency critically high ({latency}ms)',
latencyCriticalImpact: 'User experience extremely poor, many requests timing out',
latencyCriticalAction: 'Check slow queries, database indexes, network latency, and upstream services',
latencyHigh: 'Response latency elevated ({latency}ms)',
latencyHighImpact: 'User experience degraded, needs optimization',
latencyHighAction: 'Analyze slow request logs, optimize database queries and business logic',
ttftHigh: 'Time to first byte elevated ({ttft}ms)',
ttftHighImpact: 'User perceived latency increased',
ttftHighAction: 'Optimize request processing flow, reduce pre-processing time',
// Error rate diagnostics
upstreamCritical: 'Upstream error rate critically high ({rate}%)',
upstreamCriticalImpact: 'May affect many user requests',
upstreamCriticalAction: 'Check upstream service health, enable fallback strategies',
upstreamHigh: 'Upstream error rate elevated ({rate}%)',
upstreamHighImpact: 'Recommend checking upstream service status',
upstreamHighAction: 'Contact upstream service team, prepare fallback plan',
errorHigh: 'Error rate too high ({rate}%)',
errorHighImpact: 'Many requests failing',
errorHighAction: 'Check error logs, identify root cause, urgent fix required',
errorElevated: 'Error rate elevated ({rate}%)',
errorElevatedImpact: 'Recommend checking error logs',
errorElevatedAction: 'Analyze error types and distribution, create fix plan',
// SLA diagnostics
slaCritical: 'SLA critically below target ({sla}%)',
slaCriticalImpact: 'User experience severely degraded',
slaCriticalAction: 'Urgently investigate errors and latency, consider rate limiting',
slaLow: 'SLA below target ({sla}%)',
slaLowImpact: 'Service quality needs attention',
slaLowAction: 'Analyze SLA decline causes, optimize system performance',
// Health score diagnostics
healthCritical: 'Overall health score critically low ({score})',
healthCriticalImpact: 'Multiple metrics may be degraded; prioritize error rate and latency investigation',
healthCriticalAction: 'Comprehensive system check, prioritize critical-level issues',
healthLow: 'Overall health score low ({score})',
healthLowImpact: 'May indicate minor instability; monitor SLA and error rates',
healthLowAction: 'Monitor metric trends, prevent issue escalation',
healthy: 'All system metrics normal',
healthyImpact: 'Service running stable'
},
// Error Log
errorLog: {
timeId: 'Time / ID',
context: 'Context',
status: 'Status',
message: 'Message',
latency: 'Latency',
action: 'Action',
noErrors: 'No errors in this window.',
grp: 'GRP:',
acc: 'ACC:',
details: 'Details',
phase: 'Phase'
},
// Error Details Modal
errorDetails: {
upstreamErrors: 'Upstream Errors',
requestErrors: 'Request Errors',
total: 'Total:',
searchPlaceholder: 'Search request_id / client_request_id / message',
accountIdPlaceholder: 'account_id'
},
// Error Detail Modal
errorDetail: {
loading: 'Loading…',
requestId: 'Request ID',
time: 'Time',
phase: 'Phase',
status: 'Status',
message: 'Message',
basicInfo: 'Basic Info',
platform: 'Platform',
model: 'Model',
latency: 'Latency',
ttft: 'TTFT',
businessLimited: 'Business Limited',
requestPath: 'Request Path',
timings: 'Timings',
auth: 'Auth',
routing: 'Routing',
upstream: 'Upstream',
response: 'Response',
retry: 'Retry',
retryClient: 'Retry (Client)',
retryUpstream: 'Retry (Upstream pinned)',
pinnedAccountId: 'Pinned account_id',
retryNotes: 'Retry Notes',
requestBody: 'Request Body',
errorBody: 'Error Body',
trimmed: 'trimmed',
confirmRetry: 'Confirm Retry',
retrySuccess: 'Retry succeeded',
retryFailed: 'Retry failed',
na: 'N/A',
retryHint: 'Retry will resend the request with the same parameters',
retryClientHint: 'Use client retry (no account pinning)',
retryUpstreamHint: 'Use upstream pinned retry (pin to the error account)',
pinnedAccountIdHint: '(auto from error log)',
retryNote1: 'Retry will use the same request body and parameters',
retryNote2: 'If the original request failed due to account issues, pinned retry may still fail',
retryNote3: 'Client retry will reselect an account',
confirmRetryMessage: 'Confirm retry this request?',
confirmRetryHint: 'Will resend with the same request parameters'
},
requestDetails: {
title: 'Request Details',
details: 'Details',
rangeLabel: 'Window: {range}',
rangeMinutes: '{n} minutes',
rangeHours: '{n} hours',
empty: 'No requests in this window.',
emptyHint: 'Try a different time range or remove filters.',
failedToLoad: 'Failed to load request details',
requestIdCopied: 'Request ID copied',
copyFailed: 'Copy failed',
copy: 'Copy',
viewError: 'View Error',
kind: {
success: 'SUCCESS',
error: 'ERROR'
},
table: {
time: 'Time',
kind: 'Kind',
platform: 'Platform',
model: 'Model',
duration: 'Duration',
status: 'Status',
requestId: 'Request ID',
actions: 'Actions'
}
},
alertEvents: {
title: 'Alert Events',
description: 'Recent alert firing/resolution records (email-only)',
loading: 'Loading...',
empty: 'No alert events',
loadFailed: 'Failed to load alert events',
table: {
time: 'Time',
status: 'Status',
severity: 'Severity',
title: 'Title',
metric: 'Metric / Threshold',
email: 'Email Sent'
}
},
alertRules: {
title: 'Alert Rules',
description: 'Create and manage threshold-based system alerts (email-only)',
loading: 'Loading...',
empty: 'No alert rules',
loadFailed: 'Failed to load alert rules',
saveFailed: 'Failed to save alert rule',
deleteFailed: 'Failed to delete alert rule',
create: 'Create Rule',
createTitle: 'Create Alert Rule',
editTitle: 'Edit Alert Rule',
deleteConfirmTitle: 'Delete this rule?',
deleteConfirmMessage: 'This will remove the rule and its related events. Continue?',
metricGroups: {
system: 'System Metrics',
group: 'Group-level Metrics (requires group_id)',
account: 'Account-level Metrics'
},
metrics: {
successRate: 'Success Rate (%)',
errorRate: 'Error Rate (%)',
upstreamErrorRate: 'Upstream Error Rate (%)',
p95: 'P95 Latency (ms)',
p99: 'P99 Latency (ms)',
cpu: 'CPU Usage (%)',
memory: 'Memory Usage (%)',
queueDepth: 'Concurrency Queue Depth',
groupAvailableAccounts: 'Group Available Accounts',
groupAvailableRatio: 'Group Available Ratio (%)',
groupRateLimitRatio: 'Group Rate Limit Ratio (%)',
accountRateLimitedCount: 'Rate-limited Accounts',
accountErrorCount: 'Error Accounts (excluding temporarily unschedulable)',
accountErrorRatio: 'Error Account Ratio (%)',
overloadAccountCount: 'Overloaded Accounts'
},
metricDescriptions: {
successRate: 'Percentage of successful requests in the window (0-100).',
errorRate: 'Percentage of failed requests in the window (0-100).',
upstreamErrorRate: 'Percentage of upstream failures in the window (0-100).',
p95: 'P95 request latency within the window (ms).',
p99: 'P99 request latency within the window (ms).',
cpu: 'Current instance CPU usage (0-100).',
memory: 'Current instance memory usage (0-100).',
queueDepth: 'Concurrency queue depth within the window (queued requests).',
groupAvailableAccounts: 'Number of available accounts in the selected group (requires group_id).',
groupAvailableRatio: 'Available account ratio in the selected group (0-100, requires group_id).',
groupRateLimitRatio: 'Rate-limited account ratio in the selected group (0-100, requires group_id).',
accountRateLimitedCount: 'Number of rate-limited accounts within the window.',
accountErrorCount: 'Number of error accounts within the window (excluding temporarily unschedulable).',
accountErrorRatio: 'Error account ratio within the window (0-100).',
overloadAccountCount: 'Number of overloaded accounts within the window.'
},
hints: {
recommended: 'Recommended: operator {operator}, threshold {threshold}{unit}',
groupRequired: 'This is a group-level metric; selecting a group (group_id) is required.',
groupOptional: 'Optional: limit the rule to a specific group via group_id.'
},
table: {
name: 'Name',
metric: 'Metric',
severity: 'Severity',
enabled: 'Enabled',
actions: 'Actions'
},
form: {
name: 'Name',
description: 'Description',
metric: 'Metric',
operator: 'Operator',
groupId: 'Group (group_id)',
groupPlaceholder: 'Select a group',
allGroups: 'All groups',
threshold: 'Threshold',
severity: 'Severity',
window: 'Window (minutes)',
sustained: 'Sustained (samples)',
cooldown: 'Cooldown (minutes)',
enabled: 'Enabled',
notifyEmail: 'Send email notifications'
},
validation: {
title: 'Please fix the following issues',
invalid: 'Invalid rule',
nameRequired: 'Name is required',
metricRequired: 'Metric is required',
groupIdRequired: 'group_id is required for group-level metrics',
operatorRequired: 'Operator is required',
thresholdRequired: 'Threshold must be a number',
windowRange: 'Window must be one of: 1, 5, 60 minutes',
sustainedRange: 'Sustained must be between 1 and 1440 samples',
cooldownRange: 'Cooldown must be between 0 and 1440 minutes'
}
},
runtime: {
title: 'Ops Runtime Settings',
description: 'Stored in database; changes take effect without editing config files.',
loading: 'Loading...',
noData: 'No runtime settings available',
loadFailed: 'Failed to load runtime settings',
saveSuccess: 'Runtime settings saved',
saveFailed: 'Failed to save runtime settings',
alertTitle: 'Alert Evaluator',
groupAvailabilityTitle: 'Group Availability Monitor',
evalIntervalSeconds: 'Evaluation Interval (seconds)',
silencing: {
title: 'Alert Silencing (Maintenance Mode)',
enabled: 'Enable silencing',
globalUntil: 'Silence until (RFC3339)',
untilPlaceholder: '2026-01-05T00:00:00Z',
untilHint: 'Leave empty to only toggle silencing without an expiry (not recommended).',
reason: 'Reason',
reasonPlaceholder: 'e.g., planned maintenance',
entries: {
title: 'Advanced: targeted silencing',
hint: 'Optional: silence only certain rules or severities. Leave fields empty to match all.',
add: 'Add Entry',
empty: 'No targeted entries',
entryTitle: 'Entry #{n}',
ruleId: 'Rule ID (optional)',
ruleIdPlaceholder: 'e.g., 1',
severities: 'Severities (optional)',
severitiesPlaceholder: 'e.g., P0,P1 (empty = all)',
until: 'Until (RFC3339)',
reason: 'Reason',
validation: {
untilRequired: 'Entry until time is required',
untilFormat: 'Entry until time must be a valid RFC3339 timestamp',
ruleIdPositive: 'Entry rule_id must be a positive integer',
severitiesFormat: 'Entry severities must be a comma-separated list of P0..P3'
}
},
validation: {
timeFormat: 'Silence time must be a valid RFC3339 timestamp'
}
},
lockEnabled: 'Distributed Lock Enabled',
lockKey: 'Distributed Lock Key',
lockTTLSeconds: 'Distributed Lock TTL (seconds)',
showAdvancedDeveloperSettings: 'Show advanced developer settings (Distributed Lock)',
advancedSettingsSummary: 'Advanced settings (Distributed Lock)',
evalIntervalHint: 'How often the evaluator runs. Keeping the default is recommended.',
validation: {
title: 'Please fix the following issues',
invalid: 'Invalid settings',
evalIntervalRange: 'Evaluation interval must be between 1 and 86400 seconds',
lockKeyRequired: 'Distributed lock key is required when lock is enabled',
lockKeyPrefix: 'Distributed lock key must start with "{prefix}"',
lockKeyHint: 'Recommended: start with "{prefix}" to avoid conflicts',
lockTtlRange: 'Distributed lock TTL must be between 1 and 86400 seconds'
}
},
email: {
title: 'Email Notification',
description: 'Configure alert/report email notifications (stored in database).',
loading: 'Loading...',
noData: 'No email notification config',
loadFailed: 'Failed to load email notification config',
saveSuccess: 'Email notification config saved',
saveFailed: 'Failed to save email notification config',
alertTitle: 'Alert Emails',
reportTitle: 'Report Emails',
recipients: 'Recipients',
recipientsHint: 'If empty, the system may fallback to the first admin email.',
minSeverity: 'Min Severity',
minSeverityAll: 'All severities',
rateLimitPerHour: 'Rate limit per hour',
batchWindowSeconds: 'Batch window (seconds)',
includeResolved: 'Include resolved alerts',
dailySummary: 'Daily summary',
weeklySummary: 'Weekly summary',
errorDigest: 'Error digest',
errorDigestMinCount: 'Min errors for digest',
accountHealth: 'Account health',
accountHealthThreshold: 'Error rate threshold (%)',
cronPlaceholder: 'Cron expression',
reportHint: 'Schedules use cron syntax; leave empty to use defaults.',
validation: {
title: 'Please fix the following issues',
invalid: 'Invalid email notification config',
alertRecipientsRequired: 'Alert emails are enabled but no recipients are configured',
reportRecipientsRequired: 'Report emails are enabled but no recipients are configured',
invalidRecipients: 'One or more recipient emails are invalid',
rateLimitRange: 'Rate limit per hour must be a number ≥ 0',
batchWindowRange: 'Batch window must be between 0 and 86400 seconds',
cronRequired: 'A cron expression is required when schedule is enabled',
cronFormat: 'Cron expression format looks invalid (expected at least 5 parts)',
digestMinCountRange: 'Min errors for digest must be a number ≥ 0',
accountHealthThresholdRange: 'Account health threshold must be between 0 and 100'
}
},
concurrency: {
title: 'Concurrency / Queue',
byPlatform: 'By Platform',
byGroup: 'By Group',
byAccount: 'By Account',
totalRows: '{count} rows',
disabledHint: 'Realtime monitoring is disabled in settings.',
empty: 'No data',
queued: 'Queue {count}',
rateLimited: 'Rate-limited {count}',
errorAccounts: 'Errors {count}',
loadFailed: 'Failed to load concurrency data'
},
realtime: {
title: 'Realtime',
connected: 'Realtime connected',
connecting: 'Realtime connecting',
reconnecting: 'Realtime reconnecting',
offline: 'Realtime offline',
closed: 'Realtime closed',
reconnectIn: 'retry in {seconds}s'
},
queryMode: {
auto: 'Auto',
raw: 'Raw',
preagg: 'Preagg'
},
accountAvailability: {
available: 'Available',
unavailable: 'Unavailable',
accountError: 'Error'
},
tooltips: {
throughputTrend: 'Requests/QPS + Tokens/TPS in the selected window.',
latencyHistogram: 'Latency distribution (duration_ms) for successful requests.',
errorTrend: 'Error counts over time (SLA scope excludes business limits; upstream excludes 429/529).',
errorDistribution: 'Error distribution by status code.',
goroutines:
'Number of Go runtime goroutines (lightweight threads). There is no absolute “safe” number—use your historical baseline. Heuristic: <2k is common; 2k–8k watch; >8k plus rising queue/latency often suggests blocking/leaks.',
cpu: 'CPU usage percentage, showing system processor load.',
memory: 'Memory usage, including used and total available memory.',
db: 'Database connection pool status, including active, idle, and waiting connections.',
redis: 'Redis connection pool status, showing active and idle connections.',
jobs: 'Background job execution status, including last run time, success time, and error information.',
qps: 'Queries Per Second (QPS) and Tokens Per Second (TPS), real-time system throughput.',
tokens: 'Total number of tokens processed in the current time window.',
sla: 'Service Level Agreement success rate, excluding business limits (e.g., insufficient balance, quota exceeded).',
errors: 'Error statistics, including total errors, error rate, and upstream error rate.',
latency: 'Request latency statistics, including p50, p90, p95, p99 percentiles.',
ttft: 'Time To First Token, measuring the speed of first byte return in streaming responses.',
health: 'System health score (0-100), considering SLA, error rate, and resource usage.'
},
charts: {
emptyRequest: 'No requests in this window.',
emptyError: 'No errors in this window.',
resetZoom: 'Reset',
resetZoomHint: 'Reset zoom (if enabled)',
downloadChart: 'Download',
downloadChartHint: 'Download chart as image'
}
},
// Settings // Settings
settings: { settings: {
title: 'System Settings', title: 'System Settings',
...@@ -1940,6 +2473,22 @@ export default { ...@@ -1940,6 +2473,22 @@ export default {
sending: 'Sending...', sending: 'Sending...',
enterRecipientHint: 'Please enter a recipient email address' enterRecipientHint: 'Please enter a recipient email address'
}, },
opsMonitoring: {
title: 'Ops Monitoring',
description: 'Enable ops monitoring for troubleshooting and health visibility',
disabled: 'Ops monitoring is disabled',
enabled: 'Enable Ops Monitoring',
enabledHint: 'Enable the ops monitoring module (admin only)',
realtimeEnabled: 'Enable Realtime Monitoring',
realtimeEnabledHint: 'Enable realtime QPS/metrics push (WebSocket)',
queryMode: 'Default Query Mode',
queryModeHint: 'Default query mode for Ops Dashboard (auto/raw/preagg)',
queryModeAuto: 'Auto (recommended)',
queryModeRaw: 'Raw (most accurate, slower)',
queryModePreagg: 'Preagg (fastest, requires aggregation)',
metricsInterval: 'Metrics Collection Interval (seconds)',
metricsIntervalHint: 'How often to collect system/request metrics (60-3600 seconds)'
},
adminApiKey: { adminApiKey: {
title: 'Admin API Key', title: 'Admin API Key',
description: 'Global API key for external system integration with full admin access', description: 'Global API key for external system integration with full admin access',
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment