Commit df1ef3de authored by ianshaw's avatar ianshaw
Browse files

refactor: 移除 Ops 监控模块

移除未完成的运维监控功能,简化系统架构:
- 删除 ops_handler, ops_service, ops_repo 等后端代码
- 删除 ops 相关数据库迁移文件
- 删除前端 OpsDashboard 页面和 API
parent 45bd9ac7
package admin
import (
"math"
"net/http"
"strconv"
"time"
"github.com/Wei-Shaw/sub2api/internal/pkg/response"
"github.com/Wei-Shaw/sub2api/internal/service"
"github.com/gin-gonic/gin"
)
// OpsHandler handles ops dashboard endpoints.
type OpsHandler struct {
opsService *service.OpsService
}
// NewOpsHandler creates a new OpsHandler.
func NewOpsHandler(opsService *service.OpsService) *OpsHandler {
return &OpsHandler{opsService: opsService}
}
// GetMetrics returns the latest ops metrics snapshot.
// GET /api/v1/admin/ops/metrics
func (h *OpsHandler) GetMetrics(c *gin.Context) {
metrics, err := h.opsService.GetLatestMetrics(c.Request.Context())
if err != nil {
response.Error(c, http.StatusInternalServerError, "Failed to get ops metrics")
return
}
response.Success(c, metrics)
}
// ListMetricsHistory returns a time-range slice of metrics for charts.
// GET /api/v1/admin/ops/metrics/history
//
// Query params:
// - window_minutes: int (default 1)
// - minutes: int (lookback; optional)
// - start_time/end_time: RFC3339 timestamps (optional; overrides minutes when provided)
// - limit: int (optional; max 100, default 300 for backward compatibility)
func (h *OpsHandler) ListMetricsHistory(c *gin.Context) {
windowMinutes := 1
if v := c.Query("window_minutes"); v != "" {
if parsed, err := strconv.Atoi(v); err == nil && parsed > 0 {
windowMinutes = parsed
} else {
response.BadRequest(c, "Invalid window_minutes")
return
}
}
limit := 300
limitProvided := false
if v := c.Query("limit"); v != "" {
parsed, err := strconv.Atoi(v)
if err != nil || parsed <= 0 || parsed > 5000 {
response.BadRequest(c, "Invalid limit (must be 1-5000)")
return
}
limit = parsed
limitProvided = true
}
endTime := time.Now()
startTime := time.Time{}
if startTimeStr := c.Query("start_time"); startTimeStr != "" {
parsed, err := time.Parse(time.RFC3339, startTimeStr)
if err != nil {
response.BadRequest(c, "Invalid start_time format (RFC3339)")
return
}
startTime = parsed
}
if endTimeStr := c.Query("end_time"); endTimeStr != "" {
parsed, err := time.Parse(time.RFC3339, endTimeStr)
if err != nil {
response.BadRequest(c, "Invalid end_time format (RFC3339)")
return
}
endTime = parsed
}
// If explicit range not provided, use lookback minutes.
if startTime.IsZero() {
if v := c.Query("minutes"); v != "" {
minutes, err := strconv.Atoi(v)
if err != nil || minutes <= 0 {
response.BadRequest(c, "Invalid minutes")
return
}
if minutes > 60*24*7 {
minutes = 60 * 24 * 7
}
startTime = endTime.Add(-time.Duration(minutes) * time.Minute)
}
}
// Default time range: last 24 hours.
if startTime.IsZero() {
startTime = endTime.Add(-24 * time.Hour)
if !limitProvided {
// Metrics are collected at 1-minute cadence; 24h requires ~1440 points.
limit = 24 * 60
}
}
if startTime.After(endTime) {
response.BadRequest(c, "Invalid time range: start_time must be <= end_time")
return
}
items, err := h.opsService.ListMetricsHistory(c.Request.Context(), windowMinutes, startTime, endTime, limit)
if err != nil {
response.Error(c, http.StatusInternalServerError, "Failed to list ops metrics history")
return
}
response.Success(c, gin.H{"items": items})
}
// ListErrorLogs lists recent error logs with optional filters.
// GET /api/v1/admin/ops/error-logs
//
// Query params:
// - start_time/end_time: RFC3339 timestamps (optional)
// - platform: string (optional)
// - phase: string (optional)
// - severity: string (optional)
// - q: string (optional; fuzzy match)
// - limit: int (optional; default 100; max 500)
func (h *OpsHandler) ListErrorLogs(c *gin.Context) {
var filters service.OpsErrorLogFilters
if startTimeStr := c.Query("start_time"); startTimeStr != "" {
startTime, err := time.Parse(time.RFC3339, startTimeStr)
if err != nil {
response.BadRequest(c, "Invalid start_time format (RFC3339)")
return
}
filters.StartTime = &startTime
}
if endTimeStr := c.Query("end_time"); endTimeStr != "" {
endTime, err := time.Parse(time.RFC3339, endTimeStr)
if err != nil {
response.BadRequest(c, "Invalid end_time format (RFC3339)")
return
}
filters.EndTime = &endTime
}
if filters.StartTime != nil && filters.EndTime != nil && filters.StartTime.After(*filters.EndTime) {
response.BadRequest(c, "Invalid time range: start_time must be <= end_time")
return
}
filters.Platform = c.Query("platform")
filters.Phase = c.Query("phase")
filters.Severity = c.Query("severity")
filters.Query = c.Query("q")
filters.Limit = 100
if limitStr := c.Query("limit"); limitStr != "" {
limit, err := strconv.Atoi(limitStr)
if err != nil || limit <= 0 || limit > 500 {
response.BadRequest(c, "Invalid limit (must be 1-500)")
return
}
filters.Limit = limit
}
items, total, err := h.opsService.ListErrorLogs(c.Request.Context(), filters)
if err != nil {
response.Error(c, http.StatusInternalServerError, "Failed to list error logs")
return
}
response.Success(c, gin.H{
"items": items,
"total": total,
})
}
// GetDashboardOverview returns realtime ops dashboard overview.
// GET /api/v1/admin/ops/dashboard/overview
//
// Query params:
// - time_range: string (optional; default "1h") one of: 5m, 30m, 1h, 6h, 24h
func (h *OpsHandler) GetDashboardOverview(c *gin.Context) {
timeRange := c.Query("time_range")
if timeRange == "" {
timeRange = "1h"
}
switch timeRange {
case "5m", "30m", "1h", "6h", "24h":
default:
response.BadRequest(c, "Invalid time_range (supported: 5m, 30m, 1h, 6h, 24h)")
return
}
data, err := h.opsService.GetDashboardOverview(c.Request.Context(), timeRange)
if err != nil {
response.Error(c, http.StatusInternalServerError, "Failed to get dashboard overview")
return
}
response.Success(c, data)
}
// GetProviderHealth returns upstream provider health comparison data.
// GET /api/v1/admin/ops/dashboard/providers
//
// Query params:
// - time_range: string (optional; default "1h") one of: 5m, 30m, 1h, 6h, 24h
func (h *OpsHandler) GetProviderHealth(c *gin.Context) {
timeRange := c.Query("time_range")
if timeRange == "" {
timeRange = "1h"
}
switch timeRange {
case "5m", "30m", "1h", "6h", "24h":
default:
response.BadRequest(c, "Invalid time_range (supported: 5m, 30m, 1h, 6h, 24h)")
return
}
providers, err := h.opsService.GetProviderHealth(c.Request.Context(), timeRange)
if err != nil {
response.Error(c, http.StatusInternalServerError, "Failed to get provider health")
return
}
var totalRequests int64
var weightedSuccess float64
var bestProvider string
var worstProvider string
var bestRate float64
var worstRate float64
hasRate := false
for _, p := range providers {
if p == nil {
continue
}
totalRequests += p.RequestCount
weightedSuccess += (p.SuccessRate / 100) * float64(p.RequestCount)
if p.RequestCount <= 0 {
continue
}
if !hasRate {
bestProvider = p.Name
worstProvider = p.Name
bestRate = p.SuccessRate
worstRate = p.SuccessRate
hasRate = true
continue
}
if p.SuccessRate > bestRate {
bestProvider = p.Name
bestRate = p.SuccessRate
}
if p.SuccessRate < worstRate {
worstProvider = p.Name
worstRate = p.SuccessRate
}
}
avgSuccessRate := 0.0
if totalRequests > 0 {
avgSuccessRate = (weightedSuccess / float64(totalRequests)) * 100
avgSuccessRate = math.Round(avgSuccessRate*100) / 100
}
response.Success(c, gin.H{
"providers": providers,
"summary": gin.H{
"total_requests": totalRequests,
"avg_success_rate": avgSuccessRate,
"best_provider": bestProvider,
"worst_provider": worstProvider,
},
})
}
// GetErrorLogs returns a paginated error log list with multi-dimensional filters.
// GET /api/v1/admin/ops/errors
func (h *OpsHandler) GetErrorLogs(c *gin.Context) {
page, pageSize := response.ParsePagination(c)
filter := &service.ErrorLogFilter{
Page: page,
PageSize: pageSize,
}
if startTimeStr := c.Query("start_time"); startTimeStr != "" {
startTime, err := time.Parse(time.RFC3339, startTimeStr)
if err != nil {
response.BadRequest(c, "Invalid start_time format (RFC3339)")
return
}
filter.StartTime = &startTime
}
if endTimeStr := c.Query("end_time"); endTimeStr != "" {
endTime, err := time.Parse(time.RFC3339, endTimeStr)
if err != nil {
response.BadRequest(c, "Invalid end_time format (RFC3339)")
return
}
filter.EndTime = &endTime
}
if filter.StartTime != nil && filter.EndTime != nil && filter.StartTime.After(*filter.EndTime) {
response.BadRequest(c, "Invalid time range: start_time must be <= end_time")
return
}
if errorCodeStr := c.Query("error_code"); errorCodeStr != "" {
code, err := strconv.Atoi(errorCodeStr)
if err != nil || code < 0 {
response.BadRequest(c, "Invalid error_code")
return
}
filter.ErrorCode = &code
}
// Keep both parameter names for compatibility: provider (docs) and platform (legacy).
filter.Provider = c.Query("provider")
if filter.Provider == "" {
filter.Provider = c.Query("platform")
}
if accountIDStr := c.Query("account_id"); accountIDStr != "" {
accountID, err := strconv.ParseInt(accountIDStr, 10, 64)
if err != nil || accountID <= 0 {
response.BadRequest(c, "Invalid account_id")
return
}
filter.AccountID = &accountID
}
out, err := h.opsService.GetErrorLogs(c.Request.Context(), filter)
if err != nil {
response.Error(c, http.StatusInternalServerError, "Failed to get error logs")
return
}
response.Success(c, gin.H{
"errors": out.Errors,
"total": out.Total,
"page": out.Page,
"page_size": out.PageSize,
})
}
// GetLatencyHistogram returns the latency distribution histogram.
// GET /api/v1/admin/ops/dashboard/latency-histogram
func (h *OpsHandler) GetLatencyHistogram(c *gin.Context) {
timeRange := c.Query("time_range")
if timeRange == "" {
timeRange = "1h"
}
buckets, err := h.opsService.GetLatencyHistogram(c.Request.Context(), timeRange)
if err != nil {
response.Error(c, http.StatusInternalServerError, "Failed to get latency histogram")
return
}
totalRequests := int64(0)
for _, b := range buckets {
totalRequests += b.Count
}
response.Success(c, gin.H{
"buckets": buckets,
"total_requests": totalRequests,
"slow_request_threshold": 1000,
})
}
// GetErrorDistribution returns the error distribution.
// GET /api/v1/admin/ops/dashboard/errors/distribution
func (h *OpsHandler) GetErrorDistribution(c *gin.Context) {
timeRange := c.Query("time_range")
if timeRange == "" {
timeRange = "1h"
}
items, err := h.opsService.GetErrorDistribution(c.Request.Context(), timeRange)
if err != nil {
response.Error(c, http.StatusInternalServerError, "Failed to get error distribution")
return
}
response.Success(c, gin.H{
"items": items,
})
}
package admin
import (
"context"
"encoding/json"
"log"
"net"
"net/http"
"net/netip"
"net/url"
"os"
"strconv"
"strings"
"time"
"github.com/gin-gonic/gin"
"github.com/gorilla/websocket"
)
type OpsWSProxyConfig struct {
TrustProxy bool
TrustedProxies []netip.Prefix
OriginPolicy string
}
const (
envOpsWSTrustProxy = "OPS_WS_TRUST_PROXY"
envOpsWSTrustedProxies = "OPS_WS_TRUSTED_PROXIES"
envOpsWSOriginPolicy = "OPS_WS_ORIGIN_POLICY"
)
const (
OriginPolicyStrict = "strict"
OriginPolicyPermissive = "permissive"
)
var opsWSProxyConfig = loadOpsWSProxyConfigFromEnv()
var upgrader = websocket.Upgrader{
CheckOrigin: func(r *http.Request) bool {
return isAllowedOpsWSOrigin(r)
},
}
// QPSWSHandler handles realtime QPS push via WebSocket.
// GET /api/v1/admin/ops/ws/qps
func (h *OpsHandler) QPSWSHandler(c *gin.Context) {
conn, err := upgrader.Upgrade(c.Writer, c.Request, nil)
if err != nil {
log.Printf("[OpsWS] upgrade failed: %v", err)
return
}
defer func() { _ = conn.Close() }()
// Set pong handler
if err := conn.SetReadDeadline(time.Now().Add(60 * time.Second)); err != nil {
log.Printf("[OpsWS] set read deadline failed: %v", err)
return
}
conn.SetPongHandler(func(string) error {
return conn.SetReadDeadline(time.Now().Add(60 * time.Second))
})
// Push QPS data every 2 seconds
ticker := time.NewTicker(2 * time.Second)
defer ticker.Stop()
// Heartbeat ping every 30 seconds
pingTicker := time.NewTicker(30 * time.Second)
defer pingTicker.Stop()
ctx, cancel := context.WithCancel(c.Request.Context())
defer cancel()
for {
select {
case <-ticker.C:
// Fetch 1m window stats for current QPS
data, err := h.opsService.GetDashboardOverview(ctx, "5m")
if err != nil {
log.Printf("[OpsWS] get overview failed: %v", err)
continue
}
payload := gin.H{
"type": "qps_update",
"timestamp": time.Now().Format(time.RFC3339),
"data": gin.H{
"qps": data.QPS.Current,
"tps": data.TPS.Current,
"request_count": data.Errors.TotalCount + int64(data.QPS.Avg1h*60), // Rough estimate
},
}
msg, _ := json.Marshal(payload)
if err := conn.WriteMessage(websocket.TextMessage, msg); err != nil {
log.Printf("[OpsWS] write failed: %v", err)
return
}
case <-pingTicker.C:
if err := conn.WriteMessage(websocket.PingMessage, nil); err != nil {
log.Printf("[OpsWS] ping failed: %v", err)
return
}
case <-ctx.Done():
return
}
}
}
func isAllowedOpsWSOrigin(r *http.Request) bool {
if r == nil {
return false
}
origin := strings.TrimSpace(r.Header.Get("Origin"))
if origin == "" {
switch strings.ToLower(strings.TrimSpace(opsWSProxyConfig.OriginPolicy)) {
case OriginPolicyStrict:
return false
case OriginPolicyPermissive, "":
return true
default:
return true
}
}
parsed, err := url.Parse(origin)
if err != nil || parsed.Hostname() == "" {
return false
}
originHost := strings.ToLower(parsed.Hostname())
trustProxyHeaders := shouldTrustOpsWSProxyHeaders(r)
reqHost := hostWithoutPort(r.Host)
if trustProxyHeaders {
xfHost := strings.TrimSpace(r.Header.Get("X-Forwarded-Host"))
if xfHost != "" {
xfHost = strings.TrimSpace(strings.Split(xfHost, ",")[0])
if xfHost != "" {
reqHost = hostWithoutPort(xfHost)
}
}
}
reqHost = strings.ToLower(reqHost)
if reqHost == "" {
return false
}
return originHost == reqHost
}
func shouldTrustOpsWSProxyHeaders(r *http.Request) bool {
if r == nil {
return false
}
if !opsWSProxyConfig.TrustProxy {
return false
}
peerIP, ok := requestPeerIP(r)
if !ok {
return false
}
return isAddrInTrustedProxies(peerIP, opsWSProxyConfig.TrustedProxies)
}
func requestPeerIP(r *http.Request) (netip.Addr, bool) {
if r == nil {
return netip.Addr{}, false
}
host, _, err := net.SplitHostPort(strings.TrimSpace(r.RemoteAddr))
if err != nil {
host = strings.TrimSpace(r.RemoteAddr)
}
host = strings.TrimPrefix(host, "[")
host = strings.TrimSuffix(host, "]")
if host == "" {
return netip.Addr{}, false
}
addr, err := netip.ParseAddr(host)
if err != nil {
return netip.Addr{}, false
}
return addr.Unmap(), true
}
func isAddrInTrustedProxies(addr netip.Addr, trusted []netip.Prefix) bool {
if !addr.IsValid() {
return false
}
for _, p := range trusted {
if p.Contains(addr) {
return true
}
}
return false
}
func loadOpsWSProxyConfigFromEnv() OpsWSProxyConfig {
cfg := OpsWSProxyConfig{
TrustProxy: true,
TrustedProxies: defaultTrustedProxies(),
OriginPolicy: OriginPolicyPermissive,
}
if v := strings.TrimSpace(os.Getenv(envOpsWSTrustProxy)); v != "" {
if parsed, err := strconv.ParseBool(v); err == nil {
cfg.TrustProxy = parsed
} else {
log.Printf("[OpsWS] invalid %s=%q (expected bool); using default=%v", envOpsWSTrustProxy, v, cfg.TrustProxy)
}
}
if raw := strings.TrimSpace(os.Getenv(envOpsWSTrustedProxies)); raw != "" {
prefixes, invalid := parseTrustedProxyList(raw)
if len(invalid) > 0 {
log.Printf("[OpsWS] invalid %s entries ignored: %s", envOpsWSTrustedProxies, strings.Join(invalid, ", "))
}
cfg.TrustedProxies = prefixes
}
if v := strings.TrimSpace(os.Getenv(envOpsWSOriginPolicy)); v != "" {
normalized := strings.ToLower(v)
switch normalized {
case OriginPolicyStrict, OriginPolicyPermissive:
cfg.OriginPolicy = normalized
default:
log.Printf("[OpsWS] invalid %s=%q (expected %q or %q); using default=%q", envOpsWSOriginPolicy, v, OriginPolicyStrict, OriginPolicyPermissive, cfg.OriginPolicy)
}
}
return cfg
}
func defaultTrustedProxies() []netip.Prefix {
prefixes, _ := parseTrustedProxyList("127.0.0.0/8,::1/128")
return prefixes
}
func parseTrustedProxyList(raw string) (prefixes []netip.Prefix, invalid []string) {
for _, token := range strings.Split(raw, ",") {
item := strings.TrimSpace(token)
if item == "" {
continue
}
var (
p netip.Prefix
err error
)
if strings.Contains(item, "/") {
p, err = netip.ParsePrefix(item)
} else {
var addr netip.Addr
addr, err = netip.ParseAddr(item)
if err == nil {
addr = addr.Unmap()
bits := 128
if addr.Is4() {
bits = 32
}
p = netip.PrefixFrom(addr, bits)
}
}
if err != nil || !p.IsValid() {
invalid = append(invalid, item)
continue
}
prefixes = append(prefixes, p.Masked())
}
return prefixes, invalid
}
func hostWithoutPort(hostport string) string {
hostport = strings.TrimSpace(hostport)
if hostport == "" {
return ""
}
if host, _, err := net.SplitHostPort(hostport); err == nil {
return host
}
if strings.HasPrefix(hostport, "[") && strings.HasSuffix(hostport, "]") {
return strings.Trim(hostport, "[]")
}
parts := strings.Split(hostport, ":")
return parts[0]
}
package admin
import (
"net/http"
"net/netip"
"testing"
)
func TestIsAllowedOpsWSOrigin_AllowsEmptyOrigin(t *testing.T) {
original := opsWSProxyConfig
t.Cleanup(func() { opsWSProxyConfig = original })
opsWSProxyConfig = OpsWSProxyConfig{OriginPolicy: OriginPolicyPermissive}
req, err := http.NewRequest(http.MethodGet, "http://example.test", nil)
if err != nil {
t.Fatalf("NewRequest: %v", err)
}
if !isAllowedOpsWSOrigin(req) {
t.Fatalf("expected empty Origin to be allowed")
}
}
func TestIsAllowedOpsWSOrigin_RejectsEmptyOrigin_WhenStrict(t *testing.T) {
original := opsWSProxyConfig
t.Cleanup(func() { opsWSProxyConfig = original })
opsWSProxyConfig = OpsWSProxyConfig{OriginPolicy: OriginPolicyStrict}
req, err := http.NewRequest(http.MethodGet, "http://example.test", nil)
if err != nil {
t.Fatalf("NewRequest: %v", err)
}
if isAllowedOpsWSOrigin(req) {
t.Fatalf("expected empty Origin to be rejected under strict policy")
}
}
func TestIsAllowedOpsWSOrigin_UsesXForwardedHostOnlyFromTrustedProxy(t *testing.T) {
original := opsWSProxyConfig
t.Cleanup(func() { opsWSProxyConfig = original })
opsWSProxyConfig = OpsWSProxyConfig{
TrustProxy: true,
TrustedProxies: []netip.Prefix{
netip.MustParsePrefix("127.0.0.0/8"),
},
}
// Untrusted peer: ignore X-Forwarded-Host and compare against r.Host.
{
req, err := http.NewRequest(http.MethodGet, "http://internal.service.local", nil)
if err != nil {
t.Fatalf("NewRequest: %v", err)
}
req.RemoteAddr = "192.0.2.1:12345"
req.Host = "internal.service.local"
req.Header.Set("Origin", "https://public.example.com")
req.Header.Set("X-Forwarded-Host", "public.example.com")
if isAllowedOpsWSOrigin(req) {
t.Fatalf("expected Origin to be rejected when peer is not a trusted proxy")
}
}
// Trusted peer: allow X-Forwarded-Host to participate in Origin validation.
{
req, err := http.NewRequest(http.MethodGet, "http://internal.service.local", nil)
if err != nil {
t.Fatalf("NewRequest: %v", err)
}
req.RemoteAddr = "127.0.0.1:23456"
req.Host = "internal.service.local"
req.Header.Set("Origin", "https://public.example.com")
req.Header.Set("X-Forwarded-Host", "public.example.com")
if !isAllowedOpsWSOrigin(req) {
t.Fatalf("expected Origin to be accepted when peer is a trusted proxy")
}
}
}
func TestLoadOpsWSProxyConfigFromEnv_OriginPolicy(t *testing.T) {
t.Setenv(envOpsWSOriginPolicy, "STRICT")
cfg := loadOpsWSProxyConfigFromEnv()
if cfg.OriginPolicy != OriginPolicyStrict {
t.Fatalf("OriginPolicy=%q, want %q", cfg.OriginPolicy, OriginPolicyStrict)
}
}
func TestLoadOpsWSProxyConfigFromEnv_OriginPolicyInvalidUsesDefault(t *testing.T) {
t.Setenv(envOpsWSOriginPolicy, "nope")
cfg := loadOpsWSProxyConfigFromEnv()
if cfg.OriginPolicy != OriginPolicyPermissive {
t.Fatalf("OriginPolicy=%q, want %q", cfg.OriginPolicy, OriginPolicyPermissive)
}
}
func TestParseTrustedProxyList(t *testing.T) {
prefixes, invalid := parseTrustedProxyList("10.0.0.1, 10.0.0.0/8, bad, ::1/128")
if len(prefixes) != 3 {
t.Fatalf("prefixes=%d, want 3", len(prefixes))
}
if len(invalid) != 1 || invalid[0] != "bad" {
t.Fatalf("invalid=%v, want [bad]", invalid)
}
}
func TestRequestPeerIP_ParsesIPv6(t *testing.T) {
req, err := http.NewRequest(http.MethodGet, "http://example.test", nil)
if err != nil {
t.Fatalf("NewRequest: %v", err)
}
req.RemoteAddr = "[::1]:1234"
addr, ok := requestPeerIP(req)
if !ok {
t.Fatalf("expected IPv6 peer IP to parse")
}
if addr != netip.MustParseAddr("::1") {
t.Fatalf("addr=%s, want ::1", addr)
}
}
package handler
import (
"context"
"strings"
"sync"
"time"
middleware2 "github.com/Wei-Shaw/sub2api/internal/server/middleware"
"github.com/Wei-Shaw/sub2api/internal/service"
"github.com/gin-gonic/gin"
)
const (
opsModelKey = "ops_model"
opsStreamKey = "ops_stream"
)
const (
opsErrorLogWorkerCount = 10
opsErrorLogQueueSize = 256
opsErrorLogTimeout = 2 * time.Second
)
type opsErrorLogJob struct {
ops *service.OpsService
entry *service.OpsErrorLog
}
var (
opsErrorLogOnce sync.Once
opsErrorLogQueue chan opsErrorLogJob
)
func startOpsErrorLogWorkers() {
opsErrorLogQueue = make(chan opsErrorLogJob, opsErrorLogQueueSize)
for i := 0; i < opsErrorLogWorkerCount; i++ {
go func() {
for job := range opsErrorLogQueue {
if job.ops == nil || job.entry == nil {
continue
}
ctx, cancel := context.WithTimeout(context.Background(), opsErrorLogTimeout)
_ = job.ops.RecordError(ctx, job.entry)
cancel()
}
}()
}
}
func enqueueOpsErrorLog(ops *service.OpsService, entry *service.OpsErrorLog) {
if ops == nil || entry == nil {
return
}
opsErrorLogOnce.Do(startOpsErrorLogWorkers)
select {
case opsErrorLogQueue <- opsErrorLogJob{ops: ops, entry: entry}:
default:
// Queue is full; drop to avoid blocking request handling.
}
}
func setOpsRequestContext(c *gin.Context, model string, stream bool) {
c.Set(opsModelKey, model)
c.Set(opsStreamKey, stream)
}
func recordOpsError(c *gin.Context, ops *service.OpsService, status int, errType, message, fallbackPlatform string) {
if ops == nil || c == nil {
return
}
model, _ := c.Get(opsModelKey)
stream, _ := c.Get(opsStreamKey)
var modelName string
if m, ok := model.(string); ok {
modelName = m
}
streaming, _ := stream.(bool)
apiKey, _ := middleware2.GetAPIKeyFromContext(c)
logEntry := &service.OpsErrorLog{
Phase: classifyOpsPhase(errType, message),
Type: errType,
Severity: classifyOpsSeverity(errType, status),
StatusCode: status,
Platform: resolveOpsPlatform(apiKey, fallbackPlatform),
Model: modelName,
RequestID: c.Writer.Header().Get("x-request-id"),
Message: message,
ClientIP: c.ClientIP(),
RequestPath: func() string {
if c.Request != nil && c.Request.URL != nil {
return c.Request.URL.Path
}
return ""
}(),
Stream: streaming,
}
if apiKey != nil {
logEntry.APIKeyID = &apiKey.ID
if apiKey.User != nil {
logEntry.UserID = &apiKey.User.ID
}
if apiKey.GroupID != nil {
logEntry.GroupID = apiKey.GroupID
}
}
enqueueOpsErrorLog(ops, logEntry)
}
func resolveOpsPlatform(apiKey *service.APIKey, fallback string) string {
if apiKey != nil && apiKey.Group != nil && apiKey.Group.Platform != "" {
return apiKey.Group.Platform
}
return fallback
}
func classifyOpsPhase(errType, message string) string {
msg := strings.ToLower(message)
switch errType {
case "authentication_error":
return "auth"
case "billing_error", "subscription_error":
return "billing"
case "rate_limit_error":
if strings.Contains(msg, "concurrency") || strings.Contains(msg, "pending") {
return "concurrency"
}
return "upstream"
case "invalid_request_error":
return "response"
case "upstream_error", "overloaded_error":
return "upstream"
case "api_error":
if strings.Contains(msg, "no available accounts") {
return "scheduling"
}
return "internal"
default:
return "internal"
}
}
func classifyOpsSeverity(errType string, status int) string {
switch errType {
case "invalid_request_error", "authentication_error", "billing_error", "subscription_error":
return "P3"
}
if status >= 500 {
return "P1"
}
if status == 429 {
return "P1"
}
if status >= 400 {
return "P2"
}
return "P3"
}
package repository
import (
"context"
"database/sql"
"fmt"
"strconv"
"strings"
"time"
"github.com/Wei-Shaw/sub2api/internal/service"
)
// ListErrorLogs queries ops_error_logs with optional filters and pagination.
// It returns the list items and the total count of matching rows.
func (r *OpsRepository) ListErrorLogs(ctx context.Context, filter *service.ErrorLogFilter) ([]*service.ErrorLog, int64, error) {
page := 1
pageSize := 20
if filter != nil {
if filter.Page > 0 {
page = filter.Page
}
if filter.PageSize > 0 {
pageSize = filter.PageSize
}
}
if pageSize > 100 {
pageSize = 100
}
offset := (page - 1) * pageSize
conditions := make([]string, 0)
args := make([]any, 0)
addCondition := func(condition string, values ...any) {
conditions = append(conditions, condition)
args = append(args, values...)
}
if filter != nil {
// 默认查询最近 24 小时
if filter.StartTime == nil && filter.EndTime == nil {
defaultStart := time.Now().Add(-24 * time.Hour)
filter.StartTime = &defaultStart
}
if filter.StartTime != nil {
addCondition(fmt.Sprintf("created_at >= $%d", len(args)+1), *filter.StartTime)
}
if filter.EndTime != nil {
addCondition(fmt.Sprintf("created_at <= $%d", len(args)+1), *filter.EndTime)
}
if filter.ErrorCode != nil {
addCondition(fmt.Sprintf("status_code = $%d", len(args)+1), *filter.ErrorCode)
}
if provider := strings.TrimSpace(filter.Provider); provider != "" {
addCondition(fmt.Sprintf("platform = $%d", len(args)+1), provider)
}
if filter.AccountID != nil {
addCondition(fmt.Sprintf("account_id = $%d", len(args)+1), *filter.AccountID)
}
}
where := ""
if len(conditions) > 0 {
where = "WHERE " + strings.Join(conditions, " AND ")
}
countQuery := fmt.Sprintf(`SELECT COUNT(1) FROM ops_error_logs %s`, where)
var total int64
if err := scanSingleRow(ctx, r.sql, countQuery, args, &total); err != nil {
if err == sql.ErrNoRows {
total = 0
} else {
return nil, 0, err
}
}
listQuery := fmt.Sprintf(`
SELECT
id,
created_at,
severity,
request_id,
account_id,
request_path,
platform,
model,
status_code,
error_message,
duration_ms,
retry_count,
stream
FROM ops_error_logs
%s
ORDER BY created_at DESC
LIMIT $%d OFFSET $%d
`, where, len(args)+1, len(args)+2)
listArgs := append(append([]any{}, args...), pageSize, offset)
rows, err := r.sql.QueryContext(ctx, listQuery, listArgs...)
if err != nil {
return nil, 0, err
}
defer func() { _ = rows.Close() }()
results := make([]*service.ErrorLog, 0)
for rows.Next() {
var (
id int64
createdAt time.Time
severity sql.NullString
requestID sql.NullString
accountID sql.NullInt64
requestURI sql.NullString
platform sql.NullString
model sql.NullString
statusCode sql.NullInt64
message sql.NullString
durationMs sql.NullInt64
retryCount sql.NullInt64
stream sql.NullBool
)
if err := rows.Scan(
&id,
&createdAt,
&severity,
&requestID,
&accountID,
&requestURI,
&platform,
&model,
&statusCode,
&message,
&durationMs,
&retryCount,
&stream,
); err != nil {
return nil, 0, err
}
entry := &service.ErrorLog{
ID: id,
Timestamp: createdAt,
Level: levelFromSeverity(severity.String),
RequestID: requestID.String,
APIPath: requestURI.String,
Provider: platform.String,
Model: model.String,
HTTPCode: int(statusCode.Int64),
Stream: stream.Bool,
}
if accountID.Valid {
entry.AccountID = strconv.FormatInt(accountID.Int64, 10)
}
if message.Valid {
entry.ErrorMessage = message.String
}
if durationMs.Valid {
v := int(durationMs.Int64)
entry.DurationMs = &v
}
if retryCount.Valid {
v := int(retryCount.Int64)
entry.RetryCount = &v
}
results = append(results, entry)
}
if err := rows.Err(); err != nil {
return nil, 0, err
}
return results, total, nil
}
func levelFromSeverity(severity string) string {
sev := strings.ToUpper(strings.TrimSpace(severity))
switch sev {
case "P0", "P1":
return "CRITICAL"
case "P2":
return "ERROR"
case "P3":
return "WARN"
default:
return "ERROR"
}
}
package repository
import (
"context"
"encoding/json"
"errors"
"fmt"
"strings"
"time"
"github.com/Wei-Shaw/sub2api/internal/service"
"github.com/redis/go-redis/v9"
)
const (
opsLatestMetricsKey = "ops:metrics:latest"
opsDashboardOverviewKeyPrefix = "ops:dashboard:overview:"
opsLatestMetricsTTL = 10 * time.Second
)
func (r *OpsRepository) GetCachedLatestSystemMetric(ctx context.Context) (*service.OpsMetrics, error) {
if ctx == nil {
ctx = context.Background()
}
if r == nil || r.rdb == nil {
return nil, nil
}
data, err := r.rdb.Get(ctx, opsLatestMetricsKey).Bytes()
if errors.Is(err, redis.Nil) {
return nil, nil
}
if err != nil {
return nil, fmt.Errorf("redis get cached latest system metric: %w", err)
}
var metric service.OpsMetrics
if err := json.Unmarshal(data, &metric); err != nil {
return nil, fmt.Errorf("unmarshal cached latest system metric: %w", err)
}
return &metric, nil
}
func (r *OpsRepository) SetCachedLatestSystemMetric(ctx context.Context, metric *service.OpsMetrics) error {
if metric == nil {
return nil
}
if ctx == nil {
ctx = context.Background()
}
if r == nil || r.rdb == nil {
return nil
}
data, err := json.Marshal(metric)
if err != nil {
return fmt.Errorf("marshal cached latest system metric: %w", err)
}
return r.rdb.Set(ctx, opsLatestMetricsKey, data, opsLatestMetricsTTL).Err()
}
func (r *OpsRepository) GetCachedDashboardOverview(ctx context.Context, timeRange string) (*service.DashboardOverviewData, error) {
if ctx == nil {
ctx = context.Background()
}
if r == nil || r.rdb == nil {
return nil, nil
}
rangeKey := strings.TrimSpace(timeRange)
if rangeKey == "" {
rangeKey = "1h"
}
key := opsDashboardOverviewKeyPrefix + rangeKey
data, err := r.rdb.Get(ctx, key).Bytes()
if errors.Is(err, redis.Nil) {
return nil, nil
}
if err != nil {
return nil, fmt.Errorf("redis get cached dashboard overview: %w", err)
}
var overview service.DashboardOverviewData
if err := json.Unmarshal(data, &overview); err != nil {
return nil, fmt.Errorf("unmarshal cached dashboard overview: %w", err)
}
return &overview, nil
}
func (r *OpsRepository) SetCachedDashboardOverview(ctx context.Context, timeRange string, data *service.DashboardOverviewData, ttl time.Duration) error {
if data == nil {
return nil
}
if ttl <= 0 {
ttl = 10 * time.Second
}
if ctx == nil {
ctx = context.Background()
}
if r == nil || r.rdb == nil {
return nil
}
rangeKey := strings.TrimSpace(timeRange)
if rangeKey == "" {
rangeKey = "1h"
}
payload, err := json.Marshal(data)
if err != nil {
return fmt.Errorf("marshal cached dashboard overview: %w", err)
}
key := opsDashboardOverviewKeyPrefix + rangeKey
return r.rdb.Set(ctx, key, payload, ttl).Err()
}
func (r *OpsRepository) PingRedis(ctx context.Context) error {
if ctx == nil {
ctx = context.Background()
}
if r == nil || r.rdb == nil {
return errors.New("redis client is nil")
}
return r.rdb.Ping(ctx).Err()
}
package repository
import (
"context"
"database/sql"
"encoding/json"
"errors"
"fmt"
"math"
"strings"
"time"
dbent "github.com/Wei-Shaw/sub2api/ent"
"github.com/Wei-Shaw/sub2api/internal/service"
"github.com/redis/go-redis/v9"
)
const (
DefaultWindowMinutes = 1
MaxErrorLogsLimit = 500
DefaultErrorLogsLimit = 200
MaxRecentSystemMetricsLimit = 500
DefaultRecentSystemMetricsLimit = 60
MaxMetricsLimit = 5000
DefaultMetricsLimit = 300
)
type OpsRepository struct {
sql sqlExecutor
rdb *redis.Client
}
func NewOpsRepository(_ *dbent.Client, sqlDB *sql.DB, rdb *redis.Client) service.OpsRepository {
return &OpsRepository{sql: sqlDB, rdb: rdb}
}
func (r *OpsRepository) CreateErrorLog(ctx context.Context, log *service.OpsErrorLog) error {
if log == nil {
return nil
}
createdAt := log.CreatedAt
if createdAt.IsZero() {
createdAt = time.Now()
}
query := `
INSERT INTO ops_error_logs (
request_id,
user_id,
api_key_id,
account_id,
group_id,
client_ip,
error_phase,
error_type,
severity,
status_code,
platform,
model,
request_path,
stream,
error_message,
duration_ms,
created_at
) VALUES (
$1, $2, $3, $4, $5,
$6, $7, $8, $9, $10,
$11, $12, $13, $14, $15,
$16, $17
)
RETURNING id, created_at
`
requestID := nullString(log.RequestID)
clientIP := nullString(log.ClientIP)
platform := nullString(log.Platform)
model := nullString(log.Model)
requestPath := nullString(log.RequestPath)
message := nullString(log.Message)
latency := nullInt(log.LatencyMs)
args := []any{
requestID,
nullInt64(log.UserID),
nullInt64(log.APIKeyID),
nullInt64(log.AccountID),
nullInt64(log.GroupID),
clientIP,
log.Phase,
log.Type,
log.Severity,
log.StatusCode,
platform,
model,
requestPath,
log.Stream,
message,
latency,
createdAt,
}
if err := scanSingleRow(ctx, r.sql, query, args, &log.ID, &log.CreatedAt); err != nil {
return err
}
return nil
}
func (r *OpsRepository) ListErrorLogsLegacy(ctx context.Context, filters service.OpsErrorLogFilters) ([]service.OpsErrorLog, error) {
conditions := make([]string, 0)
args := make([]any, 0)
addCondition := func(condition string, values ...any) {
conditions = append(conditions, condition)
args = append(args, values...)
}
if filters.StartTime != nil {
addCondition(fmt.Sprintf("created_at >= $%d", len(args)+1), *filters.StartTime)
}
if filters.EndTime != nil {
addCondition(fmt.Sprintf("created_at <= $%d", len(args)+1), *filters.EndTime)
}
if filters.Platform != "" {
addCondition(fmt.Sprintf("platform = $%d", len(args)+1), filters.Platform)
}
if filters.Phase != "" {
addCondition(fmt.Sprintf("error_phase = $%d", len(args)+1), filters.Phase)
}
if filters.Severity != "" {
addCondition(fmt.Sprintf("severity = $%d", len(args)+1), filters.Severity)
}
if filters.Query != "" {
like := "%" + strings.ToLower(filters.Query) + "%"
startIdx := len(args) + 1
addCondition(
fmt.Sprintf("(LOWER(request_id) LIKE $%d OR LOWER(model) LIKE $%d OR LOWER(error_message) LIKE $%d OR LOWER(error_type) LIKE $%d)",
startIdx, startIdx+1, startIdx+2, startIdx+3,
),
like, like, like, like,
)
}
limit := filters.Limit
if limit <= 0 || limit > MaxErrorLogsLimit {
limit = DefaultErrorLogsLimit
}
where := ""
if len(conditions) > 0 {
where = "WHERE " + strings.Join(conditions, " AND ")
}
query := fmt.Sprintf(`
SELECT
id,
created_at,
user_id,
api_key_id,
account_id,
group_id,
client_ip,
error_phase,
error_type,
severity,
status_code,
platform,
model,
request_path,
stream,
duration_ms,
request_id,
error_message
FROM ops_error_logs
%s
ORDER BY created_at DESC
LIMIT $%d
`, where, len(args)+1)
args = append(args, limit)
rows, err := r.sql.QueryContext(ctx, query, args...)
if err != nil {
return nil, err
}
defer func() { _ = rows.Close() }()
results := make([]service.OpsErrorLog, 0)
for rows.Next() {
logEntry, err := scanOpsErrorLog(rows)
if err != nil {
return nil, err
}
results = append(results, *logEntry)
}
if err := rows.Err(); err != nil {
return nil, err
}
return results, nil
}
func (r *OpsRepository) GetLatestSystemMetric(ctx context.Context) (*service.OpsMetrics, error) {
query := `
SELECT
window_minutes,
request_count,
success_count,
error_count,
success_rate,
error_rate,
p95_latency_ms,
p99_latency_ms,
http2_errors,
active_alerts,
cpu_usage_percent,
memory_used_mb,
memory_total_mb,
memory_usage_percent,
heap_alloc_mb,
gc_pause_ms,
concurrency_queue_depth,
created_at AS updated_at
FROM ops_system_metrics
WHERE window_minutes = $1
ORDER BY updated_at DESC, id DESC
LIMIT 1
`
var windowMinutes sql.NullInt64
var requestCount, successCount, errorCount sql.NullInt64
var successRate, errorRate sql.NullFloat64
var p95Latency, p99Latency, http2Errors, activeAlerts sql.NullInt64
var cpuUsage, memoryUsage, gcPause sql.NullFloat64
var memoryUsed, memoryTotal, heapAlloc, queueDepth sql.NullInt64
var createdAt time.Time
if err := scanSingleRow(
ctx,
r.sql,
query,
[]any{DefaultWindowMinutes},
&windowMinutes,
&requestCount,
&successCount,
&errorCount,
&successRate,
&errorRate,
&p95Latency,
&p99Latency,
&http2Errors,
&activeAlerts,
&cpuUsage,
&memoryUsed,
&memoryTotal,
&memoryUsage,
&heapAlloc,
&gcPause,
&queueDepth,
&createdAt,
); err != nil {
return nil, err
}
metric := &service.OpsMetrics{
UpdatedAt: createdAt,
}
if windowMinutes.Valid {
metric.WindowMinutes = int(windowMinutes.Int64)
}
if requestCount.Valid {
metric.RequestCount = requestCount.Int64
}
if successCount.Valid {
metric.SuccessCount = successCount.Int64
}
if errorCount.Valid {
metric.ErrorCount = errorCount.Int64
}
if successRate.Valid {
metric.SuccessRate = successRate.Float64
}
if errorRate.Valid {
metric.ErrorRate = errorRate.Float64
}
if p95Latency.Valid {
metric.P95LatencyMs = int(p95Latency.Int64)
}
if p99Latency.Valid {
metric.P99LatencyMs = int(p99Latency.Int64)
}
if http2Errors.Valid {
metric.HTTP2Errors = int(http2Errors.Int64)
}
if activeAlerts.Valid {
metric.ActiveAlerts = int(activeAlerts.Int64)
}
if cpuUsage.Valid {
metric.CPUUsagePercent = cpuUsage.Float64
}
if memoryUsed.Valid {
metric.MemoryUsedMB = memoryUsed.Int64
}
if memoryTotal.Valid {
metric.MemoryTotalMB = memoryTotal.Int64
}
if memoryUsage.Valid {
metric.MemoryUsagePercent = memoryUsage.Float64
}
if heapAlloc.Valid {
metric.HeapAllocMB = heapAlloc.Int64
}
if gcPause.Valid {
metric.GCPauseMs = gcPause.Float64
}
if queueDepth.Valid {
metric.ConcurrencyQueueDepth = int(queueDepth.Int64)
}
return metric, nil
}
func (r *OpsRepository) CreateSystemMetric(ctx context.Context, metric *service.OpsMetrics) error {
if metric == nil {
return nil
}
createdAt := metric.UpdatedAt
if createdAt.IsZero() {
createdAt = time.Now()
}
windowMinutes := metric.WindowMinutes
if windowMinutes <= 0 {
windowMinutes = DefaultWindowMinutes
}
query := `
INSERT INTO ops_system_metrics (
window_minutes,
request_count,
success_count,
error_count,
success_rate,
error_rate,
p95_latency_ms,
p99_latency_ms,
http2_errors,
active_alerts,
cpu_usage_percent,
memory_used_mb,
memory_total_mb,
memory_usage_percent,
heap_alloc_mb,
gc_pause_ms,
concurrency_queue_depth,
created_at
) VALUES (
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10,
$11, $12, $13, $14, $15, $16, $17, $18
)
`
_, err := r.sql.ExecContext(ctx, query,
windowMinutes,
metric.RequestCount,
metric.SuccessCount,
metric.ErrorCount,
metric.SuccessRate,
metric.ErrorRate,
metric.P95LatencyMs,
metric.P99LatencyMs,
metric.HTTP2Errors,
metric.ActiveAlerts,
metric.CPUUsagePercent,
metric.MemoryUsedMB,
metric.MemoryTotalMB,
metric.MemoryUsagePercent,
metric.HeapAllocMB,
metric.GCPauseMs,
metric.ConcurrencyQueueDepth,
createdAt,
)
return err
}
func (r *OpsRepository) ListRecentSystemMetrics(ctx context.Context, windowMinutes, limit int) ([]service.OpsMetrics, error) {
if windowMinutes <= 0 {
windowMinutes = DefaultWindowMinutes
}
if limit <= 0 || limit > MaxRecentSystemMetricsLimit {
limit = DefaultRecentSystemMetricsLimit
}
query := `
SELECT
window_minutes,
request_count,
success_count,
error_count,
success_rate,
error_rate,
p95_latency_ms,
p99_latency_ms,
http2_errors,
active_alerts,
cpu_usage_percent,
memory_used_mb,
memory_total_mb,
memory_usage_percent,
heap_alloc_mb,
gc_pause_ms,
concurrency_queue_depth,
created_at AS updated_at
FROM ops_system_metrics
WHERE window_minutes = $1
ORDER BY updated_at DESC, id DESC
LIMIT $2
`
rows, err := r.sql.QueryContext(ctx, query, windowMinutes, limit)
if err != nil {
return nil, err
}
defer func() { _ = rows.Close() }()
results := make([]service.OpsMetrics, 0)
for rows.Next() {
metric, err := scanOpsSystemMetric(rows)
if err != nil {
return nil, err
}
results = append(results, *metric)
}
if err := rows.Err(); err != nil {
return nil, err
}
return results, nil
}
func (r *OpsRepository) ListSystemMetricsRange(ctx context.Context, windowMinutes int, startTime, endTime time.Time, limit int) ([]service.OpsMetrics, error) {
if windowMinutes <= 0 {
windowMinutes = DefaultWindowMinutes
}
if limit <= 0 || limit > MaxMetricsLimit {
limit = DefaultMetricsLimit
}
if endTime.IsZero() {
endTime = time.Now()
}
if startTime.IsZero() {
startTime = endTime.Add(-time.Duration(limit) * time.Minute)
}
if startTime.After(endTime) {
startTime, endTime = endTime, startTime
}
query := `
SELECT
window_minutes,
request_count,
success_count,
error_count,
success_rate,
error_rate,
p95_latency_ms,
p99_latency_ms,
http2_errors,
active_alerts,
cpu_usage_percent,
memory_used_mb,
memory_total_mb,
memory_usage_percent,
heap_alloc_mb,
gc_pause_ms,
concurrency_queue_depth,
created_at
FROM ops_system_metrics
WHERE window_minutes = $1
AND created_at >= $2
AND created_at <= $3
ORDER BY created_at ASC
LIMIT $4
`
rows, err := r.sql.QueryContext(ctx, query, windowMinutes, startTime, endTime, limit)
if err != nil {
return nil, err
}
defer func() { _ = rows.Close() }()
results := make([]service.OpsMetrics, 0)
for rows.Next() {
metric, err := scanOpsSystemMetric(rows)
if err != nil {
return nil, err
}
results = append(results, *metric)
}
if err := rows.Err(); err != nil {
return nil, err
}
return results, nil
}
func (r *OpsRepository) ListAlertRules(ctx context.Context) ([]service.OpsAlertRule, error) {
query := `
SELECT
id,
name,
description,
enabled,
metric_type,
operator,
threshold,
window_minutes,
sustained_minutes,
severity,
notify_email,
notify_webhook,
webhook_url,
cooldown_minutes,
dimension_filters,
notify_channels,
notify_config,
created_at,
updated_at
FROM ops_alert_rules
ORDER BY id ASC
`
rows, err := r.sql.QueryContext(ctx, query)
if err != nil {
return nil, err
}
defer func() { _ = rows.Close() }()
rules := make([]service.OpsAlertRule, 0)
for rows.Next() {
var rule service.OpsAlertRule
var description sql.NullString
var webhookURL sql.NullString
var dimensionFilters, notifyChannels, notifyConfig []byte
if err := rows.Scan(
&rule.ID,
&rule.Name,
&description,
&rule.Enabled,
&rule.MetricType,
&rule.Operator,
&rule.Threshold,
&rule.WindowMinutes,
&rule.SustainedMinutes,
&rule.Severity,
&rule.NotifyEmail,
&rule.NotifyWebhook,
&webhookURL,
&rule.CooldownMinutes,
&dimensionFilters,
&notifyChannels,
&notifyConfig,
&rule.CreatedAt,
&rule.UpdatedAt,
); err != nil {
return nil, err
}
if description.Valid {
rule.Description = description.String
}
if webhookURL.Valid {
rule.WebhookURL = webhookURL.String
}
if len(dimensionFilters) > 0 {
_ = json.Unmarshal(dimensionFilters, &rule.DimensionFilters)
}
if len(notifyChannels) > 0 {
_ = json.Unmarshal(notifyChannels, &rule.NotifyChannels)
}
if len(notifyConfig) > 0 {
_ = json.Unmarshal(notifyConfig, &rule.NotifyConfig)
}
rules = append(rules, rule)
}
if err := rows.Err(); err != nil {
return nil, err
}
return rules, nil
}
func (r *OpsRepository) GetActiveAlertEvent(ctx context.Context, ruleID int64) (*service.OpsAlertEvent, error) {
return r.getAlertEvent(ctx, `WHERE rule_id = $1 AND status = $2`, []any{ruleID, service.OpsAlertStatusFiring})
}
func (r *OpsRepository) GetLatestAlertEvent(ctx context.Context, ruleID int64) (*service.OpsAlertEvent, error) {
return r.getAlertEvent(ctx, `WHERE rule_id = $1`, []any{ruleID})
}
func (r *OpsRepository) CreateAlertEvent(ctx context.Context, event *service.OpsAlertEvent) error {
if event == nil {
return nil
}
if event.FiredAt.IsZero() {
event.FiredAt = time.Now()
}
if event.CreatedAt.IsZero() {
event.CreatedAt = event.FiredAt
}
if event.Status == "" {
event.Status = service.OpsAlertStatusFiring
}
query := `
INSERT INTO ops_alert_events (
rule_id,
severity,
status,
title,
description,
metric_value,
threshold_value,
fired_at,
resolved_at,
email_sent,
webhook_sent,
created_at
) VALUES (
$1, $2, $3, $4, $5, $6,
$7, $8, $9, $10, $11, $12
)
RETURNING id, created_at
`
var resolvedAt sql.NullTime
if event.ResolvedAt != nil {
resolvedAt = sql.NullTime{Time: *event.ResolvedAt, Valid: true}
}
if err := scanSingleRow(
ctx,
r.sql,
query,
[]any{
event.RuleID,
event.Severity,
event.Status,
event.Title,
event.Description,
event.MetricValue,
event.ThresholdValue,
event.FiredAt,
resolvedAt,
event.EmailSent,
event.WebhookSent,
event.CreatedAt,
},
&event.ID,
&event.CreatedAt,
); err != nil {
return err
}
return nil
}
func (r *OpsRepository) UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error {
var resolved sql.NullTime
if resolvedAt != nil {
resolved = sql.NullTime{Time: *resolvedAt, Valid: true}
}
_, err := r.sql.ExecContext(ctx, `
UPDATE ops_alert_events
SET status = $2, resolved_at = $3
WHERE id = $1
`, eventID, status, resolved)
return err
}
func (r *OpsRepository) UpdateAlertEventNotifications(ctx context.Context, eventID int64, emailSent, webhookSent bool) error {
_, err := r.sql.ExecContext(ctx, `
UPDATE ops_alert_events
SET email_sent = $2, webhook_sent = $3
WHERE id = $1
`, eventID, emailSent, webhookSent)
return err
}
func (r *OpsRepository) CountActiveAlerts(ctx context.Context) (int, error) {
var count int64
if err := scanSingleRow(
ctx,
r.sql,
`SELECT COUNT(*) FROM ops_alert_events WHERE status = $1`,
[]any{service.OpsAlertStatusFiring},
&count,
); err != nil {
if errors.Is(err, sql.ErrNoRows) {
return 0, nil
}
return 0, err
}
return int(count), nil
}
func (r *OpsRepository) GetWindowStats(ctx context.Context, startTime, endTime time.Time) (*service.OpsWindowStats, error) {
query := `
WITH
usage_agg AS (
SELECT
COUNT(*) AS success_count,
percentile_cont(0.95) WITHIN GROUP (ORDER BY duration_ms)
FILTER (WHERE duration_ms IS NOT NULL) AS p95,
percentile_cont(0.99) WITHIN GROUP (ORDER BY duration_ms)
FILTER (WHERE duration_ms IS NOT NULL) AS p99
FROM usage_logs
WHERE created_at >= $1 AND created_at < $2
),
error_agg AS (
SELECT
COUNT(*) AS error_count,
COUNT(*) FILTER (
WHERE
error_type = 'network_error'
OR error_message ILIKE '%http2%'
OR error_message ILIKE '%http/2%'
) AS http2_errors
FROM ops_error_logs
WHERE created_at >= $1 AND created_at < $2
)
SELECT
usage_agg.success_count,
error_agg.error_count,
usage_agg.p95,
usage_agg.p99,
error_agg.http2_errors
FROM usage_agg
CROSS JOIN error_agg
`
var stats service.OpsWindowStats
var p95Latency, p99Latency sql.NullFloat64
var http2Errors int64
if err := scanSingleRow(
ctx,
r.sql,
query,
[]any{startTime, endTime},
&stats.SuccessCount,
&stats.ErrorCount,
&p95Latency,
&p99Latency,
&http2Errors,
); err != nil {
return nil, err
}
stats.HTTP2Errors = int(http2Errors)
if p95Latency.Valid {
stats.P95LatencyMs = int(math.Round(p95Latency.Float64))
}
if p99Latency.Valid {
stats.P99LatencyMs = int(math.Round(p99Latency.Float64))
}
return &stats, nil
}
func (r *OpsRepository) GetOverviewStats(ctx context.Context, startTime, endTime time.Time) (*service.OverviewStats, error) {
query := `
WITH
usage_stats AS (
SELECT
COUNT(*) AS request_count,
COUNT(*) FILTER (WHERE duration_ms IS NOT NULL) AS success_count,
percentile_cont(0.50) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS p50,
percentile_cont(0.95) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS p95,
percentile_cont(0.99) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS p99,
percentile_cont(0.999) WITHIN GROUP (ORDER BY duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS p999,
AVG(duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS avg_latency,
MAX(duration_ms) FILTER (WHERE duration_ms IS NOT NULL) AS max_latency
FROM usage_logs
WHERE created_at >= $1 AND created_at < $2
),
error_stats AS (
SELECT
COUNT(*) AS error_count,
COUNT(*) FILTER (WHERE status_code >= 400 AND status_code < 500) AS error_4xx,
COUNT(*) FILTER (WHERE status_code >= 500) AS error_5xx,
COUNT(*) FILTER (
WHERE
error_type IN ('timeout', 'timeout_error')
OR error_message ILIKE '%timeout%'
OR error_message ILIKE '%deadline exceeded%'
) AS timeout_count
FROM ops_error_logs
WHERE created_at >= $1 AND created_at < $2
),
top_error AS (
SELECT
COALESCE(status_code::text, 'unknown') AS error_code,
error_message,
COUNT(*) AS error_count
FROM ops_error_logs
WHERE created_at >= $1 AND created_at < $2
GROUP BY status_code, error_message
ORDER BY error_count DESC
LIMIT 1
),
latest_metrics AS (
SELECT
cpu_usage_percent,
memory_usage_percent,
memory_used_mb,
memory_total_mb,
concurrency_queue_depth
FROM ops_system_metrics
ORDER BY created_at DESC
LIMIT 1
)
SELECT
COALESCE(usage_stats.request_count, 0) + COALESCE(error_stats.error_count, 0) AS request_count,
COALESCE(usage_stats.success_count, 0),
COALESCE(error_stats.error_count, 0),
COALESCE(error_stats.error_4xx, 0),
COALESCE(error_stats.error_5xx, 0),
COALESCE(error_stats.timeout_count, 0),
COALESCE(usage_stats.p50, 0),
COALESCE(usage_stats.p95, 0),
COALESCE(usage_stats.p99, 0),
COALESCE(usage_stats.p999, 0),
COALESCE(usage_stats.avg_latency, 0),
COALESCE(usage_stats.max_latency, 0),
COALESCE(top_error.error_code, ''),
COALESCE(top_error.error_message, ''),
COALESCE(top_error.error_count, 0),
COALESCE(latest_metrics.cpu_usage_percent, 0),
COALESCE(latest_metrics.memory_usage_percent, 0),
COALESCE(latest_metrics.memory_used_mb, 0),
COALESCE(latest_metrics.memory_total_mb, 0),
COALESCE(latest_metrics.concurrency_queue_depth, 0)
FROM usage_stats
CROSS JOIN error_stats
LEFT JOIN top_error ON true
LEFT JOIN latest_metrics ON true
`
var stats service.OverviewStats
var p50, p95, p99, p999, avgLatency, maxLatency sql.NullFloat64
err := scanSingleRow(
ctx,
r.sql,
query,
[]any{startTime, endTime},
&stats.RequestCount,
&stats.SuccessCount,
&stats.ErrorCount,
&stats.Error4xxCount,
&stats.Error5xxCount,
&stats.TimeoutCount,
&p50,
&p95,
&p99,
&p999,
&avgLatency,
&maxLatency,
&stats.TopErrorCode,
&stats.TopErrorMsg,
&stats.TopErrorCount,
&stats.CPUUsage,
&stats.MemoryUsage,
&stats.MemoryUsedMB,
&stats.MemoryTotalMB,
&stats.ConcurrencyQueueDepth,
)
if err != nil {
return nil, err
}
if p50.Valid {
stats.LatencyP50 = int(p50.Float64)
}
if p95.Valid {
stats.LatencyP95 = int(p95.Float64)
}
if p99.Valid {
stats.LatencyP99 = int(p99.Float64)
}
if p999.Valid {
stats.LatencyP999 = int(p999.Float64)
}
if avgLatency.Valid {
stats.LatencyAvg = int(avgLatency.Float64)
}
if maxLatency.Valid {
stats.LatencyMax = int(maxLatency.Float64)
}
return &stats, nil
}
func (r *OpsRepository) GetProviderStats(ctx context.Context, startTime, endTime time.Time) ([]*service.ProviderStats, error) {
if startTime.IsZero() || endTime.IsZero() {
return nil, nil
}
if startTime.After(endTime) {
startTime, endTime = endTime, startTime
}
query := `
WITH combined AS (
SELECT
COALESCE(g.platform, a.platform, '') AS platform,
u.duration_ms AS duration_ms,
1 AS is_success,
0 AS is_error,
NULL::INT AS status_code,
NULL::TEXT AS error_type,
NULL::TEXT AS error_message
FROM usage_logs u
LEFT JOIN groups g ON g.id = u.group_id
LEFT JOIN accounts a ON a.id = u.account_id
WHERE u.created_at >= $1 AND u.created_at < $2
UNION ALL
SELECT
COALESCE(NULLIF(o.platform, ''), g.platform, a.platform, '') AS platform,
o.duration_ms AS duration_ms,
0 AS is_success,
1 AS is_error,
o.status_code AS status_code,
o.error_type AS error_type,
o.error_message AS error_message
FROM ops_error_logs o
LEFT JOIN groups g ON g.id = o.group_id
LEFT JOIN accounts a ON a.id = o.account_id
WHERE o.created_at >= $1 AND o.created_at < $2
)
SELECT
platform,
COUNT(*) AS request_count,
COALESCE(SUM(is_success), 0) AS success_count,
COALESCE(SUM(is_error), 0) AS error_count,
COALESCE(AVG(duration_ms) FILTER (WHERE duration_ms IS NOT NULL), 0) AS avg_latency_ms,
percentile_cont(0.99) WITHIN GROUP (ORDER BY duration_ms)
FILTER (WHERE duration_ms IS NOT NULL) AS p99_latency_ms,
COUNT(*) FILTER (WHERE is_error = 1 AND status_code >= 400 AND status_code < 500) AS error_4xx,
COUNT(*) FILTER (WHERE is_error = 1 AND status_code >= 500 AND status_code < 600) AS error_5xx,
COUNT(*) FILTER (
WHERE
is_error = 1
AND (
status_code = 504
OR error_type ILIKE '%timeout%'
OR error_message ILIKE '%timeout%'
)
) AS timeout_count
FROM combined
WHERE platform <> ''
GROUP BY platform
ORDER BY request_count DESC, platform ASC
`
rows, err := r.sql.QueryContext(ctx, query, startTime, endTime)
if err != nil {
return nil, err
}
defer func() { _ = rows.Close() }()
results := make([]*service.ProviderStats, 0)
for rows.Next() {
var item service.ProviderStats
var avgLatency sql.NullFloat64
var p99Latency sql.NullFloat64
if err := rows.Scan(
&item.Platform,
&item.RequestCount,
&item.SuccessCount,
&item.ErrorCount,
&avgLatency,
&p99Latency,
&item.Error4xxCount,
&item.Error5xxCount,
&item.TimeoutCount,
); err != nil {
return nil, err
}
if avgLatency.Valid {
item.AvgLatencyMs = int(math.Round(avgLatency.Float64))
}
if p99Latency.Valid {
item.P99LatencyMs = int(math.Round(p99Latency.Float64))
}
results = append(results, &item)
}
if err := rows.Err(); err != nil {
return nil, err
}
return results, nil
}
func (r *OpsRepository) GetLatencyHistogram(ctx context.Context, startTime, endTime time.Time) ([]*service.LatencyHistogramItem, error) {
query := `
WITH buckets AS (
SELECT
CASE
WHEN duration_ms < 200 THEN '<200ms'
WHEN duration_ms < 500 THEN '200-500ms'
WHEN duration_ms < 1000 THEN '500-1000ms'
WHEN duration_ms < 3000 THEN '1000-3000ms'
ELSE '>3000ms'
END AS range_name,
CASE
WHEN duration_ms < 200 THEN 1
WHEN duration_ms < 500 THEN 2
WHEN duration_ms < 1000 THEN 3
WHEN duration_ms < 3000 THEN 4
ELSE 5
END AS range_order,
COUNT(*) AS count
FROM usage_logs
WHERE created_at >= $1 AND created_at < $2 AND duration_ms IS NOT NULL
GROUP BY 1, 2
),
total AS (
SELECT SUM(count) AS total_count FROM buckets
)
SELECT
b.range_name,
b.count,
ROUND((b.count::numeric / t.total_count) * 100, 2) AS percentage
FROM buckets b
CROSS JOIN total t
ORDER BY b.range_order ASC
`
rows, err := r.sql.QueryContext(ctx, query, startTime, endTime)
if err != nil {
return nil, err
}
defer func() { _ = rows.Close() }()
results := make([]*service.LatencyHistogramItem, 0)
for rows.Next() {
var item service.LatencyHistogramItem
if err := rows.Scan(&item.Range, &item.Count, &item.Percentage); err != nil {
return nil, err
}
results = append(results, &item)
}
return results, nil
}
func (r *OpsRepository) GetErrorDistribution(ctx context.Context, startTime, endTime time.Time) ([]*service.ErrorDistributionItem, error) {
query := `
WITH errors AS (
SELECT
COALESCE(status_code::text, 'unknown') AS code,
COALESCE(error_message, 'Unknown error') AS message,
COUNT(*) AS count
FROM ops_error_logs
WHERE created_at >= $1 AND created_at < $2
GROUP BY 1, 2
),
total AS (
SELECT SUM(count) AS total_count FROM errors
)
SELECT
e.code,
e.message,
e.count,
ROUND((e.count::numeric / t.total_count) * 100, 2) AS percentage
FROM errors e
CROSS JOIN total t
ORDER BY e.count DESC
LIMIT 20
`
rows, err := r.sql.QueryContext(ctx, query, startTime, endTime)
if err != nil {
return nil, err
}
defer func() { _ = rows.Close() }()
results := make([]*service.ErrorDistributionItem, 0)
for rows.Next() {
var item service.ErrorDistributionItem
if err := rows.Scan(&item.Code, &item.Message, &item.Count, &item.Percentage); err != nil {
return nil, err
}
results = append(results, &item)
}
return results, nil
}
func (r *OpsRepository) getAlertEvent(ctx context.Context, whereClause string, args []any) (*service.OpsAlertEvent, error) {
query := fmt.Sprintf(`
SELECT
id,
rule_id,
severity,
status,
title,
description,
metric_value,
threshold_value,
fired_at,
resolved_at,
email_sent,
webhook_sent,
created_at
FROM ops_alert_events
%s
ORDER BY fired_at DESC
LIMIT 1
`, whereClause)
var event service.OpsAlertEvent
var resolvedAt sql.NullTime
var metricValue sql.NullFloat64
var thresholdValue sql.NullFloat64
if err := scanSingleRow(
ctx,
r.sql,
query,
args,
&event.ID,
&event.RuleID,
&event.Severity,
&event.Status,
&event.Title,
&event.Description,
&metricValue,
&thresholdValue,
&event.FiredAt,
&resolvedAt,
&event.EmailSent,
&event.WebhookSent,
&event.CreatedAt,
); err != nil {
if errors.Is(err, sql.ErrNoRows) {
return nil, nil
}
return nil, err
}
if metricValue.Valid {
event.MetricValue = metricValue.Float64
}
if thresholdValue.Valid {
event.ThresholdValue = thresholdValue.Float64
}
if resolvedAt.Valid {
event.ResolvedAt = &resolvedAt.Time
}
return &event, nil
}
func scanOpsSystemMetric(rows *sql.Rows) (*service.OpsMetrics, error) {
var metric service.OpsMetrics
var windowMinutes sql.NullInt64
var requestCount, successCount, errorCount sql.NullInt64
var successRate, errorRate sql.NullFloat64
var p95Latency, p99Latency, http2Errors, activeAlerts sql.NullInt64
var cpuUsage, memoryUsage, gcPause sql.NullFloat64
var memoryUsed, memoryTotal, heapAlloc, queueDepth sql.NullInt64
if err := rows.Scan(
&windowMinutes,
&requestCount,
&successCount,
&errorCount,
&successRate,
&errorRate,
&p95Latency,
&p99Latency,
&http2Errors,
&activeAlerts,
&cpuUsage,
&memoryUsed,
&memoryTotal,
&memoryUsage,
&heapAlloc,
&gcPause,
&queueDepth,
&metric.UpdatedAt,
); err != nil {
return nil, err
}
if windowMinutes.Valid {
metric.WindowMinutes = int(windowMinutes.Int64)
}
if requestCount.Valid {
metric.RequestCount = requestCount.Int64
}
if successCount.Valid {
metric.SuccessCount = successCount.Int64
}
if errorCount.Valid {
metric.ErrorCount = errorCount.Int64
}
if successRate.Valid {
metric.SuccessRate = successRate.Float64
}
if errorRate.Valid {
metric.ErrorRate = errorRate.Float64
}
if p95Latency.Valid {
metric.P95LatencyMs = int(p95Latency.Int64)
}
if p99Latency.Valid {
metric.P99LatencyMs = int(p99Latency.Int64)
}
if http2Errors.Valid {
metric.HTTP2Errors = int(http2Errors.Int64)
}
if activeAlerts.Valid {
metric.ActiveAlerts = int(activeAlerts.Int64)
}
if cpuUsage.Valid {
metric.CPUUsagePercent = cpuUsage.Float64
}
if memoryUsed.Valid {
metric.MemoryUsedMB = memoryUsed.Int64
}
if memoryTotal.Valid {
metric.MemoryTotalMB = memoryTotal.Int64
}
if memoryUsage.Valid {
metric.MemoryUsagePercent = memoryUsage.Float64
}
if heapAlloc.Valid {
metric.HeapAllocMB = heapAlloc.Int64
}
if gcPause.Valid {
metric.GCPauseMs = gcPause.Float64
}
if queueDepth.Valid {
metric.ConcurrencyQueueDepth = int(queueDepth.Int64)
}
return &metric, nil
}
func scanOpsErrorLog(rows *sql.Rows) (*service.OpsErrorLog, error) {
var entry service.OpsErrorLog
var userID, apiKeyID, accountID, groupID sql.NullInt64
var clientIP sql.NullString
var statusCode sql.NullInt64
var platform sql.NullString
var model sql.NullString
var requestPath sql.NullString
var stream sql.NullBool
var latency sql.NullInt64
var requestID sql.NullString
var message sql.NullString
if err := rows.Scan(
&entry.ID,
&entry.CreatedAt,
&userID,
&apiKeyID,
&accountID,
&groupID,
&clientIP,
&entry.Phase,
&entry.Type,
&entry.Severity,
&statusCode,
&platform,
&model,
&requestPath,
&stream,
&latency,
&requestID,
&message,
); err != nil {
return nil, err
}
if userID.Valid {
v := userID.Int64
entry.UserID = &v
}
if apiKeyID.Valid {
v := apiKeyID.Int64
entry.APIKeyID = &v
}
if accountID.Valid {
v := accountID.Int64
entry.AccountID = &v
}
if groupID.Valid {
v := groupID.Int64
entry.GroupID = &v
}
if clientIP.Valid {
entry.ClientIP = clientIP.String
}
if statusCode.Valid {
entry.StatusCode = int(statusCode.Int64)
}
if platform.Valid {
entry.Platform = platform.String
}
if model.Valid {
entry.Model = model.String
}
if requestPath.Valid {
entry.RequestPath = requestPath.String
}
if stream.Valid {
entry.Stream = stream.Bool
}
if latency.Valid {
value := int(latency.Int64)
entry.LatencyMs = &value
}
if requestID.Valid {
entry.RequestID = requestID.String
}
if message.Valid {
entry.Message = message.String
}
return &entry, nil
}
func nullString(value string) sql.NullString {
if value == "" {
return sql.NullString{}
}
return sql.NullString{String: value, Valid: true}
}
package middleware
import (
"context"
"sync"
"time"
"github.com/Wei-Shaw/sub2api/internal/service"
)
const (
opsAuthErrorLogWorkerCount = 10
opsAuthErrorLogQueueSize = 256
opsAuthErrorLogTimeout = 2 * time.Second
)
type opsAuthErrorLogJob struct {
ops *service.OpsService
entry *service.OpsErrorLog
}
var (
opsAuthErrorLogOnce sync.Once
opsAuthErrorLogQueue chan opsAuthErrorLogJob
)
func startOpsAuthErrorLogWorkers() {
opsAuthErrorLogQueue = make(chan opsAuthErrorLogJob, opsAuthErrorLogQueueSize)
for i := 0; i < opsAuthErrorLogWorkerCount; i++ {
go func() {
for job := range opsAuthErrorLogQueue {
if job.ops == nil || job.entry == nil {
continue
}
ctx, cancel := context.WithTimeout(context.Background(), opsAuthErrorLogTimeout)
_ = job.ops.RecordError(ctx, job.entry)
cancel()
}
}()
}
}
func enqueueOpsAuthErrorLog(ops *service.OpsService, entry *service.OpsErrorLog) {
if ops == nil || entry == nil {
return
}
opsAuthErrorLogOnce.Do(startOpsAuthErrorLogWorkers)
select {
case opsAuthErrorLogQueue <- opsAuthErrorLogJob{ops: ops, entry: entry}:
default:
// Queue is full; drop to avoid blocking request handling.
}
}
package service
import (
"context"
"time"
)
// ErrorLog represents an ops error log item for list queries.
//
// Field naming matches docs/API-运维监控中心2.0.md (L3 根因追踪 - 错误日志列表).
type ErrorLog struct {
ID int64 `json:"id"`
Timestamp time.Time `json:"timestamp"`
Level string `json:"level,omitempty"`
RequestID string `json:"request_id,omitempty"`
AccountID string `json:"account_id,omitempty"`
APIPath string `json:"api_path,omitempty"`
Provider string `json:"provider,omitempty"`
Model string `json:"model,omitempty"`
HTTPCode int `json:"http_code,omitempty"`
ErrorMessage string `json:"error_message,omitempty"`
DurationMs *int `json:"duration_ms,omitempty"`
RetryCount *int `json:"retry_count,omitempty"`
Stream bool `json:"stream,omitempty"`
}
// ErrorLogFilter describes optional filters and pagination for listing ops error logs.
type ErrorLogFilter struct {
StartTime *time.Time
EndTime *time.Time
ErrorCode *int
Provider string
AccountID *int64
Page int
PageSize int
}
func (f *ErrorLogFilter) normalize() (page, pageSize int) {
page = 1
pageSize = 20
if f == nil {
return page, pageSize
}
if f.Page > 0 {
page = f.Page
}
if f.PageSize > 0 {
pageSize = f.PageSize
}
if pageSize > 100 {
pageSize = 100
}
return page, pageSize
}
type ErrorLogListResponse struct {
Errors []*ErrorLog `json:"errors"`
Total int64 `json:"total"`
Page int `json:"page"`
PageSize int `json:"page_size"`
}
func (s *OpsService) GetErrorLogs(ctx context.Context, filter *ErrorLogFilter) (*ErrorLogListResponse, error) {
if s == nil || s.repo == nil {
return &ErrorLogListResponse{
Errors: []*ErrorLog{},
Total: 0,
Page: 1,
PageSize: 20,
}, nil
}
page, pageSize := filter.normalize()
if filter == nil {
filter = &ErrorLogFilter{}
}
filter.Page = page
filter.PageSize = pageSize
items, total, err := s.repo.ListErrorLogs(ctx, filter)
if err != nil {
return nil, err
}
if items == nil {
items = []*ErrorLog{}
}
return &ErrorLogListResponse{
Errors: items,
Total: total,
Page: page,
PageSize: pageSize,
}, nil
}
package service
import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"log"
"net"
"net/http"
"net/url"
"strconv"
"strings"
"sync"
"time"
)
type OpsAlertService struct {
opsService *OpsService
userService *UserService
emailService *EmailService
httpClient *http.Client
interval time.Duration
startOnce sync.Once
stopOnce sync.Once
stopCtx context.Context
stop context.CancelFunc
wg sync.WaitGroup
}
// opsAlertEvalInterval defines how often OpsAlertService evaluates alert rules.
//
// Production uses opsMetricsInterval. Tests may override this variable to keep
// integration tests fast without changing production defaults.
var opsAlertEvalInterval = opsMetricsInterval
func NewOpsAlertService(opsService *OpsService, userService *UserService, emailService *EmailService) *OpsAlertService {
return &OpsAlertService{
opsService: opsService,
userService: userService,
emailService: emailService,
httpClient: &http.Client{Timeout: 10 * time.Second},
interval: opsAlertEvalInterval,
}
}
// Start launches the background alert evaluation loop.
//
// Stop must be called during shutdown to ensure the goroutine exits.
func (s *OpsAlertService) Start() {
s.StartWithContext(context.Background())
}
// StartWithContext is like Start but allows the caller to provide a parent context.
// When the parent context is canceled, the service stops automatically.
func (s *OpsAlertService) StartWithContext(ctx context.Context) {
if s == nil {
return
}
if ctx == nil {
ctx = context.Background()
}
s.startOnce.Do(func() {
if s.interval <= 0 {
s.interval = opsAlertEvalInterval
}
s.stopCtx, s.stop = context.WithCancel(ctx)
s.wg.Add(1)
go s.run()
})
}
// Stop gracefully stops the background goroutine started by Start/StartWithContext.
// It is safe to call Stop multiple times.
func (s *OpsAlertService) Stop() {
if s == nil {
return
}
s.stopOnce.Do(func() {
if s.stop != nil {
s.stop()
}
})
s.wg.Wait()
}
func (s *OpsAlertService) run() {
defer s.wg.Done()
ticker := time.NewTicker(s.interval)
defer ticker.Stop()
s.evaluateOnce()
for {
select {
case <-ticker.C:
s.evaluateOnce()
case <-s.stopCtx.Done():
return
}
}
}
func (s *OpsAlertService) evaluateOnce() {
ctx, cancel := context.WithTimeout(s.stopCtx, opsAlertEvaluateTimeout)
defer cancel()
s.Evaluate(ctx, time.Now())
}
func (s *OpsAlertService) Evaluate(ctx context.Context, now time.Time) {
if s == nil || s.opsService == nil {
return
}
rules, err := s.opsService.ListAlertRules(ctx)
if err != nil {
log.Printf("[OpsAlert] failed to list rules: %v", err)
return
}
if len(rules) == 0 {
return
}
maxSustainedByWindow := make(map[int]int)
for _, rule := range rules {
if !rule.Enabled {
continue
}
window := rule.WindowMinutes
if window <= 0 {
window = 1
}
sustained := rule.SustainedMinutes
if sustained <= 0 {
sustained = 1
}
if sustained > maxSustainedByWindow[window] {
maxSustainedByWindow[window] = sustained
}
}
metricsByWindow := make(map[int][]OpsMetrics)
for window, limit := range maxSustainedByWindow {
metrics, err := s.opsService.ListRecentSystemMetrics(ctx, window, limit)
if err != nil {
log.Printf("[OpsAlert] failed to load metrics window=%dm: %v", window, err)
continue
}
metricsByWindow[window] = metrics
}
for _, rule := range rules {
if !rule.Enabled {
continue
}
window := rule.WindowMinutes
if window <= 0 {
window = 1
}
sustained := rule.SustainedMinutes
if sustained <= 0 {
sustained = 1
}
metrics := metricsByWindow[window]
selected, ok := selectContiguousMetrics(metrics, sustained, now)
if !ok {
continue
}
breached, latestValue, ok := evaluateRule(rule, selected)
if !ok {
continue
}
activeEvent, err := s.opsService.GetActiveAlertEvent(ctx, rule.ID)
if err != nil {
log.Printf("[OpsAlert] failed to get active event (rule=%d): %v", rule.ID, err)
continue
}
if breached {
if activeEvent != nil {
continue
}
lastEvent, err := s.opsService.GetLatestAlertEvent(ctx, rule.ID)
if err != nil {
log.Printf("[OpsAlert] failed to get latest event (rule=%d): %v", rule.ID, err)
continue
}
if lastEvent != nil && rule.CooldownMinutes > 0 {
cooldown := time.Duration(rule.CooldownMinutes) * time.Minute
if now.Sub(lastEvent.FiredAt) < cooldown {
continue
}
}
event := &OpsAlertEvent{
RuleID: rule.ID,
Severity: rule.Severity,
Status: OpsAlertStatusFiring,
Title: fmt.Sprintf("%s: %s", rule.Severity, rule.Name),
Description: buildAlertDescription(rule, latestValue),
MetricValue: latestValue,
ThresholdValue: rule.Threshold,
FiredAt: now,
CreatedAt: now,
}
if err := s.opsService.CreateAlertEvent(ctx, event); err != nil {
log.Printf("[OpsAlert] failed to create event (rule=%d): %v", rule.ID, err)
continue
}
emailSent, webhookSent := s.dispatchNotifications(ctx, rule, event)
if emailSent || webhookSent {
if err := s.opsService.UpdateAlertEventNotifications(ctx, event.ID, emailSent, webhookSent); err != nil {
log.Printf("[OpsAlert] failed to update notification flags (event=%d): %v", event.ID, err)
}
}
} else if activeEvent != nil {
resolvedAt := now
if err := s.opsService.UpdateAlertEventStatus(ctx, activeEvent.ID, OpsAlertStatusResolved, &resolvedAt); err != nil {
log.Printf("[OpsAlert] failed to resolve event (event=%d): %v", activeEvent.ID, err)
}
}
}
}
const opsMetricsContinuityTolerance = 20 * time.Second
// selectContiguousMetrics picks the newest N metrics and verifies they are continuous.
//
// This prevents a sustained rule from triggering when metrics sampling has gaps
// (e.g. collector downtime) and avoids evaluating "stale" data.
//
// Assumptions:
// - Metrics are ordered by UpdatedAt DESC (newest first).
// - Metrics are expected to be collected at opsMetricsInterval cadence.
func selectContiguousMetrics(metrics []OpsMetrics, needed int, now time.Time) ([]OpsMetrics, bool) {
if needed <= 0 {
return nil, false
}
if len(metrics) < needed {
return nil, false
}
newest := metrics[0].UpdatedAt
if newest.IsZero() {
return nil, false
}
if now.Sub(newest) > opsMetricsInterval+opsMetricsContinuityTolerance {
return nil, false
}
selected := metrics[:needed]
for i := 0; i < len(selected)-1; i++ {
a := selected[i].UpdatedAt
b := selected[i+1].UpdatedAt
if a.IsZero() || b.IsZero() {
return nil, false
}
gap := a.Sub(b)
if gap < opsMetricsInterval-opsMetricsContinuityTolerance || gap > opsMetricsInterval+opsMetricsContinuityTolerance {
return nil, false
}
}
return selected, true
}
func evaluateRule(rule OpsAlertRule, metrics []OpsMetrics) (bool, float64, bool) {
if len(metrics) == 0 {
return false, 0, false
}
latestValue, ok := metricValue(metrics[0], rule.MetricType)
if !ok {
return false, 0, false
}
for _, metric := range metrics {
value, ok := metricValue(metric, rule.MetricType)
if !ok || !compareMetric(value, rule.Operator, rule.Threshold) {
return false, latestValue, true
}
}
return true, latestValue, true
}
func metricValue(metric OpsMetrics, metricType string) (float64, bool) {
switch metricType {
case OpsMetricSuccessRate:
if metric.RequestCount == 0 {
return 0, false
}
return metric.SuccessRate, true
case OpsMetricErrorRate:
if metric.RequestCount == 0 {
return 0, false
}
return metric.ErrorRate, true
case OpsMetricP95LatencyMs:
return float64(metric.P95LatencyMs), true
case OpsMetricP99LatencyMs:
return float64(metric.P99LatencyMs), true
case OpsMetricHTTP2Errors:
return float64(metric.HTTP2Errors), true
case OpsMetricCPUUsagePercent:
return metric.CPUUsagePercent, true
case OpsMetricMemoryUsagePercent:
return metric.MemoryUsagePercent, true
case OpsMetricQueueDepth:
return float64(metric.ConcurrencyQueueDepth), true
default:
return 0, false
}
}
func compareMetric(value float64, operator string, threshold float64) bool {
switch operator {
case ">":
return value > threshold
case ">=":
return value >= threshold
case "<":
return value < threshold
case "<=":
return value <= threshold
case "==":
return value == threshold
default:
return false
}
}
func buildAlertDescription(rule OpsAlertRule, value float64) string {
window := rule.WindowMinutes
if window <= 0 {
window = 1
}
return fmt.Sprintf("Rule %s triggered: %s %s %.2f (current %.2f) over last %dm",
rule.Name,
rule.MetricType,
rule.Operator,
rule.Threshold,
value,
window,
)
}
func (s *OpsAlertService) dispatchNotifications(ctx context.Context, rule OpsAlertRule, event *OpsAlertEvent) (bool, bool) {
emailSent := false
webhookSent := false
notifyCtx, cancel := s.notificationContext(ctx)
defer cancel()
if rule.NotifyEmail {
emailSent = s.sendEmailNotification(notifyCtx, rule, event)
}
if rule.NotifyWebhook && rule.WebhookURL != "" {
webhookSent = s.sendWebhookNotification(notifyCtx, rule, event)
}
// Fallback channel: if email is enabled but ultimately fails, try webhook even if the
// webhook toggle is off (as long as a webhook URL is configured).
if rule.NotifyEmail && !emailSent && !rule.NotifyWebhook && rule.WebhookURL != "" {
log.Printf("[OpsAlert] email failed; attempting webhook fallback (rule=%d)", rule.ID)
webhookSent = s.sendWebhookNotification(notifyCtx, rule, event)
}
return emailSent, webhookSent
}
const (
opsAlertEvaluateTimeout = 45 * time.Second
opsAlertNotificationTimeout = 30 * time.Second
opsAlertEmailMaxRetries = 3
)
var opsAlertEmailBackoff = []time.Duration{
1 * time.Second,
2 * time.Second,
4 * time.Second,
}
func (s *OpsAlertService) notificationContext(ctx context.Context) (context.Context, context.CancelFunc) {
parent := ctx
if s != nil && s.stopCtx != nil {
parent = s.stopCtx
}
if parent == nil {
parent = context.Background()
}
return context.WithTimeout(parent, opsAlertNotificationTimeout)
}
var opsAlertSleep = sleepWithContext
func sleepWithContext(ctx context.Context, d time.Duration) error {
if d <= 0 {
return nil
}
if ctx == nil {
time.Sleep(d)
return nil
}
timer := time.NewTimer(d)
defer timer.Stop()
select {
case <-ctx.Done():
return ctx.Err()
case <-timer.C:
return nil
}
}
func retryWithBackoff(
ctx context.Context,
maxRetries int,
backoff []time.Duration,
fn func() error,
onError func(attempt int, total int, nextDelay time.Duration, err error),
) error {
if ctx == nil {
ctx = context.Background()
}
if maxRetries < 0 {
maxRetries = 0
}
totalAttempts := maxRetries + 1
var lastErr error
for attempt := 1; attempt <= totalAttempts; attempt++ {
if attempt > 1 {
backoffIdx := attempt - 2
if backoffIdx < len(backoff) {
if err := opsAlertSleep(ctx, backoff[backoffIdx]); err != nil {
return err
}
}
}
if err := ctx.Err(); err != nil {
return err
}
if err := fn(); err != nil {
lastErr = err
nextDelay := time.Duration(0)
if attempt < totalAttempts {
nextIdx := attempt - 1
if nextIdx < len(backoff) {
nextDelay = backoff[nextIdx]
}
}
if onError != nil {
onError(attempt, totalAttempts, nextDelay, err)
}
continue
}
return nil
}
return lastErr
}
func (s *OpsAlertService) sendEmailNotification(ctx context.Context, rule OpsAlertRule, event *OpsAlertEvent) bool {
if s.emailService == nil || s.userService == nil {
return false
}
if ctx == nil {
ctx = context.Background()
}
admin, err := s.userService.GetFirstAdmin(ctx)
if err != nil || admin == nil || admin.Email == "" {
return false
}
subject := fmt.Sprintf("[Ops Alert][%s] %s", rule.Severity, rule.Name)
body := fmt.Sprintf(
"Alert triggered: %s\n\nMetric: %s\nThreshold: %.2f\nCurrent: %.2f\nWindow: %dm\nStatus: %s\nTime: %s",
rule.Name,
rule.MetricType,
rule.Threshold,
event.MetricValue,
rule.WindowMinutes,
event.Status,
event.FiredAt.Format(time.RFC3339),
)
config, err := s.emailService.GetSMTPConfig(ctx)
if err != nil {
log.Printf("[OpsAlert] email config load failed: %v", err)
return false
}
if err := retryWithBackoff(
ctx,
opsAlertEmailMaxRetries,
opsAlertEmailBackoff,
func() error {
return s.emailService.SendEmailWithConfig(config, admin.Email, subject, body)
},
func(attempt int, total int, nextDelay time.Duration, err error) {
if attempt < total {
log.Printf("[OpsAlert] email send failed (attempt=%d/%d), retrying in %s: %v", attempt, total, nextDelay, err)
return
}
log.Printf("[OpsAlert] email send failed (attempt=%d/%d), giving up: %v", attempt, total, err)
},
); err != nil {
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
log.Printf("[OpsAlert] email send canceled: %v", err)
}
return false
}
return true
}
func (s *OpsAlertService) sendWebhookNotification(ctx context.Context, rule OpsAlertRule, event *OpsAlertEvent) bool {
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()
webhookTarget, err := validateWebhookURL(ctx, rule.WebhookURL)
if err != nil {
log.Printf("[OpsAlert] invalid webhook url (rule=%d): %v", rule.ID, err)
return false
}
payload := map[string]any{
"rule_id": rule.ID,
"rule_name": rule.Name,
"severity": rule.Severity,
"status": event.Status,
"metric_type": rule.MetricType,
"metric_value": event.MetricValue,
"threshold_value": rule.Threshold,
"window_minutes": rule.WindowMinutes,
"fired_at": event.FiredAt.Format(time.RFC3339),
}
body, err := json.Marshal(payload)
if err != nil {
return false
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, webhookTarget.URL.String(), bytes.NewReader(body))
if err != nil {
return false
}
req.Header.Set("Content-Type", "application/json")
resp, err := buildWebhookHTTPClient(s.httpClient, webhookTarget).Do(req)
if err != nil {
log.Printf("[OpsAlert] webhook send failed: %v", err)
return false
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode < http.StatusOK || resp.StatusCode >= http.StatusMultipleChoices {
log.Printf("[OpsAlert] webhook returned status %d", resp.StatusCode)
return false
}
return true
}
const webhookHTTPClientTimeout = 10 * time.Second
func buildWebhookHTTPClient(base *http.Client, webhookTarget *validatedWebhookTarget) *http.Client {
var client http.Client
if base != nil {
client = *base
}
if client.Timeout <= 0 {
client.Timeout = webhookHTTPClientTimeout
}
client.CheckRedirect = func(req *http.Request, via []*http.Request) error {
return http.ErrUseLastResponse
}
if webhookTarget != nil {
client.Transport = buildWebhookTransport(client.Transport, webhookTarget)
}
return &client
}
var disallowedWebhookIPNets = []net.IPNet{
// "this host on this network" / unspecified.
mustParseCIDR("0.0.0.0/8"),
mustParseCIDR("127.0.0.0/8"), // loopback (includes 127.0.0.1)
mustParseCIDR("10.0.0.0/8"), // RFC1918
mustParseCIDR("192.168.0.0/16"), // RFC1918
mustParseCIDR("172.16.0.0/12"), // RFC1918 (172.16.0.0 - 172.31.255.255)
mustParseCIDR("100.64.0.0/10"), // RFC6598 (carrier-grade NAT)
mustParseCIDR("169.254.0.0/16"), // IPv4 link-local (includes 169.254.169.254 metadata IP on many clouds)
mustParseCIDR("198.18.0.0/15"), // RFC2544 benchmark testing
mustParseCIDR("224.0.0.0/4"), // IPv4 multicast
mustParseCIDR("240.0.0.0/4"), // IPv4 reserved
mustParseCIDR("::/128"), // IPv6 unspecified
mustParseCIDR("::1/128"), // IPv6 loopback
mustParseCIDR("fc00::/7"), // IPv6 unique local
mustParseCIDR("fe80::/10"), // IPv6 link-local
mustParseCIDR("ff00::/8"), // IPv6 multicast
}
func mustParseCIDR(cidr string) net.IPNet {
_, block, err := net.ParseCIDR(cidr)
if err != nil {
panic(err)
}
return *block
}
var lookupIPAddrs = func(ctx context.Context, host string) ([]net.IPAddr, error) {
return net.DefaultResolver.LookupIPAddr(ctx, host)
}
type validatedWebhookTarget struct {
URL *url.URL
host string
port string
pinnedIPs []net.IP
}
var webhookBaseDialContext = func(ctx context.Context, network, addr string) (net.Conn, error) {
dialer := net.Dialer{
Timeout: 5 * time.Second,
KeepAlive: 30 * time.Second,
}
return dialer.DialContext(ctx, network, addr)
}
func buildWebhookTransport(base http.RoundTripper, webhookTarget *validatedWebhookTarget) http.RoundTripper {
if webhookTarget == nil || webhookTarget.URL == nil {
return base
}
var transport *http.Transport
switch typed := base.(type) {
case *http.Transport:
if typed != nil {
transport = typed.Clone()
}
}
if transport == nil {
if defaultTransport, ok := http.DefaultTransport.(*http.Transport); ok && defaultTransport != nil {
transport = defaultTransport.Clone()
} else {
transport = (&http.Transport{}).Clone()
}
}
webhookHost := webhookTarget.host
webhookPort := webhookTarget.port
pinnedIPs := append([]net.IP(nil), webhookTarget.pinnedIPs...)
transport.Proxy = nil
transport.DialTLSContext = nil
transport.DialContext = func(ctx context.Context, network, addr string) (net.Conn, error) {
host, port, err := net.SplitHostPort(addr)
if err != nil || host == "" || port == "" {
return nil, fmt.Errorf("webhook dial target is invalid: %q", addr)
}
canonicalHost := strings.TrimSuffix(strings.ToLower(host), ".")
if canonicalHost != webhookHost || port != webhookPort {
return nil, fmt.Errorf("webhook dial target mismatch: %q", addr)
}
var lastErr error
for _, ip := range pinnedIPs {
if isDisallowedWebhookIP(ip) {
lastErr = fmt.Errorf("webhook target resolves to a disallowed ip")
continue
}
dialAddr := net.JoinHostPort(ip.String(), port)
conn, err := webhookBaseDialContext(ctx, network, dialAddr)
if err == nil {
return conn, nil
}
lastErr = err
}
if lastErr == nil {
lastErr = errors.New("webhook target has no resolved addresses")
}
return nil, lastErr
}
return transport
}
func validateWebhookURL(ctx context.Context, raw string) (*validatedWebhookTarget, error) {
raw = strings.TrimSpace(raw)
if raw == "" {
return nil, errors.New("webhook url is empty")
}
// Avoid request smuggling / header injection vectors.
if strings.ContainsAny(raw, "\r\n") {
return nil, errors.New("webhook url contains invalid characters")
}
parsed, err := url.Parse(raw)
if err != nil {
return nil, errors.New("webhook url format is invalid")
}
if !strings.EqualFold(parsed.Scheme, "https") {
return nil, errors.New("webhook url scheme must be https")
}
parsed.Scheme = "https"
if parsed.Host == "" || parsed.Hostname() == "" {
return nil, errors.New("webhook url must include host")
}
if parsed.User != nil {
return nil, errors.New("webhook url must not include userinfo")
}
if parsed.Port() != "" {
port, err := strconv.Atoi(parsed.Port())
if err != nil || port < 1 || port > 65535 {
return nil, errors.New("webhook url port is invalid")
}
}
host := strings.TrimSuffix(strings.ToLower(parsed.Hostname()), ".")
if host == "localhost" {
return nil, errors.New("webhook url host must not be localhost")
}
if ip := net.ParseIP(host); ip != nil {
if isDisallowedWebhookIP(ip) {
return nil, errors.New("webhook url host resolves to a disallowed ip")
}
return &validatedWebhookTarget{
URL: parsed,
host: host,
port: portForScheme(parsed),
pinnedIPs: []net.IP{ip},
}, nil
}
if ctx == nil {
ctx = context.Background()
}
ips, err := lookupIPAddrs(ctx, host)
if err != nil || len(ips) == 0 {
return nil, errors.New("webhook url host cannot be resolved")
}
pinned := make([]net.IP, 0, len(ips))
for _, addr := range ips {
if isDisallowedWebhookIP(addr.IP) {
return nil, errors.New("webhook url host resolves to a disallowed ip")
}
if addr.IP != nil {
pinned = append(pinned, addr.IP)
}
}
if len(pinned) == 0 {
return nil, errors.New("webhook url host cannot be resolved")
}
return &validatedWebhookTarget{
URL: parsed,
host: host,
port: portForScheme(parsed),
pinnedIPs: uniqueResolvedIPs(pinned),
}, nil
}
func isDisallowedWebhookIP(ip net.IP) bool {
if ip == nil {
return false
}
if ip4 := ip.To4(); ip4 != nil {
ip = ip4
} else if ip16 := ip.To16(); ip16 != nil {
ip = ip16
} else {
return false
}
// Disallow non-public addresses even if they're not explicitly covered by the CIDR list.
// This provides defense-in-depth against SSRF targets such as link-local, multicast, and
// unspecified addresses, and ensures any "pinned" IP is still blocked at dial time.
if ip.IsUnspecified() ||
ip.IsLoopback() ||
ip.IsMulticast() ||
ip.IsLinkLocalUnicast() ||
ip.IsLinkLocalMulticast() ||
ip.IsPrivate() {
return true
}
for _, block := range disallowedWebhookIPNets {
if block.Contains(ip) {
return true
}
}
return false
}
func portForScheme(u *url.URL) string {
if u != nil && u.Port() != "" {
return u.Port()
}
return "443"
}
func uniqueResolvedIPs(ips []net.IP) []net.IP {
seen := make(map[string]struct{}, len(ips))
out := make([]net.IP, 0, len(ips))
for _, ip := range ips {
if ip == nil {
continue
}
key := ip.String()
if _, ok := seen[key]; ok {
continue
}
seen[key] = struct{}{}
out = append(out, ip)
}
return out
}
//go:build integration
package service
import (
"context"
"database/sql"
"sync"
"sync/atomic"
"testing"
"time"
"github.com/stretchr/testify/require"
)
// This integration test protects the DI startup contract for OpsAlertService.
//
// Background:
// - OpsMetricsCollector previously called alertService.Start()/Evaluate() directly.
// - Those direct calls were removed, so OpsAlertService must now start via DI
// (ProvideOpsAlertService in wire.go) and run its own evaluation ticker.
//
// What we validate here:
// 1. When we construct via the Wire provider functions (ProvideOpsAlertService +
// ProvideOpsMetricsCollector), OpsAlertService starts automatically.
// 2. Its evaluation loop continues to tick even if OpsMetricsCollector is stopped,
// proving the alert evaluator is independent.
// 3. The evaluation path can trigger alert logic (CreateAlertEvent called).
func TestOpsAlertService_StartedViaWireProviders_RunsIndependentTicker(t *testing.T) {
oldInterval := opsAlertEvalInterval
opsAlertEvalInterval = 25 * time.Millisecond
t.Cleanup(func() { opsAlertEvalInterval = oldInterval })
repo := newFakeOpsRepository()
opsService := NewOpsService(repo, nil)
// Start via the Wire provider function (the production DI path).
alertService := ProvideOpsAlertService(opsService, nil, nil)
t.Cleanup(alertService.Stop)
// Construct via ProvideOpsMetricsCollector (wire.go). Stop immediately to ensure
// the alert ticker keeps running without the metrics collector.
collector := ProvideOpsMetricsCollector(opsService, NewConcurrencyService(nil))
collector.Stop()
// Wait for at least one evaluation (run() calls evaluateOnce immediately).
require.Eventually(t, func() bool {
return repo.listRulesCalls.Load() >= 1
}, 1*time.Second, 5*time.Millisecond)
// Confirm the evaluation loop keeps ticking after the metrics collector is stopped.
callsAfterCollectorStop := repo.listRulesCalls.Load()
require.Eventually(t, func() bool {
return repo.listRulesCalls.Load() >= callsAfterCollectorStop+2
}, 1*time.Second, 5*time.Millisecond)
// Confirm the evaluation logic actually fires an alert event at least once.
select {
case <-repo.eventCreatedCh:
// ok
case <-time.After(2 * time.Second):
t.Fatalf("expected OpsAlertService to create an alert event, but none was created (ListAlertRules calls=%d)", repo.listRulesCalls.Load())
}
}
func newFakeOpsRepository() *fakeOpsRepository {
return &fakeOpsRepository{
eventCreatedCh: make(chan struct{}),
}
}
// fakeOpsRepository is a lightweight in-memory stub of OpsRepository for integration tests.
// It avoids real DB/Redis usage and provides deterministic responses fast.
type fakeOpsRepository struct {
listRulesCalls atomic.Int64
mu sync.Mutex
activeEvent *OpsAlertEvent
latestEvent *OpsAlertEvent
nextEventID int64
eventCreatedCh chan struct{}
eventOnce sync.Once
}
func (r *fakeOpsRepository) CreateErrorLog(ctx context.Context, log *OpsErrorLog) error {
return nil
}
func (r *fakeOpsRepository) ListErrorLogsLegacy(ctx context.Context, filters OpsErrorLogFilters) ([]OpsErrorLog, error) {
return nil, nil
}
func (r *fakeOpsRepository) ListErrorLogs(ctx context.Context, filter *ErrorLogFilter) ([]*ErrorLog, int64, error) {
return nil, 0, nil
}
func (r *fakeOpsRepository) GetLatestSystemMetric(ctx context.Context) (*OpsMetrics, error) {
return &OpsMetrics{WindowMinutes: 1}, sql.ErrNoRows
}
func (r *fakeOpsRepository) CreateSystemMetric(ctx context.Context, metric *OpsMetrics) error {
return nil
}
func (r *fakeOpsRepository) GetWindowStats(ctx context.Context, startTime, endTime time.Time) (*OpsWindowStats, error) {
return &OpsWindowStats{}, nil
}
func (r *fakeOpsRepository) GetProviderStats(ctx context.Context, startTime, endTime time.Time) ([]*ProviderStats, error) {
return nil, nil
}
func (r *fakeOpsRepository) GetLatencyHistogram(ctx context.Context, startTime, endTime time.Time) ([]*LatencyHistogramItem, error) {
return nil, nil
}
func (r *fakeOpsRepository) GetErrorDistribution(ctx context.Context, startTime, endTime time.Time) ([]*ErrorDistributionItem, error) {
return nil, nil
}
func (r *fakeOpsRepository) ListRecentSystemMetrics(ctx context.Context, windowMinutes, limit int) ([]OpsMetrics, error) {
if limit <= 0 {
limit = 1
}
now := time.Now()
metrics := make([]OpsMetrics, 0, limit)
for i := 0; i < limit; i++ {
metrics = append(metrics, OpsMetrics{
WindowMinutes: windowMinutes,
CPUUsagePercent: 99,
UpdatedAt: now.Add(-time.Duration(i) * opsMetricsInterval),
})
}
return metrics, nil
}
func (r *fakeOpsRepository) ListSystemMetricsRange(ctx context.Context, windowMinutes int, startTime, endTime time.Time, limit int) ([]OpsMetrics, error) {
return nil, nil
}
func (r *fakeOpsRepository) ListAlertRules(ctx context.Context) ([]OpsAlertRule, error) {
call := r.listRulesCalls.Add(1)
// Delay enabling rules slightly so the test can stop OpsMetricsCollector first,
// then observe the alert evaluator ticking independently.
if call < 5 {
return nil, nil
}
return []OpsAlertRule{
{
ID: 1,
Name: "cpu too high (test)",
Enabled: true,
MetricType: OpsMetricCPUUsagePercent,
Operator: ">",
Threshold: 0,
WindowMinutes: 1,
SustainedMinutes: 1,
Severity: "P1",
NotifyEmail: false,
NotifyWebhook: false,
CooldownMinutes: 0,
},
}, nil
}
func (r *fakeOpsRepository) GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) {
r.mu.Lock()
defer r.mu.Unlock()
if r.activeEvent == nil {
return nil, nil
}
if r.activeEvent.RuleID != ruleID {
return nil, nil
}
if r.activeEvent.Status != OpsAlertStatusFiring {
return nil, nil
}
clone := *r.activeEvent
return &clone, nil
}
func (r *fakeOpsRepository) GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) {
r.mu.Lock()
defer r.mu.Unlock()
if r.latestEvent == nil || r.latestEvent.RuleID != ruleID {
return nil, nil
}
clone := *r.latestEvent
return &clone, nil
}
func (r *fakeOpsRepository) CreateAlertEvent(ctx context.Context, event *OpsAlertEvent) error {
if event == nil {
return nil
}
r.mu.Lock()
defer r.mu.Unlock()
r.nextEventID++
event.ID = r.nextEventID
clone := *event
r.latestEvent = &clone
if clone.Status == OpsAlertStatusFiring {
r.activeEvent = &clone
}
r.eventOnce.Do(func() { close(r.eventCreatedCh) })
return nil
}
func (r *fakeOpsRepository) UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error {
r.mu.Lock()
defer r.mu.Unlock()
if r.activeEvent != nil && r.activeEvent.ID == eventID {
r.activeEvent.Status = status
r.activeEvent.ResolvedAt = resolvedAt
}
if r.latestEvent != nil && r.latestEvent.ID == eventID {
r.latestEvent.Status = status
r.latestEvent.ResolvedAt = resolvedAt
}
return nil
}
func (r *fakeOpsRepository) UpdateAlertEventNotifications(ctx context.Context, eventID int64, emailSent, webhookSent bool) error {
r.mu.Lock()
defer r.mu.Unlock()
if r.activeEvent != nil && r.activeEvent.ID == eventID {
r.activeEvent.EmailSent = emailSent
r.activeEvent.WebhookSent = webhookSent
}
if r.latestEvent != nil && r.latestEvent.ID == eventID {
r.latestEvent.EmailSent = emailSent
r.latestEvent.WebhookSent = webhookSent
}
return nil
}
func (r *fakeOpsRepository) CountActiveAlerts(ctx context.Context) (int, error) {
r.mu.Lock()
defer r.mu.Unlock()
if r.activeEvent == nil {
return 0, nil
}
return 1, nil
}
func (r *fakeOpsRepository) GetOverviewStats(ctx context.Context, startTime, endTime time.Time) (*OverviewStats, error) {
return &OverviewStats{}, nil
}
func (r *fakeOpsRepository) GetCachedLatestSystemMetric(ctx context.Context) (*OpsMetrics, error) {
return nil, nil
}
func (r *fakeOpsRepository) SetCachedLatestSystemMetric(ctx context.Context, metric *OpsMetrics) error {
return nil
}
func (r *fakeOpsRepository) GetCachedDashboardOverview(ctx context.Context, timeRange string) (*DashboardOverviewData, error) {
return nil, nil
}
func (r *fakeOpsRepository) SetCachedDashboardOverview(ctx context.Context, timeRange string, data *DashboardOverviewData, ttl time.Duration) error {
return nil
}
func (r *fakeOpsRepository) PingRedis(ctx context.Context) error {
return nil
}
//go:build unit || opsalert_unit
package service
import (
"context"
"errors"
"net"
"net/http"
"testing"
"time"
"github.com/stretchr/testify/require"
)
func TestSelectContiguousMetrics_Contiguous(t *testing.T) {
now := time.Date(2026, 1, 1, 0, 0, 0, 0, time.UTC)
metrics := []OpsMetrics{
{UpdatedAt: now},
{UpdatedAt: now.Add(-1 * time.Minute)},
{UpdatedAt: now.Add(-2 * time.Minute)},
}
selected, ok := selectContiguousMetrics(metrics, 3, now)
require.True(t, ok)
require.Len(t, selected, 3)
}
func TestSelectContiguousMetrics_GapFails(t *testing.T) {
now := time.Date(2026, 1, 1, 0, 0, 0, 0, time.UTC)
metrics := []OpsMetrics{
{UpdatedAt: now},
// Missing the -1m sample (gap ~=2m).
{UpdatedAt: now.Add(-2 * time.Minute)},
{UpdatedAt: now.Add(-3 * time.Minute)},
}
_, ok := selectContiguousMetrics(metrics, 3, now)
require.False(t, ok)
}
func TestSelectContiguousMetrics_StaleNewestFails(t *testing.T) {
now := time.Date(2026, 1, 1, 0, 10, 0, 0, time.UTC)
metrics := []OpsMetrics{
{UpdatedAt: now.Add(-10 * time.Minute)},
{UpdatedAt: now.Add(-11 * time.Minute)},
}
_, ok := selectContiguousMetrics(metrics, 2, now)
require.False(t, ok)
}
func TestMetricValue_SuccessRate_NoTrafficIsNoData(t *testing.T) {
metric := OpsMetrics{
RequestCount: 0,
SuccessRate: 0,
}
value, ok := metricValue(metric, OpsMetricSuccessRate)
require.False(t, ok)
require.Equal(t, 0.0, value)
}
func TestOpsAlertService_StopWithoutStart_NoPanic(t *testing.T) {
s := NewOpsAlertService(nil, nil, nil)
require.NotPanics(t, func() { s.Stop() })
}
func TestOpsAlertService_StartStop_Graceful(t *testing.T) {
s := NewOpsAlertService(nil, nil, nil)
s.interval = 5 * time.Millisecond
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
s.StartWithContext(ctx)
done := make(chan struct{})
go func() {
s.Stop()
close(done)
}()
select {
case <-done:
// ok
case <-time.After(1 * time.Second):
t.Fatal("Stop did not return; background goroutine likely stuck")
}
require.NotPanics(t, func() { s.Stop() })
}
func TestBuildWebhookHTTPClient_DefaultTimeout(t *testing.T) {
client := buildWebhookHTTPClient(nil, nil)
require.Equal(t, webhookHTTPClientTimeout, client.Timeout)
require.NotNil(t, client.CheckRedirect)
require.ErrorIs(t, client.CheckRedirect(nil, nil), http.ErrUseLastResponse)
base := &http.Client{}
client = buildWebhookHTTPClient(base, nil)
require.Equal(t, webhookHTTPClientTimeout, client.Timeout)
require.NotNil(t, client.CheckRedirect)
base = &http.Client{Timeout: 2 * time.Second}
client = buildWebhookHTTPClient(base, nil)
require.Equal(t, 2*time.Second, client.Timeout)
require.NotNil(t, client.CheckRedirect)
}
func TestValidateWebhookURL_RequiresHTTPS(t *testing.T) {
oldLookup := lookupIPAddrs
t.Cleanup(func() { lookupIPAddrs = oldLookup })
lookupIPAddrs = func(ctx context.Context, host string) ([]net.IPAddr, error) {
return []net.IPAddr{{IP: net.ParseIP("93.184.216.34")}}, nil
}
_, err := validateWebhookURL(context.Background(), "http://example.com/webhook")
require.Error(t, err)
}
func TestValidateWebhookURL_InvalidFormatRejected(t *testing.T) {
_, err := validateWebhookURL(context.Background(), "https://[::1")
require.Error(t, err)
}
func TestValidateWebhookURL_RejectsUserinfo(t *testing.T) {
oldLookup := lookupIPAddrs
t.Cleanup(func() { lookupIPAddrs = oldLookup })
lookupIPAddrs = func(ctx context.Context, host string) ([]net.IPAddr, error) {
return []net.IPAddr{{IP: net.ParseIP("93.184.216.34")}}, nil
}
_, err := validateWebhookURL(context.Background(), "https://user:pass@example.com/webhook")
require.Error(t, err)
}
func TestValidateWebhookURL_RejectsLocalhost(t *testing.T) {
_, err := validateWebhookURL(context.Background(), "https://localhost/webhook")
require.Error(t, err)
}
func TestValidateWebhookURL_RejectsPrivateIPLiteral(t *testing.T) {
cases := []string{
"https://0.0.0.0/webhook",
"https://127.0.0.1/webhook",
"https://10.0.0.1/webhook",
"https://192.168.1.2/webhook",
"https://172.16.0.1/webhook",
"https://172.31.255.255/webhook",
"https://100.64.0.1/webhook",
"https://169.254.169.254/webhook",
"https://198.18.0.1/webhook",
"https://224.0.0.1/webhook",
"https://240.0.0.1/webhook",
"https://[::]/webhook",
"https://[::1]/webhook",
"https://[ff02::1]/webhook",
}
for _, tc := range cases {
t.Run(tc, func(t *testing.T) {
_, err := validateWebhookURL(context.Background(), tc)
require.Error(t, err)
})
}
}
func TestValidateWebhookURL_RejectsPrivateIPViaDNS(t *testing.T) {
oldLookup := lookupIPAddrs
t.Cleanup(func() { lookupIPAddrs = oldLookup })
lookupIPAddrs = func(ctx context.Context, host string) ([]net.IPAddr, error) {
require.Equal(t, "internal.example", host)
return []net.IPAddr{{IP: net.ParseIP("10.0.0.2")}}, nil
}
_, err := validateWebhookURL(context.Background(), "https://internal.example/webhook")
require.Error(t, err)
}
func TestValidateWebhookURL_RejectsLinkLocalIPViaDNS(t *testing.T) {
oldLookup := lookupIPAddrs
t.Cleanup(func() { lookupIPAddrs = oldLookup })
lookupIPAddrs = func(ctx context.Context, host string) ([]net.IPAddr, error) {
require.Equal(t, "metadata.example", host)
return []net.IPAddr{{IP: net.ParseIP("169.254.169.254")}}, nil
}
_, err := validateWebhookURL(context.Background(), "https://metadata.example/webhook")
require.Error(t, err)
}
func TestValidateWebhookURL_AllowsPublicHostViaDNS(t *testing.T) {
oldLookup := lookupIPAddrs
t.Cleanup(func() { lookupIPAddrs = oldLookup })
lookupIPAddrs = func(ctx context.Context, host string) ([]net.IPAddr, error) {
require.Equal(t, "example.com", host)
return []net.IPAddr{{IP: net.ParseIP("93.184.216.34")}}, nil
}
target, err := validateWebhookURL(context.Background(), "https://example.com:443/webhook")
require.NoError(t, err)
require.Equal(t, "https", target.URL.Scheme)
require.Equal(t, "example.com", target.URL.Hostname())
require.Equal(t, "443", target.URL.Port())
}
func TestValidateWebhookURL_RejectsInvalidPort(t *testing.T) {
oldLookup := lookupIPAddrs
t.Cleanup(func() { lookupIPAddrs = oldLookup })
lookupIPAddrs = func(ctx context.Context, host string) ([]net.IPAddr, error) {
return []net.IPAddr{{IP: net.ParseIP("93.184.216.34")}}, nil
}
_, err := validateWebhookURL(context.Background(), "https://example.com:99999/webhook")
require.Error(t, err)
}
func TestWebhookTransport_UsesPinnedIP_NoDNSRebinding(t *testing.T) {
oldLookup := lookupIPAddrs
oldDial := webhookBaseDialContext
t.Cleanup(func() {
lookupIPAddrs = oldLookup
webhookBaseDialContext = oldDial
})
lookupCalls := 0
lookupIPAddrs = func(ctx context.Context, host string) ([]net.IPAddr, error) {
lookupCalls++
require.Equal(t, "example.com", host)
return []net.IPAddr{{IP: net.ParseIP("93.184.216.34")}}, nil
}
target, err := validateWebhookURL(context.Background(), "https://example.com/webhook")
require.NoError(t, err)
require.Equal(t, 1, lookupCalls)
lookupIPAddrs = func(ctx context.Context, host string) ([]net.IPAddr, error) {
lookupCalls++
return []net.IPAddr{{IP: net.ParseIP("10.0.0.1")}}, nil
}
var dialAddrs []string
webhookBaseDialContext = func(ctx context.Context, network, addr string) (net.Conn, error) {
dialAddrs = append(dialAddrs, addr)
return nil, errors.New("dial blocked in test")
}
client := buildWebhookHTTPClient(nil, target)
transport, ok := client.Transport.(*http.Transport)
require.True(t, ok)
_, err = transport.DialContext(context.Background(), "tcp", "example.com:443")
require.Error(t, err)
require.Equal(t, []string{"93.184.216.34:443"}, dialAddrs)
require.Equal(t, 1, lookupCalls, "dial path must not re-resolve DNS")
}
func TestRetryWithBackoff_SucceedsAfterRetries(t *testing.T) {
oldSleep := opsAlertSleep
t.Cleanup(func() { opsAlertSleep = oldSleep })
var slept []time.Duration
opsAlertSleep = func(ctx context.Context, d time.Duration) error {
slept = append(slept, d)
return nil
}
attempts := 0
err := retryWithBackoff(
context.Background(),
3,
[]time.Duration{time.Second, 2 * time.Second, 4 * time.Second},
func() error {
attempts++
if attempts <= 3 {
return errors.New("send failed")
}
return nil
},
nil,
)
require.NoError(t, err)
require.Equal(t, 4, attempts)
require.Equal(t, []time.Duration{time.Second, 2 * time.Second, 4 * time.Second}, slept)
}
func TestRetryWithBackoff_ContextCanceledStopsRetries(t *testing.T) {
oldSleep := opsAlertSleep
t.Cleanup(func() { opsAlertSleep = oldSleep })
var slept []time.Duration
opsAlertSleep = func(ctx context.Context, d time.Duration) error {
slept = append(slept, d)
return ctx.Err()
}
ctx, cancel := context.WithCancel(context.Background())
attempts := 0
err := retryWithBackoff(
ctx,
3,
[]time.Duration{time.Second, 2 * time.Second, 4 * time.Second},
func() error {
attempts++
return errors.New("send failed")
},
func(attempt int, total int, nextDelay time.Duration, err error) {
if attempt == 1 {
cancel()
}
},
)
require.ErrorIs(t, err, context.Canceled)
require.Equal(t, 1, attempts)
require.Equal(t, []time.Duration{time.Second}, slept)
}
package service
import (
"context"
"time"
)
const (
OpsAlertStatusFiring = "firing"
OpsAlertStatusResolved = "resolved"
)
const (
OpsMetricSuccessRate = "success_rate"
OpsMetricErrorRate = "error_rate"
OpsMetricP95LatencyMs = "p95_latency_ms"
OpsMetricP99LatencyMs = "p99_latency_ms"
OpsMetricHTTP2Errors = "http2_errors"
OpsMetricCPUUsagePercent = "cpu_usage_percent"
OpsMetricMemoryUsagePercent = "memory_usage_percent"
OpsMetricQueueDepth = "concurrency_queue_depth"
)
type OpsAlertRule struct {
ID int64 `json:"id"`
Name string `json:"name"`
Description string `json:"description"`
Enabled bool `json:"enabled"`
MetricType string `json:"metric_type"`
Operator string `json:"operator"`
Threshold float64 `json:"threshold"`
WindowMinutes int `json:"window_minutes"`
SustainedMinutes int `json:"sustained_minutes"`
Severity string `json:"severity"`
NotifyEmail bool `json:"notify_email"`
NotifyWebhook bool `json:"notify_webhook"`
WebhookURL string `json:"webhook_url"`
CooldownMinutes int `json:"cooldown_minutes"`
DimensionFilters map[string]any `json:"dimension_filters,omitempty"`
NotifyChannels []string `json:"notify_channels,omitempty"`
NotifyConfig map[string]any `json:"notify_config,omitempty"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}
type OpsAlertEvent struct {
ID int64 `json:"id"`
RuleID int64 `json:"rule_id"`
Severity string `json:"severity"`
Status string `json:"status"`
Title string `json:"title"`
Description string `json:"description"`
MetricValue float64 `json:"metric_value"`
ThresholdValue float64 `json:"threshold_value"`
FiredAt time.Time `json:"fired_at"`
ResolvedAt *time.Time `json:"resolved_at"`
EmailSent bool `json:"email_sent"`
WebhookSent bool `json:"webhook_sent"`
CreatedAt time.Time `json:"created_at"`
}
func (s *OpsService) ListAlertRules(ctx context.Context) ([]OpsAlertRule, error) {
return s.repo.ListAlertRules(ctx)
}
func (s *OpsService) GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) {
return s.repo.GetActiveAlertEvent(ctx, ruleID)
}
func (s *OpsService) GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) {
return s.repo.GetLatestAlertEvent(ctx, ruleID)
}
func (s *OpsService) CreateAlertEvent(ctx context.Context, event *OpsAlertEvent) error {
return s.repo.CreateAlertEvent(ctx, event)
}
func (s *OpsService) UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error {
return s.repo.UpdateAlertEventStatus(ctx, eventID, status, resolvedAt)
}
func (s *OpsService) UpdateAlertEventNotifications(ctx context.Context, eventID int64, emailSent, webhookSent bool) error {
return s.repo.UpdateAlertEventNotifications(ctx, eventID, emailSent, webhookSent)
}
func (s *OpsService) ListRecentSystemMetrics(ctx context.Context, windowMinutes, limit int) ([]OpsMetrics, error) {
return s.repo.ListRecentSystemMetrics(ctx, windowMinutes, limit)
}
func (s *OpsService) CountActiveAlerts(ctx context.Context) (int, error) {
return s.repo.CountActiveAlerts(ctx)
}
package service
import (
"context"
"log"
"runtime"
"sync"
"time"
"github.com/shirou/gopsutil/v4/cpu"
"github.com/shirou/gopsutil/v4/mem"
)
const (
opsMetricsInterval = 1 * time.Minute
opsMetricsCollectTimeout = 10 * time.Second
opsMetricsWindowShortMinutes = 1
opsMetricsWindowLongMinutes = 5
bytesPerMB = 1024 * 1024
cpuUsageSampleInterval = 0 * time.Second
percentScale = 100
)
type OpsMetricsCollector struct {
opsService *OpsService
concurrencyService *ConcurrencyService
interval time.Duration
lastGCPauseTotal uint64
lastGCPauseMu sync.Mutex
stopCh chan struct{}
startOnce sync.Once
stopOnce sync.Once
}
func NewOpsMetricsCollector(opsService *OpsService, concurrencyService *ConcurrencyService) *OpsMetricsCollector {
return &OpsMetricsCollector{
opsService: opsService,
concurrencyService: concurrencyService,
interval: opsMetricsInterval,
}
}
func (c *OpsMetricsCollector) Start() {
if c == nil {
return
}
c.startOnce.Do(func() {
if c.stopCh == nil {
c.stopCh = make(chan struct{})
}
go c.run()
})
}
func (c *OpsMetricsCollector) Stop() {
if c == nil {
return
}
c.stopOnce.Do(func() {
if c.stopCh != nil {
close(c.stopCh)
}
})
}
func (c *OpsMetricsCollector) run() {
ticker := time.NewTicker(c.interval)
defer ticker.Stop()
c.collectOnce()
for {
select {
case <-ticker.C:
c.collectOnce()
case <-c.stopCh:
return
}
}
}
func (c *OpsMetricsCollector) collectOnce() {
if c.opsService == nil {
return
}
ctx, cancel := context.WithTimeout(context.Background(), opsMetricsCollectTimeout)
defer cancel()
now := time.Now()
systemStats := c.collectSystemStats(ctx)
queueDepth := c.collectQueueDepth(ctx)
activeAlerts := c.collectActiveAlerts(ctx)
for _, window := range []int{opsMetricsWindowShortMinutes, opsMetricsWindowLongMinutes} {
startTime := now.Add(-time.Duration(window) * time.Minute)
windowStats, err := c.opsService.GetWindowStats(ctx, startTime, now)
if err != nil {
log.Printf("[OpsMetrics] failed to get window stats (%dm): %v", window, err)
continue
}
successRate, errorRate := computeRates(windowStats.SuccessCount, windowStats.ErrorCount)
requestCount := windowStats.SuccessCount + windowStats.ErrorCount
metric := &OpsMetrics{
WindowMinutes: window,
RequestCount: requestCount,
SuccessCount: windowStats.SuccessCount,
ErrorCount: windowStats.ErrorCount,
SuccessRate: successRate,
ErrorRate: errorRate,
P95LatencyMs: windowStats.P95LatencyMs,
P99LatencyMs: windowStats.P99LatencyMs,
HTTP2Errors: windowStats.HTTP2Errors,
ActiveAlerts: activeAlerts,
CPUUsagePercent: systemStats.cpuUsage,
MemoryUsedMB: systemStats.memoryUsedMB,
MemoryTotalMB: systemStats.memoryTotalMB,
MemoryUsagePercent: systemStats.memoryUsagePercent,
HeapAllocMB: systemStats.heapAllocMB,
GCPauseMs: systemStats.gcPauseMs,
ConcurrencyQueueDepth: queueDepth,
UpdatedAt: now,
}
if err := c.opsService.RecordMetrics(ctx, metric); err != nil {
log.Printf("[OpsMetrics] failed to record metrics (%dm): %v", window, err)
}
}
}
func computeRates(successCount, errorCount int64) (float64, float64) {
total := successCount + errorCount
if total == 0 {
// No traffic => no data. Rates are kept at 0 and request_count will be 0.
// The UI should render this as N/A instead of "100% success".
return 0, 0
}
successRate := float64(successCount) / float64(total) * percentScale
errorRate := float64(errorCount) / float64(total) * percentScale
return successRate, errorRate
}
type opsSystemStats struct {
cpuUsage float64
memoryUsedMB int64
memoryTotalMB int64
memoryUsagePercent float64
heapAllocMB int64
gcPauseMs float64
}
func (c *OpsMetricsCollector) collectSystemStats(ctx context.Context) opsSystemStats {
stats := opsSystemStats{}
if percents, err := cpu.PercentWithContext(ctx, cpuUsageSampleInterval, false); err == nil && len(percents) > 0 {
stats.cpuUsage = percents[0]
}
if vm, err := mem.VirtualMemoryWithContext(ctx); err == nil {
stats.memoryUsedMB = int64(vm.Used / bytesPerMB)
stats.memoryTotalMB = int64(vm.Total / bytesPerMB)
stats.memoryUsagePercent = vm.UsedPercent
}
var memStats runtime.MemStats
runtime.ReadMemStats(&memStats)
stats.heapAllocMB = int64(memStats.HeapAlloc / bytesPerMB)
c.lastGCPauseMu.Lock()
if c.lastGCPauseTotal != 0 && memStats.PauseTotalNs >= c.lastGCPauseTotal {
stats.gcPauseMs = float64(memStats.PauseTotalNs-c.lastGCPauseTotal) / float64(time.Millisecond)
}
c.lastGCPauseTotal = memStats.PauseTotalNs
c.lastGCPauseMu.Unlock()
return stats
}
func (c *OpsMetricsCollector) collectQueueDepth(ctx context.Context) int {
if c.concurrencyService == nil {
return 0
}
depth, err := c.concurrencyService.GetTotalWaitCount(ctx)
if err != nil {
log.Printf("[OpsMetrics] failed to get queue depth: %v", err)
return 0
}
return depth
}
func (c *OpsMetricsCollector) collectActiveAlerts(ctx context.Context) int {
if c.opsService == nil {
return 0
}
count, err := c.opsService.CountActiveAlerts(ctx)
if err != nil {
return 0
}
return count
}
package service
import (
"context"
"database/sql"
"errors"
"fmt"
"log"
"math"
"runtime"
"strings"
"sync"
"time"
"github.com/shirou/gopsutil/v4/disk"
)
type OpsMetrics struct {
WindowMinutes int `json:"window_minutes"`
RequestCount int64 `json:"request_count"`
SuccessCount int64 `json:"success_count"`
ErrorCount int64 `json:"error_count"`
SuccessRate float64 `json:"success_rate"`
ErrorRate float64 `json:"error_rate"`
P95LatencyMs int `json:"p95_latency_ms"`
P99LatencyMs int `json:"p99_latency_ms"`
HTTP2Errors int `json:"http2_errors"`
ActiveAlerts int `json:"active_alerts"`
CPUUsagePercent float64 `json:"cpu_usage_percent"`
MemoryUsedMB int64 `json:"memory_used_mb"`
MemoryTotalMB int64 `json:"memory_total_mb"`
MemoryUsagePercent float64 `json:"memory_usage_percent"`
HeapAllocMB int64 `json:"heap_alloc_mb"`
GCPauseMs float64 `json:"gc_pause_ms"`
ConcurrencyQueueDepth int `json:"concurrency_queue_depth"`
UpdatedAt time.Time `json:"updated_at,omitempty"`
}
type OpsErrorLog struct {
ID int64 `json:"id"`
CreatedAt time.Time `json:"created_at"`
Phase string `json:"phase"`
Type string `json:"type"`
Severity string `json:"severity"`
StatusCode int `json:"status_code"`
Platform string `json:"platform"`
Model string `json:"model"`
LatencyMs *int `json:"latency_ms"`
RequestID string `json:"request_id"`
Message string `json:"message"`
UserID *int64 `json:"user_id,omitempty"`
APIKeyID *int64 `json:"api_key_id,omitempty"`
AccountID *int64 `json:"account_id,omitempty"`
GroupID *int64 `json:"group_id,omitempty"`
ClientIP string `json:"client_ip,omitempty"`
RequestPath string `json:"request_path,omitempty"`
Stream bool `json:"stream"`
}
type OpsErrorLogFilters struct {
StartTime *time.Time
EndTime *time.Time
Platform string
Phase string
Severity string
Query string
Limit int
}
type OpsWindowStats struct {
SuccessCount int64
ErrorCount int64
P95LatencyMs int
P99LatencyMs int
HTTP2Errors int
}
type ProviderStats struct {
Platform string
RequestCount int64
SuccessCount int64
ErrorCount int64
AvgLatencyMs int
P99LatencyMs int
Error4xxCount int64
Error5xxCount int64
TimeoutCount int64
}
type ProviderHealthErrorsByType struct {
HTTP4xx int64 `json:"4xx"`
HTTP5xx int64 `json:"5xx"`
Timeout int64 `json:"timeout"`
}
type ProviderHealthData struct {
Name string `json:"name"`
RequestCount int64 `json:"request_count"`
SuccessRate float64 `json:"success_rate"`
ErrorRate float64 `json:"error_rate"`
LatencyAvg int `json:"latency_avg"`
LatencyP99 int `json:"latency_p99"`
Status string `json:"status"`
ErrorsByType ProviderHealthErrorsByType `json:"errors_by_type"`
}
type LatencyHistogramItem struct {
Range string `json:"range"`
Count int64 `json:"count"`
Percentage float64 `json:"percentage"`
}
type ErrorDistributionItem struct {
Code string `json:"code"`
Message string `json:"message"`
Count int64 `json:"count"`
Percentage float64 `json:"percentage"`
}
type OpsRepository interface {
CreateErrorLog(ctx context.Context, log *OpsErrorLog) error
// ListErrorLogsLegacy keeps the original non-paginated query API used by the
// existing /api/v1/admin/ops/error-logs endpoint (limit is capped at 500; for
// stable pagination use /api/v1/admin/ops/errors).
ListErrorLogsLegacy(ctx context.Context, filters OpsErrorLogFilters) ([]OpsErrorLog, error)
// ListErrorLogs provides a paginated error-log query API (with total count).
ListErrorLogs(ctx context.Context, filter *ErrorLogFilter) ([]*ErrorLog, int64, error)
GetLatestSystemMetric(ctx context.Context) (*OpsMetrics, error)
CreateSystemMetric(ctx context.Context, metric *OpsMetrics) error
GetWindowStats(ctx context.Context, startTime, endTime time.Time) (*OpsWindowStats, error)
GetProviderStats(ctx context.Context, startTime, endTime time.Time) ([]*ProviderStats, error)
GetLatencyHistogram(ctx context.Context, startTime, endTime time.Time) ([]*LatencyHistogramItem, error)
GetErrorDistribution(ctx context.Context, startTime, endTime time.Time) ([]*ErrorDistributionItem, error)
ListRecentSystemMetrics(ctx context.Context, windowMinutes, limit int) ([]OpsMetrics, error)
ListSystemMetricsRange(ctx context.Context, windowMinutes int, startTime, endTime time.Time, limit int) ([]OpsMetrics, error)
ListAlertRules(ctx context.Context) ([]OpsAlertRule, error)
GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error)
GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error)
CreateAlertEvent(ctx context.Context, event *OpsAlertEvent) error
UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error
UpdateAlertEventNotifications(ctx context.Context, eventID int64, emailSent, webhookSent bool) error
CountActiveAlerts(ctx context.Context) (int, error)
GetOverviewStats(ctx context.Context, startTime, endTime time.Time) (*OverviewStats, error)
// Redis-backed cache/health (best-effort; implementation lives in repository layer).
GetCachedLatestSystemMetric(ctx context.Context) (*OpsMetrics, error)
SetCachedLatestSystemMetric(ctx context.Context, metric *OpsMetrics) error
GetCachedDashboardOverview(ctx context.Context, timeRange string) (*DashboardOverviewData, error)
SetCachedDashboardOverview(ctx context.Context, timeRange string, data *DashboardOverviewData, ttl time.Duration) error
PingRedis(ctx context.Context) error
}
type OpsService struct {
repo OpsRepository
sqlDB *sql.DB
redisNilWarnOnce sync.Once
dbNilWarnOnce sync.Once
}
const opsDBQueryTimeout = 5 * time.Second
func NewOpsService(repo OpsRepository, sqlDB *sql.DB) *OpsService {
svc := &OpsService{repo: repo, sqlDB: sqlDB}
// Best-effort startup health checks: log warnings if Redis/DB is unavailable,
// but never fail service startup (graceful degradation).
log.Printf("[OpsService] Performing startup health checks...")
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
redisStatus := svc.checkRedisHealth(ctx)
dbStatus := svc.checkDatabaseHealth(ctx)
log.Printf("[OpsService] Startup health check complete: Redis=%s, Database=%s", redisStatus, dbStatus)
if redisStatus == "critical" || dbStatus == "critical" {
log.Printf("[OpsService][WARN] Service starting with degraded dependencies - some features may be unavailable")
}
return svc
}
func (s *OpsService) RecordError(ctx context.Context, log *OpsErrorLog) error {
if log == nil {
return nil
}
if log.CreatedAt.IsZero() {
log.CreatedAt = time.Now()
}
if log.Severity == "" {
log.Severity = "P2"
}
if log.Phase == "" {
log.Phase = "internal"
}
if log.Type == "" {
log.Type = "unknown_error"
}
if log.Message == "" {
log.Message = "Unknown error"
}
ctxDB, cancel := context.WithTimeout(ctx, opsDBQueryTimeout)
defer cancel()
return s.repo.CreateErrorLog(ctxDB, log)
}
func (s *OpsService) RecordMetrics(ctx context.Context, metric *OpsMetrics) error {
if metric == nil {
return nil
}
if metric.UpdatedAt.IsZero() {
metric.UpdatedAt = time.Now()
}
ctxDB, cancel := context.WithTimeout(ctx, opsDBQueryTimeout)
defer cancel()
if err := s.repo.CreateSystemMetric(ctxDB, metric); err != nil {
return err
}
// Latest metrics snapshot is queried frequently by the ops dashboard; keep a short-lived cache
// to avoid unnecessary DB pressure. Only cache the default (1-minute) window metrics.
windowMinutes := metric.WindowMinutes
if windowMinutes == 0 {
windowMinutes = 1
}
if windowMinutes == 1 {
if repo := s.repo; repo != nil {
_ = repo.SetCachedLatestSystemMetric(ctx, metric)
}
}
return nil
}
func (s *OpsService) ListErrorLogs(ctx context.Context, filters OpsErrorLogFilters) ([]OpsErrorLog, int, error) {
ctxDB, cancel := context.WithTimeout(ctx, opsDBQueryTimeout)
defer cancel()
logs, err := s.repo.ListErrorLogsLegacy(ctxDB, filters)
if err != nil {
return nil, 0, err
}
return logs, len(logs), nil
}
func (s *OpsService) GetWindowStats(ctx context.Context, startTime, endTime time.Time) (*OpsWindowStats, error) {
ctxDB, cancel := context.WithTimeout(ctx, opsDBQueryTimeout)
defer cancel()
return s.repo.GetWindowStats(ctxDB, startTime, endTime)
}
func (s *OpsService) GetLatestMetrics(ctx context.Context) (*OpsMetrics, error) {
// Cache first (best-effort): cache errors should not break the dashboard.
if s != nil {
if repo := s.repo; repo != nil {
if cached, err := repo.GetCachedLatestSystemMetric(ctx); err == nil && cached != nil {
if cached.WindowMinutes == 0 {
cached.WindowMinutes = 1
}
return cached, nil
}
}
}
ctxDB, cancel := context.WithTimeout(ctx, opsDBQueryTimeout)
defer cancel()
metric, err := s.repo.GetLatestSystemMetric(ctxDB)
if err != nil {
if errors.Is(err, sql.ErrNoRows) {
return &OpsMetrics{WindowMinutes: 1}, nil
}
return nil, err
}
if metric == nil {
return &OpsMetrics{WindowMinutes: 1}, nil
}
if metric.WindowMinutes == 0 {
metric.WindowMinutes = 1
}
// Backfill cache (best-effort).
if s != nil {
if repo := s.repo; repo != nil {
_ = repo.SetCachedLatestSystemMetric(ctx, metric)
}
}
return metric, nil
}
func (s *OpsService) ListMetricsHistory(ctx context.Context, windowMinutes int, startTime, endTime time.Time, limit int) ([]OpsMetrics, error) {
if s == nil || s.repo == nil {
return nil, nil
}
if windowMinutes <= 0 {
windowMinutes = 1
}
if limit <= 0 || limit > 5000 {
limit = 300
}
if endTime.IsZero() {
endTime = time.Now()
}
if startTime.IsZero() {
startTime = endTime.Add(-time.Duration(limit) * opsMetricsInterval)
}
if startTime.After(endTime) {
startTime, endTime = endTime, startTime
}
ctxDB, cancel := context.WithTimeout(ctx, opsDBQueryTimeout)
defer cancel()
return s.repo.ListSystemMetricsRange(ctxDB, windowMinutes, startTime, endTime, limit)
}
// DashboardOverviewData represents aggregated metrics for the ops dashboard overview.
type DashboardOverviewData struct {
Timestamp time.Time `json:"timestamp"`
HealthScore int `json:"health_score"`
SLA SLAData `json:"sla"`
QPS QPSData `json:"qps"`
TPS TPSData `json:"tps"`
Latency LatencyData `json:"latency"`
Errors ErrorData `json:"errors"`
Resources ResourceData `json:"resources"`
SystemStatus SystemStatusData `json:"system_status"`
}
type SLAData struct {
Current float64 `json:"current"`
Threshold float64 `json:"threshold"`
Status string `json:"status"`
Trend string `json:"trend"`
Change24h float64 `json:"change_24h"`
}
type QPSData struct {
Current float64 `json:"current"`
Peak1h float64 `json:"peak_1h"`
Avg1h float64 `json:"avg_1h"`
ChangeVsYesterday float64 `json:"change_vs_yesterday"`
}
type TPSData struct {
Current float64 `json:"current"`
Peak1h float64 `json:"peak_1h"`
Avg1h float64 `json:"avg_1h"`
}
type LatencyData struct {
P50 int `json:"p50"`
P95 int `json:"p95"`
P99 int `json:"p99"`
P999 int `json:"p999"`
Avg int `json:"avg"`
Max int `json:"max"`
ThresholdP99 int `json:"threshold_p99"`
Status string `json:"status"`
}
type ErrorData struct {
TotalCount int64 `json:"total_count"`
ErrorRate float64 `json:"error_rate"`
Count4xx int64 `json:"4xx_count"`
Count5xx int64 `json:"5xx_count"`
TimeoutCount int64 `json:"timeout_count"`
TopError *TopError `json:"top_error,omitempty"`
}
type TopError struct {
Code string `json:"code"`
Message string `json:"message"`
Count int64 `json:"count"`
}
type ResourceData struct {
CPUUsage float64 `json:"cpu_usage"`
MemoryUsage float64 `json:"memory_usage"`
DiskUsage float64 `json:"disk_usage"`
Goroutines int `json:"goroutines"`
DBConnections DBConnectionsData `json:"db_connections"`
}
type DBConnectionsData struct {
Active int `json:"active"`
Idle int `json:"idle"`
Waiting int `json:"waiting"`
Max int `json:"max"`
}
type SystemStatusData struct {
Redis string `json:"redis"`
Database string `json:"database"`
BackgroundJobs string `json:"background_jobs"`
}
type OverviewStats struct {
RequestCount int64
SuccessCount int64
ErrorCount int64
Error4xxCount int64
Error5xxCount int64
TimeoutCount int64
LatencyP50 int
LatencyP95 int
LatencyP99 int
LatencyP999 int
LatencyAvg int
LatencyMax int
TopErrorCode string
TopErrorMsg string
TopErrorCount int64
CPUUsage float64
MemoryUsage float64
MemoryUsedMB int64
MemoryTotalMB int64
ConcurrencyQueueDepth int
}
func (s *OpsService) GetDashboardOverview(ctx context.Context, timeRange string) (*DashboardOverviewData, error) {
if s == nil {
return nil, errors.New("ops service not initialized")
}
repo := s.repo
if repo == nil {
return nil, errors.New("ops repository not initialized")
}
if s.sqlDB == nil {
return nil, errors.New("ops service not initialized")
}
if strings.TrimSpace(timeRange) == "" {
timeRange = "1h"
}
duration, err := parseTimeRange(timeRange)
if err != nil {
return nil, err
}
if cached, err := repo.GetCachedDashboardOverview(ctx, timeRange); err == nil && cached != nil {
return cached, nil
}
now := time.Now().UTC()
startTime := now.Add(-duration)
ctxStats, cancelStats := context.WithTimeout(ctx, opsDBQueryTimeout)
stats, err := repo.GetOverviewStats(ctxStats, startTime, now)
cancelStats()
if err != nil {
return nil, fmt.Errorf("get overview stats: %w", err)
}
if stats == nil {
return nil, errors.New("get overview stats returned nil")
}
var statsYesterday *OverviewStats
{
yesterdayEnd := now.Add(-24 * time.Hour)
yesterdayStart := yesterdayEnd.Add(-duration)
ctxYesterday, cancelYesterday := context.WithTimeout(ctx, opsDBQueryTimeout)
ys, err := repo.GetOverviewStats(ctxYesterday, yesterdayStart, yesterdayEnd)
cancelYesterday()
if err != nil {
// Best-effort: overview should still work when historical comparison fails.
log.Printf("[OpsOverview] get yesterday overview stats failed: %v", err)
} else {
statsYesterday = ys
}
}
totalReqs := stats.SuccessCount + stats.ErrorCount
successRate, errorRate := calculateRates(stats.SuccessCount, stats.ErrorCount, totalReqs)
successRateYesterday := 0.0
totalReqsYesterday := int64(0)
if statsYesterday != nil {
totalReqsYesterday = statsYesterday.SuccessCount + statsYesterday.ErrorCount
successRateYesterday, _ = calculateRates(statsYesterday.SuccessCount, statsYesterday.ErrorCount, totalReqsYesterday)
}
slaThreshold := 99.9
slaChange24h := roundTo2DP(successRate - successRateYesterday)
slaTrend := classifyTrend(slaChange24h, 0.05)
slaStatus := classifySLAStatus(successRate, slaThreshold)
latencyThresholdP99 := 1000
latencyStatus := classifyLatencyStatus(stats.LatencyP99, latencyThresholdP99)
qpsCurrent := 0.0
{
ctxWindow, cancelWindow := context.WithTimeout(ctx, opsDBQueryTimeout)
windowStats, err := repo.GetWindowStats(ctxWindow, now.Add(-1*time.Minute), now)
cancelWindow()
if err == nil && windowStats != nil {
qpsCurrent = roundTo1DP(float64(windowStats.SuccessCount+windowStats.ErrorCount) / 60)
} else if err != nil {
log.Printf("[OpsOverview] get realtime qps failed: %v", err)
}
}
qpsAvg := roundTo1DP(safeDivide(float64(totalReqs), duration.Seconds()))
qpsPeak := qpsAvg
{
limit := int(duration.Minutes()) + 5
if limit < 10 {
limit = 10
}
if limit > 5000 {
limit = 5000
}
ctxMetrics, cancelMetrics := context.WithTimeout(ctx, opsDBQueryTimeout)
items, err := repo.ListSystemMetricsRange(ctxMetrics, 1, startTime, now, limit)
cancelMetrics()
if err != nil {
log.Printf("[OpsOverview] get metrics range for peak qps failed: %v", err)
} else {
maxQPS := 0.0
for _, item := range items {
v := float64(item.RequestCount) / 60
if v > maxQPS {
maxQPS = v
}
}
if maxQPS > 0 {
qpsPeak = roundTo1DP(maxQPS)
}
}
}
qpsAvgYesterday := 0.0
if duration.Seconds() > 0 && totalReqsYesterday > 0 {
qpsAvgYesterday = float64(totalReqsYesterday) / duration.Seconds()
}
qpsChangeVsYesterday := roundTo1DP(percentChange(qpsAvgYesterday, float64(totalReqs)/duration.Seconds()))
tpsCurrent, tpsPeak, tpsAvg := 0.0, 0.0, 0.0
if current, peak, avg, err := s.getTokenTPS(ctx, now, startTime, duration); err != nil {
log.Printf("[OpsOverview] get token tps failed: %v", err)
} else {
tpsCurrent, tpsPeak, tpsAvg = roundTo1DP(current), roundTo1DP(peak), roundTo1DP(avg)
}
diskUsage := 0.0
if v, err := getDiskUsagePercent(ctx, "/"); err != nil {
log.Printf("[OpsOverview] get disk usage failed: %v", err)
} else {
diskUsage = roundTo1DP(v)
}
redisStatus := s.checkRedisHealth(ctx)
dbStatus := s.checkDatabaseHealth(ctx)
healthScore := calculateHealthScore(successRate, stats.LatencyP99, errorRate, redisStatus, dbStatus)
data := &DashboardOverviewData{
Timestamp: now,
HealthScore: healthScore,
SLA: SLAData{
Current: successRate,
Threshold: slaThreshold,
Status: slaStatus,
Trend: slaTrend,
Change24h: slaChange24h,
},
QPS: QPSData{
Current: qpsCurrent,
Peak1h: qpsPeak,
Avg1h: qpsAvg,
ChangeVsYesterday: qpsChangeVsYesterday,
},
TPS: TPSData{
Current: tpsCurrent,
Peak1h: tpsPeak,
Avg1h: tpsAvg,
},
Latency: LatencyData{
P50: stats.LatencyP50,
P95: stats.LatencyP95,
P99: stats.LatencyP99,
P999: stats.LatencyP999,
Avg: stats.LatencyAvg,
Max: stats.LatencyMax,
ThresholdP99: latencyThresholdP99,
Status: latencyStatus,
},
Errors: ErrorData{
TotalCount: stats.ErrorCount,
ErrorRate: errorRate,
Count4xx: stats.Error4xxCount,
Count5xx: stats.Error5xxCount,
TimeoutCount: stats.TimeoutCount,
},
Resources: ResourceData{
CPUUsage: roundTo1DP(stats.CPUUsage),
MemoryUsage: roundTo1DP(stats.MemoryUsage),
DiskUsage: diskUsage,
Goroutines: runtime.NumGoroutine(),
DBConnections: s.getDBConnections(),
},
SystemStatus: SystemStatusData{
Redis: redisStatus,
Database: dbStatus,
BackgroundJobs: "healthy",
},
}
if stats.TopErrorCount > 0 {
data.Errors.TopError = &TopError{
Code: stats.TopErrorCode,
Message: stats.TopErrorMsg,
Count: stats.TopErrorCount,
}
}
_ = repo.SetCachedDashboardOverview(ctx, timeRange, data, 10*time.Second)
return data, nil
}
func (s *OpsService) GetProviderHealth(ctx context.Context, timeRange string) ([]*ProviderHealthData, error) {
if s == nil || s.repo == nil {
return nil, nil
}
if strings.TrimSpace(timeRange) == "" {
timeRange = "1h"
}
window, err := parseTimeRange(timeRange)
if err != nil {
return nil, err
}
endTime := time.Now()
startTime := endTime.Add(-window)
ctxDB, cancel := context.WithTimeout(ctx, opsDBQueryTimeout)
stats, err := s.repo.GetProviderStats(ctxDB, startTime, endTime)
cancel()
if err != nil {
return nil, err
}
results := make([]*ProviderHealthData, 0, len(stats))
for _, item := range stats {
if item == nil {
continue
}
successRate, errorRate := calculateRates(item.SuccessCount, item.ErrorCount, item.RequestCount)
results = append(results, &ProviderHealthData{
Name: formatPlatformName(item.Platform),
RequestCount: item.RequestCount,
SuccessRate: successRate,
ErrorRate: errorRate,
LatencyAvg: item.AvgLatencyMs,
LatencyP99: item.P99LatencyMs,
Status: classifyProviderStatus(successRate, item.P99LatencyMs, item.TimeoutCount, item.RequestCount),
ErrorsByType: ProviderHealthErrorsByType{
HTTP4xx: item.Error4xxCount,
HTTP5xx: item.Error5xxCount,
Timeout: item.TimeoutCount,
},
})
}
return results, nil
}
func (s *OpsService) GetLatencyHistogram(ctx context.Context, timeRange string) ([]*LatencyHistogramItem, error) {
if s == nil || s.repo == nil {
return nil, nil
}
duration, err := parseTimeRange(timeRange)
if err != nil {
return nil, err
}
endTime := time.Now()
startTime := endTime.Add(-duration)
ctxDB, cancel := context.WithTimeout(ctx, opsDBQueryTimeout)
defer cancel()
return s.repo.GetLatencyHistogram(ctxDB, startTime, endTime)
}
func (s *OpsService) GetErrorDistribution(ctx context.Context, timeRange string) ([]*ErrorDistributionItem, error) {
if s == nil || s.repo == nil {
return nil, nil
}
duration, err := parseTimeRange(timeRange)
if err != nil {
return nil, err
}
endTime := time.Now()
startTime := endTime.Add(-duration)
ctxDB, cancel := context.WithTimeout(ctx, opsDBQueryTimeout)
defer cancel()
return s.repo.GetErrorDistribution(ctxDB, startTime, endTime)
}
func parseTimeRange(timeRange string) (time.Duration, error) {
value := strings.TrimSpace(timeRange)
if value == "" {
return 0, errors.New("invalid time range")
}
// Support "7d" style day ranges for convenience.
if strings.HasSuffix(value, "d") {
numberPart := strings.TrimSuffix(value, "d")
if numberPart == "" {
return 0, errors.New("invalid time range")
}
days := 0
for _, ch := range numberPart {
if ch < '0' || ch > '9' {
return 0, errors.New("invalid time range")
}
days = days*10 + int(ch-'0')
}
if days <= 0 {
return 0, errors.New("invalid time range")
}
return time.Duration(days) * 24 * time.Hour, nil
}
dur, err := time.ParseDuration(value)
if err != nil || dur <= 0 {
return 0, errors.New("invalid time range")
}
// Cap to avoid unbounded queries.
const maxWindow = 30 * 24 * time.Hour
if dur > maxWindow {
dur = maxWindow
}
return dur, nil
}
func calculateHealthScore(successRate float64, p99Latency int, errorRate float64, redisStatus, dbStatus string) int {
score := 100.0
// SLA impact (max -45 points)
if successRate < 99.9 {
score -= math.Min(45, (99.9-successRate)*12)
}
// Latency impact (max -35 points)
if p99Latency > 1000 {
score -= math.Min(35, float64(p99Latency-1000)/80)
}
// Error rate impact (max -20 points)
if errorRate > 0.1 {
score -= math.Min(20, (errorRate-0.1)*60)
}
// Infra status impact
if redisStatus != "healthy" {
score -= 15
}
if dbStatus != "healthy" {
score -= 20
}
if score < 0 {
score = 0
}
if score > 100 {
score = 100
}
return int(math.Round(score))
}
func calculateRates(successCount, errorCount, requestCount int64) (successRate float64, errorRate float64) {
if requestCount <= 0 {
return 0, 0
}
successRate = (float64(successCount) / float64(requestCount)) * 100
errorRate = (float64(errorCount) / float64(requestCount)) * 100
return roundTo2DP(successRate), roundTo2DP(errorRate)
}
func roundTo2DP(v float64) float64 {
return math.Round(v*100) / 100
}
func roundTo1DP(v float64) float64 {
return math.Round(v*10) / 10
}
func safeDivide(numerator float64, denominator float64) float64 {
if denominator <= 0 {
return 0
}
return numerator / denominator
}
func percentChange(previous float64, current float64) float64 {
if previous == 0 {
if current > 0 {
return 100.0
}
return 0
}
return (current - previous) / previous * 100
}
func classifyTrend(delta float64, deadband float64) string {
if delta > deadband {
return "up"
}
if delta < -deadband {
return "down"
}
return "stable"
}
func classifySLAStatus(successRate float64, threshold float64) string {
if successRate >= threshold {
return "healthy"
}
if successRate >= threshold-0.5 {
return "warning"
}
return "critical"
}
func classifyLatencyStatus(p99LatencyMs int, thresholdP99 int) string {
if thresholdP99 <= 0 {
return "healthy"
}
if p99LatencyMs <= thresholdP99 {
return "healthy"
}
if p99LatencyMs <= thresholdP99*2 {
return "warning"
}
return "critical"
}
func getDiskUsagePercent(ctx context.Context, path string) (float64, error) {
usage, err := disk.UsageWithContext(ctx, path)
if err != nil {
return 0, err
}
if usage == nil {
return 0, nil
}
return usage.UsedPercent, nil
}
func (s *OpsService) checkRedisHealth(ctx context.Context) string {
if s == nil {
log.Printf("[OpsOverview][WARN] ops service is nil; redis health check skipped")
return "critical"
}
if s.repo == nil {
s.redisNilWarnOnce.Do(func() {
log.Printf("[OpsOverview][WARN] ops repository is nil; redis health check skipped")
})
return "critical"
}
ctxPing, cancel := context.WithTimeout(ctx, 800*time.Millisecond)
defer cancel()
if err := s.repo.PingRedis(ctxPing); err != nil {
log.Printf("[OpsOverview][WARN] redis ping failed: %v", err)
return "critical"
}
return "healthy"
}
func (s *OpsService) checkDatabaseHealth(ctx context.Context) string {
if s == nil {
log.Printf("[OpsOverview][WARN] ops service is nil; db health check skipped")
return "critical"
}
if s.sqlDB == nil {
s.dbNilWarnOnce.Do(func() {
log.Printf("[OpsOverview][WARN] database is nil; db health check skipped")
})
return "critical"
}
ctxPing, cancel := context.WithTimeout(ctx, 800*time.Millisecond)
defer cancel()
if err := s.sqlDB.PingContext(ctxPing); err != nil {
log.Printf("[OpsOverview][WARN] db ping failed: %v", err)
return "critical"
}
return "healthy"
}
func (s *OpsService) getDBConnections() DBConnectionsData {
if s == nil || s.sqlDB == nil {
return DBConnectionsData{}
}
stats := s.sqlDB.Stats()
maxOpen := stats.MaxOpenConnections
if maxOpen < 0 {
maxOpen = 0
}
return DBConnectionsData{
Active: stats.InUse,
Idle: stats.Idle,
Waiting: 0,
Max: maxOpen,
}
}
func (s *OpsService) getTokenTPS(ctx context.Context, endTime time.Time, startTime time.Time, duration time.Duration) (current float64, peak float64, avg float64, err error) {
if s == nil || s.sqlDB == nil {
return 0, 0, 0, nil
}
if duration <= 0 {
return 0, 0, 0, nil
}
// Current TPS: last 1 minute.
var tokensLastMinute int64
{
lastMinuteStart := endTime.Add(-1 * time.Minute)
ctxQuery, cancel := context.WithTimeout(ctx, opsDBQueryTimeout)
row := s.sqlDB.QueryRowContext(ctxQuery, `
SELECT COALESCE(SUM(input_tokens + output_tokens), 0)
FROM usage_logs
WHERE created_at >= $1 AND created_at < $2
`, lastMinuteStart, endTime)
scanErr := row.Scan(&tokensLastMinute)
cancel()
if scanErr != nil {
return 0, 0, 0, scanErr
}
}
var totalTokens int64
var maxTokensPerMinute int64
{
ctxQuery, cancel := context.WithTimeout(ctx, opsDBQueryTimeout)
row := s.sqlDB.QueryRowContext(ctxQuery, `
WITH buckets AS (
SELECT
date_trunc('minute', created_at) AS bucket,
SUM(input_tokens + output_tokens) AS tokens
FROM usage_logs
WHERE created_at >= $1 AND created_at < $2
GROUP BY 1
)
SELECT
COALESCE(SUM(tokens), 0) AS total_tokens,
COALESCE(MAX(tokens), 0) AS max_tokens_per_minute
FROM buckets
`, startTime, endTime)
scanErr := row.Scan(&totalTokens, &maxTokensPerMinute)
cancel()
if scanErr != nil {
return 0, 0, 0, scanErr
}
}
current = safeDivide(float64(tokensLastMinute), 60)
peak = safeDivide(float64(maxTokensPerMinute), 60)
avg = safeDivide(float64(totalTokens), duration.Seconds())
return current, peak, avg, nil
}
func formatPlatformName(platform string) string {
switch strings.ToLower(strings.TrimSpace(platform)) {
case PlatformOpenAI:
return "OpenAI"
case PlatformAnthropic:
return "Anthropic"
case PlatformGemini:
return "Gemini"
case PlatformAntigravity:
return "Antigravity"
default:
if platform == "" {
return "Unknown"
}
if len(platform) == 1 {
return strings.ToUpper(platform)
}
return strings.ToUpper(platform[:1]) + platform[1:]
}
}
func classifyProviderStatus(successRate float64, p99LatencyMs int, timeoutCount int64, requestCount int64) string {
if requestCount <= 0 {
return "healthy"
}
if successRate < 98 {
return "critical"
}
if successRate < 99.5 {
return "warning"
}
// Heavy timeout volume should be highlighted even if the overall success rate is okay.
if timeoutCount >= 10 && requestCount >= 100 {
return "warning"
}
if p99LatencyMs > 0 && p99LatencyMs >= 5000 {
return "warning"
}
return "healthy"
}
-- Ops error logs and system metrics
CREATE TABLE IF NOT EXISTS ops_error_logs (
id BIGSERIAL PRIMARY KEY,
request_id VARCHAR(64),
user_id BIGINT,
api_key_id BIGINT,
account_id BIGINT,
group_id BIGINT,
client_ip INET,
error_phase VARCHAR(32) NOT NULL,
error_type VARCHAR(64) NOT NULL,
severity VARCHAR(4) NOT NULL,
status_code INT,
platform VARCHAR(32),
model VARCHAR(100),
request_path VARCHAR(256),
stream BOOLEAN NOT NULL DEFAULT FALSE,
error_message TEXT,
error_body TEXT,
provider_error_code VARCHAR(64),
provider_error_type VARCHAR(64),
is_retryable BOOLEAN NOT NULL DEFAULT FALSE,
is_user_actionable BOOLEAN NOT NULL DEFAULT FALSE,
retry_count INT NOT NULL DEFAULT 0,
completion_status VARCHAR(16),
duration_ms INT,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_created_at ON ops_error_logs (created_at DESC);
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_phase ON ops_error_logs (error_phase);
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_platform ON ops_error_logs (platform);
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_severity ON ops_error_logs (severity);
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_phase_platform_time ON ops_error_logs (error_phase, platform, created_at DESC);
CREATE TABLE IF NOT EXISTS ops_system_metrics (
id BIGSERIAL PRIMARY KEY,
success_rate DOUBLE PRECISION,
error_rate DOUBLE PRECISION,
p95_latency_ms INT,
p99_latency_ms INT,
http2_errors INT,
active_alerts INT,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_created_at ON ops_system_metrics (created_at DESC);
-- Extend ops_system_metrics with windowed/system stats
ALTER TABLE ops_system_metrics
ADD COLUMN IF NOT EXISTS window_minutes INT NOT NULL DEFAULT 1,
ADD COLUMN IF NOT EXISTS cpu_usage_percent DOUBLE PRECISION,
ADD COLUMN IF NOT EXISTS memory_used_mb BIGINT,
ADD COLUMN IF NOT EXISTS memory_total_mb BIGINT,
ADD COLUMN IF NOT EXISTS memory_usage_percent DOUBLE PRECISION,
ADD COLUMN IF NOT EXISTS heap_alloc_mb BIGINT,
ADD COLUMN IF NOT EXISTS gc_pause_ms DOUBLE PRECISION,
ADD COLUMN IF NOT EXISTS concurrency_queue_depth INT;
CREATE INDEX IF NOT EXISTS idx_ops_system_metrics_window_time
ON ops_system_metrics (window_minutes, created_at DESC);
-- Ops alert rules and events
CREATE TABLE IF NOT EXISTS ops_alert_rules (
id BIGSERIAL PRIMARY KEY,
name VARCHAR(128) NOT NULL,
description TEXT,
enabled BOOLEAN NOT NULL DEFAULT TRUE,
metric_type VARCHAR(64) NOT NULL,
operator VARCHAR(8) NOT NULL,
threshold DOUBLE PRECISION NOT NULL,
window_minutes INT NOT NULL DEFAULT 1,
sustained_minutes INT NOT NULL DEFAULT 1,
severity VARCHAR(4) NOT NULL DEFAULT 'P1',
notify_email BOOLEAN NOT NULL DEFAULT FALSE,
notify_webhook BOOLEAN NOT NULL DEFAULT FALSE,
webhook_url TEXT,
cooldown_minutes INT NOT NULL DEFAULT 10,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_ops_alert_rules_enabled ON ops_alert_rules (enabled);
CREATE INDEX IF NOT EXISTS idx_ops_alert_rules_metric ON ops_alert_rules (metric_type, window_minutes);
CREATE TABLE IF NOT EXISTS ops_alert_events (
id BIGSERIAL PRIMARY KEY,
rule_id BIGINT NOT NULL REFERENCES ops_alert_rules(id) ON DELETE CASCADE,
severity VARCHAR(4) NOT NULL,
status VARCHAR(16) NOT NULL DEFAULT 'firing',
title VARCHAR(200),
description TEXT,
metric_value DOUBLE PRECISION,
threshold_value DOUBLE PRECISION,
fired_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
resolved_at TIMESTAMPTZ,
email_sent BOOLEAN NOT NULL DEFAULT FALSE,
webhook_sent BOOLEAN NOT NULL DEFAULT FALSE,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_ops_alert_events_rule_status ON ops_alert_events (rule_id, status);
CREATE INDEX IF NOT EXISTS idx_ops_alert_events_fired_at ON ops_alert_events (fired_at DESC);
-- Seed default ops alert rules (idempotent)
INSERT INTO ops_alert_rules (
name,
description,
enabled,
metric_type,
operator,
threshold,
window_minutes,
sustained_minutes,
severity,
notify_email,
notify_webhook,
webhook_url,
cooldown_minutes
)
SELECT
'Global success rate < 99%',
'Trigger when the 1-minute success rate drops below 99% for 2 consecutive minutes.',
TRUE,
'success_rate',
'<',
99,
1,
2,
'P1',
TRUE,
FALSE,
NULL,
10
WHERE NOT EXISTS (SELECT 1 FROM ops_alert_rules);
-- Seed additional ops alert rules (idempotent)
INSERT INTO ops_alert_rules (
name,
description,
enabled,
metric_type,
operator,
threshold,
window_minutes,
sustained_minutes,
severity,
notify_email,
notify_webhook,
webhook_url,
cooldown_minutes
)
SELECT
'Global error rate > 1%',
'Trigger when the 1-minute error rate exceeds 1% for 2 consecutive minutes.',
TRUE,
'error_rate',
'>',
1,
1,
2,
'P1',
TRUE,
CASE
WHEN (SELECT webhook_url FROM ops_alert_rules WHERE webhook_url IS NOT NULL AND webhook_url <> '' LIMIT 1) IS NULL THEN FALSE
ELSE TRUE
END,
(SELECT webhook_url FROM ops_alert_rules WHERE webhook_url IS NOT NULL AND webhook_url <> '' LIMIT 1),
10
WHERE NOT EXISTS (SELECT 1 FROM ops_alert_rules WHERE name = 'Global error rate > 1%');
INSERT INTO ops_alert_rules (
name,
description,
enabled,
metric_type,
operator,
threshold,
window_minutes,
sustained_minutes,
severity,
notify_email,
notify_webhook,
webhook_url,
cooldown_minutes
)
SELECT
'P99 latency > 2000ms',
'Trigger when the 5-minute P99 latency exceeds 2000ms for 2 consecutive samples.',
TRUE,
'p99_latency_ms',
'>',
2000,
5,
2,
'P1',
TRUE,
CASE
WHEN (SELECT webhook_url FROM ops_alert_rules WHERE webhook_url IS NOT NULL AND webhook_url <> '' LIMIT 1) IS NULL THEN FALSE
ELSE TRUE
END,
(SELECT webhook_url FROM ops_alert_rules WHERE webhook_url IS NOT NULL AND webhook_url <> '' LIMIT 1),
15
WHERE NOT EXISTS (SELECT 1 FROM ops_alert_rules WHERE name = 'P99 latency > 2000ms');
INSERT INTO ops_alert_rules (
name,
description,
enabled,
metric_type,
operator,
threshold,
window_minutes,
sustained_minutes,
severity,
notify_email,
notify_webhook,
webhook_url,
cooldown_minutes
)
SELECT
'HTTP/2 errors > 20',
'Trigger when HTTP/2 errors exceed 20 in the last minute for 2 consecutive minutes.',
TRUE,
'http2_errors',
'>',
20,
1,
2,
'P2',
FALSE,
CASE
WHEN (SELECT webhook_url FROM ops_alert_rules WHERE webhook_url IS NOT NULL AND webhook_url <> '' LIMIT 1) IS NULL THEN FALSE
ELSE TRUE
END,
(SELECT webhook_url FROM ops_alert_rules WHERE webhook_url IS NOT NULL AND webhook_url <> '' LIMIT 1),
10
WHERE NOT EXISTS (SELECT 1 FROM ops_alert_rules WHERE name = 'HTTP/2 errors > 20');
INSERT INTO ops_alert_rules (
name,
description,
enabled,
metric_type,
operator,
threshold,
window_minutes,
sustained_minutes,
severity,
notify_email,
notify_webhook,
webhook_url,
cooldown_minutes
)
SELECT
'CPU usage > 85%',
'Trigger when CPU usage exceeds 85% for 5 consecutive minutes.',
TRUE,
'cpu_usage_percent',
'>',
85,
1,
5,
'P2',
FALSE,
CASE
WHEN (SELECT webhook_url FROM ops_alert_rules WHERE webhook_url IS NOT NULL AND webhook_url <> '' LIMIT 1) IS NULL THEN FALSE
ELSE TRUE
END,
(SELECT webhook_url FROM ops_alert_rules WHERE webhook_url IS NOT NULL AND webhook_url <> '' LIMIT 1),
15
WHERE NOT EXISTS (SELECT 1 FROM ops_alert_rules WHERE name = 'CPU usage > 85%');
INSERT INTO ops_alert_rules (
name,
description,
enabled,
metric_type,
operator,
threshold,
window_minutes,
sustained_minutes,
severity,
notify_email,
notify_webhook,
webhook_url,
cooldown_minutes
)
SELECT
'Memory usage > 90%',
'Trigger when memory usage exceeds 90% for 5 consecutive minutes.',
TRUE,
'memory_usage_percent',
'>',
90,
1,
5,
'P2',
FALSE,
CASE
WHEN (SELECT webhook_url FROM ops_alert_rules WHERE webhook_url IS NOT NULL AND webhook_url <> '' LIMIT 1) IS NULL THEN FALSE
ELSE TRUE
END,
(SELECT webhook_url FROM ops_alert_rules WHERE webhook_url IS NOT NULL AND webhook_url <> '' LIMIT 1),
15
WHERE NOT EXISTS (SELECT 1 FROM ops_alert_rules WHERE name = 'Memory usage > 90%');
INSERT INTO ops_alert_rules (
name,
description,
enabled,
metric_type,
operator,
threshold,
window_minutes,
sustained_minutes,
severity,
notify_email,
notify_webhook,
webhook_url,
cooldown_minutes
)
SELECT
'Queue depth > 50',
'Trigger when concurrency queue depth exceeds 50 for 2 consecutive minutes.',
TRUE,
'concurrency_queue_depth',
'>',
50,
1,
2,
'P2',
FALSE,
CASE
WHEN (SELECT webhook_url FROM ops_alert_rules WHERE webhook_url IS NOT NULL AND webhook_url <> '' LIMIT 1) IS NULL THEN FALSE
ELSE TRUE
END,
(SELECT webhook_url FROM ops_alert_rules WHERE webhook_url IS NOT NULL AND webhook_url <> '' LIMIT 1),
10
WHERE NOT EXISTS (SELECT 1 FROM ops_alert_rules WHERE name = 'Queue depth > 50');
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment