"frontend/vscode:/vscode.git/clone" did not exist on "260c152166717747302797cc09c6c76be73de31a"
Unverified Commit fd8473f2 authored by 程序猿MT's avatar 程序猿MT Committed by GitHub
Browse files

Merge branch 'Wei-Shaw:main' into main

parents 18b8bd43 cc4910dd
package service
import (
"context"
"database/sql"
"errors"
"log"
"time"
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
)
func (s *OpsService) GetDashboardOverview(ctx context.Context, filter *OpsDashboardFilter) (*OpsDashboardOverview, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
if filter == nil {
return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
}
if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
}
if filter.StartTime.After(filter.EndTime) {
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
}
// Resolve query mode (requested via query param, or DB default).
filter.QueryMode = s.resolveOpsQueryMode(ctx, filter.QueryMode)
overview, err := s.opsRepo.GetDashboardOverview(ctx, filter)
if err != nil {
if errors.Is(err, ErrOpsPreaggregatedNotPopulated) {
return nil, infraerrors.Conflict("OPS_PREAGG_NOT_READY", "Pre-aggregated ops metrics are not populated yet")
}
return nil, err
}
// Best-effort system health + jobs; dashboard metrics should still render if these are missing.
if metrics, err := s.opsRepo.GetLatestSystemMetrics(ctx, 1); err == nil {
// Attach config-derived limits so the UI can show "current / max" for connection pools.
// These are best-effort and should never block the dashboard rendering.
if s != nil && s.cfg != nil {
if s.cfg.Database.MaxOpenConns > 0 {
metrics.DBMaxOpenConns = intPtr(s.cfg.Database.MaxOpenConns)
}
if s.cfg.Redis.PoolSize > 0 {
metrics.RedisPoolSize = intPtr(s.cfg.Redis.PoolSize)
}
}
overview.SystemMetrics = metrics
} else if err != nil && !errors.Is(err, sql.ErrNoRows) {
log.Printf("[Ops] GetLatestSystemMetrics failed: %v", err)
}
if heartbeats, err := s.opsRepo.ListJobHeartbeats(ctx); err == nil {
overview.JobHeartbeats = heartbeats
} else {
log.Printf("[Ops] ListJobHeartbeats failed: %v", err)
}
overview.HealthScore = computeDashboardHealthScore(time.Now().UTC(), overview)
return overview, nil
}
func (s *OpsService) resolveOpsQueryMode(ctx context.Context, requested OpsQueryMode) OpsQueryMode {
if requested.IsValid() {
// Allow "auto" to be disabled via config until preagg is proven stable in production.
// Forced `preagg` via query param still works.
if requested == OpsQueryModeAuto && s != nil && s.cfg != nil && !s.cfg.Ops.UsePreaggregatedTables {
return OpsQueryModeRaw
}
return requested
}
mode := OpsQueryModeAuto
if s != nil && s.settingRepo != nil {
if raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsQueryModeDefault); err == nil {
mode = ParseOpsQueryMode(raw)
}
}
if mode == OpsQueryModeAuto && s != nil && s.cfg != nil && !s.cfg.Ops.UsePreaggregatedTables {
return OpsQueryModeRaw
}
return mode
}
package service
import "time"
type OpsDashboardFilter struct {
StartTime time.Time
EndTime time.Time
Platform string
GroupID *int64
// QueryMode controls whether dashboard queries should use raw logs or pre-aggregated tables.
// Expected values: auto/raw/preagg (see OpsQueryMode).
QueryMode OpsQueryMode
}
type OpsRateSummary struct {
Current float64 `json:"current"`
Peak float64 `json:"peak"`
Avg float64 `json:"avg"`
}
type OpsPercentiles struct {
P50 *int `json:"p50_ms"`
P90 *int `json:"p90_ms"`
P95 *int `json:"p95_ms"`
P99 *int `json:"p99_ms"`
Avg *int `json:"avg_ms"`
Max *int `json:"max_ms"`
}
type OpsDashboardOverview struct {
StartTime time.Time `json:"start_time"`
EndTime time.Time `json:"end_time"`
Platform string `json:"platform"`
GroupID *int64 `json:"group_id"`
// HealthScore is a backend-computed overall health score (0-100).
// It is derived from the monitored metrics in this overview, plus best-effort system metrics/job heartbeats.
HealthScore int `json:"health_score"`
// Latest system-level snapshot (window=1m, global).
SystemMetrics *OpsSystemMetricsSnapshot `json:"system_metrics"`
// Background jobs health (heartbeats).
JobHeartbeats []*OpsJobHeartbeat `json:"job_heartbeats"`
SuccessCount int64 `json:"success_count"`
ErrorCountTotal int64 `json:"error_count_total"`
BusinessLimitedCount int64 `json:"business_limited_count"`
ErrorCountSLA int64 `json:"error_count_sla"`
RequestCountTotal int64 `json:"request_count_total"`
RequestCountSLA int64 `json:"request_count_sla"`
TokenConsumed int64 `json:"token_consumed"`
SLA float64 `json:"sla"`
ErrorRate float64 `json:"error_rate"`
UpstreamErrorRate float64 `json:"upstream_error_rate"`
UpstreamErrorCountExcl429529 int64 `json:"upstream_error_count_excl_429_529"`
Upstream429Count int64 `json:"upstream_429_count"`
Upstream529Count int64 `json:"upstream_529_count"`
QPS OpsRateSummary `json:"qps"`
TPS OpsRateSummary `json:"tps"`
Duration OpsPercentiles `json:"duration"`
TTFT OpsPercentiles `json:"ttft"`
}
type OpsLatencyHistogramBucket struct {
Range string `json:"range"`
Count int64 `json:"count"`
}
// OpsLatencyHistogramResponse is a coarse latency distribution histogram (success requests only).
// It is used by the Ops dashboard to quickly identify tail latency regressions.
type OpsLatencyHistogramResponse struct {
StartTime time.Time `json:"start_time"`
EndTime time.Time `json:"end_time"`
Platform string `json:"platform"`
GroupID *int64 `json:"group_id"`
TotalRequests int64 `json:"total_requests"`
Buckets []*OpsLatencyHistogramBucket `json:"buckets"`
}
package service
import (
"context"
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
)
func (s *OpsService) GetErrorTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsErrorTrendResponse, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
if filter == nil {
return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
}
if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
}
if filter.StartTime.After(filter.EndTime) {
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
}
return s.opsRepo.GetErrorTrend(ctx, filter, bucketSeconds)
}
func (s *OpsService) GetErrorDistribution(ctx context.Context, filter *OpsDashboardFilter) (*OpsErrorDistributionResponse, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
if filter == nil {
return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
}
if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
}
if filter.StartTime.After(filter.EndTime) {
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
}
return s.opsRepo.GetErrorDistribution(ctx, filter)
}
package service
import (
"math"
"time"
)
// computeDashboardHealthScore computes a 0-100 health score from the metrics returned by the dashboard overview.
//
// Design goals:
// - Backend-owned scoring (UI only displays).
// - Layered scoring: Business Health (70%) + Infrastructure Health (30%)
// - Avoids double-counting (e.g., DB failure affects both infra and business metrics)
// - Conservative + stable: penalize clear degradations; avoid overreacting to missing/idle data.
func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview) int {
if overview == nil {
return 0
}
// Idle/no-data: avoid showing a "bad" score when there is no traffic.
// UI can still render a gray/idle state based on QPS + error rate.
if overview.RequestCountSLA <= 0 && overview.RequestCountTotal <= 0 && overview.ErrorCountTotal <= 0 {
return 100
}
businessHealth := computeBusinessHealth(overview)
infraHealth := computeInfraHealth(now, overview)
// Weighted combination: 70% business + 30% infrastructure
score := businessHealth*0.7 + infraHealth*0.3
return int(math.Round(clampFloat64(score, 0, 100)))
}
// computeBusinessHealth calculates business health score (0-100)
// Components: SLA (50%) + Error Rate (30%) + Latency (20%)
func computeBusinessHealth(overview *OpsDashboardOverview) float64 {
// SLA score: 99.5% → 100, 95% → 0 (linear)
slaScore := 100.0
slaPct := clampFloat64(overview.SLA*100, 0, 100)
if slaPct < 99.5 {
if slaPct >= 95 {
slaScore = (slaPct - 95) / 4.5 * 100
} else {
slaScore = 0
}
}
// Error rate score: 0.5% → 100, 5% → 0 (linear)
// Combines request errors and upstream errors
errorScore := 100.0
errorPct := clampFloat64(overview.ErrorRate*100, 0, 100)
upstreamPct := clampFloat64(overview.UpstreamErrorRate*100, 0, 100)
combinedErrorPct := math.Max(errorPct, upstreamPct) // Use worst case
if combinedErrorPct > 0.5 {
if combinedErrorPct <= 5 {
errorScore = (5 - combinedErrorPct) / 4.5 * 100
} else {
errorScore = 0
}
}
// Latency score: 1s → 100, 10s → 0 (linear)
// Uses P99 of duration (TTFT is less critical for overall health)
latencyScore := 100.0
if overview.Duration.P99 != nil {
p99 := float64(*overview.Duration.P99)
if p99 > 1000 {
if p99 <= 10000 {
latencyScore = (10000 - p99) / 9000 * 100
} else {
latencyScore = 0
}
}
}
// Weighted combination
return slaScore*0.5 + errorScore*0.3 + latencyScore*0.2
}
// computeInfraHealth calculates infrastructure health score (0-100)
// Components: Storage (40%) + Compute Resources (30%) + Background Jobs (30%)
func computeInfraHealth(now time.Time, overview *OpsDashboardOverview) float64 {
// Storage score: DB critical, Redis less critical
storageScore := 100.0
if overview.SystemMetrics != nil {
if overview.SystemMetrics.DBOK != nil && !*overview.SystemMetrics.DBOK {
storageScore = 0 // DB failure is critical
} else if overview.SystemMetrics.RedisOK != nil && !*overview.SystemMetrics.RedisOK {
storageScore = 50 // Redis failure is degraded but not critical
}
}
// Compute resources score: CPU + Memory
computeScore := 100.0
if overview.SystemMetrics != nil {
cpuScore := 100.0
if overview.SystemMetrics.CPUUsagePercent != nil {
cpuPct := clampFloat64(*overview.SystemMetrics.CPUUsagePercent, 0, 100)
if cpuPct > 80 {
if cpuPct <= 100 {
cpuScore = (100 - cpuPct) / 20 * 100
} else {
cpuScore = 0
}
}
}
memScore := 100.0
if overview.SystemMetrics.MemoryUsagePercent != nil {
memPct := clampFloat64(*overview.SystemMetrics.MemoryUsagePercent, 0, 100)
if memPct > 85 {
if memPct <= 100 {
memScore = (100 - memPct) / 15 * 100
} else {
memScore = 0
}
}
}
computeScore = (cpuScore + memScore) / 2
}
// Background jobs score
jobScore := 100.0
failedJobs := 0
totalJobs := 0
for _, hb := range overview.JobHeartbeats {
if hb == nil {
continue
}
totalJobs++
if hb.LastErrorAt != nil && (hb.LastSuccessAt == nil || hb.LastErrorAt.After(*hb.LastSuccessAt)) {
failedJobs++
} else if hb.LastSuccessAt != nil && now.Sub(*hb.LastSuccessAt) > 15*time.Minute {
failedJobs++
}
}
if totalJobs > 0 && failedJobs > 0 {
jobScore = (1 - float64(failedJobs)/float64(totalJobs)) * 100
}
// Weighted combination
return storageScore*0.4 + computeScore*0.3 + jobScore*0.3
}
func clampFloat64(v float64, min float64, max float64) float64 {
if v < min {
return min
}
if v > max {
return max
}
return v
}
//go:build unit
package service
import (
"testing"
"time"
"github.com/stretchr/testify/require"
)
func TestComputeDashboardHealthScore_IdleReturns100(t *testing.T) {
t.Parallel()
score := computeDashboardHealthScore(time.Now().UTC(), &OpsDashboardOverview{})
require.Equal(t, 100, score)
}
func TestComputeDashboardHealthScore_DegradesOnBadSignals(t *testing.T) {
t.Parallel()
ov := &OpsDashboardOverview{
RequestCountTotal: 100,
RequestCountSLA: 100,
SuccessCount: 90,
ErrorCountTotal: 10,
ErrorCountSLA: 10,
SLA: 0.90,
ErrorRate: 0.10,
UpstreamErrorRate: 0.08,
Duration: OpsPercentiles{P99: intPtr(20_000)},
TTFT: OpsPercentiles{P99: intPtr(2_000)},
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(false),
RedisOK: boolPtr(false),
CPUUsagePercent: float64Ptr(98.0),
MemoryUsagePercent: float64Ptr(97.0),
DBConnWaiting: intPtr(3),
ConcurrencyQueueDepth: intPtr(10),
},
JobHeartbeats: []*OpsJobHeartbeat{
{
JobName: "job-a",
LastErrorAt: timePtr(time.Now().UTC().Add(-1 * time.Minute)),
LastError: stringPtr("boom"),
},
},
}
score := computeDashboardHealthScore(time.Now().UTC(), ov)
require.Less(t, score, 80)
require.GreaterOrEqual(t, score, 0)
}
func TestComputeDashboardHealthScore_Comprehensive(t *testing.T) {
t.Parallel()
tests := []struct {
name string
overview *OpsDashboardOverview
wantMin int
wantMax int
}{
{
name: "nil overview returns 0",
overview: nil,
wantMin: 0,
wantMax: 0,
},
{
name: "perfect health",
overview: &OpsDashboardOverview{
RequestCountTotal: 1000,
RequestCountSLA: 1000,
SLA: 1.0,
ErrorRate: 0,
UpstreamErrorRate: 0,
Duration: OpsPercentiles{P99: intPtr(500)},
TTFT: OpsPercentiles{P99: intPtr(100)},
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(true),
RedisOK: boolPtr(true),
CPUUsagePercent: float64Ptr(30),
MemoryUsagePercent: float64Ptr(40),
},
},
wantMin: 100,
wantMax: 100,
},
{
name: "good health - SLA 99.8%",
overview: &OpsDashboardOverview{
RequestCountTotal: 1000,
RequestCountSLA: 1000,
SLA: 0.998,
ErrorRate: 0.003,
UpstreamErrorRate: 0.001,
Duration: OpsPercentiles{P99: intPtr(800)},
TTFT: OpsPercentiles{P99: intPtr(200)},
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(true),
RedisOK: boolPtr(true),
CPUUsagePercent: float64Ptr(50),
MemoryUsagePercent: float64Ptr(60),
},
},
wantMin: 95,
wantMax: 100,
},
{
name: "medium health - SLA 96%",
overview: &OpsDashboardOverview{
RequestCountTotal: 1000,
RequestCountSLA: 1000,
SLA: 0.96,
ErrorRate: 0.02,
UpstreamErrorRate: 0.01,
Duration: OpsPercentiles{P99: intPtr(3000)},
TTFT: OpsPercentiles{P99: intPtr(600)},
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(true),
RedisOK: boolPtr(true),
CPUUsagePercent: float64Ptr(70),
MemoryUsagePercent: float64Ptr(75),
},
},
wantMin: 60,
wantMax: 85,
},
{
name: "DB failure",
overview: &OpsDashboardOverview{
RequestCountTotal: 1000,
RequestCountSLA: 1000,
SLA: 0.995,
ErrorRate: 0,
UpstreamErrorRate: 0,
Duration: OpsPercentiles{P99: intPtr(500)},
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(false),
RedisOK: boolPtr(true),
CPUUsagePercent: float64Ptr(30),
MemoryUsagePercent: float64Ptr(40),
},
},
wantMin: 70,
wantMax: 90,
},
{
name: "Redis failure",
overview: &OpsDashboardOverview{
RequestCountTotal: 1000,
RequestCountSLA: 1000,
SLA: 0.995,
ErrorRate: 0,
UpstreamErrorRate: 0,
Duration: OpsPercentiles{P99: intPtr(500)},
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(true),
RedisOK: boolPtr(false),
CPUUsagePercent: float64Ptr(30),
MemoryUsagePercent: float64Ptr(40),
},
},
wantMin: 85,
wantMax: 95,
},
{
name: "high CPU usage",
overview: &OpsDashboardOverview{
RequestCountTotal: 1000,
RequestCountSLA: 1000,
SLA: 0.995,
ErrorRate: 0,
UpstreamErrorRate: 0,
Duration: OpsPercentiles{P99: intPtr(500)},
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(true),
RedisOK: boolPtr(true),
CPUUsagePercent: float64Ptr(95),
MemoryUsagePercent: float64Ptr(40),
},
},
wantMin: 85,
wantMax: 100,
},
{
name: "combined failures - business degraded + infra healthy",
overview: &OpsDashboardOverview{
RequestCountTotal: 1000,
RequestCountSLA: 1000,
SLA: 0.90,
ErrorRate: 0.05,
UpstreamErrorRate: 0.02,
Duration: OpsPercentiles{P99: intPtr(10000)},
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(true),
RedisOK: boolPtr(true),
CPUUsagePercent: float64Ptr(20),
MemoryUsagePercent: float64Ptr(30),
},
},
wantMin: 25,
wantMax: 50,
},
{
name: "combined failures - business healthy + infra degraded",
overview: &OpsDashboardOverview{
RequestCountTotal: 1000,
RequestCountSLA: 1000,
SLA: 0.998,
ErrorRate: 0.001,
UpstreamErrorRate: 0,
Duration: OpsPercentiles{P99: intPtr(600)},
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(false),
RedisOK: boolPtr(false),
CPUUsagePercent: float64Ptr(95),
MemoryUsagePercent: float64Ptr(95),
},
},
wantMin: 70,
wantMax: 90,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
score := computeDashboardHealthScore(time.Now().UTC(), tt.overview)
require.GreaterOrEqual(t, score, tt.wantMin, "score should be >= %d", tt.wantMin)
require.LessOrEqual(t, score, tt.wantMax, "score should be <= %d", tt.wantMax)
require.GreaterOrEqual(t, score, 0, "score must be >= 0")
require.LessOrEqual(t, score, 100, "score must be <= 100")
})
}
}
func TestComputeBusinessHealth(t *testing.T) {
t.Parallel()
tests := []struct {
name string
overview *OpsDashboardOverview
wantMin float64
wantMax float64
}{
{
name: "perfect metrics",
overview: &OpsDashboardOverview{
SLA: 1.0,
ErrorRate: 0,
UpstreamErrorRate: 0,
Duration: OpsPercentiles{P99: intPtr(500)},
},
wantMin: 100,
wantMax: 100,
},
{
name: "SLA boundary 99.5%",
overview: &OpsDashboardOverview{
SLA: 0.995,
ErrorRate: 0,
UpstreamErrorRate: 0,
Duration: OpsPercentiles{P99: intPtr(500)},
},
wantMin: 100,
wantMax: 100,
},
{
name: "SLA boundary 95%",
overview: &OpsDashboardOverview{
SLA: 0.95,
ErrorRate: 0,
UpstreamErrorRate: 0,
Duration: OpsPercentiles{P99: intPtr(500)},
},
wantMin: 50,
wantMax: 60,
},
{
name: "error rate boundary 0.5%",
overview: &OpsDashboardOverview{
SLA: 0.995,
ErrorRate: 0.005,
UpstreamErrorRate: 0,
Duration: OpsPercentiles{P99: intPtr(500)},
},
wantMin: 95,
wantMax: 100,
},
{
name: "latency boundary 1000ms",
overview: &OpsDashboardOverview{
SLA: 0.995,
ErrorRate: 0,
UpstreamErrorRate: 0,
Duration: OpsPercentiles{P99: intPtr(1000)},
},
wantMin: 95,
wantMax: 100,
},
{
name: "upstream error dominates",
overview: &OpsDashboardOverview{
SLA: 0.995,
ErrorRate: 0.001,
UpstreamErrorRate: 0.03,
Duration: OpsPercentiles{P99: intPtr(500)},
},
wantMin: 75,
wantMax: 90,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
score := computeBusinessHealth(tt.overview)
require.GreaterOrEqual(t, score, tt.wantMin, "score should be >= %.1f", tt.wantMin)
require.LessOrEqual(t, score, tt.wantMax, "score should be <= %.1f", tt.wantMax)
require.GreaterOrEqual(t, score, 0.0, "score must be >= 0")
require.LessOrEqual(t, score, 100.0, "score must be <= 100")
})
}
}
func TestComputeInfraHealth(t *testing.T) {
t.Parallel()
now := time.Now().UTC()
tests := []struct {
name string
overview *OpsDashboardOverview
wantMin float64
wantMax float64
}{
{
name: "all infrastructure healthy",
overview: &OpsDashboardOverview{
RequestCountTotal: 1000,
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(true),
RedisOK: boolPtr(true),
CPUUsagePercent: float64Ptr(30),
MemoryUsagePercent: float64Ptr(40),
},
},
wantMin: 100,
wantMax: 100,
},
{
name: "DB down",
overview: &OpsDashboardOverview{
RequestCountTotal: 1000,
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(false),
RedisOK: boolPtr(true),
CPUUsagePercent: float64Ptr(30),
MemoryUsagePercent: float64Ptr(40),
},
},
wantMin: 50,
wantMax: 70,
},
{
name: "Redis down",
overview: &OpsDashboardOverview{
RequestCountTotal: 1000,
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(true),
RedisOK: boolPtr(false),
CPUUsagePercent: float64Ptr(30),
MemoryUsagePercent: float64Ptr(40),
},
},
wantMin: 80,
wantMax: 95,
},
{
name: "CPU at 90%",
overview: &OpsDashboardOverview{
RequestCountTotal: 1000,
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(true),
RedisOK: boolPtr(true),
CPUUsagePercent: float64Ptr(90),
MemoryUsagePercent: float64Ptr(40),
},
},
wantMin: 85,
wantMax: 95,
},
{
name: "failed background job",
overview: &OpsDashboardOverview{
RequestCountTotal: 1000,
SystemMetrics: &OpsSystemMetricsSnapshot{
DBOK: boolPtr(true),
RedisOK: boolPtr(true),
CPUUsagePercent: float64Ptr(30),
MemoryUsagePercent: float64Ptr(40),
},
JobHeartbeats: []*OpsJobHeartbeat{
{
JobName: "test-job",
LastErrorAt: &now,
},
},
},
wantMin: 70,
wantMax: 90,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
score := computeInfraHealth(now, tt.overview)
require.GreaterOrEqual(t, score, tt.wantMin, "score should be >= %.1f", tt.wantMin)
require.LessOrEqual(t, score, tt.wantMax, "score should be <= %.1f", tt.wantMax)
require.GreaterOrEqual(t, score, 0.0, "score must be >= 0")
require.LessOrEqual(t, score, 100.0, "score must be <= 100")
})
}
}
func timePtr(v time.Time) *time.Time { return &v }
func stringPtr(v string) *string { return &v }
package service
import (
"context"
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
)
func (s *OpsService) GetLatencyHistogram(ctx context.Context, filter *OpsDashboardFilter) (*OpsLatencyHistogramResponse, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
if filter == nil {
return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
}
if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
}
if filter.StartTime.After(filter.EndTime) {
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
}
return s.opsRepo.GetLatencyHistogram(ctx, filter)
}
package service
import (
"context"
"database/sql"
"errors"
"fmt"
"log"
"math"
"os"
"runtime"
"strconv"
"strings"
"sync"
"time"
"unicode/utf8"
"github.com/Wei-Shaw/sub2api/internal/config"
"github.com/google/uuid"
"github.com/redis/go-redis/v9"
"github.com/shirou/gopsutil/v4/cpu"
"github.com/shirou/gopsutil/v4/mem"
)
const (
opsMetricsCollectorJobName = "ops_metrics_collector"
opsMetricsCollectorMinInterval = 60 * time.Second
opsMetricsCollectorMaxInterval = 1 * time.Hour
opsMetricsCollectorTimeout = 10 * time.Second
opsMetricsCollectorLeaderLockKey = "ops:metrics:collector:leader"
opsMetricsCollectorLeaderLockTTL = 90 * time.Second
opsMetricsCollectorHeartbeatTimeout = 2 * time.Second
bytesPerMB = 1024 * 1024
)
var opsMetricsCollectorAdvisoryLockID = hashAdvisoryLockID(opsMetricsCollectorLeaderLockKey)
type OpsMetricsCollector struct {
opsRepo OpsRepository
settingRepo SettingRepository
cfg *config.Config
accountRepo AccountRepository
concurrencyService *ConcurrencyService
db *sql.DB
redisClient *redis.Client
instanceID string
lastCgroupCPUUsageNanos uint64
lastCgroupCPUSampleAt time.Time
stopCh chan struct{}
startOnce sync.Once
stopOnce sync.Once
skipLogMu sync.Mutex
skipLogAt time.Time
}
func NewOpsMetricsCollector(
opsRepo OpsRepository,
settingRepo SettingRepository,
accountRepo AccountRepository,
concurrencyService *ConcurrencyService,
db *sql.DB,
redisClient *redis.Client,
cfg *config.Config,
) *OpsMetricsCollector {
return &OpsMetricsCollector{
opsRepo: opsRepo,
settingRepo: settingRepo,
cfg: cfg,
accountRepo: accountRepo,
concurrencyService: concurrencyService,
db: db,
redisClient: redisClient,
instanceID: uuid.NewString(),
}
}
func (c *OpsMetricsCollector) Start() {
if c == nil {
return
}
c.startOnce.Do(func() {
if c.stopCh == nil {
c.stopCh = make(chan struct{})
}
go c.run()
})
}
func (c *OpsMetricsCollector) Stop() {
if c == nil {
return
}
c.stopOnce.Do(func() {
if c.stopCh != nil {
close(c.stopCh)
}
})
}
func (c *OpsMetricsCollector) run() {
// First run immediately so the dashboard has data soon after startup.
c.collectOnce()
for {
interval := c.getInterval()
timer := time.NewTimer(interval)
select {
case <-timer.C:
c.collectOnce()
case <-c.stopCh:
timer.Stop()
return
}
}
}
func (c *OpsMetricsCollector) getInterval() time.Duration {
interval := opsMetricsCollectorMinInterval
if c.settingRepo == nil {
return interval
}
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
raw, err := c.settingRepo.GetValue(ctx, SettingKeyOpsMetricsIntervalSeconds)
if err != nil {
return interval
}
raw = strings.TrimSpace(raw)
if raw == "" {
return interval
}
seconds, err := strconv.Atoi(raw)
if err != nil {
return interval
}
if seconds < int(opsMetricsCollectorMinInterval.Seconds()) {
seconds = int(opsMetricsCollectorMinInterval.Seconds())
}
if seconds > int(opsMetricsCollectorMaxInterval.Seconds()) {
seconds = int(opsMetricsCollectorMaxInterval.Seconds())
}
return time.Duration(seconds) * time.Second
}
func (c *OpsMetricsCollector) collectOnce() {
if c == nil {
return
}
if c.cfg != nil && !c.cfg.Ops.Enabled {
return
}
if c.opsRepo == nil {
return
}
if c.db == nil {
return
}
ctx, cancel := context.WithTimeout(context.Background(), opsMetricsCollectorTimeout)
defer cancel()
if !c.isMonitoringEnabled(ctx) {
return
}
release, ok := c.tryAcquireLeaderLock(ctx)
if !ok {
return
}
if release != nil {
defer release()
}
startedAt := time.Now().UTC()
err := c.collectAndPersist(ctx)
finishedAt := time.Now().UTC()
durationMs := finishedAt.Sub(startedAt).Milliseconds()
dur := durationMs
runAt := startedAt
if err != nil {
msg := truncateString(err.Error(), 2048)
errAt := finishedAt
hbCtx, hbCancel := context.WithTimeout(context.Background(), opsMetricsCollectorHeartbeatTimeout)
defer hbCancel()
_ = c.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
JobName: opsMetricsCollectorJobName,
LastRunAt: &runAt,
LastErrorAt: &errAt,
LastError: &msg,
LastDurationMs: &dur,
})
log.Printf("[OpsMetricsCollector] collect failed: %v", err)
return
}
successAt := finishedAt
hbCtx, hbCancel := context.WithTimeout(context.Background(), opsMetricsCollectorHeartbeatTimeout)
defer hbCancel()
_ = c.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
JobName: opsMetricsCollectorJobName,
LastRunAt: &runAt,
LastSuccessAt: &successAt,
LastDurationMs: &dur,
})
}
func (c *OpsMetricsCollector) isMonitoringEnabled(ctx context.Context) bool {
if c == nil {
return false
}
if c.cfg != nil && !c.cfg.Ops.Enabled {
return false
}
if c.settingRepo == nil {
return true
}
if ctx == nil {
ctx = context.Background()
}
value, err := c.settingRepo.GetValue(ctx, SettingKeyOpsMonitoringEnabled)
if err != nil {
if errors.Is(err, ErrSettingNotFound) {
return true
}
// Fail-open: collector should not become a hard dependency.
return true
}
switch strings.ToLower(strings.TrimSpace(value)) {
case "false", "0", "off", "disabled":
return false
default:
return true
}
}
func (c *OpsMetricsCollector) collectAndPersist(ctx context.Context) error {
if ctx == nil {
ctx = context.Background()
}
// Align to stable minute boundaries to avoid partial buckets and to maximize cache hits.
now := time.Now().UTC()
windowEnd := now.Truncate(time.Minute)
windowStart := windowEnd.Add(-1 * time.Minute)
sys, err := c.collectSystemStats(ctx)
if err != nil {
// Continue; system stats are best-effort.
log.Printf("[OpsMetricsCollector] system stats error: %v", err)
}
dbOK := c.checkDB(ctx)
redisOK := c.checkRedis(ctx)
active, idle := c.dbPoolStats()
redisTotal, redisIdle, redisStatsOK := c.redisPoolStats()
successCount, tokenConsumed, err := c.queryUsageCounts(ctx, windowStart, windowEnd)
if err != nil {
return fmt.Errorf("query usage counts: %w", err)
}
duration, ttft, err := c.queryUsageLatency(ctx, windowStart, windowEnd)
if err != nil {
return fmt.Errorf("query usage latency: %w", err)
}
errorTotal, businessLimited, errorSLA, upstreamExcl, upstream429, upstream529, err := c.queryErrorCounts(ctx, windowStart, windowEnd)
if err != nil {
return fmt.Errorf("query error counts: %w", err)
}
windowSeconds := windowEnd.Sub(windowStart).Seconds()
if windowSeconds <= 0 {
windowSeconds = 60
}
requestTotal := successCount + errorTotal
qps := float64(requestTotal) / windowSeconds
tps := float64(tokenConsumed) / windowSeconds
goroutines := runtime.NumGoroutine()
concurrencyQueueDepth := c.collectConcurrencyQueueDepth(ctx)
input := &OpsInsertSystemMetricsInput{
CreatedAt: windowEnd,
WindowMinutes: 1,
SuccessCount: successCount,
ErrorCountTotal: errorTotal,
BusinessLimitedCount: businessLimited,
ErrorCountSLA: errorSLA,
UpstreamErrorCountExcl429529: upstreamExcl,
Upstream429Count: upstream429,
Upstream529Count: upstream529,
TokenConsumed: tokenConsumed,
QPS: float64Ptr(roundTo1DP(qps)),
TPS: float64Ptr(roundTo1DP(tps)),
DurationP50Ms: duration.p50,
DurationP90Ms: duration.p90,
DurationP95Ms: duration.p95,
DurationP99Ms: duration.p99,
DurationAvgMs: duration.avg,
DurationMaxMs: duration.max,
TTFTP50Ms: ttft.p50,
TTFTP90Ms: ttft.p90,
TTFTP95Ms: ttft.p95,
TTFTP99Ms: ttft.p99,
TTFTAvgMs: ttft.avg,
TTFTMaxMs: ttft.max,
CPUUsagePercent: sys.cpuUsagePercent,
MemoryUsedMB: sys.memoryUsedMB,
MemoryTotalMB: sys.memoryTotalMB,
MemoryUsagePercent: sys.memoryUsagePercent,
DBOK: boolPtr(dbOK),
RedisOK: boolPtr(redisOK),
RedisConnTotal: func() *int {
if !redisStatsOK {
return nil
}
return intPtr(redisTotal)
}(),
RedisConnIdle: func() *int {
if !redisStatsOK {
return nil
}
return intPtr(redisIdle)
}(),
DBConnActive: intPtr(active),
DBConnIdle: intPtr(idle),
GoroutineCount: intPtr(goroutines),
ConcurrencyQueueDepth: concurrencyQueueDepth,
}
return c.opsRepo.InsertSystemMetrics(ctx, input)
}
func (c *OpsMetricsCollector) collectConcurrencyQueueDepth(parentCtx context.Context) *int {
if c == nil || c.accountRepo == nil || c.concurrencyService == nil {
return nil
}
if parentCtx == nil {
parentCtx = context.Background()
}
// Best-effort: never let concurrency sampling break the metrics collector.
ctx, cancel := context.WithTimeout(parentCtx, 2*time.Second)
defer cancel()
accounts, err := c.accountRepo.ListSchedulable(ctx)
if err != nil {
return nil
}
if len(accounts) == 0 {
zero := 0
return &zero
}
batch := make([]AccountWithConcurrency, 0, len(accounts))
for _, acc := range accounts {
if acc.ID <= 0 {
continue
}
maxConc := acc.Concurrency
if maxConc < 0 {
maxConc = 0
}
batch = append(batch, AccountWithConcurrency{
ID: acc.ID,
MaxConcurrency: maxConc,
})
}
if len(batch) == 0 {
zero := 0
return &zero
}
loadMap, err := c.concurrencyService.GetAccountsLoadBatch(ctx, batch)
if err != nil {
return nil
}
var total int64
for _, info := range loadMap {
if info == nil || info.WaitingCount <= 0 {
continue
}
total += int64(info.WaitingCount)
}
if total < 0 {
total = 0
}
maxInt := int64(^uint(0) >> 1)
if total > maxInt {
total = maxInt
}
v := int(total)
return &v
}
type opsCollectedPercentiles struct {
p50 *int
p90 *int
p95 *int
p99 *int
avg *float64
max *int
}
func (c *OpsMetricsCollector) queryUsageCounts(ctx context.Context, start, end time.Time) (successCount int64, tokenConsumed int64, err error) {
q := `
SELECT
COALESCE(COUNT(*), 0) AS success_count,
COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS token_consumed
FROM usage_logs
WHERE created_at >= $1 AND created_at < $2`
var tokens sql.NullInt64
if err := c.db.QueryRowContext(ctx, q, start, end).Scan(&successCount, &tokens); err != nil {
return 0, 0, err
}
if tokens.Valid {
tokenConsumed = tokens.Int64
}
return successCount, tokenConsumed, nil
}
func (c *OpsMetricsCollector) queryUsageLatency(ctx context.Context, start, end time.Time) (duration opsCollectedPercentiles, ttft opsCollectedPercentiles, err error) {
{
q := `
SELECT
percentile_cont(0.50) WITHIN GROUP (ORDER BY duration_ms) AS p50,
percentile_cont(0.90) WITHIN GROUP (ORDER BY duration_ms) AS p90,
percentile_cont(0.95) WITHIN GROUP (ORDER BY duration_ms) AS p95,
percentile_cont(0.99) WITHIN GROUP (ORDER BY duration_ms) AS p99,
AVG(duration_ms) AS avg_ms,
MAX(duration_ms) AS max_ms
FROM usage_logs
WHERE created_at >= $1 AND created_at < $2
AND duration_ms IS NOT NULL`
var p50, p90, p95, p99 sql.NullFloat64
var avg sql.NullFloat64
var max sql.NullInt64
if err := c.db.QueryRowContext(ctx, q, start, end).Scan(&p50, &p90, &p95, &p99, &avg, &max); err != nil {
return opsCollectedPercentiles{}, opsCollectedPercentiles{}, err
}
duration.p50 = floatToIntPtr(p50)
duration.p90 = floatToIntPtr(p90)
duration.p95 = floatToIntPtr(p95)
duration.p99 = floatToIntPtr(p99)
if avg.Valid {
v := roundTo1DP(avg.Float64)
duration.avg = &v
}
if max.Valid {
v := int(max.Int64)
duration.max = &v
}
}
{
q := `
SELECT
percentile_cont(0.50) WITHIN GROUP (ORDER BY first_token_ms) AS p50,
percentile_cont(0.90) WITHIN GROUP (ORDER BY first_token_ms) AS p90,
percentile_cont(0.95) WITHIN GROUP (ORDER BY first_token_ms) AS p95,
percentile_cont(0.99) WITHIN GROUP (ORDER BY first_token_ms) AS p99,
AVG(first_token_ms) AS avg_ms,
MAX(first_token_ms) AS max_ms
FROM usage_logs
WHERE created_at >= $1 AND created_at < $2
AND first_token_ms IS NOT NULL`
var p50, p90, p95, p99 sql.NullFloat64
var avg sql.NullFloat64
var max sql.NullInt64
if err := c.db.QueryRowContext(ctx, q, start, end).Scan(&p50, &p90, &p95, &p99, &avg, &max); err != nil {
return opsCollectedPercentiles{}, opsCollectedPercentiles{}, err
}
ttft.p50 = floatToIntPtr(p50)
ttft.p90 = floatToIntPtr(p90)
ttft.p95 = floatToIntPtr(p95)
ttft.p99 = floatToIntPtr(p99)
if avg.Valid {
v := roundTo1DP(avg.Float64)
ttft.avg = &v
}
if max.Valid {
v := int(max.Int64)
ttft.max = &v
}
}
return duration, ttft, nil
}
func (c *OpsMetricsCollector) queryErrorCounts(ctx context.Context, start, end time.Time) (
errorTotal int64,
businessLimited int64,
errorSLA int64,
upstreamExcl429529 int64,
upstream429 int64,
upstream529 int64,
err error,
) {
q := `
SELECT
COALESCE(COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400), 0) AS error_total,
COALESCE(COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400 AND is_business_limited), 0) AS business_limited,
COALESCE(COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400 AND NOT is_business_limited), 0) AS error_sla,
COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) NOT IN (429, 529)), 0) AS upstream_excl,
COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) = 429), 0) AS upstream_429,
COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) = 529), 0) AS upstream_529
FROM ops_error_logs
WHERE created_at >= $1 AND created_at < $2`
if err := c.db.QueryRowContext(ctx, q, start, end).Scan(
&errorTotal,
&businessLimited,
&errorSLA,
&upstreamExcl429529,
&upstream429,
&upstream529,
); err != nil {
return 0, 0, 0, 0, 0, 0, err
}
return errorTotal, businessLimited, errorSLA, upstreamExcl429529, upstream429, upstream529, nil
}
type opsCollectedSystemStats struct {
cpuUsagePercent *float64
memoryUsedMB *int64
memoryTotalMB *int64
memoryUsagePercent *float64
}
func (c *OpsMetricsCollector) collectSystemStats(ctx context.Context) (*opsCollectedSystemStats, error) {
out := &opsCollectedSystemStats{}
if ctx == nil {
ctx = context.Background()
}
sampleAt := time.Now().UTC()
// Prefer cgroup (container) metrics when available.
if cpuPct := c.tryCgroupCPUPercent(sampleAt); cpuPct != nil {
out.cpuUsagePercent = cpuPct
}
cgroupUsed, cgroupTotal, cgroupOK := readCgroupMemoryBytes()
if cgroupOK {
usedMB := int64(cgroupUsed / bytesPerMB)
out.memoryUsedMB = &usedMB
if cgroupTotal > 0 {
totalMB := int64(cgroupTotal / bytesPerMB)
out.memoryTotalMB = &totalMB
pct := roundTo1DP(float64(cgroupUsed) / float64(cgroupTotal) * 100)
out.memoryUsagePercent = &pct
}
}
// Fallback to host metrics if cgroup metrics are unavailable (or incomplete).
if out.cpuUsagePercent == nil {
if cpuPercents, err := cpu.PercentWithContext(ctx, 0, false); err == nil && len(cpuPercents) > 0 {
v := roundTo1DP(cpuPercents[0])
out.cpuUsagePercent = &v
}
}
// If total memory isn't available from cgroup (e.g. memory.max = "max"), fill total from host.
if out.memoryUsedMB == nil || out.memoryTotalMB == nil || out.memoryUsagePercent == nil {
if vm, err := mem.VirtualMemoryWithContext(ctx); err == nil && vm != nil {
if out.memoryUsedMB == nil {
usedMB := int64(vm.Used / bytesPerMB)
out.memoryUsedMB = &usedMB
}
if out.memoryTotalMB == nil {
totalMB := int64(vm.Total / bytesPerMB)
out.memoryTotalMB = &totalMB
}
if out.memoryUsagePercent == nil {
if out.memoryUsedMB != nil && out.memoryTotalMB != nil && *out.memoryTotalMB > 0 {
pct := roundTo1DP(float64(*out.memoryUsedMB) / float64(*out.memoryTotalMB) * 100)
out.memoryUsagePercent = &pct
} else {
pct := roundTo1DP(vm.UsedPercent)
out.memoryUsagePercent = &pct
}
}
}
}
return out, nil
}
func (c *OpsMetricsCollector) tryCgroupCPUPercent(now time.Time) *float64 {
usageNanos, ok := readCgroupCPUUsageNanos()
if !ok {
return nil
}
// Initialize baseline sample.
if c.lastCgroupCPUSampleAt.IsZero() {
c.lastCgroupCPUUsageNanos = usageNanos
c.lastCgroupCPUSampleAt = now
return nil
}
elapsed := now.Sub(c.lastCgroupCPUSampleAt)
if elapsed <= 0 {
c.lastCgroupCPUUsageNanos = usageNanos
c.lastCgroupCPUSampleAt = now
return nil
}
prev := c.lastCgroupCPUUsageNanos
c.lastCgroupCPUUsageNanos = usageNanos
c.lastCgroupCPUSampleAt = now
if usageNanos < prev {
// Counter reset (container restarted).
return nil
}
deltaUsageSec := float64(usageNanos-prev) / 1e9
elapsedSec := elapsed.Seconds()
if elapsedSec <= 0 {
return nil
}
cores := readCgroupCPULimitCores()
if cores <= 0 {
// Can't reliably normalize; skip and fall back to gopsutil.
return nil
}
pct := (deltaUsageSec / (elapsedSec * cores)) * 100
if pct < 0 {
pct = 0
}
// Clamp to avoid noise/jitter showing impossible values.
if pct > 100 {
pct = 100
}
v := roundTo1DP(pct)
return &v
}
func readCgroupMemoryBytes() (usedBytes uint64, totalBytes uint64, ok bool) {
// cgroup v2 (most common in modern containers)
if used, ok1 := readUintFile("/sys/fs/cgroup/memory.current"); ok1 {
usedBytes = used
rawMax, err := os.ReadFile("/sys/fs/cgroup/memory.max")
if err == nil {
s := strings.TrimSpace(string(rawMax))
if s != "" && s != "max" {
if v, err := strconv.ParseUint(s, 10, 64); err == nil {
totalBytes = v
}
}
}
return usedBytes, totalBytes, true
}
// cgroup v1 fallback
if used, ok1 := readUintFile("/sys/fs/cgroup/memory/memory.usage_in_bytes"); ok1 {
usedBytes = used
if limit, ok2 := readUintFile("/sys/fs/cgroup/memory/memory.limit_in_bytes"); ok2 {
// Some environments report a very large number when unlimited.
if limit > 0 && limit < (1<<60) {
totalBytes = limit
}
}
return usedBytes, totalBytes, true
}
return 0, 0, false
}
func readCgroupCPUUsageNanos() (usageNanos uint64, ok bool) {
// cgroup v2: cpu.stat has usage_usec
if raw, err := os.ReadFile("/sys/fs/cgroup/cpu.stat"); err == nil {
lines := strings.Split(string(raw), "\n")
for _, line := range lines {
fields := strings.Fields(line)
if len(fields) != 2 {
continue
}
if fields[0] != "usage_usec" {
continue
}
v, err := strconv.ParseUint(fields[1], 10, 64)
if err != nil {
continue
}
return v * 1000, true
}
}
// cgroup v1: cpuacct.usage is in nanoseconds
if v, ok := readUintFile("/sys/fs/cgroup/cpuacct/cpuacct.usage"); ok {
return v, true
}
return 0, false
}
func readCgroupCPULimitCores() float64 {
// cgroup v2: cpu.max => "<quota> <period>" or "max <period>"
if raw, err := os.ReadFile("/sys/fs/cgroup/cpu.max"); err == nil {
fields := strings.Fields(string(raw))
if len(fields) >= 2 && fields[0] != "max" {
quota, err1 := strconv.ParseFloat(fields[0], 64)
period, err2 := strconv.ParseFloat(fields[1], 64)
if err1 == nil && err2 == nil && quota > 0 && period > 0 {
return quota / period
}
}
}
// cgroup v1: cpu.cfs_quota_us / cpu.cfs_period_us
quota, okQuota := readIntFile("/sys/fs/cgroup/cpu/cpu.cfs_quota_us")
period, okPeriod := readIntFile("/sys/fs/cgroup/cpu/cpu.cfs_period_us")
if okQuota && okPeriod && quota > 0 && period > 0 {
return float64(quota) / float64(period)
}
return 0
}
func readUintFile(path string) (uint64, bool) {
raw, err := os.ReadFile(path)
if err != nil {
return 0, false
}
s := strings.TrimSpace(string(raw))
if s == "" {
return 0, false
}
v, err := strconv.ParseUint(s, 10, 64)
if err != nil {
return 0, false
}
return v, true
}
func readIntFile(path string) (int64, bool) {
raw, err := os.ReadFile(path)
if err != nil {
return 0, false
}
s := strings.TrimSpace(string(raw))
if s == "" {
return 0, false
}
v, err := strconv.ParseInt(s, 10, 64)
if err != nil {
return 0, false
}
return v, true
}
func (c *OpsMetricsCollector) checkDB(ctx context.Context) bool {
if c == nil || c.db == nil {
return false
}
if ctx == nil {
ctx = context.Background()
}
var one int
if err := c.db.QueryRowContext(ctx, "SELECT 1").Scan(&one); err != nil {
return false
}
return one == 1
}
func (c *OpsMetricsCollector) checkRedis(ctx context.Context) bool {
if c == nil || c.redisClient == nil {
return false
}
if ctx == nil {
ctx = context.Background()
}
return c.redisClient.Ping(ctx).Err() == nil
}
func (c *OpsMetricsCollector) redisPoolStats() (total int, idle int, ok bool) {
if c == nil || c.redisClient == nil {
return 0, 0, false
}
stats := c.redisClient.PoolStats()
if stats == nil {
return 0, 0, false
}
return int(stats.TotalConns), int(stats.IdleConns), true
}
func (c *OpsMetricsCollector) dbPoolStats() (active int, idle int) {
if c == nil || c.db == nil {
return 0, 0
}
stats := c.db.Stats()
return stats.InUse, stats.Idle
}
var opsMetricsCollectorReleaseScript = redis.NewScript(`
if redis.call("GET", KEYS[1]) == ARGV[1] then
return redis.call("DEL", KEYS[1])
end
return 0
`)
func (c *OpsMetricsCollector) tryAcquireLeaderLock(ctx context.Context) (func(), bool) {
if c == nil || c.redisClient == nil {
return nil, true
}
if ctx == nil {
ctx = context.Background()
}
ok, err := c.redisClient.SetNX(ctx, opsMetricsCollectorLeaderLockKey, c.instanceID, opsMetricsCollectorLeaderLockTTL).Result()
if err != nil {
// Prefer fail-closed to avoid stampeding the database when Redis is flaky.
// Fallback to a DB advisory lock when Redis is present but unavailable.
release, ok := tryAcquireDBAdvisoryLock(ctx, c.db, opsMetricsCollectorAdvisoryLockID)
if !ok {
c.maybeLogSkip()
return nil, false
}
return release, true
}
if !ok {
c.maybeLogSkip()
return nil, false
}
release := func() {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
_, _ = opsMetricsCollectorReleaseScript.Run(ctx, c.redisClient, []string{opsMetricsCollectorLeaderLockKey}, c.instanceID).Result()
}
return release, true
}
func (c *OpsMetricsCollector) maybeLogSkip() {
c.skipLogMu.Lock()
defer c.skipLogMu.Unlock()
now := time.Now()
if !c.skipLogAt.IsZero() && now.Sub(c.skipLogAt) < time.Minute {
return
}
c.skipLogAt = now
log.Printf("[OpsMetricsCollector] leader lock held by another instance; skipping")
}
func floatToIntPtr(v sql.NullFloat64) *int {
if !v.Valid {
return nil
}
n := int(math.Round(v.Float64))
return &n
}
func roundTo1DP(v float64) float64 {
return math.Round(v*10) / 10
}
func truncateString(s string, max int) string {
if max <= 0 {
return ""
}
if len(s) <= max {
return s
}
cut := s[:max]
for len(cut) > 0 && !utf8.ValidString(cut) {
cut = cut[:len(cut)-1]
}
return cut
}
func boolPtr(v bool) *bool {
out := v
return &out
}
func intPtr(v int) *int {
out := v
return &out
}
func float64Ptr(v float64) *float64 {
out := v
return &out
}
package service
import "time"
type OpsErrorLog struct {
ID int64 `json:"id"`
CreatedAt time.Time `json:"created_at"`
Phase string `json:"phase"`
Type string `json:"type"`
Severity string `json:"severity"`
StatusCode int `json:"status_code"`
Platform string `json:"platform"`
Model string `json:"model"`
LatencyMs *int `json:"latency_ms"`
ClientRequestID string `json:"client_request_id"`
RequestID string `json:"request_id"`
Message string `json:"message"`
UserID *int64 `json:"user_id"`
APIKeyID *int64 `json:"api_key_id"`
AccountID *int64 `json:"account_id"`
GroupID *int64 `json:"group_id"`
ClientIP *string `json:"client_ip"`
RequestPath string `json:"request_path"`
Stream bool `json:"stream"`
}
type OpsErrorLogDetail struct {
OpsErrorLog
ErrorBody string `json:"error_body"`
UserAgent string `json:"user_agent"`
// Upstream context (optional)
UpstreamStatusCode *int `json:"upstream_status_code,omitempty"`
UpstreamErrorMessage string `json:"upstream_error_message,omitempty"`
UpstreamErrorDetail string `json:"upstream_error_detail,omitempty"`
UpstreamErrors string `json:"upstream_errors,omitempty"` // JSON array (string) for display/parsing
// Timings (optional)
AuthLatencyMs *int64 `json:"auth_latency_ms"`
RoutingLatencyMs *int64 `json:"routing_latency_ms"`
UpstreamLatencyMs *int64 `json:"upstream_latency_ms"`
ResponseLatencyMs *int64 `json:"response_latency_ms"`
TimeToFirstTokenMs *int64 `json:"time_to_first_token_ms"`
// Retry context
RequestBody string `json:"request_body"`
RequestBodyTruncated bool `json:"request_body_truncated"`
RequestBodyBytes *int `json:"request_body_bytes"`
RequestHeaders string `json:"request_headers,omitempty"`
// vNext metric semantics
IsBusinessLimited bool `json:"is_business_limited"`
}
type OpsErrorLogFilter struct {
StartTime *time.Time
EndTime *time.Time
Platform string
GroupID *int64
AccountID *int64
StatusCodes []int
Phase string
Query string
Page int
PageSize int
}
type OpsErrorLogList struct {
Errors []*OpsErrorLog `json:"errors"`
Total int `json:"total"`
Page int `json:"page"`
PageSize int `json:"page_size"`
}
type OpsRetryAttempt struct {
ID int64 `json:"id"`
CreatedAt time.Time `json:"created_at"`
RequestedByUserID int64 `json:"requested_by_user_id"`
SourceErrorID int64 `json:"source_error_id"`
Mode string `json:"mode"`
PinnedAccountID *int64 `json:"pinned_account_id"`
Status string `json:"status"`
StartedAt *time.Time `json:"started_at"`
FinishedAt *time.Time `json:"finished_at"`
DurationMs *int64 `json:"duration_ms"`
ResultRequestID *string `json:"result_request_id"`
ResultErrorID *int64 `json:"result_error_id"`
ErrorMessage *string `json:"error_message"`
}
type OpsRetryResult struct {
AttemptID int64 `json:"attempt_id"`
Mode string `json:"mode"`
Status string `json:"status"`
PinnedAccountID *int64 `json:"pinned_account_id"`
UsedAccountID *int64 `json:"used_account_id"`
HTTPStatusCode int `json:"http_status_code"`
UpstreamRequestID string `json:"upstream_request_id"`
ResponsePreview string `json:"response_preview"`
ResponseTruncated bool `json:"response_truncated"`
ErrorMessage string `json:"error_message"`
StartedAt time.Time `json:"started_at"`
FinishedAt time.Time `json:"finished_at"`
DurationMs int64 `json:"duration_ms"`
}
package service
import (
"context"
"time"
)
type OpsRepository interface {
InsertErrorLog(ctx context.Context, input *OpsInsertErrorLogInput) (int64, error)
ListErrorLogs(ctx context.Context, filter *OpsErrorLogFilter) (*OpsErrorLogList, error)
GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLogDetail, error)
ListRequestDetails(ctx context.Context, filter *OpsRequestDetailFilter) ([]*OpsRequestDetail, int64, error)
InsertRetryAttempt(ctx context.Context, input *OpsInsertRetryAttemptInput) (int64, error)
UpdateRetryAttempt(ctx context.Context, input *OpsUpdateRetryAttemptInput) error
GetLatestRetryAttemptForError(ctx context.Context, sourceErrorID int64) (*OpsRetryAttempt, error)
// Lightweight window stats (for realtime WS / quick sampling).
GetWindowStats(ctx context.Context, filter *OpsDashboardFilter) (*OpsWindowStats, error)
GetDashboardOverview(ctx context.Context, filter *OpsDashboardFilter) (*OpsDashboardOverview, error)
GetThroughputTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsThroughputTrendResponse, error)
GetLatencyHistogram(ctx context.Context, filter *OpsDashboardFilter) (*OpsLatencyHistogramResponse, error)
GetErrorTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsErrorTrendResponse, error)
GetErrorDistribution(ctx context.Context, filter *OpsDashboardFilter) (*OpsErrorDistributionResponse, error)
InsertSystemMetrics(ctx context.Context, input *OpsInsertSystemMetricsInput) error
GetLatestSystemMetrics(ctx context.Context, windowMinutes int) (*OpsSystemMetricsSnapshot, error)
UpsertJobHeartbeat(ctx context.Context, input *OpsUpsertJobHeartbeatInput) error
ListJobHeartbeats(ctx context.Context) ([]*OpsJobHeartbeat, error)
// Alerts (rules + events)
ListAlertRules(ctx context.Context) ([]*OpsAlertRule, error)
CreateAlertRule(ctx context.Context, input *OpsAlertRule) (*OpsAlertRule, error)
UpdateAlertRule(ctx context.Context, input *OpsAlertRule) (*OpsAlertRule, error)
DeleteAlertRule(ctx context.Context, id int64) error
ListAlertEvents(ctx context.Context, filter *OpsAlertEventFilter) ([]*OpsAlertEvent, error)
GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error)
GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error)
CreateAlertEvent(ctx context.Context, event *OpsAlertEvent) (*OpsAlertEvent, error)
UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error
UpdateAlertEventEmailSent(ctx context.Context, eventID int64, emailSent bool) error
// Pre-aggregation (hourly/daily) used for long-window dashboard performance.
UpsertHourlyMetrics(ctx context.Context, startTime, endTime time.Time) error
UpsertDailyMetrics(ctx context.Context, startTime, endTime time.Time) error
GetLatestHourlyBucketStart(ctx context.Context) (time.Time, bool, error)
GetLatestDailyBucketDate(ctx context.Context) (time.Time, bool, error)
}
type OpsInsertErrorLogInput struct {
RequestID string
ClientRequestID string
UserID *int64
APIKeyID *int64
AccountID *int64
GroupID *int64
ClientIP *string
Platform string
Model string
RequestPath string
Stream bool
UserAgent string
ErrorPhase string
ErrorType string
Severity string
StatusCode int
IsBusinessLimited bool
ErrorMessage string
ErrorBody string
ErrorSource string
ErrorOwner string
UpstreamStatusCode *int
UpstreamErrorMessage *string
UpstreamErrorDetail *string
// UpstreamErrors captures all upstream error attempts observed during handling this request.
// It is populated during request processing (gin context) and sanitized+serialized by OpsService.
UpstreamErrors []*OpsUpstreamErrorEvent
// UpstreamErrorsJSON is the sanitized JSON string stored into ops_error_logs.upstream_errors.
// It is set by OpsService.RecordError before persisting.
UpstreamErrorsJSON *string
DurationMs *int
TimeToFirstTokenMs *int64
RequestBodyJSON *string // sanitized json string (not raw bytes)
RequestBodyTruncated bool
RequestBodyBytes *int
RequestHeadersJSON *string // optional json string
IsRetryable bool
RetryCount int
CreatedAt time.Time
}
type OpsInsertRetryAttemptInput struct {
RequestedByUserID int64
SourceErrorID int64
Mode string
PinnedAccountID *int64
// running|queued etc.
Status string
StartedAt time.Time
}
type OpsUpdateRetryAttemptInput struct {
ID int64
// succeeded|failed
Status string
FinishedAt time.Time
DurationMs int64
// Optional correlation
ResultRequestID *string
ResultErrorID *int64
ErrorMessage *string
}
type OpsInsertSystemMetricsInput struct {
CreatedAt time.Time
WindowMinutes int
Platform *string
GroupID *int64
SuccessCount int64
ErrorCountTotal int64
BusinessLimitedCount int64
ErrorCountSLA int64
UpstreamErrorCountExcl429529 int64
Upstream429Count int64
Upstream529Count int64
TokenConsumed int64
QPS *float64
TPS *float64
DurationP50Ms *int
DurationP90Ms *int
DurationP95Ms *int
DurationP99Ms *int
DurationAvgMs *float64
DurationMaxMs *int
TTFTP50Ms *int
TTFTP90Ms *int
TTFTP95Ms *int
TTFTP99Ms *int
TTFTAvgMs *float64
TTFTMaxMs *int
CPUUsagePercent *float64
MemoryUsedMB *int64
MemoryTotalMB *int64
MemoryUsagePercent *float64
DBOK *bool
RedisOK *bool
RedisConnTotal *int
RedisConnIdle *int
DBConnActive *int
DBConnIdle *int
DBConnWaiting *int
GoroutineCount *int
ConcurrencyQueueDepth *int
}
type OpsSystemMetricsSnapshot struct {
ID int64 `json:"id"`
CreatedAt time.Time `json:"created_at"`
WindowMinutes int `json:"window_minutes"`
CPUUsagePercent *float64 `json:"cpu_usage_percent"`
MemoryUsedMB *int64 `json:"memory_used_mb"`
MemoryTotalMB *int64 `json:"memory_total_mb"`
MemoryUsagePercent *float64 `json:"memory_usage_percent"`
DBOK *bool `json:"db_ok"`
RedisOK *bool `json:"redis_ok"`
// Config-derived limits (best-effort). These are not historical metrics; they help UI render "current vs max".
DBMaxOpenConns *int `json:"db_max_open_conns"`
RedisPoolSize *int `json:"redis_pool_size"`
RedisConnTotal *int `json:"redis_conn_total"`
RedisConnIdle *int `json:"redis_conn_idle"`
DBConnActive *int `json:"db_conn_active"`
DBConnIdle *int `json:"db_conn_idle"`
DBConnWaiting *int `json:"db_conn_waiting"`
GoroutineCount *int `json:"goroutine_count"`
ConcurrencyQueueDepth *int `json:"concurrency_queue_depth"`
}
type OpsUpsertJobHeartbeatInput struct {
JobName string
LastRunAt *time.Time
LastSuccessAt *time.Time
LastErrorAt *time.Time
LastError *string
LastDurationMs *int64
}
type OpsJobHeartbeat struct {
JobName string `json:"job_name"`
LastRunAt *time.Time `json:"last_run_at"`
LastSuccessAt *time.Time `json:"last_success_at"`
LastErrorAt *time.Time `json:"last_error_at"`
LastError *string `json:"last_error"`
LastDurationMs *int64 `json:"last_duration_ms"`
UpdatedAt time.Time `json:"updated_at"`
}
type OpsWindowStats struct {
StartTime time.Time `json:"start_time"`
EndTime time.Time `json:"end_time"`
SuccessCount int64 `json:"success_count"`
ErrorCountTotal int64 `json:"error_count_total"`
TokenConsumed int64 `json:"token_consumed"`
}
package service
import (
"errors"
"strings"
)
type OpsQueryMode string
const (
OpsQueryModeAuto OpsQueryMode = "auto"
OpsQueryModeRaw OpsQueryMode = "raw"
OpsQueryModePreagg OpsQueryMode = "preagg"
)
// ErrOpsPreaggregatedNotPopulated indicates that raw logs exist for a window, but the
// pre-aggregation tables are not populated yet. This is primarily used to implement
// the forced `preagg` mode UX.
var ErrOpsPreaggregatedNotPopulated = errors.New("ops pre-aggregated tables not populated")
func ParseOpsQueryMode(raw string) OpsQueryMode {
v := strings.ToLower(strings.TrimSpace(raw))
switch v {
case string(OpsQueryModeRaw):
return OpsQueryModeRaw
case string(OpsQueryModePreagg):
return OpsQueryModePreagg
default:
return OpsQueryModeAuto
}
}
func (m OpsQueryMode) IsValid() bool {
switch m {
case OpsQueryModeAuto, OpsQueryModeRaw, OpsQueryModePreagg:
return true
default:
return false
}
}
package service
import (
"context"
"errors"
"strings"
)
// IsRealtimeMonitoringEnabled returns true when realtime ops features are enabled.
//
// This is a soft switch controlled by the DB setting `ops_realtime_monitoring_enabled`,
// and it is also gated by the hard switch/soft switch of overall ops monitoring.
func (s *OpsService) IsRealtimeMonitoringEnabled(ctx context.Context) bool {
if !s.IsMonitoringEnabled(ctx) {
return false
}
if s.settingRepo == nil {
return true
}
value, err := s.settingRepo.GetValue(ctx, SettingKeyOpsRealtimeMonitoringEnabled)
if err != nil {
// Default enabled when key is missing; fail-open on transient errors.
if errors.Is(err, ErrSettingNotFound) {
return true
}
return true
}
switch strings.ToLower(strings.TrimSpace(value)) {
case "false", "0", "off", "disabled":
return false
default:
return true
}
}
package service
import "time"
// PlatformConcurrencyInfo aggregates concurrency usage by platform.
type PlatformConcurrencyInfo struct {
Platform string `json:"platform"`
CurrentInUse int64 `json:"current_in_use"`
MaxCapacity int64 `json:"max_capacity"`
LoadPercentage float64 `json:"load_percentage"`
WaitingInQueue int64 `json:"waiting_in_queue"`
}
// GroupConcurrencyInfo aggregates concurrency usage by group.
//
// Note: one account can belong to multiple groups; group totals are therefore not additive across groups.
type GroupConcurrencyInfo struct {
GroupID int64 `json:"group_id"`
GroupName string `json:"group_name"`
Platform string `json:"platform"`
CurrentInUse int64 `json:"current_in_use"`
MaxCapacity int64 `json:"max_capacity"`
LoadPercentage float64 `json:"load_percentage"`
WaitingInQueue int64 `json:"waiting_in_queue"`
}
// AccountConcurrencyInfo represents real-time concurrency usage for a single account.
type AccountConcurrencyInfo struct {
AccountID int64 `json:"account_id"`
AccountName string `json:"account_name"`
Platform string `json:"platform"`
GroupID int64 `json:"group_id"`
GroupName string `json:"group_name"`
CurrentInUse int64 `json:"current_in_use"`
MaxCapacity int64 `json:"max_capacity"`
LoadPercentage float64 `json:"load_percentage"`
WaitingInQueue int64 `json:"waiting_in_queue"`
}
// PlatformAvailability aggregates account availability by platform.
type PlatformAvailability struct {
Platform string `json:"platform"`
TotalAccounts int64 `json:"total_accounts"`
AvailableCount int64 `json:"available_count"`
RateLimitCount int64 `json:"rate_limit_count"`
ErrorCount int64 `json:"error_count"`
}
// GroupAvailability aggregates account availability by group.
type GroupAvailability struct {
GroupID int64 `json:"group_id"`
GroupName string `json:"group_name"`
Platform string `json:"platform"`
TotalAccounts int64 `json:"total_accounts"`
AvailableCount int64 `json:"available_count"`
RateLimitCount int64 `json:"rate_limit_count"`
ErrorCount int64 `json:"error_count"`
}
// AccountAvailability represents current availability for a single account.
type AccountAvailability struct {
AccountID int64 `json:"account_id"`
AccountName string `json:"account_name"`
Platform string `json:"platform"`
GroupID int64 `json:"group_id"`
GroupName string `json:"group_name"`
Status string `json:"status"`
IsAvailable bool `json:"is_available"`
IsRateLimited bool `json:"is_rate_limited"`
IsOverloaded bool `json:"is_overloaded"`
HasError bool `json:"has_error"`
RateLimitResetAt *time.Time `json:"rate_limit_reset_at"`
RateLimitRemainingSec *int64 `json:"rate_limit_remaining_sec"`
OverloadUntil *time.Time `json:"overload_until"`
OverloadRemainingSec *int64 `json:"overload_remaining_sec"`
ErrorMessage string `json:"error_message"`
TempUnschedulableUntil *time.Time `json:"temp_unschedulable_until,omitempty"`
}
package service
import (
"context"
"time"
)
type OpsRequestKind string
const (
OpsRequestKindSuccess OpsRequestKind = "success"
OpsRequestKindError OpsRequestKind = "error"
)
// OpsRequestDetail is a request-level view across success (usage_logs) and error (ops_error_logs).
// It powers "request drilldown" UIs without exposing full request bodies for successful requests.
type OpsRequestDetail struct {
Kind OpsRequestKind `json:"kind"`
CreatedAt time.Time `json:"created_at"`
RequestID string `json:"request_id"`
Platform string `json:"platform,omitempty"`
Model string `json:"model,omitempty"`
DurationMs *int `json:"duration_ms,omitempty"`
StatusCode *int `json:"status_code,omitempty"`
// When Kind == "error", ErrorID links to /admin/ops/errors/:id.
ErrorID *int64 `json:"error_id,omitempty"`
Phase string `json:"phase,omitempty"`
Severity string `json:"severity,omitempty"`
Message string `json:"message,omitempty"`
UserID *int64 `json:"user_id,omitempty"`
APIKeyID *int64 `json:"api_key_id,omitempty"`
AccountID *int64 `json:"account_id,omitempty"`
GroupID *int64 `json:"group_id,omitempty"`
Stream bool `json:"stream"`
}
type OpsRequestDetailFilter struct {
StartTime *time.Time
EndTime *time.Time
// kind: success|error|all
Kind string
Platform string
GroupID *int64
UserID *int64
APIKeyID *int64
AccountID *int64
Model string
RequestID string
Query string
MinDurationMs *int
MaxDurationMs *int
// Sort: created_at_desc (default) or duration_desc.
Sort string
Page int
PageSize int
}
func (f *OpsRequestDetailFilter) Normalize() (page, pageSize int, startTime, endTime time.Time) {
page = 1
pageSize = 50
endTime = time.Now()
startTime = endTime.Add(-1 * time.Hour)
if f == nil {
return page, pageSize, startTime, endTime
}
if f.Page > 0 {
page = f.Page
}
if f.PageSize > 0 {
pageSize = f.PageSize
}
if pageSize > 100 {
pageSize = 100
}
if f.EndTime != nil {
endTime = *f.EndTime
}
if f.StartTime != nil {
startTime = *f.StartTime
} else if f.EndTime != nil {
startTime = endTime.Add(-1 * time.Hour)
}
if startTime.After(endTime) {
startTime, endTime = endTime, startTime
}
return page, pageSize, startTime, endTime
}
type OpsRequestDetailList struct {
Items []*OpsRequestDetail `json:"items"`
Total int64 `json:"total"`
Page int `json:"page"`
PageSize int `json:"page_size"`
}
func (s *OpsService) ListRequestDetails(ctx context.Context, filter *OpsRequestDetailFilter) (*OpsRequestDetailList, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return &OpsRequestDetailList{
Items: []*OpsRequestDetail{},
Total: 0,
Page: 1,
PageSize: 50,
}, nil
}
page, pageSize, startTime, endTime := filter.Normalize()
filterCopy := &OpsRequestDetailFilter{}
if filter != nil {
*filterCopy = *filter
}
filterCopy.Page = page
filterCopy.PageSize = pageSize
filterCopy.StartTime = &startTime
filterCopy.EndTime = &endTime
items, total, err := s.opsRepo.ListRequestDetails(ctx, filterCopy)
if err != nil {
return nil, err
}
if items == nil {
items = []*OpsRequestDetail{}
}
return &OpsRequestDetailList{
Items: items,
Total: total,
Page: page,
PageSize: pageSize,
}, nil
}
package service
import (
"bytes"
"context"
"database/sql"
"encoding/json"
"errors"
"fmt"
"log"
"net/http"
"strings"
"time"
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
"github.com/gin-gonic/gin"
"github.com/lib/pq"
)
const (
OpsRetryModeClient = "client"
OpsRetryModeUpstream = "upstream"
)
const (
opsRetryStatusRunning = "running"
opsRetryStatusSucceeded = "succeeded"
opsRetryStatusFailed = "failed"
)
const (
opsRetryTimeout = 60 * time.Second
opsRetryCaptureBytesLimit = 64 * 1024
opsRetryResponsePreviewMax = 8 * 1024
opsRetryMinIntervalPerError = 10 * time.Second
opsRetryMaxAccountSwitches = 3
)
var opsRetryRequestHeaderAllowlist = map[string]bool{
"anthropic-beta": true,
"anthropic-version": true,
}
type opsRetryRequestType string
const (
opsRetryTypeMessages opsRetryRequestType = "messages"
opsRetryTypeOpenAI opsRetryRequestType = "openai_responses"
opsRetryTypeGeminiV1B opsRetryRequestType = "gemini_v1beta"
)
type limitedResponseWriter struct {
header http.Header
wroteHeader bool
limit int
totalWritten int64
buf bytes.Buffer
}
func newLimitedResponseWriter(limit int) *limitedResponseWriter {
if limit <= 0 {
limit = 1
}
return &limitedResponseWriter{
header: make(http.Header),
limit: limit,
}
}
func (w *limitedResponseWriter) Header() http.Header {
return w.header
}
func (w *limitedResponseWriter) WriteHeader(statusCode int) {
if w.wroteHeader {
return
}
w.wroteHeader = true
}
func (w *limitedResponseWriter) Write(p []byte) (int, error) {
if !w.wroteHeader {
w.WriteHeader(http.StatusOK)
}
w.totalWritten += int64(len(p))
if w.buf.Len() < w.limit {
remaining := w.limit - w.buf.Len()
if len(p) > remaining {
_, _ = w.buf.Write(p[:remaining])
} else {
_, _ = w.buf.Write(p)
}
}
// Pretend we wrote everything to avoid upstream/client code treating it as an error.
return len(p), nil
}
func (w *limitedResponseWriter) Flush() {}
func (w *limitedResponseWriter) bodyBytes() []byte {
return w.buf.Bytes()
}
func (w *limitedResponseWriter) truncated() bool {
return w.totalWritten > int64(w.limit)
}
func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, errorID int64, mode string, pinnedAccountID *int64) (*OpsRetryResult, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
mode = strings.ToLower(strings.TrimSpace(mode))
switch mode {
case OpsRetryModeClient, OpsRetryModeUpstream:
default:
return nil, infraerrors.BadRequest("OPS_RETRY_INVALID_MODE", "mode must be client or upstream")
}
latest, err := s.opsRepo.GetLatestRetryAttemptForError(ctx, errorID)
if err != nil && !errors.Is(err, sql.ErrNoRows) {
return nil, infraerrors.InternalServer("OPS_RETRY_LOAD_LATEST_FAILED", "Failed to check retry status").WithCause(err)
}
if latest != nil {
if strings.EqualFold(latest.Status, opsRetryStatusRunning) || strings.EqualFold(latest.Status, "queued") {
return nil, infraerrors.Conflict("OPS_RETRY_IN_PROGRESS", "A retry is already in progress for this error")
}
lastAttemptAt := latest.CreatedAt
if latest.FinishedAt != nil && !latest.FinishedAt.IsZero() {
lastAttemptAt = *latest.FinishedAt
} else if latest.StartedAt != nil && !latest.StartedAt.IsZero() {
lastAttemptAt = *latest.StartedAt
}
if time.Since(lastAttemptAt) < opsRetryMinIntervalPerError {
return nil, infraerrors.Conflict("OPS_RETRY_TOO_FREQUENT", "Please wait before retrying this error again")
}
}
errorLog, err := s.GetErrorLogByID(ctx, errorID)
if err != nil {
return nil, err
}
if strings.TrimSpace(errorLog.RequestBody) == "" {
return nil, infraerrors.BadRequest("OPS_RETRY_NO_REQUEST_BODY", "No request body found to retry")
}
var pinned *int64
if mode == OpsRetryModeUpstream {
if pinnedAccountID != nil && *pinnedAccountID > 0 {
pinned = pinnedAccountID
} else if errorLog.AccountID != nil && *errorLog.AccountID > 0 {
pinned = errorLog.AccountID
} else {
return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "pinned_account_id is required for upstream retry")
}
}
startedAt := time.Now()
attemptID, err := s.opsRepo.InsertRetryAttempt(ctx, &OpsInsertRetryAttemptInput{
RequestedByUserID: requestedByUserID,
SourceErrorID: errorID,
Mode: mode,
PinnedAccountID: pinned,
Status: opsRetryStatusRunning,
StartedAt: startedAt,
})
if err != nil {
var pqErr *pq.Error
if errors.As(err, &pqErr) && string(pqErr.Code) == "23505" {
return nil, infraerrors.Conflict("OPS_RETRY_IN_PROGRESS", "A retry is already in progress for this error")
}
return nil, infraerrors.InternalServer("OPS_RETRY_CREATE_ATTEMPT_FAILED", "Failed to create retry attempt").WithCause(err)
}
result := &OpsRetryResult{
AttemptID: attemptID,
Mode: mode,
Status: opsRetryStatusFailed,
PinnedAccountID: pinned,
HTTPStatusCode: 0,
UpstreamRequestID: "",
ResponsePreview: "",
ResponseTruncated: false,
ErrorMessage: "",
StartedAt: startedAt,
}
execCtx, cancel := context.WithTimeout(ctx, opsRetryTimeout)
defer cancel()
execRes := s.executeRetry(execCtx, errorLog, mode, pinned)
finishedAt := time.Now()
result.FinishedAt = finishedAt
result.DurationMs = finishedAt.Sub(startedAt).Milliseconds()
if execRes != nil {
result.Status = execRes.status
result.UsedAccountID = execRes.usedAccountID
result.HTTPStatusCode = execRes.httpStatusCode
result.UpstreamRequestID = execRes.upstreamRequestID
result.ResponsePreview = execRes.responsePreview
result.ResponseTruncated = execRes.responseTruncated
result.ErrorMessage = execRes.errorMessage
}
updateCtx, updateCancel := context.WithTimeout(context.Background(), 3*time.Second)
defer updateCancel()
var updateErrMsg *string
if strings.TrimSpace(result.ErrorMessage) != "" {
msg := result.ErrorMessage
updateErrMsg = &msg
}
var resultRequestID *string
if strings.TrimSpace(result.UpstreamRequestID) != "" {
v := result.UpstreamRequestID
resultRequestID = &v
}
finalStatus := result.Status
if strings.TrimSpace(finalStatus) == "" {
finalStatus = opsRetryStatusFailed
}
if err := s.opsRepo.UpdateRetryAttempt(updateCtx, &OpsUpdateRetryAttemptInput{
ID: attemptID,
Status: finalStatus,
FinishedAt: finishedAt,
DurationMs: result.DurationMs,
ResultRequestID: resultRequestID,
ErrorMessage: updateErrMsg,
}); err != nil {
// Best-effort: retry itself already executed; do not fail the API response.
log.Printf("[Ops] UpdateRetryAttempt failed: %v", err)
}
return result, nil
}
type opsRetryExecution struct {
status string
usedAccountID *int64
httpStatusCode int
upstreamRequestID string
responsePreview string
responseTruncated bool
errorMessage string
}
func (s *OpsService) executeRetry(ctx context.Context, errorLog *OpsErrorLogDetail, mode string, pinnedAccountID *int64) *opsRetryExecution {
if errorLog == nil {
return &opsRetryExecution{
status: opsRetryStatusFailed,
errorMessage: "missing error log",
}
}
reqType := detectOpsRetryType(errorLog.RequestPath)
bodyBytes := []byte(errorLog.RequestBody)
switch reqType {
case opsRetryTypeMessages:
bodyBytes = FilterThinkingBlocksForRetry(bodyBytes)
case opsRetryTypeOpenAI, opsRetryTypeGeminiV1B:
// No-op
}
switch strings.ToLower(strings.TrimSpace(mode)) {
case OpsRetryModeUpstream:
if pinnedAccountID == nil || *pinnedAccountID <= 0 {
return &opsRetryExecution{
status: opsRetryStatusFailed,
errorMessage: "pinned_account_id required for upstream retry",
}
}
return s.executePinnedRetry(ctx, reqType, errorLog, bodyBytes, *pinnedAccountID)
case OpsRetryModeClient:
return s.executeClientRetry(ctx, reqType, errorLog, bodyBytes)
default:
return &opsRetryExecution{
status: opsRetryStatusFailed,
errorMessage: "invalid retry mode",
}
}
}
func detectOpsRetryType(path string) opsRetryRequestType {
p := strings.ToLower(strings.TrimSpace(path))
switch {
case strings.Contains(p, "/responses"):
return opsRetryTypeOpenAI
case strings.Contains(p, "/v1beta/"):
return opsRetryTypeGeminiV1B
default:
return opsRetryTypeMessages
}
}
func (s *OpsService) executePinnedRetry(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte, pinnedAccountID int64) *opsRetryExecution {
if s.accountRepo == nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account repository not available"}
}
account, err := s.accountRepo.GetByID(ctx, pinnedAccountID)
if err != nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: fmt.Sprintf("account not found: %v", err)}
}
if account == nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account not found"}
}
if !account.IsSchedulable() {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account is not schedulable"}
}
if errorLog.GroupID != nil && *errorLog.GroupID > 0 {
if !containsInt64(account.GroupIDs, *errorLog.GroupID) {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "pinned account is not in the same group as the original request"}
}
}
var release func()
if s.concurrencyService != nil {
acq, err := s.concurrencyService.AcquireAccountSlot(ctx, account.ID, account.Concurrency)
if err != nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: fmt.Sprintf("acquire account slot failed: %v", err)}
}
if acq == nil || !acq.Acquired {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account concurrency limit reached"}
}
release = acq.ReleaseFunc
}
if release != nil {
defer release()
}
usedID := account.ID
exec := s.executeWithAccount(ctx, reqType, errorLog, body, account)
exec.usedAccountID = &usedID
if exec.status == "" {
exec.status = opsRetryStatusFailed
}
return exec
}
func (s *OpsService) executeClientRetry(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte) *opsRetryExecution {
groupID := errorLog.GroupID
if groupID == nil || *groupID <= 0 {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "group_id missing; cannot reselect account"}
}
model, stream, parsedErr := extractRetryModelAndStream(reqType, errorLog, body)
if parsedErr != nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: parsedErr.Error()}
}
_ = stream
excluded := make(map[int64]struct{})
switches := 0
for {
if switches >= opsRetryMaxAccountSwitches {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "retry failed after exhausting account failovers"}
}
selection, selErr := s.selectAccountForRetry(ctx, reqType, groupID, model, excluded)
if selErr != nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: selErr.Error()}
}
if selection == nil || selection.Account == nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "no available accounts"}
}
account := selection.Account
if !selection.Acquired || selection.ReleaseFunc == nil {
excluded[account.ID] = struct{}{}
switches++
continue
}
exec := func() *opsRetryExecution {
defer selection.ReleaseFunc()
return s.executeWithAccount(ctx, reqType, errorLog, body, account)
}()
if exec != nil {
if exec.status == opsRetryStatusSucceeded {
usedID := account.ID
exec.usedAccountID = &usedID
return exec
}
// If the gateway services ask for failover, try another account.
if s.isFailoverError(exec.errorMessage) {
excluded[account.ID] = struct{}{}
switches++
continue
}
usedID := account.ID
exec.usedAccountID = &usedID
return exec
}
excluded[account.ID] = struct{}{}
switches++
}
}
func (s *OpsService) selectAccountForRetry(ctx context.Context, reqType opsRetryRequestType, groupID *int64, model string, excludedIDs map[int64]struct{}) (*AccountSelectionResult, error) {
switch reqType {
case opsRetryTypeOpenAI:
if s.openAIGatewayService == nil {
return nil, fmt.Errorf("openai gateway service not available")
}
return s.openAIGatewayService.SelectAccountWithLoadAwareness(ctx, groupID, "", model, excludedIDs)
case opsRetryTypeGeminiV1B, opsRetryTypeMessages:
if s.gatewayService == nil {
return nil, fmt.Errorf("gateway service not available")
}
return s.gatewayService.SelectAccountWithLoadAwareness(ctx, groupID, "", model, excludedIDs)
default:
return nil, fmt.Errorf("unsupported retry type: %s", reqType)
}
}
func extractRetryModelAndStream(reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte) (model string, stream bool, err error) {
switch reqType {
case opsRetryTypeMessages:
parsed, parseErr := ParseGatewayRequest(body)
if parseErr != nil {
return "", false, fmt.Errorf("failed to parse messages request body: %w", parseErr)
}
return parsed.Model, parsed.Stream, nil
case opsRetryTypeOpenAI:
var v struct {
Model string `json:"model"`
Stream bool `json:"stream"`
}
if err := json.Unmarshal(body, &v); err != nil {
return "", false, fmt.Errorf("failed to parse openai request body: %w", err)
}
return strings.TrimSpace(v.Model), v.Stream, nil
case opsRetryTypeGeminiV1B:
if strings.TrimSpace(errorLog.Model) == "" {
return "", false, fmt.Errorf("missing model for gemini v1beta retry")
}
return strings.TrimSpace(errorLog.Model), errorLog.Stream, nil
default:
return "", false, fmt.Errorf("unsupported retry type: %s", reqType)
}
}
func (s *OpsService) executeWithAccount(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte, account *Account) *opsRetryExecution {
if account == nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "missing account"}
}
c, w := newOpsRetryContext(ctx, errorLog)
var err error
switch reqType {
case opsRetryTypeOpenAI:
if s.openAIGatewayService == nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "openai gateway service not available"}
}
_, err = s.openAIGatewayService.Forward(ctx, c, account, body)
case opsRetryTypeGeminiV1B:
if s.geminiCompatService == nil || s.antigravityGatewayService == nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gemini services not available"}
}
modelName := strings.TrimSpace(errorLog.Model)
action := "generateContent"
if errorLog.Stream {
action = "streamGenerateContent"
}
if account.Platform == PlatformAntigravity {
_, err = s.antigravityGatewayService.ForwardGemini(ctx, c, account, modelName, action, errorLog.Stream, body)
} else {
_, err = s.geminiCompatService.ForwardNative(ctx, c, account, modelName, action, errorLog.Stream, body)
}
case opsRetryTypeMessages:
switch account.Platform {
case PlatformAntigravity:
if s.antigravityGatewayService == nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "antigravity gateway service not available"}
}
_, err = s.antigravityGatewayService.Forward(ctx, c, account, body)
case PlatformGemini:
if s.geminiCompatService == nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gemini gateway service not available"}
}
_, err = s.geminiCompatService.Forward(ctx, c, account, body)
default:
if s.gatewayService == nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gateway service not available"}
}
parsedReq, parseErr := ParseGatewayRequest(body)
if parseErr != nil {
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "failed to parse request body"}
}
_, err = s.gatewayService.Forward(ctx, c, account, parsedReq)
}
default:
return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "unsupported retry type"}
}
statusCode := http.StatusOK
if c != nil && c.Writer != nil {
statusCode = c.Writer.Status()
}
upstreamReqID := extractUpstreamRequestID(c)
preview, truncated := extractResponsePreview(w)
exec := &opsRetryExecution{
status: opsRetryStatusFailed,
httpStatusCode: statusCode,
upstreamRequestID: upstreamReqID,
responsePreview: preview,
responseTruncated: truncated,
errorMessage: "",
}
if err == nil && statusCode < 400 {
exec.status = opsRetryStatusSucceeded
return exec
}
if err != nil {
exec.errorMessage = err.Error()
} else {
exec.errorMessage = fmt.Sprintf("upstream returned status %d", statusCode)
}
return exec
}
func newOpsRetryContext(ctx context.Context, errorLog *OpsErrorLogDetail) (*gin.Context, *limitedResponseWriter) {
w := newLimitedResponseWriter(opsRetryCaptureBytesLimit)
c, _ := gin.CreateTestContext(w)
path := "/"
if errorLog != nil && strings.TrimSpace(errorLog.RequestPath) != "" {
path = errorLog.RequestPath
}
req, _ := http.NewRequestWithContext(ctx, http.MethodPost, "http://localhost"+path, bytes.NewReader(nil))
req.Header.Set("content-type", "application/json")
if errorLog != nil && strings.TrimSpace(errorLog.UserAgent) != "" {
req.Header.Set("user-agent", errorLog.UserAgent)
}
// Restore a minimal, whitelisted subset of request headers to improve retry fidelity
// (e.g. anthropic-beta / anthropic-version). Never replay auth credentials.
if errorLog != nil && strings.TrimSpace(errorLog.RequestHeaders) != "" {
var stored map[string]string
if err := json.Unmarshal([]byte(errorLog.RequestHeaders), &stored); err == nil {
for k, v := range stored {
key := strings.TrimSpace(k)
if key == "" {
continue
}
if !opsRetryRequestHeaderAllowlist[strings.ToLower(key)] {
continue
}
val := strings.TrimSpace(v)
if val == "" {
continue
}
req.Header.Set(key, val)
}
}
}
c.Request = req
return c, w
}
func extractUpstreamRequestID(c *gin.Context) string {
if c == nil || c.Writer == nil {
return ""
}
h := c.Writer.Header()
if h == nil {
return ""
}
for _, key := range []string{"x-request-id", "X-Request-Id", "X-Request-ID"} {
if v := strings.TrimSpace(h.Get(key)); v != "" {
return v
}
}
return ""
}
func extractResponsePreview(w *limitedResponseWriter) (preview string, truncated bool) {
if w == nil {
return "", false
}
b := bytes.TrimSpace(w.bodyBytes())
if len(b) == 0 {
return "", w.truncated()
}
if len(b) > opsRetryResponsePreviewMax {
return string(b[:opsRetryResponsePreviewMax]), true
}
return string(b), w.truncated()
}
func containsInt64(items []int64, needle int64) bool {
for _, v := range items {
if v == needle {
return true
}
}
return false
}
func (s *OpsService) isFailoverError(message string) bool {
msg := strings.ToLower(strings.TrimSpace(message))
if msg == "" {
return false
}
return strings.Contains(msg, "upstream error:") && strings.Contains(msg, "failover")
}
package service
import (
"context"
"fmt"
"log"
"strconv"
"strings"
"sync"
"time"
"github.com/Wei-Shaw/sub2api/internal/config"
"github.com/google/uuid"
"github.com/redis/go-redis/v9"
"github.com/robfig/cron/v3"
)
const (
opsScheduledReportJobName = "ops_scheduled_reports"
opsScheduledReportLeaderLockKeyDefault = "ops:scheduled_reports:leader"
opsScheduledReportLeaderLockTTLDefault = 5 * time.Minute
opsScheduledReportLastRunKeyPrefix = "ops:scheduled_reports:last_run:"
opsScheduledReportTickInterval = 1 * time.Minute
)
var opsScheduledReportCronParser = cron.NewParser(cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow)
var opsScheduledReportReleaseScript = redis.NewScript(`
if redis.call("GET", KEYS[1]) == ARGV[1] then
return redis.call("DEL", KEYS[1])
end
return 0
`)
type OpsScheduledReportService struct {
opsService *OpsService
userService *UserService
emailService *EmailService
redisClient *redis.Client
cfg *config.Config
instanceID string
loc *time.Location
distributedLockOn bool
warnNoRedisOnce sync.Once
startOnce sync.Once
stopOnce sync.Once
stopCtx context.Context
stop context.CancelFunc
wg sync.WaitGroup
}
func NewOpsScheduledReportService(
opsService *OpsService,
userService *UserService,
emailService *EmailService,
redisClient *redis.Client,
cfg *config.Config,
) *OpsScheduledReportService {
lockOn := cfg == nil || strings.TrimSpace(cfg.RunMode) != config.RunModeSimple
loc := time.Local
if cfg != nil && strings.TrimSpace(cfg.Timezone) != "" {
if parsed, err := time.LoadLocation(strings.TrimSpace(cfg.Timezone)); err == nil && parsed != nil {
loc = parsed
}
}
return &OpsScheduledReportService{
opsService: opsService,
userService: userService,
emailService: emailService,
redisClient: redisClient,
cfg: cfg,
instanceID: uuid.NewString(),
loc: loc,
distributedLockOn: lockOn,
warnNoRedisOnce: sync.Once{},
startOnce: sync.Once{},
stopOnce: sync.Once{},
stopCtx: nil,
stop: nil,
wg: sync.WaitGroup{},
}
}
func (s *OpsScheduledReportService) Start() {
s.StartWithContext(context.Background())
}
func (s *OpsScheduledReportService) StartWithContext(ctx context.Context) {
if s == nil {
return
}
if ctx == nil {
ctx = context.Background()
}
if s.cfg != nil && !s.cfg.Ops.Enabled {
return
}
if s.opsService == nil || s.emailService == nil {
return
}
s.startOnce.Do(func() {
s.stopCtx, s.stop = context.WithCancel(ctx)
s.wg.Add(1)
go s.run()
})
}
func (s *OpsScheduledReportService) Stop() {
if s == nil {
return
}
s.stopOnce.Do(func() {
if s.stop != nil {
s.stop()
}
})
s.wg.Wait()
}
func (s *OpsScheduledReportService) run() {
defer s.wg.Done()
ticker := time.NewTicker(opsScheduledReportTickInterval)
defer ticker.Stop()
s.runOnce()
for {
select {
case <-ticker.C:
s.runOnce()
case <-s.stopCtx.Done():
return
}
}
}
func (s *OpsScheduledReportService) runOnce() {
if s == nil || s.opsService == nil || s.emailService == nil {
return
}
startedAt := time.Now().UTC()
runAt := startedAt
ctx, cancel := context.WithTimeout(s.stopCtx, 60*time.Second)
defer cancel()
// Respect ops monitoring enabled switch.
if !s.opsService.IsMonitoringEnabled(ctx) {
return
}
release, ok := s.tryAcquireLeaderLock(ctx)
if !ok {
return
}
if release != nil {
defer release()
}
now := time.Now()
if s.loc != nil {
now = now.In(s.loc)
}
reports := s.listScheduledReports(ctx, now)
if len(reports) == 0 {
return
}
for _, report := range reports {
if report == nil || !report.Enabled {
continue
}
if report.NextRunAt.After(now) {
continue
}
if err := s.runReport(ctx, report, now); err != nil {
s.recordHeartbeatError(runAt, time.Since(startedAt), err)
return
}
}
s.recordHeartbeatSuccess(runAt, time.Since(startedAt))
}
type opsScheduledReport struct {
Name string
ReportType string
Schedule string
Enabled bool
TimeRange time.Duration
Recipients []string
ErrorDigestMinCount int
AccountHealthErrorRateThreshold float64
LastRunAt *time.Time
NextRunAt time.Time
}
func (s *OpsScheduledReportService) listScheduledReports(ctx context.Context, now time.Time) []*opsScheduledReport {
if s == nil || s.opsService == nil {
return nil
}
if ctx == nil {
ctx = context.Background()
}
emailCfg, err := s.opsService.GetEmailNotificationConfig(ctx)
if err != nil || emailCfg == nil {
return nil
}
if !emailCfg.Report.Enabled {
return nil
}
recipients := normalizeEmails(emailCfg.Report.Recipients)
type reportDef struct {
enabled bool
name string
kind string
timeRange time.Duration
schedule string
}
defs := []reportDef{
{enabled: emailCfg.Report.DailySummaryEnabled, name: "日报", kind: "daily_summary", timeRange: 24 * time.Hour, schedule: emailCfg.Report.DailySummarySchedule},
{enabled: emailCfg.Report.WeeklySummaryEnabled, name: "周报", kind: "weekly_summary", timeRange: 7 * 24 * time.Hour, schedule: emailCfg.Report.WeeklySummarySchedule},
{enabled: emailCfg.Report.ErrorDigestEnabled, name: "错误摘要", kind: "error_digest", timeRange: 24 * time.Hour, schedule: emailCfg.Report.ErrorDigestSchedule},
{enabled: emailCfg.Report.AccountHealthEnabled, name: "账号健康", kind: "account_health", timeRange: 24 * time.Hour, schedule: emailCfg.Report.AccountHealthSchedule},
}
out := make([]*opsScheduledReport, 0, len(defs))
for _, d := range defs {
if !d.enabled {
continue
}
spec := strings.TrimSpace(d.schedule)
if spec == "" {
continue
}
sched, err := opsScheduledReportCronParser.Parse(spec)
if err != nil {
log.Printf("[OpsScheduledReport] invalid cron spec=%q for report=%s: %v", spec, d.kind, err)
continue
}
lastRun := s.getLastRunAt(ctx, d.kind)
base := lastRun
if base.IsZero() {
// Allow a schedule matching the current minute to trigger right after startup.
base = now.Add(-1 * time.Minute)
}
next := sched.Next(base)
if next.IsZero() {
continue
}
var lastRunPtr *time.Time
if !lastRun.IsZero() {
lastCopy := lastRun
lastRunPtr = &lastCopy
}
out = append(out, &opsScheduledReport{
Name: d.name,
ReportType: d.kind,
Schedule: spec,
Enabled: true,
TimeRange: d.timeRange,
Recipients: recipients,
ErrorDigestMinCount: emailCfg.Report.ErrorDigestMinCount,
AccountHealthErrorRateThreshold: emailCfg.Report.AccountHealthErrorRateThreshold,
LastRunAt: lastRunPtr,
NextRunAt: next,
})
}
return out
}
func (s *OpsScheduledReportService) runReport(ctx context.Context, report *opsScheduledReport, now time.Time) error {
if s == nil || s.opsService == nil || s.emailService == nil || report == nil {
return nil
}
if ctx == nil {
ctx = context.Background()
}
// Mark as "run" up-front so a broken SMTP config doesn't spam retries every minute.
s.setLastRunAt(ctx, report.ReportType, now)
content, err := s.generateReportHTML(ctx, report, now)
if err != nil {
return err
}
if strings.TrimSpace(content) == "" {
// Skip sending when the report decides not to emit content (e.g., digest below min count).
return nil
}
recipients := report.Recipients
if len(recipients) == 0 && s.userService != nil {
admin, err := s.userService.GetFirstAdmin(ctx)
if err == nil && admin != nil && strings.TrimSpace(admin.Email) != "" {
recipients = []string{strings.TrimSpace(admin.Email)}
}
}
if len(recipients) == 0 {
return nil
}
subject := fmt.Sprintf("[Ops Report] %s", strings.TrimSpace(report.Name))
for _, to := range recipients {
addr := strings.TrimSpace(to)
if addr == "" {
continue
}
if err := s.emailService.SendEmail(ctx, addr, subject, content); err != nil {
// Ignore per-recipient failures; continue best-effort.
continue
}
}
return nil
}
func (s *OpsScheduledReportService) generateReportHTML(ctx context.Context, report *opsScheduledReport, now time.Time) (string, error) {
if s == nil || s.opsService == nil || report == nil {
return "", fmt.Errorf("service not initialized")
}
if report.TimeRange <= 0 {
return "", fmt.Errorf("invalid time range")
}
end := now.UTC()
start := end.Add(-report.TimeRange)
switch strings.TrimSpace(report.ReportType) {
case "daily_summary", "weekly_summary":
overview, err := s.opsService.GetDashboardOverview(ctx, &OpsDashboardFilter{
StartTime: start,
EndTime: end,
Platform: "",
GroupID: nil,
QueryMode: OpsQueryModeAuto,
})
if err != nil {
// If pre-aggregation isn't ready but the report is requested, fall back to raw.
if strings.TrimSpace(report.ReportType) == "daily_summary" || strings.TrimSpace(report.ReportType) == "weekly_summary" {
overview, err = s.opsService.GetDashboardOverview(ctx, &OpsDashboardFilter{
StartTime: start,
EndTime: end,
Platform: "",
GroupID: nil,
QueryMode: OpsQueryModeRaw,
})
}
if err != nil {
return "", err
}
}
return buildOpsSummaryEmailHTML(report.Name, start, end, overview), nil
case "error_digest":
// Lightweight digest: list recent errors (status>=400) and breakdown by type.
startTime := start
endTime := end
filter := &OpsErrorLogFilter{
StartTime: &startTime,
EndTime: &endTime,
Page: 1,
PageSize: 100,
}
out, err := s.opsService.GetErrorLogs(ctx, filter)
if err != nil {
return "", err
}
if report.ErrorDigestMinCount > 0 && out != nil && out.Total < report.ErrorDigestMinCount {
return "", nil
}
return buildOpsErrorDigestEmailHTML(report.Name, start, end, out), nil
case "account_health":
// Best-effort: use account availability (not error rate yet).
avail, err := s.opsService.GetAccountAvailability(ctx, "", nil)
if err != nil {
return "", err
}
_ = report.AccountHealthErrorRateThreshold // reserved for future per-account error rate report
return buildOpsAccountHealthEmailHTML(report.Name, start, end, avail), nil
default:
return "", fmt.Errorf("unknown report type: %s", report.ReportType)
}
}
func buildOpsSummaryEmailHTML(title string, start, end time.Time, overview *OpsDashboardOverview) string {
if overview == nil {
return fmt.Sprintf("<h2>%s</h2><p>No data.</p>", htmlEscape(title))
}
latP50 := "-"
latP99 := "-"
if overview.Duration.P50 != nil {
latP50 = fmt.Sprintf("%dms", *overview.Duration.P50)
}
if overview.Duration.P99 != nil {
latP99 = fmt.Sprintf("%dms", *overview.Duration.P99)
}
ttftP50 := "-"
ttftP99 := "-"
if overview.TTFT.P50 != nil {
ttftP50 = fmt.Sprintf("%dms", *overview.TTFT.P50)
}
if overview.TTFT.P99 != nil {
ttftP99 = fmt.Sprintf("%dms", *overview.TTFT.P99)
}
return fmt.Sprintf(`
<h2>%s</h2>
<p><b>Period</b>: %s ~ %s (UTC)</p>
<ul>
<li><b>Total Requests</b>: %d</li>
<li><b>Success</b>: %d</li>
<li><b>Errors (SLA)</b>: %d</li>
<li><b>Business Limited</b>: %d</li>
<li><b>SLA</b>: %.2f%%</li>
<li><b>Error Rate</b>: %.2f%%</li>
<li><b>Upstream Error Rate (excl 429/529)</b>: %.2f%%</li>
<li><b>Upstream Errors</b>: excl429/529=%d, 429=%d, 529=%d</li>
<li><b>Latency</b>: p50=%s, p99=%s</li>
<li><b>TTFT</b>: p50=%s, p99=%s</li>
<li><b>Tokens</b>: %d</li>
<li><b>QPS</b>: current=%.1f, peak=%.1f, avg=%.1f</li>
<li><b>TPS</b>: current=%.1f, peak=%.1f, avg=%.1f</li>
</ul>
`,
htmlEscape(strings.TrimSpace(title)),
htmlEscape(start.UTC().Format(time.RFC3339)),
htmlEscape(end.UTC().Format(time.RFC3339)),
overview.RequestCountTotal,
overview.SuccessCount,
overview.ErrorCountSLA,
overview.BusinessLimitedCount,
overview.SLA*100,
overview.ErrorRate*100,
overview.UpstreamErrorRate*100,
overview.UpstreamErrorCountExcl429529,
overview.Upstream429Count,
overview.Upstream529Count,
htmlEscape(latP50),
htmlEscape(latP99),
htmlEscape(ttftP50),
htmlEscape(ttftP99),
overview.TokenConsumed,
overview.QPS.Current,
overview.QPS.Peak,
overview.QPS.Avg,
overview.TPS.Current,
overview.TPS.Peak,
overview.TPS.Avg,
)
}
func buildOpsErrorDigestEmailHTML(title string, start, end time.Time, list *OpsErrorLogList) string {
total := 0
recent := []*OpsErrorLog{}
if list != nil {
total = list.Total
recent = list.Errors
}
if len(recent) > 10 {
recent = recent[:10]
}
rows := ""
for _, item := range recent {
if item == nil {
continue
}
rows += fmt.Sprintf(
"<tr><td>%s</td><td>%s</td><td>%d</td><td>%s</td></tr>",
htmlEscape(item.CreatedAt.UTC().Format(time.RFC3339)),
htmlEscape(item.Platform),
item.StatusCode,
htmlEscape(truncateString(item.Message, 180)),
)
}
if rows == "" {
rows = "<tr><td colspan=\"4\">No recent errors.</td></tr>"
}
return fmt.Sprintf(`
<h2>%s</h2>
<p><b>Period</b>: %s ~ %s (UTC)</p>
<p><b>Total Errors</b>: %d</p>
<h3>Recent</h3>
<table border="1" cellpadding="6" cellspacing="0" style="border-collapse:collapse;">
<thead><tr><th>Time</th><th>Platform</th><th>Status</th><th>Message</th></tr></thead>
<tbody>%s</tbody>
</table>
`,
htmlEscape(strings.TrimSpace(title)),
htmlEscape(start.UTC().Format(time.RFC3339)),
htmlEscape(end.UTC().Format(time.RFC3339)),
total,
rows,
)
}
func buildOpsAccountHealthEmailHTML(title string, start, end time.Time, avail *OpsAccountAvailability) string {
total := 0
available := 0
rateLimited := 0
hasError := 0
if avail != nil && avail.Accounts != nil {
for _, a := range avail.Accounts {
if a == nil {
continue
}
total++
if a.IsAvailable {
available++
}
if a.IsRateLimited {
rateLimited++
}
if a.HasError {
hasError++
}
}
}
return fmt.Sprintf(`
<h2>%s</h2>
<p><b>Period</b>: %s ~ %s (UTC)</p>
<ul>
<li><b>Total Accounts</b>: %d</li>
<li><b>Available</b>: %d</li>
<li><b>Rate Limited</b>: %d</li>
<li><b>Error</b>: %d</li>
</ul>
<p>Note: This report currently reflects account availability status only.</p>
`,
htmlEscape(strings.TrimSpace(title)),
htmlEscape(start.UTC().Format(time.RFC3339)),
htmlEscape(end.UTC().Format(time.RFC3339)),
total,
available,
rateLimited,
hasError,
)
}
func (s *OpsScheduledReportService) tryAcquireLeaderLock(ctx context.Context) (func(), bool) {
if s == nil || !s.distributedLockOn {
return nil, true
}
if s.redisClient == nil {
s.warnNoRedisOnce.Do(func() {
log.Printf("[OpsScheduledReport] redis not configured; running without distributed lock")
})
return nil, true
}
if ctx == nil {
ctx = context.Background()
}
key := opsScheduledReportLeaderLockKeyDefault
ttl := opsScheduledReportLeaderLockTTLDefault
if strings.TrimSpace(key) == "" {
key = "ops:scheduled_reports:leader"
}
if ttl <= 0 {
ttl = 5 * time.Minute
}
ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
if err != nil {
// Prefer fail-closed to avoid duplicate report sends when Redis is flaky.
log.Printf("[OpsScheduledReport] leader lock SetNX failed; skipping this cycle: %v", err)
return nil, false
}
if !ok {
return nil, false
}
return func() {
_, _ = opsScheduledReportReleaseScript.Run(ctx, s.redisClient, []string{key}, s.instanceID).Result()
}, true
}
func (s *OpsScheduledReportService) getLastRunAt(ctx context.Context, reportType string) time.Time {
if s == nil || s.redisClient == nil {
return time.Time{}
}
kind := strings.TrimSpace(reportType)
if kind == "" {
return time.Time{}
}
key := opsScheduledReportLastRunKeyPrefix + kind
raw, err := s.redisClient.Get(ctx, key).Result()
if err != nil || strings.TrimSpace(raw) == "" {
return time.Time{}
}
sec, err := strconv.ParseInt(strings.TrimSpace(raw), 10, 64)
if err != nil || sec <= 0 {
return time.Time{}
}
last := time.Unix(sec, 0)
// Cron schedules are interpreted in the configured timezone (s.loc). Ensure the base time
// passed into cron.Next() uses the same location; otherwise the job will drift by timezone
// offset (e.g. Asia/Shanghai default would run 8h later after the first execution).
if s.loc != nil {
return last.In(s.loc)
}
return last.UTC()
}
func (s *OpsScheduledReportService) setLastRunAt(ctx context.Context, reportType string, t time.Time) {
if s == nil || s.redisClient == nil {
return
}
kind := strings.TrimSpace(reportType)
if kind == "" {
return
}
if t.IsZero() {
t = time.Now().UTC()
}
key := opsScheduledReportLastRunKeyPrefix + kind
_ = s.redisClient.Set(ctx, key, strconv.FormatInt(t.UTC().Unix(), 10), 14*24*time.Hour).Err()
}
func (s *OpsScheduledReportService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration) {
if s == nil || s.opsService == nil || s.opsService.opsRepo == nil {
return
}
now := time.Now().UTC()
durMs := duration.Milliseconds()
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
_ = s.opsService.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
JobName: opsScheduledReportJobName,
LastRunAt: &runAt,
LastSuccessAt: &now,
LastDurationMs: &durMs,
})
}
func (s *OpsScheduledReportService) recordHeartbeatError(runAt time.Time, duration time.Duration, err error) {
if s == nil || s.opsService == nil || s.opsService.opsRepo == nil || err == nil {
return
}
now := time.Now().UTC()
durMs := duration.Milliseconds()
msg := truncateString(err.Error(), 2048)
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
_ = s.opsService.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
JobName: opsScheduledReportJobName,
LastRunAt: &runAt,
LastErrorAt: &now,
LastError: &msg,
LastDurationMs: &durMs,
})
}
func normalizeEmails(in []string) []string {
if len(in) == 0 {
return nil
}
seen := make(map[string]struct{}, len(in))
out := make([]string, 0, len(in))
for _, raw := range in {
addr := strings.ToLower(strings.TrimSpace(raw))
if addr == "" {
continue
}
if _, ok := seen[addr]; ok {
continue
}
seen[addr] = struct{}{}
out = append(out, addr)
}
return out
}
package service
import (
"context"
"database/sql"
"encoding/json"
"errors"
"log"
"strings"
"time"
"github.com/Wei-Shaw/sub2api/internal/config"
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
)
var ErrOpsDisabled = infraerrors.NotFound("OPS_DISABLED", "Ops monitoring is disabled")
const (
opsMaxStoredRequestBodyBytes = 10 * 1024
opsMaxStoredErrorBodyBytes = 20 * 1024
)
// OpsService provides ingestion and query APIs for the Ops monitoring module.
type OpsService struct {
opsRepo OpsRepository
settingRepo SettingRepository
cfg *config.Config
accountRepo AccountRepository
// getAccountAvailability is a unit-test hook for overriding account availability lookup.
getAccountAvailability func(ctx context.Context, platformFilter string, groupIDFilter *int64) (*OpsAccountAvailability, error)
concurrencyService *ConcurrencyService
gatewayService *GatewayService
openAIGatewayService *OpenAIGatewayService
geminiCompatService *GeminiMessagesCompatService
antigravityGatewayService *AntigravityGatewayService
}
func NewOpsService(
opsRepo OpsRepository,
settingRepo SettingRepository,
cfg *config.Config,
accountRepo AccountRepository,
concurrencyService *ConcurrencyService,
gatewayService *GatewayService,
openAIGatewayService *OpenAIGatewayService,
geminiCompatService *GeminiMessagesCompatService,
antigravityGatewayService *AntigravityGatewayService,
) *OpsService {
return &OpsService{
opsRepo: opsRepo,
settingRepo: settingRepo,
cfg: cfg,
accountRepo: accountRepo,
concurrencyService: concurrencyService,
gatewayService: gatewayService,
openAIGatewayService: openAIGatewayService,
geminiCompatService: geminiCompatService,
antigravityGatewayService: antigravityGatewayService,
}
}
func (s *OpsService) RequireMonitoringEnabled(ctx context.Context) error {
if s.IsMonitoringEnabled(ctx) {
return nil
}
return ErrOpsDisabled
}
func (s *OpsService) IsMonitoringEnabled(ctx context.Context) bool {
// Hard switch: disable ops entirely.
if s.cfg != nil && !s.cfg.Ops.Enabled {
return false
}
if s.settingRepo == nil {
return true
}
value, err := s.settingRepo.GetValue(ctx, SettingKeyOpsMonitoringEnabled)
if err != nil {
// Default enabled when key is missing, and fail-open on transient errors
// (ops should never block gateway traffic).
if errors.Is(err, ErrSettingNotFound) {
return true
}
return true
}
switch strings.ToLower(strings.TrimSpace(value)) {
case "false", "0", "off", "disabled":
return false
default:
return true
}
}
func (s *OpsService) RecordError(ctx context.Context, entry *OpsInsertErrorLogInput, rawRequestBody []byte) error {
if entry == nil {
return nil
}
if !s.IsMonitoringEnabled(ctx) {
return nil
}
if s.opsRepo == nil {
return nil
}
// Ensure timestamps are always populated.
if entry.CreatedAt.IsZero() {
entry.CreatedAt = time.Now()
}
// Ensure required fields exist (DB has NOT NULL constraints).
entry.ErrorPhase = strings.TrimSpace(entry.ErrorPhase)
entry.ErrorType = strings.TrimSpace(entry.ErrorType)
if entry.ErrorPhase == "" {
entry.ErrorPhase = "internal"
}
if entry.ErrorType == "" {
entry.ErrorType = "api_error"
}
// Sanitize + trim request body (errors only).
if len(rawRequestBody) > 0 {
sanitized, truncated, bytesLen := sanitizeAndTrimRequestBody(rawRequestBody, opsMaxStoredRequestBodyBytes)
if sanitized != "" {
entry.RequestBodyJSON = &sanitized
}
entry.RequestBodyTruncated = truncated
entry.RequestBodyBytes = &bytesLen
}
// Sanitize + truncate error_body to avoid storing sensitive data.
if strings.TrimSpace(entry.ErrorBody) != "" {
sanitized, _ := sanitizeErrorBodyForStorage(entry.ErrorBody, opsMaxStoredErrorBodyBytes)
entry.ErrorBody = sanitized
}
// Sanitize upstream error context if provided by gateway services.
if entry.UpstreamStatusCode != nil && *entry.UpstreamStatusCode <= 0 {
entry.UpstreamStatusCode = nil
}
if entry.UpstreamErrorMessage != nil {
msg := strings.TrimSpace(*entry.UpstreamErrorMessage)
msg = sanitizeUpstreamErrorMessage(msg)
msg = truncateString(msg, 2048)
if strings.TrimSpace(msg) == "" {
entry.UpstreamErrorMessage = nil
} else {
entry.UpstreamErrorMessage = &msg
}
}
if entry.UpstreamErrorDetail != nil {
detail := strings.TrimSpace(*entry.UpstreamErrorDetail)
if detail == "" {
entry.UpstreamErrorDetail = nil
} else {
sanitized, _ := sanitizeErrorBodyForStorage(detail, opsMaxStoredErrorBodyBytes)
if strings.TrimSpace(sanitized) == "" {
entry.UpstreamErrorDetail = nil
} else {
entry.UpstreamErrorDetail = &sanitized
}
}
}
// Sanitize + serialize upstream error events list.
if len(entry.UpstreamErrors) > 0 {
const maxEvents = 32
events := entry.UpstreamErrors
if len(events) > maxEvents {
events = events[len(events)-maxEvents:]
}
sanitized := make([]*OpsUpstreamErrorEvent, 0, len(events))
for _, ev := range events {
if ev == nil {
continue
}
out := *ev
out.Platform = strings.TrimSpace(out.Platform)
out.UpstreamRequestID = truncateString(strings.TrimSpace(out.UpstreamRequestID), 128)
out.Kind = truncateString(strings.TrimSpace(out.Kind), 64)
if out.AccountID < 0 {
out.AccountID = 0
}
if out.UpstreamStatusCode < 0 {
out.UpstreamStatusCode = 0
}
if out.AtUnixMs < 0 {
out.AtUnixMs = 0
}
msg := sanitizeUpstreamErrorMessage(strings.TrimSpace(out.Message))
msg = truncateString(msg, 2048)
out.Message = msg
detail := strings.TrimSpace(out.Detail)
if detail != "" {
// Keep upstream detail small; request bodies are not stored here, only upstream error payloads.
sanitizedDetail, _ := sanitizeErrorBodyForStorage(detail, opsMaxStoredErrorBodyBytes)
out.Detail = sanitizedDetail
} else {
out.Detail = ""
}
// Drop fully-empty events (can happen if only status code was known).
if out.UpstreamStatusCode == 0 && out.Message == "" && out.Detail == "" {
continue
}
evCopy := out
sanitized = append(sanitized, &evCopy)
}
entry.UpstreamErrorsJSON = marshalOpsUpstreamErrors(sanitized)
entry.UpstreamErrors = nil
}
if _, err := s.opsRepo.InsertErrorLog(ctx, entry); err != nil {
// Never bubble up to gateway; best-effort logging.
log.Printf("[Ops] RecordError failed: %v", err)
return err
}
return nil
}
func (s *OpsService) GetErrorLogs(ctx context.Context, filter *OpsErrorLogFilter) (*OpsErrorLogList, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return &OpsErrorLogList{Errors: []*OpsErrorLog{}, Total: 0, Page: 1, PageSize: 20}, nil
}
return s.opsRepo.ListErrorLogs(ctx, filter)
}
func (s *OpsService) GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLogDetail, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found")
}
detail, err := s.opsRepo.GetErrorLogByID(ctx, id)
if err != nil {
if errors.Is(err, sql.ErrNoRows) {
return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found")
}
return nil, infraerrors.InternalServer("OPS_ERROR_LOAD_FAILED", "Failed to load ops error log").WithCause(err)
}
return detail, nil
}
func sanitizeAndTrimRequestBody(raw []byte, maxBytes int) (jsonString string, truncated bool, bytesLen int) {
bytesLen = len(raw)
if len(raw) == 0 {
return "", false, 0
}
var decoded any
if err := json.Unmarshal(raw, &decoded); err != nil {
// If it's not valid JSON, don't store (retry would not be reliable anyway).
return "", false, bytesLen
}
decoded = redactSensitiveJSON(decoded)
encoded, err := json.Marshal(decoded)
if err != nil {
return "", false, bytesLen
}
if len(encoded) <= maxBytes {
return string(encoded), false, bytesLen
}
// Trim conversation history to keep the most recent context.
if root, ok := decoded.(map[string]any); ok {
if trimmed, ok := trimConversationArrays(root, maxBytes); ok {
encoded2, err2 := json.Marshal(trimmed)
if err2 == nil && len(encoded2) <= maxBytes {
return string(encoded2), true, bytesLen
}
// Fallthrough: keep shrinking.
decoded = trimmed
}
essential := shrinkToEssentials(root)
encoded3, err3 := json.Marshal(essential)
if err3 == nil && len(encoded3) <= maxBytes {
return string(encoded3), true, bytesLen
}
}
// Last resort: store a minimal placeholder (still valid JSON).
placeholder := map[string]any{
"request_body_truncated": true,
}
if model := extractString(decoded, "model"); model != "" {
placeholder["model"] = model
}
encoded4, err4 := json.Marshal(placeholder)
if err4 != nil {
return "", true, bytesLen
}
return string(encoded4), true, bytesLen
}
func redactSensitiveJSON(v any) any {
switch t := v.(type) {
case map[string]any:
out := make(map[string]any, len(t))
for k, vv := range t {
if isSensitiveKey(k) {
out[k] = "[REDACTED]"
continue
}
out[k] = redactSensitiveJSON(vv)
}
return out
case []any:
out := make([]any, 0, len(t))
for _, vv := range t {
out = append(out, redactSensitiveJSON(vv))
}
return out
default:
return v
}
}
func isSensitiveKey(key string) bool {
k := strings.ToLower(strings.TrimSpace(key))
if k == "" {
return false
}
// Exact matches (common credential fields).
switch k {
case "authorization",
"proxy-authorization",
"x-api-key",
"api_key",
"apikey",
"access_token",
"refresh_token",
"id_token",
"session_token",
"token",
"password",
"passwd",
"passphrase",
"secret",
"client_secret",
"private_key",
"jwt",
"signature",
"accesskeyid",
"secretaccesskey":
return true
}
// Suffix matches.
for _, suffix := range []string{
"_secret",
"_token",
"_id_token",
"_session_token",
"_password",
"_passwd",
"_passphrase",
"_key",
"secret_key",
"private_key",
} {
if strings.HasSuffix(k, suffix) {
return true
}
}
// Substring matches (conservative, but errs on the side of privacy).
for _, sub := range []string{
"secret",
"token",
"password",
"passwd",
"passphrase",
"privatekey",
"private_key",
"apikey",
"api_key",
"accesskeyid",
"secretaccesskey",
"bearer",
"cookie",
"credential",
"session",
"jwt",
"signature",
} {
if strings.Contains(k, sub) {
return true
}
}
return false
}
func trimConversationArrays(root map[string]any, maxBytes int) (map[string]any, bool) {
// Supported: anthropic/openai: messages; gemini: contents.
if out, ok := trimArrayField(root, "messages", maxBytes); ok {
return out, true
}
if out, ok := trimArrayField(root, "contents", maxBytes); ok {
return out, true
}
return root, false
}
func trimArrayField(root map[string]any, field string, maxBytes int) (map[string]any, bool) {
raw, ok := root[field]
if !ok {
return nil, false
}
arr, ok := raw.([]any)
if !ok || len(arr) == 0 {
return nil, false
}
// Keep at least the last message/content. Use binary search so we don't marshal O(n) times.
// We are dropping from the *front* of the array (oldest context first).
lo := 0
hi := len(arr) - 1 // inclusive; hi ensures at least one item remains
var best map[string]any
found := false
for lo <= hi {
mid := (lo + hi) / 2
candidateArr := arr[mid:]
if len(candidateArr) == 0 {
lo = mid + 1
continue
}
next := shallowCopyMap(root)
next[field] = candidateArr
encoded, err := json.Marshal(next)
if err != nil {
// If marshal fails, try dropping more.
lo = mid + 1
continue
}
if len(encoded) <= maxBytes {
best = next
found = true
// Try to keep more context by dropping fewer items.
hi = mid - 1
continue
}
// Need to drop more.
lo = mid + 1
}
if found {
return best, true
}
// Nothing fit (even with only one element); return the smallest slice and let the
// caller fall back to shrinkToEssentials().
next := shallowCopyMap(root)
next[field] = arr[len(arr)-1:]
return next, true
}
func shrinkToEssentials(root map[string]any) map[string]any {
out := make(map[string]any)
for _, key := range []string{"model", "stream", "max_tokens", "temperature", "top_p", "top_k"} {
if v, ok := root[key]; ok {
out[key] = v
}
}
// Keep only the last element of the conversation array.
if v, ok := root["messages"]; ok {
if arr, ok := v.([]any); ok && len(arr) > 0 {
out["messages"] = []any{arr[len(arr)-1]}
}
}
if v, ok := root["contents"]; ok {
if arr, ok := v.([]any); ok && len(arr) > 0 {
out["contents"] = []any{arr[len(arr)-1]}
}
}
return out
}
func shallowCopyMap(m map[string]any) map[string]any {
out := make(map[string]any, len(m))
for k, v := range m {
out[k] = v
}
return out
}
func sanitizeErrorBodyForStorage(raw string, maxBytes int) (sanitized string, truncated bool) {
raw = strings.TrimSpace(raw)
if raw == "" {
return "", false
}
// Prefer JSON-safe sanitization when possible.
if out, trunc, _ := sanitizeAndTrimRequestBody([]byte(raw), maxBytes); out != "" {
return out, trunc
}
// Non-JSON: best-effort truncate.
if maxBytes > 0 && len(raw) > maxBytes {
return truncateString(raw, maxBytes), true
}
return raw, false
}
func extractString(v any, key string) string {
root, ok := v.(map[string]any)
if !ok {
return ""
}
s, _ := root[key].(string)
return strings.TrimSpace(s)
}
package service
import (
"context"
"encoding/json"
"errors"
"strings"
"time"
)
const (
opsAlertEvaluatorLeaderLockKeyDefault = "ops:alert:evaluator:leader"
opsAlertEvaluatorLeaderLockTTLDefault = 30 * time.Second
)
// =========================
// Email notification config
// =========================
func (s *OpsService) GetEmailNotificationConfig(ctx context.Context) (*OpsEmailNotificationConfig, error) {
defaultCfg := defaultOpsEmailNotificationConfig()
if s == nil || s.settingRepo == nil {
return defaultCfg, nil
}
if ctx == nil {
ctx = context.Background()
}
raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsEmailNotificationConfig)
if err != nil {
if errors.Is(err, ErrSettingNotFound) {
// Initialize defaults on first read (best-effort).
if b, mErr := json.Marshal(defaultCfg); mErr == nil {
_ = s.settingRepo.Set(ctx, SettingKeyOpsEmailNotificationConfig, string(b))
}
return defaultCfg, nil
}
return nil, err
}
cfg := &OpsEmailNotificationConfig{}
if err := json.Unmarshal([]byte(raw), cfg); err != nil {
// Corrupted JSON should not break ops UI; fall back to defaults.
return defaultCfg, nil
}
normalizeOpsEmailNotificationConfig(cfg)
return cfg, nil
}
func (s *OpsService) UpdateEmailNotificationConfig(ctx context.Context, req *OpsEmailNotificationConfigUpdateRequest) (*OpsEmailNotificationConfig, error) {
if s == nil || s.settingRepo == nil {
return nil, errors.New("setting repository not initialized")
}
if ctx == nil {
ctx = context.Background()
}
if req == nil {
return nil, errors.New("invalid request")
}
cfg, err := s.GetEmailNotificationConfig(ctx)
if err != nil {
return nil, err
}
if req.Alert != nil {
cfg.Alert.Enabled = req.Alert.Enabled
if req.Alert.Recipients != nil {
cfg.Alert.Recipients = req.Alert.Recipients
}
cfg.Alert.MinSeverity = strings.TrimSpace(req.Alert.MinSeverity)
cfg.Alert.RateLimitPerHour = req.Alert.RateLimitPerHour
cfg.Alert.BatchingWindowSeconds = req.Alert.BatchingWindowSeconds
cfg.Alert.IncludeResolvedAlerts = req.Alert.IncludeResolvedAlerts
}
if req.Report != nil {
cfg.Report.Enabled = req.Report.Enabled
if req.Report.Recipients != nil {
cfg.Report.Recipients = req.Report.Recipients
}
cfg.Report.DailySummaryEnabled = req.Report.DailySummaryEnabled
cfg.Report.DailySummarySchedule = strings.TrimSpace(req.Report.DailySummarySchedule)
cfg.Report.WeeklySummaryEnabled = req.Report.WeeklySummaryEnabled
cfg.Report.WeeklySummarySchedule = strings.TrimSpace(req.Report.WeeklySummarySchedule)
cfg.Report.ErrorDigestEnabled = req.Report.ErrorDigestEnabled
cfg.Report.ErrorDigestSchedule = strings.TrimSpace(req.Report.ErrorDigestSchedule)
cfg.Report.ErrorDigestMinCount = req.Report.ErrorDigestMinCount
cfg.Report.AccountHealthEnabled = req.Report.AccountHealthEnabled
cfg.Report.AccountHealthSchedule = strings.TrimSpace(req.Report.AccountHealthSchedule)
cfg.Report.AccountHealthErrorRateThreshold = req.Report.AccountHealthErrorRateThreshold
}
if err := validateOpsEmailNotificationConfig(cfg); err != nil {
return nil, err
}
normalizeOpsEmailNotificationConfig(cfg)
raw, err := json.Marshal(cfg)
if err != nil {
return nil, err
}
if err := s.settingRepo.Set(ctx, SettingKeyOpsEmailNotificationConfig, string(raw)); err != nil {
return nil, err
}
return cfg, nil
}
func defaultOpsEmailNotificationConfig() *OpsEmailNotificationConfig {
return &OpsEmailNotificationConfig{
Alert: OpsEmailAlertConfig{
Enabled: true,
Recipients: []string{},
MinSeverity: "",
RateLimitPerHour: 0,
BatchingWindowSeconds: 0,
IncludeResolvedAlerts: false,
},
Report: OpsEmailReportConfig{
Enabled: false,
Recipients: []string{},
DailySummaryEnabled: false,
DailySummarySchedule: "0 9 * * *",
WeeklySummaryEnabled: false,
WeeklySummarySchedule: "0 9 * * 1",
ErrorDigestEnabled: false,
ErrorDigestSchedule: "0 9 * * *",
ErrorDigestMinCount: 10,
AccountHealthEnabled: false,
AccountHealthSchedule: "0 9 * * *",
AccountHealthErrorRateThreshold: 10.0,
},
}
}
func normalizeOpsEmailNotificationConfig(cfg *OpsEmailNotificationConfig) {
if cfg == nil {
return
}
if cfg.Alert.Recipients == nil {
cfg.Alert.Recipients = []string{}
}
if cfg.Report.Recipients == nil {
cfg.Report.Recipients = []string{}
}
cfg.Alert.MinSeverity = strings.TrimSpace(cfg.Alert.MinSeverity)
cfg.Report.DailySummarySchedule = strings.TrimSpace(cfg.Report.DailySummarySchedule)
cfg.Report.WeeklySummarySchedule = strings.TrimSpace(cfg.Report.WeeklySummarySchedule)
cfg.Report.ErrorDigestSchedule = strings.TrimSpace(cfg.Report.ErrorDigestSchedule)
cfg.Report.AccountHealthSchedule = strings.TrimSpace(cfg.Report.AccountHealthSchedule)
// Fill missing schedules with defaults to avoid breaking cron logic if clients send empty strings.
if cfg.Report.DailySummarySchedule == "" {
cfg.Report.DailySummarySchedule = "0 9 * * *"
}
if cfg.Report.WeeklySummarySchedule == "" {
cfg.Report.WeeklySummarySchedule = "0 9 * * 1"
}
if cfg.Report.ErrorDigestSchedule == "" {
cfg.Report.ErrorDigestSchedule = "0 9 * * *"
}
if cfg.Report.AccountHealthSchedule == "" {
cfg.Report.AccountHealthSchedule = "0 9 * * *"
}
}
func validateOpsEmailNotificationConfig(cfg *OpsEmailNotificationConfig) error {
if cfg == nil {
return errors.New("invalid config")
}
if cfg.Alert.RateLimitPerHour < 0 {
return errors.New("alert.rate_limit_per_hour must be >= 0")
}
if cfg.Alert.BatchingWindowSeconds < 0 {
return errors.New("alert.batching_window_seconds must be >= 0")
}
switch strings.TrimSpace(cfg.Alert.MinSeverity) {
case "", "critical", "warning", "info":
default:
return errors.New("alert.min_severity must be one of: critical, warning, info, or empty")
}
if cfg.Report.ErrorDigestMinCount < 0 {
return errors.New("report.error_digest_min_count must be >= 0")
}
if cfg.Report.AccountHealthErrorRateThreshold < 0 || cfg.Report.AccountHealthErrorRateThreshold > 100 {
return errors.New("report.account_health_error_rate_threshold must be between 0 and 100")
}
return nil
}
// =========================
// Alert runtime settings
// =========================
func defaultOpsAlertRuntimeSettings() *OpsAlertRuntimeSettings {
return &OpsAlertRuntimeSettings{
EvaluationIntervalSeconds: 60,
DistributedLock: OpsDistributedLockSettings{
Enabled: true,
Key: opsAlertEvaluatorLeaderLockKeyDefault,
TTLSeconds: int(opsAlertEvaluatorLeaderLockTTLDefault.Seconds()),
},
Silencing: OpsAlertSilencingSettings{
Enabled: false,
GlobalUntilRFC3339: "",
GlobalReason: "",
Entries: []OpsAlertSilenceEntry{},
},
}
}
func normalizeOpsDistributedLockSettings(s *OpsDistributedLockSettings, defaultKey string, defaultTTLSeconds int) {
if s == nil {
return
}
s.Key = strings.TrimSpace(s.Key)
if s.Key == "" {
s.Key = defaultKey
}
if s.TTLSeconds <= 0 {
s.TTLSeconds = defaultTTLSeconds
}
}
func normalizeOpsAlertSilencingSettings(s *OpsAlertSilencingSettings) {
if s == nil {
return
}
s.GlobalUntilRFC3339 = strings.TrimSpace(s.GlobalUntilRFC3339)
s.GlobalReason = strings.TrimSpace(s.GlobalReason)
if s.Entries == nil {
s.Entries = []OpsAlertSilenceEntry{}
}
for i := range s.Entries {
s.Entries[i].UntilRFC3339 = strings.TrimSpace(s.Entries[i].UntilRFC3339)
s.Entries[i].Reason = strings.TrimSpace(s.Entries[i].Reason)
}
}
func validateOpsDistributedLockSettings(s OpsDistributedLockSettings) error {
if strings.TrimSpace(s.Key) == "" {
return errors.New("distributed_lock.key is required")
}
if s.TTLSeconds <= 0 || s.TTLSeconds > int((24*time.Hour).Seconds()) {
return errors.New("distributed_lock.ttl_seconds must be between 1 and 86400")
}
return nil
}
func validateOpsAlertSilencingSettings(s OpsAlertSilencingSettings) error {
parse := func(raw string) error {
if strings.TrimSpace(raw) == "" {
return nil
}
if _, err := time.Parse(time.RFC3339, raw); err != nil {
return errors.New("silencing time must be RFC3339")
}
return nil
}
if err := parse(s.GlobalUntilRFC3339); err != nil {
return err
}
for _, entry := range s.Entries {
if strings.TrimSpace(entry.UntilRFC3339) == "" {
return errors.New("silencing.entries.until_rfc3339 is required")
}
if _, err := time.Parse(time.RFC3339, entry.UntilRFC3339); err != nil {
return errors.New("silencing.entries.until_rfc3339 must be RFC3339")
}
}
return nil
}
func (s *OpsService) GetOpsAlertRuntimeSettings(ctx context.Context) (*OpsAlertRuntimeSettings, error) {
defaultCfg := defaultOpsAlertRuntimeSettings()
if s == nil || s.settingRepo == nil {
return defaultCfg, nil
}
if ctx == nil {
ctx = context.Background()
}
raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsAlertRuntimeSettings)
if err != nil {
if errors.Is(err, ErrSettingNotFound) {
if b, mErr := json.Marshal(defaultCfg); mErr == nil {
_ = s.settingRepo.Set(ctx, SettingKeyOpsAlertRuntimeSettings, string(b))
}
return defaultCfg, nil
}
return nil, err
}
cfg := &OpsAlertRuntimeSettings{}
if err := json.Unmarshal([]byte(raw), cfg); err != nil {
return defaultCfg, nil
}
if cfg.EvaluationIntervalSeconds <= 0 {
cfg.EvaluationIntervalSeconds = defaultCfg.EvaluationIntervalSeconds
}
normalizeOpsDistributedLockSettings(&cfg.DistributedLock, opsAlertEvaluatorLeaderLockKeyDefault, defaultCfg.DistributedLock.TTLSeconds)
normalizeOpsAlertSilencingSettings(&cfg.Silencing)
return cfg, nil
}
func (s *OpsService) UpdateOpsAlertRuntimeSettings(ctx context.Context, cfg *OpsAlertRuntimeSettings) (*OpsAlertRuntimeSettings, error) {
if s == nil || s.settingRepo == nil {
return nil, errors.New("setting repository not initialized")
}
if ctx == nil {
ctx = context.Background()
}
if cfg == nil {
return nil, errors.New("invalid config")
}
if cfg.EvaluationIntervalSeconds < 1 || cfg.EvaluationIntervalSeconds > int((24*time.Hour).Seconds()) {
return nil, errors.New("evaluation_interval_seconds must be between 1 and 86400")
}
if cfg.DistributedLock.Enabled {
if err := validateOpsDistributedLockSettings(cfg.DistributedLock); err != nil {
return nil, err
}
}
if cfg.Silencing.Enabled {
if err := validateOpsAlertSilencingSettings(cfg.Silencing); err != nil {
return nil, err
}
}
defaultCfg := defaultOpsAlertRuntimeSettings()
normalizeOpsDistributedLockSettings(&cfg.DistributedLock, opsAlertEvaluatorLeaderLockKeyDefault, defaultCfg.DistributedLock.TTLSeconds)
normalizeOpsAlertSilencingSettings(&cfg.Silencing)
raw, err := json.Marshal(cfg)
if err != nil {
return nil, err
}
if err := s.settingRepo.Set(ctx, SettingKeyOpsAlertRuntimeSettings, string(raw)); err != nil {
return nil, err
}
// Return a fresh copy (avoid callers holding pointers into internal slices that may be mutated).
updated := &OpsAlertRuntimeSettings{}
_ = json.Unmarshal(raw, updated)
return updated, nil
}
// =========================
// Advanced settings
// =========================
func defaultOpsAdvancedSettings() *OpsAdvancedSettings {
return &OpsAdvancedSettings{
DataRetention: OpsDataRetentionSettings{
CleanupEnabled: false,
CleanupSchedule: "0 2 * * *",
ErrorLogRetentionDays: 30,
MinuteMetricsRetentionDays: 30,
HourlyMetricsRetentionDays: 30,
},
Aggregation: OpsAggregationSettings{
AggregationEnabled: false,
},
}
}
func normalizeOpsAdvancedSettings(cfg *OpsAdvancedSettings) {
if cfg == nil {
return
}
cfg.DataRetention.CleanupSchedule = strings.TrimSpace(cfg.DataRetention.CleanupSchedule)
if cfg.DataRetention.CleanupSchedule == "" {
cfg.DataRetention.CleanupSchedule = "0 2 * * *"
}
if cfg.DataRetention.ErrorLogRetentionDays <= 0 {
cfg.DataRetention.ErrorLogRetentionDays = 30
}
if cfg.DataRetention.MinuteMetricsRetentionDays <= 0 {
cfg.DataRetention.MinuteMetricsRetentionDays = 30
}
if cfg.DataRetention.HourlyMetricsRetentionDays <= 0 {
cfg.DataRetention.HourlyMetricsRetentionDays = 30
}
}
func validateOpsAdvancedSettings(cfg *OpsAdvancedSettings) error {
if cfg == nil {
return errors.New("invalid config")
}
if cfg.DataRetention.ErrorLogRetentionDays < 1 || cfg.DataRetention.ErrorLogRetentionDays > 365 {
return errors.New("error_log_retention_days must be between 1 and 365")
}
if cfg.DataRetention.MinuteMetricsRetentionDays < 1 || cfg.DataRetention.MinuteMetricsRetentionDays > 365 {
return errors.New("minute_metrics_retention_days must be between 1 and 365")
}
if cfg.DataRetention.HourlyMetricsRetentionDays < 1 || cfg.DataRetention.HourlyMetricsRetentionDays > 365 {
return errors.New("hourly_metrics_retention_days must be between 1 and 365")
}
return nil
}
func (s *OpsService) GetOpsAdvancedSettings(ctx context.Context) (*OpsAdvancedSettings, error) {
defaultCfg := defaultOpsAdvancedSettings()
if s == nil || s.settingRepo == nil {
return defaultCfg, nil
}
if ctx == nil {
ctx = context.Background()
}
raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsAdvancedSettings)
if err != nil {
if errors.Is(err, ErrSettingNotFound) {
if b, mErr := json.Marshal(defaultCfg); mErr == nil {
_ = s.settingRepo.Set(ctx, SettingKeyOpsAdvancedSettings, string(b))
}
return defaultCfg, nil
}
return nil, err
}
cfg := &OpsAdvancedSettings{}
if err := json.Unmarshal([]byte(raw), cfg); err != nil {
return defaultCfg, nil
}
normalizeOpsAdvancedSettings(cfg)
return cfg, nil
}
func (s *OpsService) UpdateOpsAdvancedSettings(ctx context.Context, cfg *OpsAdvancedSettings) (*OpsAdvancedSettings, error) {
if s == nil || s.settingRepo == nil {
return nil, errors.New("setting repository not initialized")
}
if ctx == nil {
ctx = context.Background()
}
if cfg == nil {
return nil, errors.New("invalid config")
}
if err := validateOpsAdvancedSettings(cfg); err != nil {
return nil, err
}
normalizeOpsAdvancedSettings(cfg)
raw, err := json.Marshal(cfg)
if err != nil {
return nil, err
}
if err := s.settingRepo.Set(ctx, SettingKeyOpsAdvancedSettings, string(raw)); err != nil {
return nil, err
}
updated := &OpsAdvancedSettings{}
_ = json.Unmarshal(raw, updated)
return updated, nil
}
package service
// Ops settings models stored in DB `settings` table (JSON blobs).
type OpsEmailNotificationConfig struct {
Alert OpsEmailAlertConfig `json:"alert"`
Report OpsEmailReportConfig `json:"report"`
}
type OpsEmailAlertConfig struct {
Enabled bool `json:"enabled"`
Recipients []string `json:"recipients"`
MinSeverity string `json:"min_severity"`
RateLimitPerHour int `json:"rate_limit_per_hour"`
BatchingWindowSeconds int `json:"batching_window_seconds"`
IncludeResolvedAlerts bool `json:"include_resolved_alerts"`
}
type OpsEmailReportConfig struct {
Enabled bool `json:"enabled"`
Recipients []string `json:"recipients"`
DailySummaryEnabled bool `json:"daily_summary_enabled"`
DailySummarySchedule string `json:"daily_summary_schedule"`
WeeklySummaryEnabled bool `json:"weekly_summary_enabled"`
WeeklySummarySchedule string `json:"weekly_summary_schedule"`
ErrorDigestEnabled bool `json:"error_digest_enabled"`
ErrorDigestSchedule string `json:"error_digest_schedule"`
ErrorDigestMinCount int `json:"error_digest_min_count"`
AccountHealthEnabled bool `json:"account_health_enabled"`
AccountHealthSchedule string `json:"account_health_schedule"`
AccountHealthErrorRateThreshold float64 `json:"account_health_error_rate_threshold"`
}
// OpsEmailNotificationConfigUpdateRequest allows partial updates, while the
// frontend can still send the full config shape.
type OpsEmailNotificationConfigUpdateRequest struct {
Alert *OpsEmailAlertConfig `json:"alert"`
Report *OpsEmailReportConfig `json:"report"`
}
type OpsDistributedLockSettings struct {
Enabled bool `json:"enabled"`
Key string `json:"key"`
TTLSeconds int `json:"ttl_seconds"`
}
type OpsAlertSilenceEntry struct {
RuleID *int64 `json:"rule_id,omitempty"`
Severities []string `json:"severities,omitempty"`
UntilRFC3339 string `json:"until_rfc3339"`
Reason string `json:"reason"`
}
type OpsAlertSilencingSettings struct {
Enabled bool `json:"enabled"`
GlobalUntilRFC3339 string `json:"global_until_rfc3339"`
GlobalReason string `json:"global_reason"`
Entries []OpsAlertSilenceEntry `json:"entries,omitempty"`
}
type OpsAlertRuntimeSettings struct {
EvaluationIntervalSeconds int `json:"evaluation_interval_seconds"`
DistributedLock OpsDistributedLockSettings `json:"distributed_lock"`
Silencing OpsAlertSilencingSettings `json:"silencing"`
}
// OpsAdvancedSettings stores advanced ops configuration (data retention, aggregation).
type OpsAdvancedSettings struct {
DataRetention OpsDataRetentionSettings `json:"data_retention"`
Aggregation OpsAggregationSettings `json:"aggregation"`
}
type OpsDataRetentionSettings struct {
CleanupEnabled bool `json:"cleanup_enabled"`
CleanupSchedule string `json:"cleanup_schedule"`
ErrorLogRetentionDays int `json:"error_log_retention_days"`
MinuteMetricsRetentionDays int `json:"minute_metrics_retention_days"`
HourlyMetricsRetentionDays int `json:"hourly_metrics_retention_days"`
}
type OpsAggregationSettings struct {
AggregationEnabled bool `json:"aggregation_enabled"`
}
package service
import "time"
type OpsThroughputTrendPoint struct {
BucketStart time.Time `json:"bucket_start"`
RequestCount int64 `json:"request_count"`
TokenConsumed int64 `json:"token_consumed"`
QPS float64 `json:"qps"`
TPS float64 `json:"tps"`
}
type OpsThroughputPlatformBreakdownItem struct {
Platform string `json:"platform"`
RequestCount int64 `json:"request_count"`
TokenConsumed int64 `json:"token_consumed"`
}
type OpsThroughputGroupBreakdownItem struct {
GroupID int64 `json:"group_id"`
GroupName string `json:"group_name"`
RequestCount int64 `json:"request_count"`
TokenConsumed int64 `json:"token_consumed"`
}
type OpsThroughputTrendResponse struct {
Bucket string `json:"bucket"`
Points []*OpsThroughputTrendPoint `json:"points"`
// Optional drilldown helpers:
// - When no platform/group is selected: returns totals by platform.
// - When platform is selected but group is not: returns top groups in that platform.
ByPlatform []*OpsThroughputPlatformBreakdownItem `json:"by_platform,omitempty"`
TopGroups []*OpsThroughputGroupBreakdownItem `json:"top_groups,omitempty"`
}
type OpsErrorTrendPoint struct {
BucketStart time.Time `json:"bucket_start"`
ErrorCountTotal int64 `json:"error_count_total"`
BusinessLimitedCount int64 `json:"business_limited_count"`
ErrorCountSLA int64 `json:"error_count_sla"`
UpstreamErrorCountExcl429529 int64 `json:"upstream_error_count_excl_429_529"`
Upstream429Count int64 `json:"upstream_429_count"`
Upstream529Count int64 `json:"upstream_529_count"`
}
type OpsErrorTrendResponse struct {
Bucket string `json:"bucket"`
Points []*OpsErrorTrendPoint `json:"points"`
}
type OpsErrorDistributionItem struct {
StatusCode int `json:"status_code"`
Total int64 `json:"total"`
SLA int64 `json:"sla"`
BusinessLimited int64 `json:"business_limited"`
}
type OpsErrorDistributionResponse struct {
Total int64 `json:"total"`
Items []*OpsErrorDistributionItem `json:"items"`
}
package service
import (
"context"
infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
)
func (s *OpsService) GetThroughputTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsThroughputTrendResponse, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
if filter == nil {
return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
}
if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
}
if filter.StartTime.After(filter.EndTime) {
return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
}
return s.opsRepo.GetThroughputTrend(ctx, filter, bucketSeconds)
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment