Merge branch 'Wei-Shaw:main' into main

fd8473f2 · 程序猿MT · GitHub · 18b8bd43 · cc4910dd · fd8473f2
Unverified Commit fd8473f2 authored Jan 12, 2026 by 程序猿MT Committed by GitHub Jan 12, 2026
--- a/backend/internal/service/ops_dashboard.go
+++ b/backend/internal/service/ops_dashboard.go
+package service
+
+import (
+	"context"
+	"database/sql"
+	"errors"
+	"log"
+	"time"
+
+	infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
+)
+
+func (s *OpsService) GetDashboardOverview(ctx context.Context, filter *OpsDashboardFilter) (*OpsDashboardOverview, error) {
+	if err := s.RequireMonitoringEnabled(ctx); err != nil {
+		return nil, err
+	}
+	if s.opsRepo == nil {
+		return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+	}
+	if filter == nil {
+		return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
+	}
+	if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
+		return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
+	}
+	if filter.StartTime.After(filter.EndTime) {
+		return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
+	}
+
+	// Resolve query mode (requested via query param, or DB default).
+	filter.QueryMode = s.resolveOpsQueryMode(ctx, filter.QueryMode)
+
+	overview, err := s.opsRepo.GetDashboardOverview(ctx, filter)
+	if err != nil {
+		if errors.Is(err, ErrOpsPreaggregatedNotPopulated) {
+			return nil, infraerrors.Conflict("OPS_PREAGG_NOT_READY", "Pre-aggregated ops metrics are not populated yet")
+		}
+		return nil, err
+	}
+
+	// Best-effort system health + jobs; dashboard metrics should still render if these are missing.
+	if metrics, err := s.opsRepo.GetLatestSystemMetrics(ctx, 1); err == nil {
+		// Attach config-derived limits so the UI can show "current / max" for connection pools.
+		// These are best-effort and should never block the dashboard rendering.
+		if s != nil && s.cfg != nil {
+			if s.cfg.Database.MaxOpenConns > 0 {
+				metrics.DBMaxOpenConns = intPtr(s.cfg.Database.MaxOpenConns)
+			}
+			if s.cfg.Redis.PoolSize > 0 {
+				metrics.RedisPoolSize = intPtr(s.cfg.Redis.PoolSize)
+			}
+		}
+		overview.SystemMetrics = metrics
+	} else if err != nil && !errors.Is(err, sql.ErrNoRows) {
+		log.Printf("[Ops] GetLatestSystemMetrics failed: %v", err)
+	}
+
+	if heartbeats, err := s.opsRepo.ListJobHeartbeats(ctx); err == nil {
+		overview.JobHeartbeats = heartbeats
+	} else {
+		log.Printf("[Ops] ListJobHeartbeats failed: %v", err)
+	}
+
+	overview.HealthScore = computeDashboardHealthScore(time.Now().UTC(), overview)
+
+	return overview, nil
+}
+
+func (s *OpsService) resolveOpsQueryMode(ctx context.Context, requested OpsQueryMode) OpsQueryMode {
+	if requested.IsValid() {
+		// Allow "auto" to be disabled via config until preagg is proven stable in production.
+		// Forced `preagg` via query param still works.
+		if requested == OpsQueryModeAuto && s != nil && s.cfg != nil && !s.cfg.Ops.UsePreaggregatedTables {
+			return OpsQueryModeRaw
+		}
+		return requested
+	}
+
+	mode := OpsQueryModeAuto
+	if s != nil && s.settingRepo != nil {
+		if raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsQueryModeDefault); err == nil {
+			mode = ParseOpsQueryMode(raw)
+		}
+	}
+
+	if mode == OpsQueryModeAuto && s != nil && s.cfg != nil && !s.cfg.Ops.UsePreaggregatedTables {
+		return OpsQueryModeRaw
+	}
+	return mode
+}
--- a/backend/internal/service/ops_dashboard_models.go
+++ b/backend/internal/service/ops_dashboard_models.go
+package service
+
+import "time"
+
+type OpsDashboardFilter struct {
+	StartTime time.Time
+	EndTime   time.Time
+
+	Platform string
+	GroupID  *int64
+
+	// QueryMode controls whether dashboard queries should use raw logs or pre-aggregated tables.
+	// Expected values: auto/raw/preagg (see OpsQueryMode).
+	QueryMode OpsQueryMode
+}
+
+type OpsRateSummary struct {
+	Current float64 `json:"current"`
+	Peak    float64 `json:"peak"`
+	Avg     float64 `json:"avg"`
+}
+
+type OpsPercentiles struct {
+	P50 *int `json:"p50_ms"`
+	P90 *int `json:"p90_ms"`
+	P95 *int `json:"p95_ms"`
+	P99 *int `json:"p99_ms"`
+	Avg *int `json:"avg_ms"`
+	Max *int `json:"max_ms"`
+}
+
+type OpsDashboardOverview struct {
+	StartTime time.Time `json:"start_time"`
+	EndTime   time.Time `json:"end_time"`
+	Platform  string    `json:"platform"`
+	GroupID   *int64    `json:"group_id"`
+
+	// HealthScore is a backend-computed overall health score (0-100).
+	// It is derived from the monitored metrics in this overview, plus best-effort system metrics/job heartbeats.
+	HealthScore int `json:"health_score"`
+
+	// Latest system-level snapshot (window=1m, global).
+	SystemMetrics *OpsSystemMetricsSnapshot `json:"system_metrics"`
+
+	// Background jobs health (heartbeats).
+	JobHeartbeats []*OpsJobHeartbeat `json:"job_heartbeats"`
+
+	SuccessCount         int64 `json:"success_count"`
+	ErrorCountTotal      int64 `json:"error_count_total"`
+	BusinessLimitedCount int64 `json:"business_limited_count"`
+
+	ErrorCountSLA     int64 `json:"error_count_sla"`
+	RequestCountTotal int64 `json:"request_count_total"`
+	RequestCountSLA   int64 `json:"request_count_sla"`
+
+	TokenConsumed int64 `json:"token_consumed"`
+
+	SLA                          float64 `json:"sla"`
+	ErrorRate                    float64 `json:"error_rate"`
+	UpstreamErrorRate            float64 `json:"upstream_error_rate"`
+	UpstreamErrorCountExcl429529 int64   `json:"upstream_error_count_excl_429_529"`
+	Upstream429Count             int64   `json:"upstream_429_count"`
+	Upstream529Count             int64   `json:"upstream_529_count"`
+
+	QPS OpsRateSummary `json:"qps"`
+	TPS OpsRateSummary `json:"tps"`
+
+	Duration OpsPercentiles `json:"duration"`
+	TTFT     OpsPercentiles `json:"ttft"`
+}
+
+type OpsLatencyHistogramBucket struct {
+	Range string `json:"range"`
+	Count int64  `json:"count"`
+}
+
+// OpsLatencyHistogramResponse is a coarse latency distribution histogram (success requests only).
+// It is used by the Ops dashboard to quickly identify tail latency regressions.
+type OpsLatencyHistogramResponse struct {
+	StartTime time.Time `json:"start_time"`
+	EndTime   time.Time `json:"end_time"`
+	Platform  string    `json:"platform"`
+	GroupID   *int64    `json:"group_id"`
+
+	TotalRequests int64                        `json:"total_requests"`
+	Buckets       []*OpsLatencyHistogramBucket `json:"buckets"`
+}
--- a/backend/internal/service/ops_errors.go
+++ b/backend/internal/service/ops_errors.go
+package service
+
+import (
+	"context"
+
+	infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
+)
+
+func (s *OpsService) GetErrorTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsErrorTrendResponse, error) {
+	if err := s.RequireMonitoringEnabled(ctx); err != nil {
+		return nil, err
+	}
+	if s.opsRepo == nil {
+		return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+	}
+	if filter == nil {
+		return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
+	}
+	if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
+		return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
+	}
+	if filter.StartTime.After(filter.EndTime) {
+		return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
+	}
+	return s.opsRepo.GetErrorTrend(ctx, filter, bucketSeconds)
+}
+
+func (s *OpsService) GetErrorDistribution(ctx context.Context, filter *OpsDashboardFilter) (*OpsErrorDistributionResponse, error) {
+	if err := s.RequireMonitoringEnabled(ctx); err != nil {
+		return nil, err
+	}
+	if s.opsRepo == nil {
+		return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+	}
+	if filter == nil {
+		return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
+	}
+	if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
+		return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
+	}
+	if filter.StartTime.After(filter.EndTime) {
+		return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
+	}
+	return s.opsRepo.GetErrorDistribution(ctx, filter)
+}
--- a/backend/internal/service/ops_health_score.go
+++ b/backend/internal/service/ops_health_score.go
+package service
+
+import (
+	"math"
+	"time"
+)
+
+// computeDashboardHealthScore computes a 0-100 health score from the metrics returned by the dashboard overview.
+//
+// Design goals:
+// - Backend-owned scoring (UI only displays).
+// - Layered scoring: Business Health (70%) + Infrastructure Health (30%)
+// - Avoids double-counting (e.g., DB failure affects both infra and business metrics)
+// - Conservative + stable: penalize clear degradations; avoid overreacting to missing/idle data.
+func computeDashboardHealthScore(now time.Time, overview *OpsDashboardOverview) int {
+	if overview == nil {
+		return 0
+	}
+
+	// Idle/no-data: avoid showing a "bad" score when there is no traffic.
+	// UI can still render a gray/idle state based on QPS + error rate.
+	if overview.RequestCountSLA <= 0 && overview.RequestCountTotal <= 0 && overview.ErrorCountTotal <= 0 {
+		return 100
+	}
+
+	businessHealth := computeBusinessHealth(overview)
+	infraHealth := computeInfraHealth(now, overview)
+
+	// Weighted combination: 70% business + 30% infrastructure
+	score := businessHealth*0.7 + infraHealth*0.3
+	return int(math.Round(clampFloat64(score, 0, 100)))
+}
+
+// computeBusinessHealth calculates business health score (0-100)
+// Components: SLA (50%) + Error Rate (30%) + Latency (20%)
+func computeBusinessHealth(overview *OpsDashboardOverview) float64 {
+	// SLA score: 99.5% → 100, 95% → 0 (linear)
+	slaScore := 100.0
+	slaPct := clampFloat64(overview.SLA*100, 0, 100)
+	if slaPct < 99.5 {
+		if slaPct >= 95 {
+			slaScore = (slaPct - 95) / 4.5 * 100
+		} else {
+			slaScore = 0
+		}
+	}
+
+	// Error rate score: 0.5% → 100, 5% → 0 (linear)
+	// Combines request errors and upstream errors
+	errorScore := 100.0
+	errorPct := clampFloat64(overview.ErrorRate*100, 0, 100)
+	upstreamPct := clampFloat64(overview.UpstreamErrorRate*100, 0, 100)
+	combinedErrorPct := math.Max(errorPct, upstreamPct) // Use worst case
+	if combinedErrorPct > 0.5 {
+		if combinedErrorPct <= 5 {
+			errorScore = (5 - combinedErrorPct) / 4.5 * 100
+		} else {
+			errorScore = 0
+		}
+	}
+
+	// Latency score: 1s → 100, 10s → 0 (linear)
+	// Uses P99 of duration (TTFT is less critical for overall health)
+	latencyScore := 100.0
+	if overview.Duration.P99 != nil {
+		p99 := float64(*overview.Duration.P99)
+		if p99 > 1000 {
+			if p99 <= 10000 {
+				latencyScore = (10000 - p99) / 9000 * 100
+			} else {
+				latencyScore = 0
+			}
+		}
+	}
+
+	// Weighted combination
+	return slaScore*0.5 + errorScore*0.3 + latencyScore*0.2
+}
+
+// computeInfraHealth calculates infrastructure health score (0-100)
+// Components: Storage (40%) + Compute Resources (30%) + Background Jobs (30%)
+func computeInfraHealth(now time.Time, overview *OpsDashboardOverview) float64 {
+	// Storage score: DB critical, Redis less critical
+	storageScore := 100.0
+	if overview.SystemMetrics != nil {
+		if overview.SystemMetrics.DBOK != nil && !*overview.SystemMetrics.DBOK {
+			storageScore = 0 // DB failure is critical
+		} else if overview.SystemMetrics.RedisOK != nil && !*overview.SystemMetrics.RedisOK {
+			storageScore = 50 // Redis failure is degraded but not critical
+		}
+	}
+
+	// Compute resources score: CPU + Memory
+	computeScore := 100.0
+	if overview.SystemMetrics != nil {
+		cpuScore := 100.0
+		if overview.SystemMetrics.CPUUsagePercent != nil {
+			cpuPct := clampFloat64(*overview.SystemMetrics.CPUUsagePercent, 0, 100)
+			if cpuPct > 80 {
+				if cpuPct <= 100 {
+					cpuScore = (100 - cpuPct) / 20 * 100
+				} else {
+					cpuScore = 0
+				}
+			}
+		}
+
+		memScore := 100.0
+		if overview.SystemMetrics.MemoryUsagePercent != nil {
+			memPct := clampFloat64(*overview.SystemMetrics.MemoryUsagePercent, 0, 100)
+			if memPct > 85 {
+				if memPct <= 100 {
+					memScore = (100 - memPct) / 15 * 100
+				} else {
+					memScore = 0
+				}
+			}
+		}
+
+		computeScore = (cpuScore + memScore) / 2
+	}
+
+	// Background jobs score
+	jobScore := 100.0
+	failedJobs := 0
+	totalJobs := 0
+	for _, hb := range overview.JobHeartbeats {
+		if hb == nil {
+			continue
+		}
+		totalJobs++
+		if hb.LastErrorAt != nil && (hb.LastSuccessAt == nil || hb.LastErrorAt.After(*hb.LastSuccessAt)) {
+			failedJobs++
+		} else if hb.LastSuccessAt != nil && now.Sub(*hb.LastSuccessAt) > 15*time.Minute {
+			failedJobs++
+		}
+	}
+	if totalJobs > 0 && failedJobs > 0 {
+		jobScore = (1 - float64(failedJobs)/float64(totalJobs)) * 100
+	}
+
+	// Weighted combination
+	return storageScore*0.4 + computeScore*0.3 + jobScore*0.3
+}
+
+func clampFloat64(v float64, min float64, max float64) float64 {
+	if v < min {
+		return min
+	}
+	if v > max {
+		return max
+	}
+	return v
+}
--- a/backend/internal/service/ops_health_score_test.go
+++ b/backend/internal/service/ops_health_score_test.go
+//go:build unit
+
+package service
+
+import (
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestComputeDashboardHealthScore_IdleReturns100(t *testing.T) {
+	t.Parallel()
+
+	score := computeDashboardHealthScore(time.Now().UTC(), &OpsDashboardOverview{})
+	require.Equal(t, 100, score)
+}
+
+func TestComputeDashboardHealthScore_DegradesOnBadSignals(t *testing.T) {
+	t.Parallel()
+
+	ov := &OpsDashboardOverview{
+		RequestCountTotal: 100,
+		RequestCountSLA:   100,
+		SuccessCount:      90,
+		ErrorCountTotal:   10,
+		ErrorCountSLA:     10,
+
+		SLA:               0.90,
+		ErrorRate:         0.10,
+		UpstreamErrorRate: 0.08,
+
+		Duration: OpsPercentiles{P99: intPtr(20_000)},
+		TTFT:     OpsPercentiles{P99: intPtr(2_000)},
+
+		SystemMetrics: &OpsSystemMetricsSnapshot{
+			DBOK:                  boolPtr(false),
+			RedisOK:               boolPtr(false),
+			CPUUsagePercent:       float64Ptr(98.0),
+			MemoryUsagePercent:    float64Ptr(97.0),
+			DBConnWaiting:         intPtr(3),
+			ConcurrencyQueueDepth: intPtr(10),
+		},
+		JobHeartbeats: []*OpsJobHeartbeat{
+			{
+				JobName:     "job-a",
+				LastErrorAt: timePtr(time.Now().UTC().Add(-1 * time.Minute)),
+				LastError:   stringPtr("boom"),
+			},
+		},
+	}
+
+	score := computeDashboardHealthScore(time.Now().UTC(), ov)
+	require.Less(t, score, 80)
+	require.GreaterOrEqual(t, score, 0)
+}
+
+func TestComputeDashboardHealthScore_Comprehensive(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name     string
+		overview *OpsDashboardOverview
+		wantMin  int
+		wantMax  int
+	}{
+		{
+			name:     "nil overview returns 0",
+			overview: nil,
+			wantMin:  0,
+			wantMax:  0,
+		},
+		{
+			name: "perfect health",
+			overview: &OpsDashboardOverview{
+				RequestCountTotal: 1000,
+				RequestCountSLA:   1000,
+				SLA:               1.0,
+				ErrorRate:         0,
+				UpstreamErrorRate: 0,
+				Duration:          OpsPercentiles{P99: intPtr(500)},
+				TTFT:              OpsPercentiles{P99: intPtr(100)},
+				SystemMetrics: &OpsSystemMetricsSnapshot{
+					DBOK:               boolPtr(true),
+					RedisOK:            boolPtr(true),
+					CPUUsagePercent:    float64Ptr(30),
+					MemoryUsagePercent: float64Ptr(40),
+				},
+			},
+			wantMin: 100,
+			wantMax: 100,
+		},
+		{
+			name: "good health - SLA 99.8%",
+			overview: &OpsDashboardOverview{
+				RequestCountTotal: 1000,
+				RequestCountSLA:   1000,
+				SLA:               0.998,
+				ErrorRate:         0.003,
+				UpstreamErrorRate: 0.001,
+				Duration:          OpsPercentiles{P99: intPtr(800)},
+				TTFT:              OpsPercentiles{P99: intPtr(200)},
+				SystemMetrics: &OpsSystemMetricsSnapshot{
+					DBOK:               boolPtr(true),
+					RedisOK:            boolPtr(true),
+					CPUUsagePercent:    float64Ptr(50),
+					MemoryUsagePercent: float64Ptr(60),
+				},
+			},
+			wantMin: 95,
+			wantMax: 100,
+		},
+		{
+			name: "medium health - SLA 96%",
+			overview: &OpsDashboardOverview{
+				RequestCountTotal: 1000,
+				RequestCountSLA:   1000,
+				SLA:               0.96,
+				ErrorRate:         0.02,
+				UpstreamErrorRate: 0.01,
+				Duration:          OpsPercentiles{P99: intPtr(3000)},
+				TTFT:              OpsPercentiles{P99: intPtr(600)},
+				SystemMetrics: &OpsSystemMetricsSnapshot{
+					DBOK:               boolPtr(true),
+					RedisOK:            boolPtr(true),
+					CPUUsagePercent:    float64Ptr(70),
+					MemoryUsagePercent: float64Ptr(75),
+				},
+			},
+			wantMin: 60,
+			wantMax: 85,
+		},
+		{
+			name: "DB failure",
+			overview: &OpsDashboardOverview{
+				RequestCountTotal: 1000,
+				RequestCountSLA:   1000,
+				SLA:               0.995,
+				ErrorRate:         0,
+				UpstreamErrorRate: 0,
+				Duration:          OpsPercentiles{P99: intPtr(500)},
+				SystemMetrics: &OpsSystemMetricsSnapshot{
+					DBOK:               boolPtr(false),
+					RedisOK:            boolPtr(true),
+					CPUUsagePercent:    float64Ptr(30),
+					MemoryUsagePercent: float64Ptr(40),
+				},
+			},
+			wantMin: 70,
+			wantMax: 90,
+		},
+		{
+			name: "Redis failure",
+			overview: &OpsDashboardOverview{
+				RequestCountTotal: 1000,
+				RequestCountSLA:   1000,
+				SLA:               0.995,
+				ErrorRate:         0,
+				UpstreamErrorRate: 0,
+				Duration:          OpsPercentiles{P99: intPtr(500)},
+				SystemMetrics: &OpsSystemMetricsSnapshot{
+					DBOK:               boolPtr(true),
+					RedisOK:            boolPtr(false),
+					CPUUsagePercent:    float64Ptr(30),
+					MemoryUsagePercent: float64Ptr(40),
+				},
+			},
+			wantMin: 85,
+			wantMax: 95,
+		},
+		{
+			name: "high CPU usage",
+			overview: &OpsDashboardOverview{
+				RequestCountTotal: 1000,
+				RequestCountSLA:   1000,
+				SLA:               0.995,
+				ErrorRate:         0,
+				UpstreamErrorRate: 0,
+				Duration:          OpsPercentiles{P99: intPtr(500)},
+				SystemMetrics: &OpsSystemMetricsSnapshot{
+					DBOK:               boolPtr(true),
+					RedisOK:            boolPtr(true),
+					CPUUsagePercent:    float64Ptr(95),
+					MemoryUsagePercent: float64Ptr(40),
+				},
+			},
+			wantMin: 85,
+			wantMax: 100,
+		},
+		{
+			name: "combined failures - business degraded + infra healthy",
+			overview: &OpsDashboardOverview{
+				RequestCountTotal: 1000,
+				RequestCountSLA:   1000,
+				SLA:               0.90,
+				ErrorRate:         0.05,
+				UpstreamErrorRate: 0.02,
+				Duration:          OpsPercentiles{P99: intPtr(10000)},
+				SystemMetrics: &OpsSystemMetricsSnapshot{
+					DBOK:               boolPtr(true),
+					RedisOK:            boolPtr(true),
+					CPUUsagePercent:    float64Ptr(20),
+					MemoryUsagePercent: float64Ptr(30),
+				},
+			},
+			wantMin: 25,
+			wantMax: 50,
+		},
+		{
+			name: "combined failures - business healthy + infra degraded",
+			overview: &OpsDashboardOverview{
+				RequestCountTotal: 1000,
+				RequestCountSLA:   1000,
+				SLA:               0.998,
+				ErrorRate:         0.001,
+				UpstreamErrorRate: 0,
+				Duration:          OpsPercentiles{P99: intPtr(600)},
+				SystemMetrics: &OpsSystemMetricsSnapshot{
+					DBOK:               boolPtr(false),
+					RedisOK:            boolPtr(false),
+					CPUUsagePercent:    float64Ptr(95),
+					MemoryUsagePercent: float64Ptr(95),
+				},
+			},
+			wantMin: 70,
+			wantMax: 90,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			score := computeDashboardHealthScore(time.Now().UTC(), tt.overview)
+			require.GreaterOrEqual(t, score, tt.wantMin, "score should be >= %d", tt.wantMin)
+			require.LessOrEqual(t, score, tt.wantMax, "score should be <= %d", tt.wantMax)
+			require.GreaterOrEqual(t, score, 0, "score must be >= 0")
+			require.LessOrEqual(t, score, 100, "score must be <= 100")
+		})
+	}
+}
+
+func TestComputeBusinessHealth(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name     string
+		overview *OpsDashboardOverview
+		wantMin  float64
+		wantMax  float64
+	}{
+		{
+			name: "perfect metrics",
+			overview: &OpsDashboardOverview{
+				SLA:               1.0,
+				ErrorRate:         0,
+				UpstreamErrorRate: 0,
+				Duration:          OpsPercentiles{P99: intPtr(500)},
+			},
+			wantMin: 100,
+			wantMax: 100,
+		},
+		{
+			name: "SLA boundary 99.5%",
+			overview: &OpsDashboardOverview{
+				SLA:               0.995,
+				ErrorRate:         0,
+				UpstreamErrorRate: 0,
+				Duration:          OpsPercentiles{P99: intPtr(500)},
+			},
+			wantMin: 100,
+			wantMax: 100,
+		},
+		{
+			name: "SLA boundary 95%",
+			overview: &OpsDashboardOverview{
+				SLA:               0.95,
+				ErrorRate:         0,
+				UpstreamErrorRate: 0,
+				Duration:          OpsPercentiles{P99: intPtr(500)},
+			},
+			wantMin: 50,
+			wantMax: 60,
+		},
+		{
+			name: "error rate boundary 0.5%",
+			overview: &OpsDashboardOverview{
+				SLA:               0.995,
+				ErrorRate:         0.005,
+				UpstreamErrorRate: 0,
+				Duration:          OpsPercentiles{P99: intPtr(500)},
+			},
+			wantMin: 95,
+			wantMax: 100,
+		},
+		{
+			name: "latency boundary 1000ms",
+			overview: &OpsDashboardOverview{
+				SLA:               0.995,
+				ErrorRate:         0,
+				UpstreamErrorRate: 0,
+				Duration:          OpsPercentiles{P99: intPtr(1000)},
+			},
+			wantMin: 95,
+			wantMax: 100,
+		},
+		{
+			name: "upstream error dominates",
+			overview: &OpsDashboardOverview{
+				SLA:               0.995,
+				ErrorRate:         0.001,
+				UpstreamErrorRate: 0.03,
+				Duration:          OpsPercentiles{P99: intPtr(500)},
+			},
+			wantMin: 75,
+			wantMax: 90,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			score := computeBusinessHealth(tt.overview)
+			require.GreaterOrEqual(t, score, tt.wantMin, "score should be >= %.1f", tt.wantMin)
+			require.LessOrEqual(t, score, tt.wantMax, "score should be <= %.1f", tt.wantMax)
+			require.GreaterOrEqual(t, score, 0.0, "score must be >= 0")
+			require.LessOrEqual(t, score, 100.0, "score must be <= 100")
+		})
+	}
+}
+
+func TestComputeInfraHealth(t *testing.T) {
+	t.Parallel()
+
+	now := time.Now().UTC()
+
+	tests := []struct {
+		name     string
+		overview *OpsDashboardOverview
+		wantMin  float64
+		wantMax  float64
+	}{
+		{
+			name: "all infrastructure healthy",
+			overview: &OpsDashboardOverview{
+				RequestCountTotal: 1000,
+				SystemMetrics: &OpsSystemMetricsSnapshot{
+					DBOK:               boolPtr(true),
+					RedisOK:            boolPtr(true),
+					CPUUsagePercent:    float64Ptr(30),
+					MemoryUsagePercent: float64Ptr(40),
+				},
+			},
+			wantMin: 100,
+			wantMax: 100,
+		},
+		{
+			name: "DB down",
+			overview: &OpsDashboardOverview{
+				RequestCountTotal: 1000,
+				SystemMetrics: &OpsSystemMetricsSnapshot{
+					DBOK:               boolPtr(false),
+					RedisOK:            boolPtr(true),
+					CPUUsagePercent:    float64Ptr(30),
+					MemoryUsagePercent: float64Ptr(40),
+				},
+			},
+			wantMin: 50,
+			wantMax: 70,
+		},
+		{
+			name: "Redis down",
+			overview: &OpsDashboardOverview{
+				RequestCountTotal: 1000,
+				SystemMetrics: &OpsSystemMetricsSnapshot{
+					DBOK:               boolPtr(true),
+					RedisOK:            boolPtr(false),
+					CPUUsagePercent:    float64Ptr(30),
+					MemoryUsagePercent: float64Ptr(40),
+				},
+			},
+			wantMin: 80,
+			wantMax: 95,
+		},
+		{
+			name: "CPU at 90%",
+			overview: &OpsDashboardOverview{
+				RequestCountTotal: 1000,
+				SystemMetrics: &OpsSystemMetricsSnapshot{
+					DBOK:               boolPtr(true),
+					RedisOK:            boolPtr(true),
+					CPUUsagePercent:    float64Ptr(90),
+					MemoryUsagePercent: float64Ptr(40),
+				},
+			},
+			wantMin: 85,
+			wantMax: 95,
+		},
+		{
+			name: "failed background job",
+			overview: &OpsDashboardOverview{
+				RequestCountTotal: 1000,
+				SystemMetrics: &OpsSystemMetricsSnapshot{
+					DBOK:               boolPtr(true),
+					RedisOK:            boolPtr(true),
+					CPUUsagePercent:    float64Ptr(30),
+					MemoryUsagePercent: float64Ptr(40),
+				},
+				JobHeartbeats: []*OpsJobHeartbeat{
+					{
+						JobName:     "test-job",
+						LastErrorAt: &now,
+					},
+				},
+			},
+			wantMin: 70,
+			wantMax: 90,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			score := computeInfraHealth(now, tt.overview)
+			require.GreaterOrEqual(t, score, tt.wantMin, "score should be >= %.1f", tt.wantMin)
+			require.LessOrEqual(t, score, tt.wantMax, "score should be <= %.1f", tt.wantMax)
+			require.GreaterOrEqual(t, score, 0.0, "score must be >= 0")
+			require.LessOrEqual(t, score, 100.0, "score must be <= 100")
+		})
+	}
+}
+
+func timePtr(v time.Time) *time.Time { return &v }
+
+func stringPtr(v string) *string { return &v }
--- a/backend/internal/service/ops_histograms.go
+++ b/backend/internal/service/ops_histograms.go
+package service
+
+import (
+	"context"
+
+	infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
+)
+
+func (s *OpsService) GetLatencyHistogram(ctx context.Context, filter *OpsDashboardFilter) (*OpsLatencyHistogramResponse, error) {
+	if err := s.RequireMonitoringEnabled(ctx); err != nil {
+		return nil, err
+	}
+	if s.opsRepo == nil {
+		return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+	}
+	if filter == nil {
+		return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
+	}
+	if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
+		return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
+	}
+	if filter.StartTime.After(filter.EndTime) {
+		return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
+	}
+	return s.opsRepo.GetLatencyHistogram(ctx, filter)
+}
--- a/backend/internal/service/ops_metrics_collector.go
+++ b/backend/internal/service/ops_metrics_collector.go
+package service
+
+import (
+	"context"
+	"database/sql"
+	"errors"
+	"fmt"
+	"log"
+	"math"
+	"os"
+	"runtime"
+	"strconv"
+	"strings"
+	"sync"
+	"time"
+	"unicode/utf8"
+
+	"github.com/Wei-Shaw/sub2api/internal/config"
+	"github.com/google/uuid"
+	"github.com/redis/go-redis/v9"
+	"github.com/shirou/gopsutil/v4/cpu"
+	"github.com/shirou/gopsutil/v4/mem"
+)
+
+const (
+	opsMetricsCollectorJobName     = "ops_metrics_collector"
+	opsMetricsCollectorMinInterval = 60 * time.Second
+	opsMetricsCollectorMaxInterval = 1 * time.Hour
+
+	opsMetricsCollectorTimeout = 10 * time.Second
+
+	opsMetricsCollectorLeaderLockKey = "ops:metrics:collector:leader"
+	opsMetricsCollectorLeaderLockTTL = 90 * time.Second
+
+	opsMetricsCollectorHeartbeatTimeout = 2 * time.Second
+
+	bytesPerMB = 1024 * 1024
+)
+
+var opsMetricsCollectorAdvisoryLockID = hashAdvisoryLockID(opsMetricsCollectorLeaderLockKey)
+
+type OpsMetricsCollector struct {
+	opsRepo     OpsRepository
+	settingRepo SettingRepository
+	cfg         *config.Config
+
+	accountRepo        AccountRepository
+	concurrencyService *ConcurrencyService
+
+	db          *sql.DB
+	redisClient *redis.Client
+	instanceID  string
+
+	lastCgroupCPUUsageNanos uint64
+	lastCgroupCPUSampleAt   time.Time
+
+	stopCh    chan struct{}
+	startOnce sync.Once
+	stopOnce  sync.Once
+
+	skipLogMu sync.Mutex
+	skipLogAt time.Time
+}
+
+func NewOpsMetricsCollector(
+	opsRepo OpsRepository,
+	settingRepo SettingRepository,
+	accountRepo AccountRepository,
+	concurrencyService *ConcurrencyService,
+	db *sql.DB,
+	redisClient *redis.Client,
+	cfg *config.Config,
+) *OpsMetricsCollector {
+	return &OpsMetricsCollector{
+		opsRepo:            opsRepo,
+		settingRepo:        settingRepo,
+		cfg:                cfg,
+		accountRepo:        accountRepo,
+		concurrencyService: concurrencyService,
+		db:                 db,
+		redisClient:        redisClient,
+		instanceID:         uuid.NewString(),
+	}
+}
+
+func (c *OpsMetricsCollector) Start() {
+	if c == nil {
+		return
+	}
+	c.startOnce.Do(func() {
+		if c.stopCh == nil {
+			c.stopCh = make(chan struct{})
+		}
+		go c.run()
+	})
+}
+
+func (c *OpsMetricsCollector) Stop() {
+	if c == nil {
+		return
+	}
+	c.stopOnce.Do(func() {
+		if c.stopCh != nil {
+			close(c.stopCh)
+		}
+	})
+}
+
+func (c *OpsMetricsCollector) run() {
+	// First run immediately so the dashboard has data soon after startup.
+	c.collectOnce()
+
+	for {
+		interval := c.getInterval()
+		timer := time.NewTimer(interval)
+		select {
+		case <-timer.C:
+			c.collectOnce()
+		case <-c.stopCh:
+			timer.Stop()
+			return
+		}
+	}
+}
+
+func (c *OpsMetricsCollector) getInterval() time.Duration {
+	interval := opsMetricsCollectorMinInterval
+
+	if c.settingRepo == nil {
+		return interval
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+	defer cancel()
+
+	raw, err := c.settingRepo.GetValue(ctx, SettingKeyOpsMetricsIntervalSeconds)
+	if err != nil {
+		return interval
+	}
+	raw = strings.TrimSpace(raw)
+	if raw == "" {
+		return interval
+	}
+
+	seconds, err := strconv.Atoi(raw)
+	if err != nil {
+		return interval
+	}
+	if seconds < int(opsMetricsCollectorMinInterval.Seconds()) {
+		seconds = int(opsMetricsCollectorMinInterval.Seconds())
+	}
+	if seconds > int(opsMetricsCollectorMaxInterval.Seconds()) {
+		seconds = int(opsMetricsCollectorMaxInterval.Seconds())
+	}
+	return time.Duration(seconds) * time.Second
+}
+
+func (c *OpsMetricsCollector) collectOnce() {
+	if c == nil {
+		return
+	}
+	if c.cfg != nil && !c.cfg.Ops.Enabled {
+		return
+	}
+	if c.opsRepo == nil {
+		return
+	}
+	if c.db == nil {
+		return
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), opsMetricsCollectorTimeout)
+	defer cancel()
+
+	if !c.isMonitoringEnabled(ctx) {
+		return
+	}
+
+	release, ok := c.tryAcquireLeaderLock(ctx)
+	if !ok {
+		return
+	}
+	if release != nil {
+		defer release()
+	}
+
+	startedAt := time.Now().UTC()
+	err := c.collectAndPersist(ctx)
+	finishedAt := time.Now().UTC()
+
+	durationMs := finishedAt.Sub(startedAt).Milliseconds()
+	dur := durationMs
+	runAt := startedAt
+
+	if err != nil {
+		msg := truncateString(err.Error(), 2048)
+		errAt := finishedAt
+		hbCtx, hbCancel := context.WithTimeout(context.Background(), opsMetricsCollectorHeartbeatTimeout)
+		defer hbCancel()
+		_ = c.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
+			JobName:        opsMetricsCollectorJobName,
+			LastRunAt:      &runAt,
+			LastErrorAt:    &errAt,
+			LastError:      &msg,
+			LastDurationMs: &dur,
+		})
+		log.Printf("[OpsMetricsCollector] collect failed: %v", err)
+		return
+	}
+
+	successAt := finishedAt
+	hbCtx, hbCancel := context.WithTimeout(context.Background(), opsMetricsCollectorHeartbeatTimeout)
+	defer hbCancel()
+	_ = c.opsRepo.UpsertJobHeartbeat(hbCtx, &OpsUpsertJobHeartbeatInput{
+		JobName:        opsMetricsCollectorJobName,
+		LastRunAt:      &runAt,
+		LastSuccessAt:  &successAt,
+		LastDurationMs: &dur,
+	})
+}
+
+func (c *OpsMetricsCollector) isMonitoringEnabled(ctx context.Context) bool {
+	if c == nil {
+		return false
+	}
+	if c.cfg != nil && !c.cfg.Ops.Enabled {
+		return false
+	}
+	if c.settingRepo == nil {
+		return true
+	}
+	if ctx == nil {
+		ctx = context.Background()
+	}
+
+	value, err := c.settingRepo.GetValue(ctx, SettingKeyOpsMonitoringEnabled)
+	if err != nil {
+		if errors.Is(err, ErrSettingNotFound) {
+			return true
+		}
+		// Fail-open: collector should not become a hard dependency.
+		return true
+	}
+	switch strings.ToLower(strings.TrimSpace(value)) {
+	case "false", "0", "off", "disabled":
+		return false
+	default:
+		return true
+	}
+}
+
+func (c *OpsMetricsCollector) collectAndPersist(ctx context.Context) error {
+	if ctx == nil {
+		ctx = context.Background()
+	}
+
+	// Align to stable minute boundaries to avoid partial buckets and to maximize cache hits.
+	now := time.Now().UTC()
+	windowEnd := now.Truncate(time.Minute)
+	windowStart := windowEnd.Add(-1 * time.Minute)
+
+	sys, err := c.collectSystemStats(ctx)
+	if err != nil {
+		// Continue; system stats are best-effort.
+		log.Printf("[OpsMetricsCollector] system stats error: %v", err)
+	}
+
+	dbOK := c.checkDB(ctx)
+	redisOK := c.checkRedis(ctx)
+	active, idle := c.dbPoolStats()
+	redisTotal, redisIdle, redisStatsOK := c.redisPoolStats()
+
+	successCount, tokenConsumed, err := c.queryUsageCounts(ctx, windowStart, windowEnd)
+	if err != nil {
+		return fmt.Errorf("query usage counts: %w", err)
+	}
+
+	duration, ttft, err := c.queryUsageLatency(ctx, windowStart, windowEnd)
+	if err != nil {
+		return fmt.Errorf("query usage latency: %w", err)
+	}
+
+	errorTotal, businessLimited, errorSLA, upstreamExcl, upstream429, upstream529, err := c.queryErrorCounts(ctx, windowStart, windowEnd)
+	if err != nil {
+		return fmt.Errorf("query error counts: %w", err)
+	}
+
+	windowSeconds := windowEnd.Sub(windowStart).Seconds()
+	if windowSeconds <= 0 {
+		windowSeconds = 60
+	}
+	requestTotal := successCount + errorTotal
+	qps := float64(requestTotal) / windowSeconds
+	tps := float64(tokenConsumed) / windowSeconds
+
+	goroutines := runtime.NumGoroutine()
+	concurrencyQueueDepth := c.collectConcurrencyQueueDepth(ctx)
+
+	input := &OpsInsertSystemMetricsInput{
+		CreatedAt:     windowEnd,
+		WindowMinutes: 1,
+
+		SuccessCount:         successCount,
+		ErrorCountTotal:      errorTotal,
+		BusinessLimitedCount: businessLimited,
+		ErrorCountSLA:        errorSLA,
+
+		UpstreamErrorCountExcl429529: upstreamExcl,
+		Upstream429Count:             upstream429,
+		Upstream529Count:             upstream529,
+
+		TokenConsumed: tokenConsumed,
+		QPS:           float64Ptr(roundTo1DP(qps)),
+		TPS:           float64Ptr(roundTo1DP(tps)),
+
+		DurationP50Ms: duration.p50,
+		DurationP90Ms: duration.p90,
+		DurationP95Ms: duration.p95,
+		DurationP99Ms: duration.p99,
+		DurationAvgMs: duration.avg,
+		DurationMaxMs: duration.max,
+
+		TTFTP50Ms: ttft.p50,
+		TTFTP90Ms: ttft.p90,
+		TTFTP95Ms: ttft.p95,
+		TTFTP99Ms: ttft.p99,
+		TTFTAvgMs: ttft.avg,
+		TTFTMaxMs: ttft.max,
+
+		CPUUsagePercent:    sys.cpuUsagePercent,
+		MemoryUsedMB:       sys.memoryUsedMB,
+		MemoryTotalMB:      sys.memoryTotalMB,
+		MemoryUsagePercent: sys.memoryUsagePercent,
+
+		DBOK:    boolPtr(dbOK),
+		RedisOK: boolPtr(redisOK),
+
+		RedisConnTotal: func() *int {
+			if !redisStatsOK {
+				return nil
+			}
+			return intPtr(redisTotal)
+		}(),
+		RedisConnIdle: func() *int {
+			if !redisStatsOK {
+				return nil
+			}
+			return intPtr(redisIdle)
+		}(),
+
+		DBConnActive:          intPtr(active),
+		DBConnIdle:            intPtr(idle),
+		GoroutineCount:        intPtr(goroutines),
+		ConcurrencyQueueDepth: concurrencyQueueDepth,
+	}
+
+	return c.opsRepo.InsertSystemMetrics(ctx, input)
+}
+
+func (c *OpsMetricsCollector) collectConcurrencyQueueDepth(parentCtx context.Context) *int {
+	if c == nil || c.accountRepo == nil || c.concurrencyService == nil {
+		return nil
+	}
+	if parentCtx == nil {
+		parentCtx = context.Background()
+	}
+
+	// Best-effort: never let concurrency sampling break the metrics collector.
+	ctx, cancel := context.WithTimeout(parentCtx, 2*time.Second)
+	defer cancel()
+
+	accounts, err := c.accountRepo.ListSchedulable(ctx)
+	if err != nil {
+		return nil
+	}
+	if len(accounts) == 0 {
+		zero := 0
+		return &zero
+	}
+
+	batch := make([]AccountWithConcurrency, 0, len(accounts))
+	for _, acc := range accounts {
+		if acc.ID <= 0 {
+			continue
+		}
+		maxConc := acc.Concurrency
+		if maxConc < 0 {
+			maxConc = 0
+		}
+		batch = append(batch, AccountWithConcurrency{
+			ID:             acc.ID,
+			MaxConcurrency: maxConc,
+		})
+	}
+	if len(batch) == 0 {
+		zero := 0
+		return &zero
+	}
+
+	loadMap, err := c.concurrencyService.GetAccountsLoadBatch(ctx, batch)
+	if err != nil {
+		return nil
+	}
+
+	var total int64
+	for _, info := range loadMap {
+		if info == nil || info.WaitingCount <= 0 {
+			continue
+		}
+		total += int64(info.WaitingCount)
+	}
+	if total < 0 {
+		total = 0
+	}
+
+	maxInt := int64(^uint(0) >> 1)
+	if total > maxInt {
+		total = maxInt
+	}
+	v := int(total)
+	return &v
+}
+
+type opsCollectedPercentiles struct {
+	p50 *int
+	p90 *int
+	p95 *int
+	p99 *int
+	avg *float64
+	max *int
+}
+
+func (c *OpsMetricsCollector) queryUsageCounts(ctx context.Context, start, end time.Time) (successCount int64, tokenConsumed int64, err error) {
+	q := `
+SELECT
+  COALESCE(COUNT(*), 0) AS success_count,
+  COALESCE(SUM(input_tokens + output_tokens + cache_creation_tokens + cache_read_tokens), 0) AS token_consumed
+FROM usage_logs
+WHERE created_at >= $1 AND created_at < $2`
+
+	var tokens sql.NullInt64
+	if err := c.db.QueryRowContext(ctx, q, start, end).Scan(&successCount, &tokens); err != nil {
+		return 0, 0, err
+	}
+	if tokens.Valid {
+		tokenConsumed = tokens.Int64
+	}
+	return successCount, tokenConsumed, nil
+}
+
+func (c *OpsMetricsCollector) queryUsageLatency(ctx context.Context, start, end time.Time) (duration opsCollectedPercentiles, ttft opsCollectedPercentiles, err error) {
+	{
+		q := `
+SELECT
+  percentile_cont(0.50) WITHIN GROUP (ORDER BY duration_ms) AS p50,
+  percentile_cont(0.90) WITHIN GROUP (ORDER BY duration_ms) AS p90,
+  percentile_cont(0.95) WITHIN GROUP (ORDER BY duration_ms) AS p95,
+  percentile_cont(0.99) WITHIN GROUP (ORDER BY duration_ms) AS p99,
+  AVG(duration_ms) AS avg_ms,
+  MAX(duration_ms) AS max_ms
+FROM usage_logs
+WHERE created_at >= $1 AND created_at < $2
+  AND duration_ms IS NOT NULL`
+
+		var p50, p90, p95, p99 sql.NullFloat64
+		var avg sql.NullFloat64
+		var max sql.NullInt64
+		if err := c.db.QueryRowContext(ctx, q, start, end).Scan(&p50, &p90, &p95, &p99, &avg, &max); err != nil {
+			return opsCollectedPercentiles{}, opsCollectedPercentiles{}, err
+		}
+		duration.p50 = floatToIntPtr(p50)
+		duration.p90 = floatToIntPtr(p90)
+		duration.p95 = floatToIntPtr(p95)
+		duration.p99 = floatToIntPtr(p99)
+		if avg.Valid {
+			v := roundTo1DP(avg.Float64)
+			duration.avg = &v
+		}
+		if max.Valid {
+			v := int(max.Int64)
+			duration.max = &v
+		}
+	}
+
+	{
+		q := `
+SELECT
+  percentile_cont(0.50) WITHIN GROUP (ORDER BY first_token_ms) AS p50,
+  percentile_cont(0.90) WITHIN GROUP (ORDER BY first_token_ms) AS p90,
+  percentile_cont(0.95) WITHIN GROUP (ORDER BY first_token_ms) AS p95,
+  percentile_cont(0.99) WITHIN GROUP (ORDER BY first_token_ms) AS p99,
+  AVG(first_token_ms) AS avg_ms,
+  MAX(first_token_ms) AS max_ms
+FROM usage_logs
+WHERE created_at >= $1 AND created_at < $2
+  AND first_token_ms IS NOT NULL`
+
+		var p50, p90, p95, p99 sql.NullFloat64
+		var avg sql.NullFloat64
+		var max sql.NullInt64
+		if err := c.db.QueryRowContext(ctx, q, start, end).Scan(&p50, &p90, &p95, &p99, &avg, &max); err != nil {
+			return opsCollectedPercentiles{}, opsCollectedPercentiles{}, err
+		}
+		ttft.p50 = floatToIntPtr(p50)
+		ttft.p90 = floatToIntPtr(p90)
+		ttft.p95 = floatToIntPtr(p95)
+		ttft.p99 = floatToIntPtr(p99)
+		if avg.Valid {
+			v := roundTo1DP(avg.Float64)
+			ttft.avg = &v
+		}
+		if max.Valid {
+			v := int(max.Int64)
+			ttft.max = &v
+		}
+	}
+
+	return duration, ttft, nil
+}
+
+func (c *OpsMetricsCollector) queryErrorCounts(ctx context.Context, start, end time.Time) (
+	errorTotal int64,
+	businessLimited int64,
+	errorSLA int64,
+	upstreamExcl429529 int64,
+	upstream429 int64,
+	upstream529 int64,
+	err error,
+) {
+	q := `
+SELECT
+  COALESCE(COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400), 0) AS error_total,
+  COALESCE(COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400 AND is_business_limited), 0) AS business_limited,
+  COALESCE(COUNT(*) FILTER (WHERE COALESCE(status_code, 0) >= 400 AND NOT is_business_limited), 0) AS error_sla,
+  COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) NOT IN (429, 529)), 0) AS upstream_excl,
+  COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) = 429), 0) AS upstream_429,
+  COALESCE(COUNT(*) FILTER (WHERE error_owner = 'provider' AND NOT is_business_limited AND COALESCE(upstream_status_code, status_code, 0) = 529), 0) AS upstream_529
+FROM ops_error_logs
+WHERE created_at >= $1 AND created_at < $2`
+
+	if err := c.db.QueryRowContext(ctx, q, start, end).Scan(
+		&errorTotal,
+		&businessLimited,
+		&errorSLA,
+		&upstreamExcl429529,
+		&upstream429,
+		&upstream529,
+	); err != nil {
+		return 0, 0, 0, 0, 0, 0, err
+	}
+	return errorTotal, businessLimited, errorSLA, upstreamExcl429529, upstream429, upstream529, nil
+}
+
+type opsCollectedSystemStats struct {
+	cpuUsagePercent    *float64
+	memoryUsedMB       *int64
+	memoryTotalMB      *int64
+	memoryUsagePercent *float64
+}
+
+func (c *OpsMetricsCollector) collectSystemStats(ctx context.Context) (*opsCollectedSystemStats, error) {
+	out := &opsCollectedSystemStats{}
+	if ctx == nil {
+		ctx = context.Background()
+	}
+
+	sampleAt := time.Now().UTC()
+
+	// Prefer cgroup (container) metrics when available.
+	if cpuPct := c.tryCgroupCPUPercent(sampleAt); cpuPct != nil {
+		out.cpuUsagePercent = cpuPct
+	}
+
+	cgroupUsed, cgroupTotal, cgroupOK := readCgroupMemoryBytes()
+	if cgroupOK {
+		usedMB := int64(cgroupUsed / bytesPerMB)
+		out.memoryUsedMB = &usedMB
+		if cgroupTotal > 0 {
+			totalMB := int64(cgroupTotal / bytesPerMB)
+			out.memoryTotalMB = &totalMB
+			pct := roundTo1DP(float64(cgroupUsed) / float64(cgroupTotal) * 100)
+			out.memoryUsagePercent = &pct
+		}
+	}
+
+	// Fallback to host metrics if cgroup metrics are unavailable (or incomplete).
+	if out.cpuUsagePercent == nil {
+		if cpuPercents, err := cpu.PercentWithContext(ctx, 0, false); err == nil && len(cpuPercents) > 0 {
+			v := roundTo1DP(cpuPercents[0])
+			out.cpuUsagePercent = &v
+		}
+	}
+
+	// If total memory isn't available from cgroup (e.g. memory.max = "max"), fill total from host.
+	if out.memoryUsedMB == nil || out.memoryTotalMB == nil || out.memoryUsagePercent == nil {
+		if vm, err := mem.VirtualMemoryWithContext(ctx); err == nil && vm != nil {
+			if out.memoryUsedMB == nil {
+				usedMB := int64(vm.Used / bytesPerMB)
+				out.memoryUsedMB = &usedMB
+			}
+			if out.memoryTotalMB == nil {
+				totalMB := int64(vm.Total / bytesPerMB)
+				out.memoryTotalMB = &totalMB
+			}
+			if out.memoryUsagePercent == nil {
+				if out.memoryUsedMB != nil && out.memoryTotalMB != nil && *out.memoryTotalMB > 0 {
+					pct := roundTo1DP(float64(*out.memoryUsedMB) / float64(*out.memoryTotalMB) * 100)
+					out.memoryUsagePercent = &pct
+				} else {
+					pct := roundTo1DP(vm.UsedPercent)
+					out.memoryUsagePercent = &pct
+				}
+			}
+		}
+	}
+
+	return out, nil
+}
+
+func (c *OpsMetricsCollector) tryCgroupCPUPercent(now time.Time) *float64 {
+	usageNanos, ok := readCgroupCPUUsageNanos()
+	if !ok {
+		return nil
+	}
+
+	// Initialize baseline sample.
+	if c.lastCgroupCPUSampleAt.IsZero() {
+		c.lastCgroupCPUUsageNanos = usageNanos
+		c.lastCgroupCPUSampleAt = now
+		return nil
+	}
+
+	elapsed := now.Sub(c.lastCgroupCPUSampleAt)
+	if elapsed <= 0 {
+		c.lastCgroupCPUUsageNanos = usageNanos
+		c.lastCgroupCPUSampleAt = now
+		return nil
+	}
+
+	prev := c.lastCgroupCPUUsageNanos
+	c.lastCgroupCPUUsageNanos = usageNanos
+	c.lastCgroupCPUSampleAt = now
+
+	if usageNanos < prev {
+		// Counter reset (container restarted).
+		return nil
+	}
+
+	deltaUsageSec := float64(usageNanos-prev) / 1e9
+	elapsedSec := elapsed.Seconds()
+	if elapsedSec <= 0 {
+		return nil
+	}
+
+	cores := readCgroupCPULimitCores()
+	if cores <= 0 {
+		// Can't reliably normalize; skip and fall back to gopsutil.
+		return nil
+	}
+
+	pct := (deltaUsageSec / (elapsedSec * cores)) * 100
+	if pct < 0 {
+		pct = 0
+	}
+	// Clamp to avoid noise/jitter showing impossible values.
+	if pct > 100 {
+		pct = 100
+	}
+	v := roundTo1DP(pct)
+	return &v
+}
+
+func readCgroupMemoryBytes() (usedBytes uint64, totalBytes uint64, ok bool) {
+	// cgroup v2 (most common in modern containers)
+	if used, ok1 := readUintFile("/sys/fs/cgroup/memory.current"); ok1 {
+		usedBytes = used
+		rawMax, err := os.ReadFile("/sys/fs/cgroup/memory.max")
+		if err == nil {
+			s := strings.TrimSpace(string(rawMax))
+			if s != "" && s != "max" {
+				if v, err := strconv.ParseUint(s, 10, 64); err == nil {
+					totalBytes = v
+				}
+			}
+		}
+		return usedBytes, totalBytes, true
+	}
+
+	// cgroup v1 fallback
+	if used, ok1 := readUintFile("/sys/fs/cgroup/memory/memory.usage_in_bytes"); ok1 {
+		usedBytes = used
+		if limit, ok2 := readUintFile("/sys/fs/cgroup/memory/memory.limit_in_bytes"); ok2 {
+			// Some environments report a very large number when unlimited.
+			if limit > 0 && limit < (1<<60) {
+				totalBytes = limit
+			}
+		}
+		return usedBytes, totalBytes, true
+	}
+
+	return 0, 0, false
+}
+
+func readCgroupCPUUsageNanos() (usageNanos uint64, ok bool) {
+	// cgroup v2: cpu.stat has usage_usec
+	if raw, err := os.ReadFile("/sys/fs/cgroup/cpu.stat"); err == nil {
+		lines := strings.Split(string(raw), "\n")
+		for _, line := range lines {
+			fields := strings.Fields(line)
+			if len(fields) != 2 {
+				continue
+			}
+			if fields[0] != "usage_usec" {
+				continue
+			}
+			v, err := strconv.ParseUint(fields[1], 10, 64)
+			if err != nil {
+				continue
+			}
+			return v * 1000, true
+		}
+	}
+
+	// cgroup v1: cpuacct.usage is in nanoseconds
+	if v, ok := readUintFile("/sys/fs/cgroup/cpuacct/cpuacct.usage"); ok {
+		return v, true
+	}
+
+	return 0, false
+}
+
+func readCgroupCPULimitCores() float64 {
+	// cgroup v2: cpu.max => "<quota> <period>" or "max <period>"
+	if raw, err := os.ReadFile("/sys/fs/cgroup/cpu.max"); err == nil {
+		fields := strings.Fields(string(raw))
+		if len(fields) >= 2 && fields[0] != "max" {
+			quota, err1 := strconv.ParseFloat(fields[0], 64)
+			period, err2 := strconv.ParseFloat(fields[1], 64)
+			if err1 == nil && err2 == nil && quota > 0 && period > 0 {
+				return quota / period
+			}
+		}
+	}
+
+	// cgroup v1: cpu.cfs_quota_us / cpu.cfs_period_us
+	quota, okQuota := readIntFile("/sys/fs/cgroup/cpu/cpu.cfs_quota_us")
+	period, okPeriod := readIntFile("/sys/fs/cgroup/cpu/cpu.cfs_period_us")
+	if okQuota && okPeriod && quota > 0 && period > 0 {
+		return float64(quota) / float64(period)
+	}
+
+	return 0
+}
+
+func readUintFile(path string) (uint64, bool) {
+	raw, err := os.ReadFile(path)
+	if err != nil {
+		return 0, false
+	}
+	s := strings.TrimSpace(string(raw))
+	if s == "" {
+		return 0, false
+	}
+	v, err := strconv.ParseUint(s, 10, 64)
+	if err != nil {
+		return 0, false
+	}
+	return v, true
+}
+
+func readIntFile(path string) (int64, bool) {
+	raw, err := os.ReadFile(path)
+	if err != nil {
+		return 0, false
+	}
+	s := strings.TrimSpace(string(raw))
+	if s == "" {
+		return 0, false
+	}
+	v, err := strconv.ParseInt(s, 10, 64)
+	if err != nil {
+		return 0, false
+	}
+	return v, true
+}
+
+func (c *OpsMetricsCollector) checkDB(ctx context.Context) bool {
+	if c == nil || c.db == nil {
+		return false
+	}
+	if ctx == nil {
+		ctx = context.Background()
+	}
+	var one int
+	if err := c.db.QueryRowContext(ctx, "SELECT 1").Scan(&one); err != nil {
+		return false
+	}
+	return one == 1
+}
+
+func (c *OpsMetricsCollector) checkRedis(ctx context.Context) bool {
+	if c == nil || c.redisClient == nil {
+		return false
+	}
+	if ctx == nil {
+		ctx = context.Background()
+	}
+	return c.redisClient.Ping(ctx).Err() == nil
+}
+
+func (c *OpsMetricsCollector) redisPoolStats() (total int, idle int, ok bool) {
+	if c == nil || c.redisClient == nil {
+		return 0, 0, false
+	}
+	stats := c.redisClient.PoolStats()
+	if stats == nil {
+		return 0, 0, false
+	}
+	return int(stats.TotalConns), int(stats.IdleConns), true
+}
+
+func (c *OpsMetricsCollector) dbPoolStats() (active int, idle int) {
+	if c == nil || c.db == nil {
+		return 0, 0
+	}
+	stats := c.db.Stats()
+	return stats.InUse, stats.Idle
+}
+
+var opsMetricsCollectorReleaseScript = redis.NewScript(`
+if redis.call("GET", KEYS[1]) == ARGV[1] then
+  return redis.call("DEL", KEYS[1])
+end
+return 0
+`)
+
+func (c *OpsMetricsCollector) tryAcquireLeaderLock(ctx context.Context) (func(), bool) {
+	if c == nil || c.redisClient == nil {
+		return nil, true
+	}
+	if ctx == nil {
+		ctx = context.Background()
+	}
+
+	ok, err := c.redisClient.SetNX(ctx, opsMetricsCollectorLeaderLockKey, c.instanceID, opsMetricsCollectorLeaderLockTTL).Result()
+	if err != nil {
+		// Prefer fail-closed to avoid stampeding the database when Redis is flaky.
+		// Fallback to a DB advisory lock when Redis is present but unavailable.
+		release, ok := tryAcquireDBAdvisoryLock(ctx, c.db, opsMetricsCollectorAdvisoryLockID)
+		if !ok {
+			c.maybeLogSkip()
+			return nil, false
+		}
+		return release, true
+	}
+	if !ok {
+		c.maybeLogSkip()
+		return nil, false
+	}
+
+	release := func() {
+		ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+		defer cancel()
+		_, _ = opsMetricsCollectorReleaseScript.Run(ctx, c.redisClient, []string{opsMetricsCollectorLeaderLockKey}, c.instanceID).Result()
+	}
+	return release, true
+}
+
+func (c *OpsMetricsCollector) maybeLogSkip() {
+	c.skipLogMu.Lock()
+	defer c.skipLogMu.Unlock()
+
+	now := time.Now()
+	if !c.skipLogAt.IsZero() && now.Sub(c.skipLogAt) < time.Minute {
+		return
+	}
+	c.skipLogAt = now
+	log.Printf("[OpsMetricsCollector] leader lock held by another instance; skipping")
+}
+
+func floatToIntPtr(v sql.NullFloat64) *int {
+	if !v.Valid {
+		return nil
+	}
+	n := int(math.Round(v.Float64))
+	return &n
+}
+
+func roundTo1DP(v float64) float64 {
+	return math.Round(v*10) / 10
+}
+
+func truncateString(s string, max int) string {
+	if max <= 0 {
+		return ""
+	}
+	if len(s) <= max {
+		return s
+	}
+	cut := s[:max]
+	for len(cut) > 0 && !utf8.ValidString(cut) {
+		cut = cut[:len(cut)-1]
+	}
+	return cut
+}
+
+func boolPtr(v bool) *bool {
+	out := v
+	return &out
+}
+
+func intPtr(v int) *int {
+	out := v
+	return &out
+}
+
+func float64Ptr(v float64) *float64 {
+	out := v
+	return &out
+}
--- a/backend/internal/service/ops_models.go
+++ b/backend/internal/service/ops_models.go
+package service
+
+import "time"
+
+type OpsErrorLog struct {
+	ID        int64     `json:"id"`
+	CreatedAt time.Time `json:"created_at"`
+
+	Phase    string `json:"phase"`
+	Type     string `json:"type"`
+	Severity string `json:"severity"`
+
+	StatusCode int    `json:"status_code"`
+	Platform   string `json:"platform"`
+	Model      string `json:"model"`
+
+	LatencyMs *int `json:"latency_ms"`
+
+	ClientRequestID string `json:"client_request_id"`
+	RequestID       string `json:"request_id"`
+	Message         string `json:"message"`
+
+	UserID    *int64 `json:"user_id"`
+	APIKeyID  *int64 `json:"api_key_id"`
+	AccountID *int64 `json:"account_id"`
+	GroupID   *int64 `json:"group_id"`
+
+	ClientIP    *string `json:"client_ip"`
+	RequestPath string  `json:"request_path"`
+	Stream      bool    `json:"stream"`
+}
+
+type OpsErrorLogDetail struct {
+	OpsErrorLog
+
+	ErrorBody string `json:"error_body"`
+	UserAgent string `json:"user_agent"`
+
+	// Upstream context (optional)
+	UpstreamStatusCode   *int   `json:"upstream_status_code,omitempty"`
+	UpstreamErrorMessage string `json:"upstream_error_message,omitempty"`
+	UpstreamErrorDetail  string `json:"upstream_error_detail,omitempty"`
+	UpstreamErrors       string `json:"upstream_errors,omitempty"` // JSON array (string) for display/parsing
+
+	// Timings (optional)
+	AuthLatencyMs      *int64 `json:"auth_latency_ms"`
+	RoutingLatencyMs   *int64 `json:"routing_latency_ms"`
+	UpstreamLatencyMs  *int64 `json:"upstream_latency_ms"`
+	ResponseLatencyMs  *int64 `json:"response_latency_ms"`
+	TimeToFirstTokenMs *int64 `json:"time_to_first_token_ms"`
+
+	// Retry context
+	RequestBody          string `json:"request_body"`
+	RequestBodyTruncated bool   `json:"request_body_truncated"`
+	RequestBodyBytes     *int   `json:"request_body_bytes"`
+	RequestHeaders       string `json:"request_headers,omitempty"`
+
+	// vNext metric semantics
+	IsBusinessLimited bool `json:"is_business_limited"`
+}
+
+type OpsErrorLogFilter struct {
+	StartTime *time.Time
+	EndTime   *time.Time
+
+	Platform  string
+	GroupID   *int64
+	AccountID *int64
+
+	StatusCodes []int
+	Phase       string
+	Query       string
+
+	Page     int
+	PageSize int
+}
+
+type OpsErrorLogList struct {
+	Errors   []*OpsErrorLog `json:"errors"`
+	Total    int            `json:"total"`
+	Page     int            `json:"page"`
+	PageSize int            `json:"page_size"`
+}
+
+type OpsRetryAttempt struct {
+	ID        int64     `json:"id"`
+	CreatedAt time.Time `json:"created_at"`
+
+	RequestedByUserID int64  `json:"requested_by_user_id"`
+	SourceErrorID     int64  `json:"source_error_id"`
+	Mode              string `json:"mode"`
+	PinnedAccountID   *int64 `json:"pinned_account_id"`
+
+	Status     string     `json:"status"`
+	StartedAt  *time.Time `json:"started_at"`
+	FinishedAt *time.Time `json:"finished_at"`
+	DurationMs *int64     `json:"duration_ms"`
+
+	ResultRequestID *string `json:"result_request_id"`
+	ResultErrorID   *int64  `json:"result_error_id"`
+
+	ErrorMessage *string `json:"error_message"`
+}
+
+type OpsRetryResult struct {
+	AttemptID int64  `json:"attempt_id"`
+	Mode      string `json:"mode"`
+	Status    string `json:"status"`
+
+	PinnedAccountID *int64 `json:"pinned_account_id"`
+	UsedAccountID   *int64 `json:"used_account_id"`
+
+	HTTPStatusCode    int    `json:"http_status_code"`
+	UpstreamRequestID string `json:"upstream_request_id"`
+
+	ResponsePreview   string `json:"response_preview"`
+	ResponseTruncated bool   `json:"response_truncated"`
+
+	ErrorMessage string `json:"error_message"`
+
+	StartedAt  time.Time `json:"started_at"`
+	FinishedAt time.Time `json:"finished_at"`
+	DurationMs int64     `json:"duration_ms"`
+}
--- a/backend/internal/service/ops_port.go
+++ b/backend/internal/service/ops_port.go
+package service
+
+import (
+	"context"
+	"time"
+)
+
+type OpsRepository interface {
+	InsertErrorLog(ctx context.Context, input *OpsInsertErrorLogInput) (int64, error)
+	ListErrorLogs(ctx context.Context, filter *OpsErrorLogFilter) (*OpsErrorLogList, error)
+	GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLogDetail, error)
+	ListRequestDetails(ctx context.Context, filter *OpsRequestDetailFilter) ([]*OpsRequestDetail, int64, error)
+
+	InsertRetryAttempt(ctx context.Context, input *OpsInsertRetryAttemptInput) (int64, error)
+	UpdateRetryAttempt(ctx context.Context, input *OpsUpdateRetryAttemptInput) error
+	GetLatestRetryAttemptForError(ctx context.Context, sourceErrorID int64) (*OpsRetryAttempt, error)
+
+	// Lightweight window stats (for realtime WS / quick sampling).
+	GetWindowStats(ctx context.Context, filter *OpsDashboardFilter) (*OpsWindowStats, error)
+
+	GetDashboardOverview(ctx context.Context, filter *OpsDashboardFilter) (*OpsDashboardOverview, error)
+	GetThroughputTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsThroughputTrendResponse, error)
+	GetLatencyHistogram(ctx context.Context, filter *OpsDashboardFilter) (*OpsLatencyHistogramResponse, error)
+	GetErrorTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsErrorTrendResponse, error)
+	GetErrorDistribution(ctx context.Context, filter *OpsDashboardFilter) (*OpsErrorDistributionResponse, error)
+
+	InsertSystemMetrics(ctx context.Context, input *OpsInsertSystemMetricsInput) error
+	GetLatestSystemMetrics(ctx context.Context, windowMinutes int) (*OpsSystemMetricsSnapshot, error)
+
+	UpsertJobHeartbeat(ctx context.Context, input *OpsUpsertJobHeartbeatInput) error
+	ListJobHeartbeats(ctx context.Context) ([]*OpsJobHeartbeat, error)
+
+	// Alerts (rules + events)
+	ListAlertRules(ctx context.Context) ([]*OpsAlertRule, error)
+	CreateAlertRule(ctx context.Context, input *OpsAlertRule) (*OpsAlertRule, error)
+	UpdateAlertRule(ctx context.Context, input *OpsAlertRule) (*OpsAlertRule, error)
+	DeleteAlertRule(ctx context.Context, id int64) error
+
+	ListAlertEvents(ctx context.Context, filter *OpsAlertEventFilter) ([]*OpsAlertEvent, error)
+	GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error)
+	GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error)
+	CreateAlertEvent(ctx context.Context, event *OpsAlertEvent) (*OpsAlertEvent, error)
+	UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error
+	UpdateAlertEventEmailSent(ctx context.Context, eventID int64, emailSent bool) error
+
+	// Pre-aggregation (hourly/daily) used for long-window dashboard performance.
+	UpsertHourlyMetrics(ctx context.Context, startTime, endTime time.Time) error
+	UpsertDailyMetrics(ctx context.Context, startTime, endTime time.Time) error
+	GetLatestHourlyBucketStart(ctx context.Context) (time.Time, bool, error)
+	GetLatestDailyBucketDate(ctx context.Context) (time.Time, bool, error)
+}
+
+type OpsInsertErrorLogInput struct {
+	RequestID       string
+	ClientRequestID string
+
+	UserID    *int64
+	APIKeyID  *int64
+	AccountID *int64
+	GroupID   *int64
+	ClientIP  *string
+
+	Platform    string
+	Model       string
+	RequestPath string
+	Stream      bool
+	UserAgent   string
+
+	ErrorPhase        string
+	ErrorType         string
+	Severity          string
+	StatusCode        int
+	IsBusinessLimited bool
+
+	ErrorMessage string
+	ErrorBody    string
+
+	ErrorSource string
+	ErrorOwner  string
+
+	UpstreamStatusCode   *int
+	UpstreamErrorMessage *string
+	UpstreamErrorDetail  *string
+	// UpstreamErrors captures all upstream error attempts observed during handling this request.
+	// It is populated during request processing (gin context) and sanitized+serialized by OpsService.
+	UpstreamErrors []*OpsUpstreamErrorEvent
+	// UpstreamErrorsJSON is the sanitized JSON string stored into ops_error_logs.upstream_errors.
+	// It is set by OpsService.RecordError before persisting.
+	UpstreamErrorsJSON *string
+
+	DurationMs         *int
+	TimeToFirstTokenMs *int64
+
+	RequestBodyJSON      *string // sanitized json string (not raw bytes)
+	RequestBodyTruncated bool
+	RequestBodyBytes     *int
+	RequestHeadersJSON   *string // optional json string
+
+	IsRetryable bool
+	RetryCount  int
+
+	CreatedAt time.Time
+}
+
+type OpsInsertRetryAttemptInput struct {
+	RequestedByUserID int64
+	SourceErrorID     int64
+	Mode              string
+	PinnedAccountID   *int64
+
+	// running|queued etc.
+	Status    string
+	StartedAt time.Time
+}
+
+type OpsUpdateRetryAttemptInput struct {
+	ID int64
+
+	// succeeded|failed
+	Status     string
+	FinishedAt time.Time
+	DurationMs int64
+
+	// Optional correlation
+	ResultRequestID *string
+	ResultErrorID   *int64
+
+	ErrorMessage *string
+}
+
+type OpsInsertSystemMetricsInput struct {
+	CreatedAt     time.Time
+	WindowMinutes int
+
+	Platform *string
+	GroupID  *int64
+
+	SuccessCount         int64
+	ErrorCountTotal      int64
+	BusinessLimitedCount int64
+	ErrorCountSLA        int64
+
+	UpstreamErrorCountExcl429529 int64
+	Upstream429Count             int64
+	Upstream529Count             int64
+
+	TokenConsumed int64
+
+	QPS *float64
+	TPS *float64
+
+	DurationP50Ms *int
+	DurationP90Ms *int
+	DurationP95Ms *int
+	DurationP99Ms *int
+	DurationAvgMs *float64
+	DurationMaxMs *int
+
+	TTFTP50Ms *int
+	TTFTP90Ms *int
+	TTFTP95Ms *int
+	TTFTP99Ms *int
+	TTFTAvgMs *float64
+	TTFTMaxMs *int
+
+	CPUUsagePercent    *float64
+	MemoryUsedMB       *int64
+	MemoryTotalMB      *int64
+	MemoryUsagePercent *float64
+
+	DBOK    *bool
+	RedisOK *bool
+
+	RedisConnTotal *int
+	RedisConnIdle  *int
+
+	DBConnActive  *int
+	DBConnIdle    *int
+	DBConnWaiting *int
+
+	GoroutineCount        *int
+	ConcurrencyQueueDepth *int
+}
+
+type OpsSystemMetricsSnapshot struct {
+	ID            int64     `json:"id"`
+	CreatedAt     time.Time `json:"created_at"`
+	WindowMinutes int       `json:"window_minutes"`
+
+	CPUUsagePercent    *float64 `json:"cpu_usage_percent"`
+	MemoryUsedMB       *int64   `json:"memory_used_mb"`
+	MemoryTotalMB      *int64   `json:"memory_total_mb"`
+	MemoryUsagePercent *float64 `json:"memory_usage_percent"`
+
+	DBOK    *bool `json:"db_ok"`
+	RedisOK *bool `json:"redis_ok"`
+
+	// Config-derived limits (best-effort). These are not historical metrics; they help UI render "current vs max".
+	DBMaxOpenConns *int `json:"db_max_open_conns"`
+	RedisPoolSize  *int `json:"redis_pool_size"`
+
+	RedisConnTotal *int `json:"redis_conn_total"`
+	RedisConnIdle  *int `json:"redis_conn_idle"`
+
+	DBConnActive  *int `json:"db_conn_active"`
+	DBConnIdle    *int `json:"db_conn_idle"`
+	DBConnWaiting *int `json:"db_conn_waiting"`
+
+	GoroutineCount        *int `json:"goroutine_count"`
+	ConcurrencyQueueDepth *int `json:"concurrency_queue_depth"`
+}
+
+type OpsUpsertJobHeartbeatInput struct {
+	JobName string
+
+	LastRunAt      *time.Time
+	LastSuccessAt  *time.Time
+	LastErrorAt    *time.Time
+	LastError      *string
+	LastDurationMs *int64
+}
+
+type OpsJobHeartbeat struct {
+	JobName string `json:"job_name"`
+
+	LastRunAt      *time.Time `json:"last_run_at"`
+	LastSuccessAt  *time.Time `json:"last_success_at"`
+	LastErrorAt    *time.Time `json:"last_error_at"`
+	LastError      *string    `json:"last_error"`
+	LastDurationMs *int64     `json:"last_duration_ms"`
+
+	UpdatedAt time.Time `json:"updated_at"`
+}
+
+type OpsWindowStats struct {
+	StartTime time.Time `json:"start_time"`
+	EndTime   time.Time `json:"end_time"`
+
+	SuccessCount    int64 `json:"success_count"`
+	ErrorCountTotal int64 `json:"error_count_total"`
+	TokenConsumed   int64 `json:"token_consumed"`
+}
--- a/backend/internal/service/ops_query_mode.go
+++ b/backend/internal/service/ops_query_mode.go
+package service
+
+import (
+	"errors"
+	"strings"
+)
+
+type OpsQueryMode string
+
+const (
+	OpsQueryModeAuto   OpsQueryMode = "auto"
+	OpsQueryModeRaw    OpsQueryMode = "raw"
+	OpsQueryModePreagg OpsQueryMode = "preagg"
+)
+
+// ErrOpsPreaggregatedNotPopulated indicates that raw logs exist for a window, but the
+// pre-aggregation tables are not populated yet. This is primarily used to implement
+// the forced `preagg` mode UX.
+var ErrOpsPreaggregatedNotPopulated = errors.New("ops pre-aggregated tables not populated")
+
+func ParseOpsQueryMode(raw string) OpsQueryMode {
+	v := strings.ToLower(strings.TrimSpace(raw))
+	switch v {
+	case string(OpsQueryModeRaw):
+		return OpsQueryModeRaw
+	case string(OpsQueryModePreagg):
+		return OpsQueryModePreagg
+	default:
+		return OpsQueryModeAuto
+	}
+}
+
+func (m OpsQueryMode) IsValid() bool {
+	switch m {
+	case OpsQueryModeAuto, OpsQueryModeRaw, OpsQueryModePreagg:
+		return true
+	default:
+		return false
+	}
+}
--- a/backend/internal/service/ops_realtime.go
+++ b/backend/internal/service/ops_realtime.go
+package service
+
+import (
+	"context"
+	"errors"
+	"strings"
+)
+
+// IsRealtimeMonitoringEnabled returns true when realtime ops features are enabled.
+//
+// This is a soft switch controlled by the DB setting `ops_realtime_monitoring_enabled`,
+// and it is also gated by the hard switch/soft switch of overall ops monitoring.
+func (s *OpsService) IsRealtimeMonitoringEnabled(ctx context.Context) bool {
+	if !s.IsMonitoringEnabled(ctx) {
+		return false
+	}
+	if s.settingRepo == nil {
+		return true
+	}
+
+	value, err := s.settingRepo.GetValue(ctx, SettingKeyOpsRealtimeMonitoringEnabled)
+	if err != nil {
+		// Default enabled when key is missing; fail-open on transient errors.
+		if errors.Is(err, ErrSettingNotFound) {
+			return true
+		}
+		return true
+	}
+
+	switch strings.ToLower(strings.TrimSpace(value)) {
+	case "false", "0", "off", "disabled":
+		return false
+	default:
+		return true
+	}
+}
--- a/backend/internal/service/ops_realtime_models.go
+++ b/backend/internal/service/ops_realtime_models.go
+package service
+
+import "time"
+
+// PlatformConcurrencyInfo aggregates concurrency usage by platform.
+type PlatformConcurrencyInfo struct {
+	Platform       string  `json:"platform"`
+	CurrentInUse   int64   `json:"current_in_use"`
+	MaxCapacity    int64   `json:"max_capacity"`
+	LoadPercentage float64 `json:"load_percentage"`
+	WaitingInQueue int64   `json:"waiting_in_queue"`
+}
+
+// GroupConcurrencyInfo aggregates concurrency usage by group.
+//
+// Note: one account can belong to multiple groups; group totals are therefore not additive across groups.
+type GroupConcurrencyInfo struct {
+	GroupID        int64   `json:"group_id"`
+	GroupName      string  `json:"group_name"`
+	Platform       string  `json:"platform"`
+	CurrentInUse   int64   `json:"current_in_use"`
+	MaxCapacity    int64   `json:"max_capacity"`
+	LoadPercentage float64 `json:"load_percentage"`
+	WaitingInQueue int64   `json:"waiting_in_queue"`
+}
+
+// AccountConcurrencyInfo represents real-time concurrency usage for a single account.
+type AccountConcurrencyInfo struct {
+	AccountID      int64   `json:"account_id"`
+	AccountName    string  `json:"account_name"`
+	Platform       string  `json:"platform"`
+	GroupID        int64   `json:"group_id"`
+	GroupName      string  `json:"group_name"`
+	CurrentInUse   int64   `json:"current_in_use"`
+	MaxCapacity    int64   `json:"max_capacity"`
+	LoadPercentage float64 `json:"load_percentage"`
+	WaitingInQueue int64   `json:"waiting_in_queue"`
+}
+
+// PlatformAvailability aggregates account availability by platform.
+type PlatformAvailability struct {
+	Platform       string `json:"platform"`
+	TotalAccounts  int64  `json:"total_accounts"`
+	AvailableCount int64  `json:"available_count"`
+	RateLimitCount int64  `json:"rate_limit_count"`
+	ErrorCount     int64  `json:"error_count"`
+}
+
+// GroupAvailability aggregates account availability by group.
+type GroupAvailability struct {
+	GroupID        int64  `json:"group_id"`
+	GroupName      string `json:"group_name"`
+	Platform       string `json:"platform"`
+	TotalAccounts  int64  `json:"total_accounts"`
+	AvailableCount int64  `json:"available_count"`
+	RateLimitCount int64  `json:"rate_limit_count"`
+	ErrorCount     int64  `json:"error_count"`
+}
+
+// AccountAvailability represents current availability for a single account.
+type AccountAvailability struct {
+	AccountID   int64  `json:"account_id"`
+	AccountName string `json:"account_name"`
+	Platform    string `json:"platform"`
+	GroupID     int64  `json:"group_id"`
+	GroupName   string `json:"group_name"`
+
+	Status string `json:"status"`
+
+	IsAvailable   bool `json:"is_available"`
+	IsRateLimited bool `json:"is_rate_limited"`
+	IsOverloaded  bool `json:"is_overloaded"`
+	HasError      bool `json:"has_error"`
+
+	RateLimitResetAt       *time.Time `json:"rate_limit_reset_at"`
+	RateLimitRemainingSec  *int64     `json:"rate_limit_remaining_sec"`
+	OverloadUntil          *time.Time `json:"overload_until"`
+	OverloadRemainingSec   *int64     `json:"overload_remaining_sec"`
+	ErrorMessage           string     `json:"error_message"`
+	TempUnschedulableUntil *time.Time `json:"temp_unschedulable_until,omitempty"`
+}
--- a/backend/internal/service/ops_request_details.go
+++ b/backend/internal/service/ops_request_details.go
+package service
+
+import (
+	"context"
+	"time"
+)
+
+type OpsRequestKind string
+
+const (
+	OpsRequestKindSuccess OpsRequestKind = "success"
+	OpsRequestKindError   OpsRequestKind = "error"
+)
+
+// OpsRequestDetail is a request-level view across success (usage_logs) and error (ops_error_logs).
+// It powers "request drilldown" UIs without exposing full request bodies for successful requests.
+type OpsRequestDetail struct {
+	Kind      OpsRequestKind `json:"kind"`
+	CreatedAt time.Time      `json:"created_at"`
+	RequestID string         `json:"request_id"`
+
+	Platform string `json:"platform,omitempty"`
+	Model    string `json:"model,omitempty"`
+
+	DurationMs *int `json:"duration_ms,omitempty"`
+	StatusCode *int `json:"status_code,omitempty"`
+
+	// When Kind == "error", ErrorID links to /admin/ops/errors/:id.
+	ErrorID *int64 `json:"error_id,omitempty"`
+
+	Phase    string `json:"phase,omitempty"`
+	Severity string `json:"severity,omitempty"`
+	Message  string `json:"message,omitempty"`
+
+	UserID    *int64 `json:"user_id,omitempty"`
+	APIKeyID  *int64 `json:"api_key_id,omitempty"`
+	AccountID *int64 `json:"account_id,omitempty"`
+	GroupID   *int64 `json:"group_id,omitempty"`
+
+	Stream bool `json:"stream"`
+}
+
+type OpsRequestDetailFilter struct {
+	StartTime *time.Time
+	EndTime   *time.Time
+
+	// kind: success|error|all
+	Kind string
+
+	Platform string
+	GroupID  *int64
+
+	UserID    *int64
+	APIKeyID  *int64
+	AccountID *int64
+
+	Model     string
+	RequestID string
+	Query     string
+
+	MinDurationMs *int
+	MaxDurationMs *int
+
+	// Sort: created_at_desc (default) or duration_desc.
+	Sort string
+
+	Page     int
+	PageSize int
+}
+
+func (f *OpsRequestDetailFilter) Normalize() (page, pageSize int, startTime, endTime time.Time) {
+	page = 1
+	pageSize = 50
+	endTime = time.Now()
+	startTime = endTime.Add(-1 * time.Hour)
+
+	if f == nil {
+		return page, pageSize, startTime, endTime
+	}
+
+	if f.Page > 0 {
+		page = f.Page
+	}
+	if f.PageSize > 0 {
+		pageSize = f.PageSize
+	}
+	if pageSize > 100 {
+		pageSize = 100
+	}
+
+	if f.EndTime != nil {
+		endTime = *f.EndTime
+	}
+	if f.StartTime != nil {
+		startTime = *f.StartTime
+	} else if f.EndTime != nil {
+		startTime = endTime.Add(-1 * time.Hour)
+	}
+
+	if startTime.After(endTime) {
+		startTime, endTime = endTime, startTime
+	}
+
+	return page, pageSize, startTime, endTime
+}
+
+type OpsRequestDetailList struct {
+	Items    []*OpsRequestDetail `json:"items"`
+	Total    int64               `json:"total"`
+	Page     int                 `json:"page"`
+	PageSize int                 `json:"page_size"`
+}
+
+func (s *OpsService) ListRequestDetails(ctx context.Context, filter *OpsRequestDetailFilter) (*OpsRequestDetailList, error) {
+	if err := s.RequireMonitoringEnabled(ctx); err != nil {
+		return nil, err
+	}
+	if s.opsRepo == nil {
+		return &OpsRequestDetailList{
+			Items:    []*OpsRequestDetail{},
+			Total:    0,
+			Page:     1,
+			PageSize: 50,
+		}, nil
+	}
+
+	page, pageSize, startTime, endTime := filter.Normalize()
+	filterCopy := &OpsRequestDetailFilter{}
+	if filter != nil {
+		*filterCopy = *filter
+	}
+	filterCopy.Page = page
+	filterCopy.PageSize = pageSize
+	filterCopy.StartTime = &startTime
+	filterCopy.EndTime = &endTime
+
+	items, total, err := s.opsRepo.ListRequestDetails(ctx, filterCopy)
+	if err != nil {
+		return nil, err
+	}
+	if items == nil {
+		items = []*OpsRequestDetail{}
+	}
+
+	return &OpsRequestDetailList{
+		Items:    items,
+		Total:    total,
+		Page:     page,
+		PageSize: pageSize,
+	}, nil
+}
--- a/backend/internal/service/ops_retry.go
+++ b/backend/internal/service/ops_retry.go
+package service
+
+import (
+	"bytes"
+	"context"
+	"database/sql"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"log"
+	"net/http"
+	"strings"
+	"time"
+
+	infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
+	"github.com/gin-gonic/gin"
+	"github.com/lib/pq"
+)
+
+const (
+	OpsRetryModeClient   = "client"
+	OpsRetryModeUpstream = "upstream"
+)
+
+const (
+	opsRetryStatusRunning   = "running"
+	opsRetryStatusSucceeded = "succeeded"
+	opsRetryStatusFailed    = "failed"
+)
+
+const (
+	opsRetryTimeout             = 60 * time.Second
+	opsRetryCaptureBytesLimit   = 64 * 1024
+	opsRetryResponsePreviewMax  = 8 * 1024
+	opsRetryMinIntervalPerError = 10 * time.Second
+	opsRetryMaxAccountSwitches  = 3
+)
+
+var opsRetryRequestHeaderAllowlist = map[string]bool{
+	"anthropic-beta":    true,
+	"anthropic-version": true,
+}
+
+type opsRetryRequestType string
+
+const (
+	opsRetryTypeMessages  opsRetryRequestType = "messages"
+	opsRetryTypeOpenAI    opsRetryRequestType = "openai_responses"
+	opsRetryTypeGeminiV1B opsRetryRequestType = "gemini_v1beta"
+)
+
+type limitedResponseWriter struct {
+	header      http.Header
+	wroteHeader bool
+
+	limit        int
+	totalWritten int64
+	buf          bytes.Buffer
+}
+
+func newLimitedResponseWriter(limit int) *limitedResponseWriter {
+	if limit <= 0 {
+		limit = 1
+	}
+	return &limitedResponseWriter{
+		header: make(http.Header),
+		limit:  limit,
+	}
+}
+
+func (w *limitedResponseWriter) Header() http.Header {
+	return w.header
+}
+
+func (w *limitedResponseWriter) WriteHeader(statusCode int) {
+	if w.wroteHeader {
+		return
+	}
+	w.wroteHeader = true
+}
+
+func (w *limitedResponseWriter) Write(p []byte) (int, error) {
+	if !w.wroteHeader {
+		w.WriteHeader(http.StatusOK)
+	}
+	w.totalWritten += int64(len(p))
+
+	if w.buf.Len() < w.limit {
+		remaining := w.limit - w.buf.Len()
+		if len(p) > remaining {
+			_, _ = w.buf.Write(p[:remaining])
+		} else {
+			_, _ = w.buf.Write(p)
+		}
+	}
+
+	// Pretend we wrote everything to avoid upstream/client code treating it as an error.
+	return len(p), nil
+}
+
+func (w *limitedResponseWriter) Flush() {}
+
+func (w *limitedResponseWriter) bodyBytes() []byte {
+	return w.buf.Bytes()
+}
+
+func (w *limitedResponseWriter) truncated() bool {
+	return w.totalWritten > int64(w.limit)
+}
+
+func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, errorID int64, mode string, pinnedAccountID *int64) (*OpsRetryResult, error) {
+	if err := s.RequireMonitoringEnabled(ctx); err != nil {
+		return nil, err
+	}
+	if s.opsRepo == nil {
+		return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+	}
+
+	mode = strings.ToLower(strings.TrimSpace(mode))
+	switch mode {
+	case OpsRetryModeClient, OpsRetryModeUpstream:
+	default:
+		return nil, infraerrors.BadRequest("OPS_RETRY_INVALID_MODE", "mode must be client or upstream")
+	}
+
+	latest, err := s.opsRepo.GetLatestRetryAttemptForError(ctx, errorID)
+	if err != nil && !errors.Is(err, sql.ErrNoRows) {
+		return nil, infraerrors.InternalServer("OPS_RETRY_LOAD_LATEST_FAILED", "Failed to check retry status").WithCause(err)
+	}
+	if latest != nil {
+		if strings.EqualFold(latest.Status, opsRetryStatusRunning) || strings.EqualFold(latest.Status, "queued") {
+			return nil, infraerrors.Conflict("OPS_RETRY_IN_PROGRESS", "A retry is already in progress for this error")
+		}
+
+		lastAttemptAt := latest.CreatedAt
+		if latest.FinishedAt != nil && !latest.FinishedAt.IsZero() {
+			lastAttemptAt = *latest.FinishedAt
+		} else if latest.StartedAt != nil && !latest.StartedAt.IsZero() {
+			lastAttemptAt = *latest.StartedAt
+		}
+
+		if time.Since(lastAttemptAt) < opsRetryMinIntervalPerError {
+			return nil, infraerrors.Conflict("OPS_RETRY_TOO_FREQUENT", "Please wait before retrying this error again")
+		}
+	}
+
+	errorLog, err := s.GetErrorLogByID(ctx, errorID)
+	if err != nil {
+		return nil, err
+	}
+	if strings.TrimSpace(errorLog.RequestBody) == "" {
+		return nil, infraerrors.BadRequest("OPS_RETRY_NO_REQUEST_BODY", "No request body found to retry")
+	}
+
+	var pinned *int64
+	if mode == OpsRetryModeUpstream {
+		if pinnedAccountID != nil && *pinnedAccountID > 0 {
+			pinned = pinnedAccountID
+		} else if errorLog.AccountID != nil && *errorLog.AccountID > 0 {
+			pinned = errorLog.AccountID
+		} else {
+			return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "pinned_account_id is required for upstream retry")
+		}
+	}
+
+	startedAt := time.Now()
+	attemptID, err := s.opsRepo.InsertRetryAttempt(ctx, &OpsInsertRetryAttemptInput{
+		RequestedByUserID: requestedByUserID,
+		SourceErrorID:     errorID,
+		Mode:              mode,
+		PinnedAccountID:   pinned,
+		Status:            opsRetryStatusRunning,
+		StartedAt:         startedAt,
+	})
+	if err != nil {
+		var pqErr *pq.Error
+		if errors.As(err, &pqErr) && string(pqErr.Code) == "23505" {
+			return nil, infraerrors.Conflict("OPS_RETRY_IN_PROGRESS", "A retry is already in progress for this error")
+		}
+		return nil, infraerrors.InternalServer("OPS_RETRY_CREATE_ATTEMPT_FAILED", "Failed to create retry attempt").WithCause(err)
+	}
+
+	result := &OpsRetryResult{
+		AttemptID:         attemptID,
+		Mode:              mode,
+		Status:            opsRetryStatusFailed,
+		PinnedAccountID:   pinned,
+		HTTPStatusCode:    0,
+		UpstreamRequestID: "",
+		ResponsePreview:   "",
+		ResponseTruncated: false,
+		ErrorMessage:      "",
+		StartedAt:         startedAt,
+	}
+
+	execCtx, cancel := context.WithTimeout(ctx, opsRetryTimeout)
+	defer cancel()
+
+	execRes := s.executeRetry(execCtx, errorLog, mode, pinned)
+
+	finishedAt := time.Now()
+	result.FinishedAt = finishedAt
+	result.DurationMs = finishedAt.Sub(startedAt).Milliseconds()
+
+	if execRes != nil {
+		result.Status = execRes.status
+		result.UsedAccountID = execRes.usedAccountID
+		result.HTTPStatusCode = execRes.httpStatusCode
+		result.UpstreamRequestID = execRes.upstreamRequestID
+		result.ResponsePreview = execRes.responsePreview
+		result.ResponseTruncated = execRes.responseTruncated
+		result.ErrorMessage = execRes.errorMessage
+	}
+
+	updateCtx, updateCancel := context.WithTimeout(context.Background(), 3*time.Second)
+	defer updateCancel()
+
+	var updateErrMsg *string
+	if strings.TrimSpace(result.ErrorMessage) != "" {
+		msg := result.ErrorMessage
+		updateErrMsg = &msg
+	}
+	var resultRequestID *string
+	if strings.TrimSpace(result.UpstreamRequestID) != "" {
+		v := result.UpstreamRequestID
+		resultRequestID = &v
+	}
+
+	finalStatus := result.Status
+	if strings.TrimSpace(finalStatus) == "" {
+		finalStatus = opsRetryStatusFailed
+	}
+
+	if err := s.opsRepo.UpdateRetryAttempt(updateCtx, &OpsUpdateRetryAttemptInput{
+		ID:              attemptID,
+		Status:          finalStatus,
+		FinishedAt:      finishedAt,
+		DurationMs:      result.DurationMs,
+		ResultRequestID: resultRequestID,
+		ErrorMessage:    updateErrMsg,
+	}); err != nil {
+		// Best-effort: retry itself already executed; do not fail the API response.
+		log.Printf("[Ops] UpdateRetryAttempt failed: %v", err)
+	}
+
+	return result, nil
+}
+
+type opsRetryExecution struct {
+	status string
+
+	usedAccountID     *int64
+	httpStatusCode    int
+	upstreamRequestID string
+
+	responsePreview   string
+	responseTruncated bool
+
+	errorMessage string
+}
+
+func (s *OpsService) executeRetry(ctx context.Context, errorLog *OpsErrorLogDetail, mode string, pinnedAccountID *int64) *opsRetryExecution {
+	if errorLog == nil {
+		return &opsRetryExecution{
+			status:       opsRetryStatusFailed,
+			errorMessage: "missing error log",
+		}
+	}
+
+	reqType := detectOpsRetryType(errorLog.RequestPath)
+	bodyBytes := []byte(errorLog.RequestBody)
+
+	switch reqType {
+	case opsRetryTypeMessages:
+		bodyBytes = FilterThinkingBlocksForRetry(bodyBytes)
+	case opsRetryTypeOpenAI, opsRetryTypeGeminiV1B:
+		// No-op
+	}
+
+	switch strings.ToLower(strings.TrimSpace(mode)) {
+	case OpsRetryModeUpstream:
+		if pinnedAccountID == nil || *pinnedAccountID <= 0 {
+			return &opsRetryExecution{
+				status:       opsRetryStatusFailed,
+				errorMessage: "pinned_account_id required for upstream retry",
+			}
+		}
+		return s.executePinnedRetry(ctx, reqType, errorLog, bodyBytes, *pinnedAccountID)
+	case OpsRetryModeClient:
+		return s.executeClientRetry(ctx, reqType, errorLog, bodyBytes)
+	default:
+		return &opsRetryExecution{
+			status:       opsRetryStatusFailed,
+			errorMessage: "invalid retry mode",
+		}
+	}
+}
+
+func detectOpsRetryType(path string) opsRetryRequestType {
+	p := strings.ToLower(strings.TrimSpace(path))
+	switch {
+	case strings.Contains(p, "/responses"):
+		return opsRetryTypeOpenAI
+	case strings.Contains(p, "/v1beta/"):
+		return opsRetryTypeGeminiV1B
+	default:
+		return opsRetryTypeMessages
+	}
+}
+
+func (s *OpsService) executePinnedRetry(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte, pinnedAccountID int64) *opsRetryExecution {
+	if s.accountRepo == nil {
+		return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account repository not available"}
+	}
+
+	account, err := s.accountRepo.GetByID(ctx, pinnedAccountID)
+	if err != nil {
+		return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: fmt.Sprintf("account not found: %v", err)}
+	}
+	if account == nil {
+		return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account not found"}
+	}
+	if !account.IsSchedulable() {
+		return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account is not schedulable"}
+	}
+	if errorLog.GroupID != nil && *errorLog.GroupID > 0 {
+		if !containsInt64(account.GroupIDs, *errorLog.GroupID) {
+			return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "pinned account is not in the same group as the original request"}
+		}
+	}
+
+	var release func()
+	if s.concurrencyService != nil {
+		acq, err := s.concurrencyService.AcquireAccountSlot(ctx, account.ID, account.Concurrency)
+		if err != nil {
+			return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: fmt.Sprintf("acquire account slot failed: %v", err)}
+		}
+		if acq == nil || !acq.Acquired {
+			return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "account concurrency limit reached"}
+		}
+		release = acq.ReleaseFunc
+	}
+	if release != nil {
+		defer release()
+	}
+
+	usedID := account.ID
+	exec := s.executeWithAccount(ctx, reqType, errorLog, body, account)
+	exec.usedAccountID = &usedID
+	if exec.status == "" {
+		exec.status = opsRetryStatusFailed
+	}
+	return exec
+}
+
+func (s *OpsService) executeClientRetry(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte) *opsRetryExecution {
+	groupID := errorLog.GroupID
+	if groupID == nil || *groupID <= 0 {
+		return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "group_id missing; cannot reselect account"}
+	}
+
+	model, stream, parsedErr := extractRetryModelAndStream(reqType, errorLog, body)
+	if parsedErr != nil {
+		return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: parsedErr.Error()}
+	}
+	_ = stream
+
+	excluded := make(map[int64]struct{})
+	switches := 0
+
+	for {
+		if switches >= opsRetryMaxAccountSwitches {
+			return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "retry failed after exhausting account failovers"}
+		}
+
+		selection, selErr := s.selectAccountForRetry(ctx, reqType, groupID, model, excluded)
+		if selErr != nil {
+			return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: selErr.Error()}
+		}
+		if selection == nil || selection.Account == nil {
+			return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "no available accounts"}
+		}
+
+		account := selection.Account
+		if !selection.Acquired || selection.ReleaseFunc == nil {
+			excluded[account.ID] = struct{}{}
+			switches++
+			continue
+		}
+
+		exec := func() *opsRetryExecution {
+			defer selection.ReleaseFunc()
+			return s.executeWithAccount(ctx, reqType, errorLog, body, account)
+		}()
+
+		if exec != nil {
+			if exec.status == opsRetryStatusSucceeded {
+				usedID := account.ID
+				exec.usedAccountID = &usedID
+				return exec
+			}
+			// If the gateway services ask for failover, try another account.
+			if s.isFailoverError(exec.errorMessage) {
+				excluded[account.ID] = struct{}{}
+				switches++
+				continue
+			}
+			usedID := account.ID
+			exec.usedAccountID = &usedID
+			return exec
+		}
+
+		excluded[account.ID] = struct{}{}
+		switches++
+	}
+}
+
+func (s *OpsService) selectAccountForRetry(ctx context.Context, reqType opsRetryRequestType, groupID *int64, model string, excludedIDs map[int64]struct{}) (*AccountSelectionResult, error) {
+	switch reqType {
+	case opsRetryTypeOpenAI:
+		if s.openAIGatewayService == nil {
+			return nil, fmt.Errorf("openai gateway service not available")
+		}
+		return s.openAIGatewayService.SelectAccountWithLoadAwareness(ctx, groupID, "", model, excludedIDs)
+	case opsRetryTypeGeminiV1B, opsRetryTypeMessages:
+		if s.gatewayService == nil {
+			return nil, fmt.Errorf("gateway service not available")
+		}
+		return s.gatewayService.SelectAccountWithLoadAwareness(ctx, groupID, "", model, excludedIDs)
+	default:
+		return nil, fmt.Errorf("unsupported retry type: %s", reqType)
+	}
+}
+
+func extractRetryModelAndStream(reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte) (model string, stream bool, err error) {
+	switch reqType {
+	case opsRetryTypeMessages:
+		parsed, parseErr := ParseGatewayRequest(body)
+		if parseErr != nil {
+			return "", false, fmt.Errorf("failed to parse messages request body: %w", parseErr)
+		}
+		return parsed.Model, parsed.Stream, nil
+	case opsRetryTypeOpenAI:
+		var v struct {
+			Model  string `json:"model"`
+			Stream bool   `json:"stream"`
+		}
+		if err := json.Unmarshal(body, &v); err != nil {
+			return "", false, fmt.Errorf("failed to parse openai request body: %w", err)
+		}
+		return strings.TrimSpace(v.Model), v.Stream, nil
+	case opsRetryTypeGeminiV1B:
+		if strings.TrimSpace(errorLog.Model) == "" {
+			return "", false, fmt.Errorf("missing model for gemini v1beta retry")
+		}
+		return strings.TrimSpace(errorLog.Model), errorLog.Stream, nil
+	default:
+		return "", false, fmt.Errorf("unsupported retry type: %s", reqType)
+	}
+}
+
+func (s *OpsService) executeWithAccount(ctx context.Context, reqType opsRetryRequestType, errorLog *OpsErrorLogDetail, body []byte, account *Account) *opsRetryExecution {
+	if account == nil {
+		return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "missing account"}
+	}
+
+	c, w := newOpsRetryContext(ctx, errorLog)
+
+	var err error
+	switch reqType {
+	case opsRetryTypeOpenAI:
+		if s.openAIGatewayService == nil {
+			return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "openai gateway service not available"}
+		}
+		_, err = s.openAIGatewayService.Forward(ctx, c, account, body)
+	case opsRetryTypeGeminiV1B:
+		if s.geminiCompatService == nil || s.antigravityGatewayService == nil {
+			return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gemini services not available"}
+		}
+		modelName := strings.TrimSpace(errorLog.Model)
+		action := "generateContent"
+		if errorLog.Stream {
+			action = "streamGenerateContent"
+		}
+		if account.Platform == PlatformAntigravity {
+			_, err = s.antigravityGatewayService.ForwardGemini(ctx, c, account, modelName, action, errorLog.Stream, body)
+		} else {
+			_, err = s.geminiCompatService.ForwardNative(ctx, c, account, modelName, action, errorLog.Stream, body)
+		}
+	case opsRetryTypeMessages:
+		switch account.Platform {
+		case PlatformAntigravity:
+			if s.antigravityGatewayService == nil {
+				return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "antigravity gateway service not available"}
+			}
+			_, err = s.antigravityGatewayService.Forward(ctx, c, account, body)
+		case PlatformGemini:
+			if s.geminiCompatService == nil {
+				return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gemini gateway service not available"}
+			}
+			_, err = s.geminiCompatService.Forward(ctx, c, account, body)
+		default:
+			if s.gatewayService == nil {
+				return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "gateway service not available"}
+			}
+			parsedReq, parseErr := ParseGatewayRequest(body)
+			if parseErr != nil {
+				return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "failed to parse request body"}
+			}
+			_, err = s.gatewayService.Forward(ctx, c, account, parsedReq)
+		}
+	default:
+		return &opsRetryExecution{status: opsRetryStatusFailed, errorMessage: "unsupported retry type"}
+	}
+
+	statusCode := http.StatusOK
+	if c != nil && c.Writer != nil {
+		statusCode = c.Writer.Status()
+	}
+
+	upstreamReqID := extractUpstreamRequestID(c)
+	preview, truncated := extractResponsePreview(w)
+
+	exec := &opsRetryExecution{
+		status:            opsRetryStatusFailed,
+		httpStatusCode:    statusCode,
+		upstreamRequestID: upstreamReqID,
+		responsePreview:   preview,
+		responseTruncated: truncated,
+		errorMessage:      "",
+	}
+
+	if err == nil && statusCode < 400 {
+		exec.status = opsRetryStatusSucceeded
+		return exec
+	}
+
+	if err != nil {
+		exec.errorMessage = err.Error()
+	} else {
+		exec.errorMessage = fmt.Sprintf("upstream returned status %d", statusCode)
+	}
+
+	return exec
+}
+
+func newOpsRetryContext(ctx context.Context, errorLog *OpsErrorLogDetail) (*gin.Context, *limitedResponseWriter) {
+	w := newLimitedResponseWriter(opsRetryCaptureBytesLimit)
+	c, _ := gin.CreateTestContext(w)
+
+	path := "/"
+	if errorLog != nil && strings.TrimSpace(errorLog.RequestPath) != "" {
+		path = errorLog.RequestPath
+	}
+
+	req, _ := http.NewRequestWithContext(ctx, http.MethodPost, "http://localhost"+path, bytes.NewReader(nil))
+	req.Header.Set("content-type", "application/json")
+	if errorLog != nil && strings.TrimSpace(errorLog.UserAgent) != "" {
+		req.Header.Set("user-agent", errorLog.UserAgent)
+	}
+	// Restore a minimal, whitelisted subset of request headers to improve retry fidelity
+	// (e.g. anthropic-beta / anthropic-version). Never replay auth credentials.
+	if errorLog != nil && strings.TrimSpace(errorLog.RequestHeaders) != "" {
+		var stored map[string]string
+		if err := json.Unmarshal([]byte(errorLog.RequestHeaders), &stored); err == nil {
+			for k, v := range stored {
+				key := strings.TrimSpace(k)
+				if key == "" {
+					continue
+				}
+				if !opsRetryRequestHeaderAllowlist[strings.ToLower(key)] {
+					continue
+				}
+				val := strings.TrimSpace(v)
+				if val == "" {
+					continue
+				}
+				req.Header.Set(key, val)
+			}
+		}
+	}
+
+	c.Request = req
+	return c, w
+}
+
+func extractUpstreamRequestID(c *gin.Context) string {
+	if c == nil || c.Writer == nil {
+		return ""
+	}
+	h := c.Writer.Header()
+	if h == nil {
+		return ""
+	}
+	for _, key := range []string{"x-request-id", "X-Request-Id", "X-Request-ID"} {
+		if v := strings.TrimSpace(h.Get(key)); v != "" {
+			return v
+		}
+	}
+	return ""
+}
+
+func extractResponsePreview(w *limitedResponseWriter) (preview string, truncated bool) {
+	if w == nil {
+		return "", false
+	}
+	b := bytes.TrimSpace(w.bodyBytes())
+	if len(b) == 0 {
+		return "", w.truncated()
+	}
+	if len(b) > opsRetryResponsePreviewMax {
+		return string(b[:opsRetryResponsePreviewMax]), true
+	}
+	return string(b), w.truncated()
+}
+
+func containsInt64(items []int64, needle int64) bool {
+	for _, v := range items {
+		if v == needle {
+			return true
+		}
+	}
+	return false
+}
+
+func (s *OpsService) isFailoverError(message string) bool {
+	msg := strings.ToLower(strings.TrimSpace(message))
+	if msg == "" {
+		return false
+	}
+	return strings.Contains(msg, "upstream error:") && strings.Contains(msg, "failover")
+}
--- a/backend/internal/service/ops_scheduled_report_service.go
+++ b/backend/internal/service/ops_scheduled_report_service.go
+package service
+
+import (
+	"context"
+	"fmt"
+	"log"
+	"strconv"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/Wei-Shaw/sub2api/internal/config"
+	"github.com/google/uuid"
+	"github.com/redis/go-redis/v9"
+	"github.com/robfig/cron/v3"
+)
+
+const (
+	opsScheduledReportJobName = "ops_scheduled_reports"
+
+	opsScheduledReportLeaderLockKeyDefault = "ops:scheduled_reports:leader"
+	opsScheduledReportLeaderLockTTLDefault = 5 * time.Minute
+
+	opsScheduledReportLastRunKeyPrefix = "ops:scheduled_reports:last_run:"
+
+	opsScheduledReportTickInterval = 1 * time.Minute
+)
+
+var opsScheduledReportCronParser = cron.NewParser(cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow)
+
+var opsScheduledReportReleaseScript = redis.NewScript(`
+if redis.call("GET", KEYS[1]) == ARGV[1] then
+  return redis.call("DEL", KEYS[1])
+end
+return 0
+`)
+
+type OpsScheduledReportService struct {
+	opsService   *OpsService
+	userService  *UserService
+	emailService *EmailService
+	redisClient  *redis.Client
+	cfg          *config.Config
+
+	instanceID string
+	loc        *time.Location
+
+	distributedLockOn bool
+	warnNoRedisOnce   sync.Once
+
+	startOnce sync.Once
+	stopOnce  sync.Once
+	stopCtx   context.Context
+	stop      context.CancelFunc
+	wg        sync.WaitGroup
+}
+
+func NewOpsScheduledReportService(
+	opsService *OpsService,
+	userService *UserService,
+	emailService *EmailService,
+	redisClient *redis.Client,
+	cfg *config.Config,
+) *OpsScheduledReportService {
+	lockOn := cfg == nil || strings.TrimSpace(cfg.RunMode) != config.RunModeSimple
+
+	loc := time.Local
+	if cfg != nil && strings.TrimSpace(cfg.Timezone) != "" {
+		if parsed, err := time.LoadLocation(strings.TrimSpace(cfg.Timezone)); err == nil && parsed != nil {
+			loc = parsed
+		}
+	}
+	return &OpsScheduledReportService{
+		opsService:   opsService,
+		userService:  userService,
+		emailService: emailService,
+		redisClient:  redisClient,
+		cfg:          cfg,
+
+		instanceID:        uuid.NewString(),
+		loc:               loc,
+		distributedLockOn: lockOn,
+		warnNoRedisOnce:   sync.Once{},
+		startOnce:         sync.Once{},
+		stopOnce:          sync.Once{},
+		stopCtx:           nil,
+		stop:              nil,
+		wg:                sync.WaitGroup{},
+	}
+}
+
+func (s *OpsScheduledReportService) Start() {
+	s.StartWithContext(context.Background())
+}
+
+func (s *OpsScheduledReportService) StartWithContext(ctx context.Context) {
+	if s == nil {
+		return
+	}
+	if ctx == nil {
+		ctx = context.Background()
+	}
+	if s.cfg != nil && !s.cfg.Ops.Enabled {
+		return
+	}
+	if s.opsService == nil || s.emailService == nil {
+		return
+	}
+
+	s.startOnce.Do(func() {
+		s.stopCtx, s.stop = context.WithCancel(ctx)
+		s.wg.Add(1)
+		go s.run()
+	})
+}
+
+func (s *OpsScheduledReportService) Stop() {
+	if s == nil {
+		return
+	}
+	s.stopOnce.Do(func() {
+		if s.stop != nil {
+			s.stop()
+		}
+	})
+	s.wg.Wait()
+}
+
+func (s *OpsScheduledReportService) run() {
+	defer s.wg.Done()
+
+	ticker := time.NewTicker(opsScheduledReportTickInterval)
+	defer ticker.Stop()
+
+	s.runOnce()
+	for {
+		select {
+		case <-ticker.C:
+			s.runOnce()
+		case <-s.stopCtx.Done():
+			return
+		}
+	}
+}
+
+func (s *OpsScheduledReportService) runOnce() {
+	if s == nil || s.opsService == nil || s.emailService == nil {
+		return
+	}
+
+	startedAt := time.Now().UTC()
+	runAt := startedAt
+
+	ctx, cancel := context.WithTimeout(s.stopCtx, 60*time.Second)
+	defer cancel()
+
+	// Respect ops monitoring enabled switch.
+	if !s.opsService.IsMonitoringEnabled(ctx) {
+		return
+	}
+
+	release, ok := s.tryAcquireLeaderLock(ctx)
+	if !ok {
+		return
+	}
+	if release != nil {
+		defer release()
+	}
+
+	now := time.Now()
+	if s.loc != nil {
+		now = now.In(s.loc)
+	}
+
+	reports := s.listScheduledReports(ctx, now)
+	if len(reports) == 0 {
+		return
+	}
+
+	for _, report := range reports {
+		if report == nil || !report.Enabled {
+			continue
+		}
+		if report.NextRunAt.After(now) {
+			continue
+		}
+
+		if err := s.runReport(ctx, report, now); err != nil {
+			s.recordHeartbeatError(runAt, time.Since(startedAt), err)
+			return
+		}
+	}
+
+	s.recordHeartbeatSuccess(runAt, time.Since(startedAt))
+}
+
+type opsScheduledReport struct {
+	Name       string
+	ReportType string
+	Schedule   string
+	Enabled    bool
+
+	TimeRange time.Duration
+
+	Recipients []string
+
+	ErrorDigestMinCount             int
+	AccountHealthErrorRateThreshold float64
+
+	LastRunAt *time.Time
+	NextRunAt time.Time
+}
+
+func (s *OpsScheduledReportService) listScheduledReports(ctx context.Context, now time.Time) []*opsScheduledReport {
+	if s == nil || s.opsService == nil {
+		return nil
+	}
+	if ctx == nil {
+		ctx = context.Background()
+	}
+
+	emailCfg, err := s.opsService.GetEmailNotificationConfig(ctx)
+	if err != nil || emailCfg == nil {
+		return nil
+	}
+	if !emailCfg.Report.Enabled {
+		return nil
+	}
+
+	recipients := normalizeEmails(emailCfg.Report.Recipients)
+
+	type reportDef struct {
+		enabled   bool
+		name      string
+		kind      string
+		timeRange time.Duration
+		schedule  string
+	}
+
+	defs := []reportDef{
+		{enabled: emailCfg.Report.DailySummaryEnabled, name: "日报", kind: "daily_summary", timeRange: 24 * time.Hour, schedule: emailCfg.Report.DailySummarySchedule},
+		{enabled: emailCfg.Report.WeeklySummaryEnabled, name: "周报", kind: "weekly_summary", timeRange: 7 * 24 * time.Hour, schedule: emailCfg.Report.WeeklySummarySchedule},
+		{enabled: emailCfg.Report.ErrorDigestEnabled, name: "错误摘要", kind: "error_digest", timeRange: 24 * time.Hour, schedule: emailCfg.Report.ErrorDigestSchedule},
+		{enabled: emailCfg.Report.AccountHealthEnabled, name: "账号健康", kind: "account_health", timeRange: 24 * time.Hour, schedule: emailCfg.Report.AccountHealthSchedule},
+	}
+
+	out := make([]*opsScheduledReport, 0, len(defs))
+	for _, d := range defs {
+		if !d.enabled {
+			continue
+		}
+		spec := strings.TrimSpace(d.schedule)
+		if spec == "" {
+			continue
+		}
+		sched, err := opsScheduledReportCronParser.Parse(spec)
+		if err != nil {
+			log.Printf("[OpsScheduledReport] invalid cron spec=%q for report=%s: %v", spec, d.kind, err)
+			continue
+		}
+
+		lastRun := s.getLastRunAt(ctx, d.kind)
+		base := lastRun
+		if base.IsZero() {
+			// Allow a schedule matching the current minute to trigger right after startup.
+			base = now.Add(-1 * time.Minute)
+		}
+		next := sched.Next(base)
+		if next.IsZero() {
+			continue
+		}
+
+		var lastRunPtr *time.Time
+		if !lastRun.IsZero() {
+			lastCopy := lastRun
+			lastRunPtr = &lastCopy
+		}
+
+		out = append(out, &opsScheduledReport{
+			Name:       d.name,
+			ReportType: d.kind,
+			Schedule:   spec,
+			Enabled:    true,
+
+			TimeRange: d.timeRange,
+
+			Recipients: recipients,
+
+			ErrorDigestMinCount:             emailCfg.Report.ErrorDigestMinCount,
+			AccountHealthErrorRateThreshold: emailCfg.Report.AccountHealthErrorRateThreshold,
+
+			LastRunAt: lastRunPtr,
+			NextRunAt: next,
+		})
+	}
+
+	return out
+}
+
+func (s *OpsScheduledReportService) runReport(ctx context.Context, report *opsScheduledReport, now time.Time) error {
+	if s == nil || s.opsService == nil || s.emailService == nil || report == nil {
+		return nil
+	}
+	if ctx == nil {
+		ctx = context.Background()
+	}
+
+	// Mark as "run" up-front so a broken SMTP config doesn't spam retries every minute.
+	s.setLastRunAt(ctx, report.ReportType, now)
+
+	content, err := s.generateReportHTML(ctx, report, now)
+	if err != nil {
+		return err
+	}
+	if strings.TrimSpace(content) == "" {
+		// Skip sending when the report decides not to emit content (e.g., digest below min count).
+		return nil
+	}
+
+	recipients := report.Recipients
+	if len(recipients) == 0 && s.userService != nil {
+		admin, err := s.userService.GetFirstAdmin(ctx)
+		if err == nil && admin != nil && strings.TrimSpace(admin.Email) != "" {
+			recipients = []string{strings.TrimSpace(admin.Email)}
+		}
+	}
+	if len(recipients) == 0 {
+		return nil
+	}
+
+	subject := fmt.Sprintf("[Ops Report] %s", strings.TrimSpace(report.Name))
+
+	for _, to := range recipients {
+		addr := strings.TrimSpace(to)
+		if addr == "" {
+			continue
+		}
+		if err := s.emailService.SendEmail(ctx, addr, subject, content); err != nil {
+			// Ignore per-recipient failures; continue best-effort.
+			continue
+		}
+	}
+	return nil
+}
+
+func (s *OpsScheduledReportService) generateReportHTML(ctx context.Context, report *opsScheduledReport, now time.Time) (string, error) {
+	if s == nil || s.opsService == nil || report == nil {
+		return "", fmt.Errorf("service not initialized")
+	}
+	if report.TimeRange <= 0 {
+		return "", fmt.Errorf("invalid time range")
+	}
+
+	end := now.UTC()
+	start := end.Add(-report.TimeRange)
+
+	switch strings.TrimSpace(report.ReportType) {
+	case "daily_summary", "weekly_summary":
+		overview, err := s.opsService.GetDashboardOverview(ctx, &OpsDashboardFilter{
+			StartTime: start,
+			EndTime:   end,
+			Platform:  "",
+			GroupID:   nil,
+			QueryMode: OpsQueryModeAuto,
+		})
+		if err != nil {
+			// If pre-aggregation isn't ready but the report is requested, fall back to raw.
+			if strings.TrimSpace(report.ReportType) == "daily_summary" || strings.TrimSpace(report.ReportType) == "weekly_summary" {
+				overview, err = s.opsService.GetDashboardOverview(ctx, &OpsDashboardFilter{
+					StartTime: start,
+					EndTime:   end,
+					Platform:  "",
+					GroupID:   nil,
+					QueryMode: OpsQueryModeRaw,
+				})
+			}
+			if err != nil {
+				return "", err
+			}
+		}
+		return buildOpsSummaryEmailHTML(report.Name, start, end, overview), nil
+	case "error_digest":
+		// Lightweight digest: list recent errors (status>=400) and breakdown by type.
+		startTime := start
+		endTime := end
+		filter := &OpsErrorLogFilter{
+			StartTime: &startTime,
+			EndTime:   &endTime,
+			Page:      1,
+			PageSize:  100,
+		}
+		out, err := s.opsService.GetErrorLogs(ctx, filter)
+		if err != nil {
+			return "", err
+		}
+		if report.ErrorDigestMinCount > 0 && out != nil && out.Total < report.ErrorDigestMinCount {
+			return "", nil
+		}
+		return buildOpsErrorDigestEmailHTML(report.Name, start, end, out), nil
+	case "account_health":
+		// Best-effort: use account availability (not error rate yet).
+		avail, err := s.opsService.GetAccountAvailability(ctx, "", nil)
+		if err != nil {
+			return "", err
+		}
+		_ = report.AccountHealthErrorRateThreshold // reserved for future per-account error rate report
+		return buildOpsAccountHealthEmailHTML(report.Name, start, end, avail), nil
+	default:
+		return "", fmt.Errorf("unknown report type: %s", report.ReportType)
+	}
+}
+
+func buildOpsSummaryEmailHTML(title string, start, end time.Time, overview *OpsDashboardOverview) string {
+	if overview == nil {
+		return fmt.Sprintf("<h2>%s</h2><p>No data.</p>", htmlEscape(title))
+	}
+
+	latP50 := "-"
+	latP99 := "-"
+	if overview.Duration.P50 != nil {
+		latP50 = fmt.Sprintf("%dms", *overview.Duration.P50)
+	}
+	if overview.Duration.P99 != nil {
+		latP99 = fmt.Sprintf("%dms", *overview.Duration.P99)
+	}
+
+	ttftP50 := "-"
+	ttftP99 := "-"
+	if overview.TTFT.P50 != nil {
+		ttftP50 = fmt.Sprintf("%dms", *overview.TTFT.P50)
+	}
+	if overview.TTFT.P99 != nil {
+		ttftP99 = fmt.Sprintf("%dms", *overview.TTFT.P99)
+	}
+
+	return fmt.Sprintf(`
+<h2>%s</h2>
+<p><b>Period</b>: %s ~ %s (UTC)</p>
+<ul>
+  <li><b>Total Requests</b>: %d</li>
+  <li><b>Success</b>: %d</li>
+  <li><b>Errors (SLA)</b>: %d</li>
+  <li><b>Business Limited</b>: %d</li>
+  <li><b>SLA</b>: %.2f%%</li>
+  <li><b>Error Rate</b>: %.2f%%</li>
+  <li><b>Upstream Error Rate (excl 429/529)</b>: %.2f%%</li>
+  <li><b>Upstream Errors</b>: excl429/529=%d, 429=%d, 529=%d</li>
+  <li><b>Latency</b>: p50=%s, p99=%s</li>
+  <li><b>TTFT</b>: p50=%s, p99=%s</li>
+  <li><b>Tokens</b>: %d</li>
+  <li><b>QPS</b>: current=%.1f, peak=%.1f, avg=%.1f</li>
+  <li><b>TPS</b>: current=%.1f, peak=%.1f, avg=%.1f</li>
+</ul>
+`,
+		htmlEscape(strings.TrimSpace(title)),
+		htmlEscape(start.UTC().Format(time.RFC3339)),
+		htmlEscape(end.UTC().Format(time.RFC3339)),
+		overview.RequestCountTotal,
+		overview.SuccessCount,
+		overview.ErrorCountSLA,
+		overview.BusinessLimitedCount,
+		overview.SLA*100,
+		overview.ErrorRate*100,
+		overview.UpstreamErrorRate*100,
+		overview.UpstreamErrorCountExcl429529,
+		overview.Upstream429Count,
+		overview.Upstream529Count,
+		htmlEscape(latP50),
+		htmlEscape(latP99),
+		htmlEscape(ttftP50),
+		htmlEscape(ttftP99),
+		overview.TokenConsumed,
+		overview.QPS.Current,
+		overview.QPS.Peak,
+		overview.QPS.Avg,
+		overview.TPS.Current,
+		overview.TPS.Peak,
+		overview.TPS.Avg,
+	)
+}
+
+func buildOpsErrorDigestEmailHTML(title string, start, end time.Time, list *OpsErrorLogList) string {
+	total := 0
+	recent := []*OpsErrorLog{}
+	if list != nil {
+		total = list.Total
+		recent = list.Errors
+	}
+	if len(recent) > 10 {
+		recent = recent[:10]
+	}
+
+	rows := ""
+	for _, item := range recent {
+		if item == nil {
+			continue
+		}
+		rows += fmt.Sprintf(
+			"<tr><td>%s</td><td>%s</td><td>%d</td><td>%s</td></tr>",
+			htmlEscape(item.CreatedAt.UTC().Format(time.RFC3339)),
+			htmlEscape(item.Platform),
+			item.StatusCode,
+			htmlEscape(truncateString(item.Message, 180)),
+		)
+	}
+	if rows == "" {
+		rows = "<tr><td colspan=\"4\">No recent errors.</td></tr>"
+	}
+
+	return fmt.Sprintf(`
+<h2>%s</h2>
+<p><b>Period</b>: %s ~ %s (UTC)</p>
+<p><b>Total Errors</b>: %d</p>
+<h3>Recent</h3>
+<table border="1" cellpadding="6" cellspacing="0" style="border-collapse:collapse;">
+  <thead><tr><th>Time</th><th>Platform</th><th>Status</th><th>Message</th></tr></thead>
+  <tbody>%s</tbody>
+</table>
+`,
+		htmlEscape(strings.TrimSpace(title)),
+		htmlEscape(start.UTC().Format(time.RFC3339)),
+		htmlEscape(end.UTC().Format(time.RFC3339)),
+		total,
+		rows,
+	)
+}
+
+func buildOpsAccountHealthEmailHTML(title string, start, end time.Time, avail *OpsAccountAvailability) string {
+	total := 0
+	available := 0
+	rateLimited := 0
+	hasError := 0
+
+	if avail != nil && avail.Accounts != nil {
+		for _, a := range avail.Accounts {
+			if a == nil {
+				continue
+			}
+			total++
+			if a.IsAvailable {
+				available++
+			}
+			if a.IsRateLimited {
+				rateLimited++
+			}
+			if a.HasError {
+				hasError++
+			}
+		}
+	}
+
+	return fmt.Sprintf(`
+<h2>%s</h2>
+<p><b>Period</b>: %s ~ %s (UTC)</p>
+<ul>
+  <li><b>Total Accounts</b>: %d</li>
+  <li><b>Available</b>: %d</li>
+  <li><b>Rate Limited</b>: %d</li>
+  <li><b>Error</b>: %d</li>
+</ul>
+<p>Note: This report currently reflects account availability status only.</p>
+`,
+		htmlEscape(strings.TrimSpace(title)),
+		htmlEscape(start.UTC().Format(time.RFC3339)),
+		htmlEscape(end.UTC().Format(time.RFC3339)),
+		total,
+		available,
+		rateLimited,
+		hasError,
+	)
+}
+
+func (s *OpsScheduledReportService) tryAcquireLeaderLock(ctx context.Context) (func(), bool) {
+	if s == nil || !s.distributedLockOn {
+		return nil, true
+	}
+	if s.redisClient == nil {
+		s.warnNoRedisOnce.Do(func() {
+			log.Printf("[OpsScheduledReport] redis not configured; running without distributed lock")
+		})
+		return nil, true
+	}
+	if ctx == nil {
+		ctx = context.Background()
+	}
+
+	key := opsScheduledReportLeaderLockKeyDefault
+	ttl := opsScheduledReportLeaderLockTTLDefault
+	if strings.TrimSpace(key) == "" {
+		key = "ops:scheduled_reports:leader"
+	}
+	if ttl <= 0 {
+		ttl = 5 * time.Minute
+	}
+
+	ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
+	if err != nil {
+		// Prefer fail-closed to avoid duplicate report sends when Redis is flaky.
+		log.Printf("[OpsScheduledReport] leader lock SetNX failed; skipping this cycle: %v", err)
+		return nil, false
+	}
+	if !ok {
+		return nil, false
+	}
+	return func() {
+		_, _ = opsScheduledReportReleaseScript.Run(ctx, s.redisClient, []string{key}, s.instanceID).Result()
+	}, true
+}
+
+func (s *OpsScheduledReportService) getLastRunAt(ctx context.Context, reportType string) time.Time {
+	if s == nil || s.redisClient == nil {
+		return time.Time{}
+	}
+	kind := strings.TrimSpace(reportType)
+	if kind == "" {
+		return time.Time{}
+	}
+	key := opsScheduledReportLastRunKeyPrefix + kind
+
+	raw, err := s.redisClient.Get(ctx, key).Result()
+	if err != nil || strings.TrimSpace(raw) == "" {
+		return time.Time{}
+	}
+	sec, err := strconv.ParseInt(strings.TrimSpace(raw), 10, 64)
+	if err != nil || sec <= 0 {
+		return time.Time{}
+	}
+	last := time.Unix(sec, 0)
+	// Cron schedules are interpreted in the configured timezone (s.loc). Ensure the base time
+	// passed into cron.Next() uses the same location; otherwise the job will drift by timezone
+	// offset (e.g. Asia/Shanghai default would run 8h later after the first execution).
+	if s.loc != nil {
+		return last.In(s.loc)
+	}
+	return last.UTC()
+}
+
+func (s *OpsScheduledReportService) setLastRunAt(ctx context.Context, reportType string, t time.Time) {
+	if s == nil || s.redisClient == nil {
+		return
+	}
+	kind := strings.TrimSpace(reportType)
+	if kind == "" {
+		return
+	}
+	if t.IsZero() {
+		t = time.Now().UTC()
+	}
+	key := opsScheduledReportLastRunKeyPrefix + kind
+	_ = s.redisClient.Set(ctx, key, strconv.FormatInt(t.UTC().Unix(), 10), 14*24*time.Hour).Err()
+}
+
+func (s *OpsScheduledReportService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration) {
+	if s == nil || s.opsService == nil || s.opsService.opsRepo == nil {
+		return
+	}
+	now := time.Now().UTC()
+	durMs := duration.Milliseconds()
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+	defer cancel()
+	_ = s.opsService.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
+		JobName:        opsScheduledReportJobName,
+		LastRunAt:      &runAt,
+		LastSuccessAt:  &now,
+		LastDurationMs: &durMs,
+	})
+}
+
+func (s *OpsScheduledReportService) recordHeartbeatError(runAt time.Time, duration time.Duration, err error) {
+	if s == nil || s.opsService == nil || s.opsService.opsRepo == nil || err == nil {
+		return
+	}
+	now := time.Now().UTC()
+	durMs := duration.Milliseconds()
+	msg := truncateString(err.Error(), 2048)
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+	defer cancel()
+	_ = s.opsService.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
+		JobName:        opsScheduledReportJobName,
+		LastRunAt:      &runAt,
+		LastErrorAt:    &now,
+		LastError:      &msg,
+		LastDurationMs: &durMs,
+	})
+}
+
+func normalizeEmails(in []string) []string {
+	if len(in) == 0 {
+		return nil
+	}
+	seen := make(map[string]struct{}, len(in))
+	out := make([]string, 0, len(in))
+	for _, raw := range in {
+		addr := strings.ToLower(strings.TrimSpace(raw))
+		if addr == "" {
+			continue
+		}
+		if _, ok := seen[addr]; ok {
+			continue
+		}
+		seen[addr] = struct{}{}
+		out = append(out, addr)
+	}
+	return out
+}
--- a/backend/internal/service/ops_service.go
+++ b/backend/internal/service/ops_service.go
+package service
+
+import (
+	"context"
+	"database/sql"
+	"encoding/json"
+	"errors"
+	"log"
+	"strings"
+	"time"
+
+	"github.com/Wei-Shaw/sub2api/internal/config"
+	infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
+)
+
+var ErrOpsDisabled = infraerrors.NotFound("OPS_DISABLED", "Ops monitoring is disabled")
+
+const (
+	opsMaxStoredRequestBodyBytes = 10 * 1024
+	opsMaxStoredErrorBodyBytes   = 20 * 1024
+)
+
+// OpsService provides ingestion and query APIs for the Ops monitoring module.
+type OpsService struct {
+	opsRepo     OpsRepository
+	settingRepo SettingRepository
+	cfg         *config.Config
+
+	accountRepo AccountRepository
+
+	// getAccountAvailability is a unit-test hook for overriding account availability lookup.
+	getAccountAvailability func(ctx context.Context, platformFilter string, groupIDFilter *int64) (*OpsAccountAvailability, error)
+
+	concurrencyService        *ConcurrencyService
+	gatewayService            *GatewayService
+	openAIGatewayService      *OpenAIGatewayService
+	geminiCompatService       *GeminiMessagesCompatService
+	antigravityGatewayService *AntigravityGatewayService
+}
+
+func NewOpsService(
+	opsRepo OpsRepository,
+	settingRepo SettingRepository,
+	cfg *config.Config,
+	accountRepo AccountRepository,
+	concurrencyService *ConcurrencyService,
+	gatewayService *GatewayService,
+	openAIGatewayService *OpenAIGatewayService,
+	geminiCompatService *GeminiMessagesCompatService,
+	antigravityGatewayService *AntigravityGatewayService,
+) *OpsService {
+	return &OpsService{
+		opsRepo:     opsRepo,
+		settingRepo: settingRepo,
+		cfg:         cfg,
+
+		accountRepo: accountRepo,
+
+		concurrencyService:        concurrencyService,
+		gatewayService:            gatewayService,
+		openAIGatewayService:      openAIGatewayService,
+		geminiCompatService:       geminiCompatService,
+		antigravityGatewayService: antigravityGatewayService,
+	}
+}
+
+func (s *OpsService) RequireMonitoringEnabled(ctx context.Context) error {
+	if s.IsMonitoringEnabled(ctx) {
+		return nil
+	}
+	return ErrOpsDisabled
+}
+
+func (s *OpsService) IsMonitoringEnabled(ctx context.Context) bool {
+	// Hard switch: disable ops entirely.
+	if s.cfg != nil && !s.cfg.Ops.Enabled {
+		return false
+	}
+	if s.settingRepo == nil {
+		return true
+	}
+	value, err := s.settingRepo.GetValue(ctx, SettingKeyOpsMonitoringEnabled)
+	if err != nil {
+		// Default enabled when key is missing, and fail-open on transient errors
+		// (ops should never block gateway traffic).
+		if errors.Is(err, ErrSettingNotFound) {
+			return true
+		}
+		return true
+	}
+	switch strings.ToLower(strings.TrimSpace(value)) {
+	case "false", "0", "off", "disabled":
+		return false
+	default:
+		return true
+	}
+}
+
+func (s *OpsService) RecordError(ctx context.Context, entry *OpsInsertErrorLogInput, rawRequestBody []byte) error {
+	if entry == nil {
+		return nil
+	}
+	if !s.IsMonitoringEnabled(ctx) {
+		return nil
+	}
+	if s.opsRepo == nil {
+		return nil
+	}
+
+	// Ensure timestamps are always populated.
+	if entry.CreatedAt.IsZero() {
+		entry.CreatedAt = time.Now()
+	}
+
+	// Ensure required fields exist (DB has NOT NULL constraints).
+	entry.ErrorPhase = strings.TrimSpace(entry.ErrorPhase)
+	entry.ErrorType = strings.TrimSpace(entry.ErrorType)
+	if entry.ErrorPhase == "" {
+		entry.ErrorPhase = "internal"
+	}
+	if entry.ErrorType == "" {
+		entry.ErrorType = "api_error"
+	}
+
+	// Sanitize + trim request body (errors only).
+	if len(rawRequestBody) > 0 {
+		sanitized, truncated, bytesLen := sanitizeAndTrimRequestBody(rawRequestBody, opsMaxStoredRequestBodyBytes)
+		if sanitized != "" {
+			entry.RequestBodyJSON = &sanitized
+		}
+		entry.RequestBodyTruncated = truncated
+		entry.RequestBodyBytes = &bytesLen
+	}
+
+	// Sanitize + truncate error_body to avoid storing sensitive data.
+	if strings.TrimSpace(entry.ErrorBody) != "" {
+		sanitized, _ := sanitizeErrorBodyForStorage(entry.ErrorBody, opsMaxStoredErrorBodyBytes)
+		entry.ErrorBody = sanitized
+	}
+
+	// Sanitize upstream error context if provided by gateway services.
+	if entry.UpstreamStatusCode != nil && *entry.UpstreamStatusCode <= 0 {
+		entry.UpstreamStatusCode = nil
+	}
+	if entry.UpstreamErrorMessage != nil {
+		msg := strings.TrimSpace(*entry.UpstreamErrorMessage)
+		msg = sanitizeUpstreamErrorMessage(msg)
+		msg = truncateString(msg, 2048)
+		if strings.TrimSpace(msg) == "" {
+			entry.UpstreamErrorMessage = nil
+		} else {
+			entry.UpstreamErrorMessage = &msg
+		}
+	}
+	if entry.UpstreamErrorDetail != nil {
+		detail := strings.TrimSpace(*entry.UpstreamErrorDetail)
+		if detail == "" {
+			entry.UpstreamErrorDetail = nil
+		} else {
+			sanitized, _ := sanitizeErrorBodyForStorage(detail, opsMaxStoredErrorBodyBytes)
+			if strings.TrimSpace(sanitized) == "" {
+				entry.UpstreamErrorDetail = nil
+			} else {
+				entry.UpstreamErrorDetail = &sanitized
+			}
+		}
+	}
+
+	// Sanitize + serialize upstream error events list.
+	if len(entry.UpstreamErrors) > 0 {
+		const maxEvents = 32
+		events := entry.UpstreamErrors
+		if len(events) > maxEvents {
+			events = events[len(events)-maxEvents:]
+		}
+
+		sanitized := make([]*OpsUpstreamErrorEvent, 0, len(events))
+		for _, ev := range events {
+			if ev == nil {
+				continue
+			}
+			out := *ev
+
+			out.Platform = strings.TrimSpace(out.Platform)
+			out.UpstreamRequestID = truncateString(strings.TrimSpace(out.UpstreamRequestID), 128)
+			out.Kind = truncateString(strings.TrimSpace(out.Kind), 64)
+
+			if out.AccountID < 0 {
+				out.AccountID = 0
+			}
+			if out.UpstreamStatusCode < 0 {
+				out.UpstreamStatusCode = 0
+			}
+			if out.AtUnixMs < 0 {
+				out.AtUnixMs = 0
+			}
+
+			msg := sanitizeUpstreamErrorMessage(strings.TrimSpace(out.Message))
+			msg = truncateString(msg, 2048)
+			out.Message = msg
+
+			detail := strings.TrimSpace(out.Detail)
+			if detail != "" {
+				// Keep upstream detail small; request bodies are not stored here, only upstream error payloads.
+				sanitizedDetail, _ := sanitizeErrorBodyForStorage(detail, opsMaxStoredErrorBodyBytes)
+				out.Detail = sanitizedDetail
+			} else {
+				out.Detail = ""
+			}
+
+			// Drop fully-empty events (can happen if only status code was known).
+			if out.UpstreamStatusCode == 0 && out.Message == "" && out.Detail == "" {
+				continue
+			}
+
+			evCopy := out
+			sanitized = append(sanitized, &evCopy)
+		}
+
+		entry.UpstreamErrorsJSON = marshalOpsUpstreamErrors(sanitized)
+		entry.UpstreamErrors = nil
+	}
+
+	if _, err := s.opsRepo.InsertErrorLog(ctx, entry); err != nil {
+		// Never bubble up to gateway; best-effort logging.
+		log.Printf("[Ops] RecordError failed: %v", err)
+		return err
+	}
+	return nil
+}
+
+func (s *OpsService) GetErrorLogs(ctx context.Context, filter *OpsErrorLogFilter) (*OpsErrorLogList, error) {
+	if err := s.RequireMonitoringEnabled(ctx); err != nil {
+		return nil, err
+	}
+	if s.opsRepo == nil {
+		return &OpsErrorLogList{Errors: []*OpsErrorLog{}, Total: 0, Page: 1, PageSize: 20}, nil
+	}
+	return s.opsRepo.ListErrorLogs(ctx, filter)
+}
+
+func (s *OpsService) GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLogDetail, error) {
+	if err := s.RequireMonitoringEnabled(ctx); err != nil {
+		return nil, err
+	}
+	if s.opsRepo == nil {
+		return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found")
+	}
+	detail, err := s.opsRepo.GetErrorLogByID(ctx, id)
+	if err != nil {
+		if errors.Is(err, sql.ErrNoRows) {
+			return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found")
+		}
+		return nil, infraerrors.InternalServer("OPS_ERROR_LOAD_FAILED", "Failed to load ops error log").WithCause(err)
+	}
+	return detail, nil
+}
+
+func sanitizeAndTrimRequestBody(raw []byte, maxBytes int) (jsonString string, truncated bool, bytesLen int) {
+	bytesLen = len(raw)
+	if len(raw) == 0 {
+		return "", false, 0
+	}
+
+	var decoded any
+	if err := json.Unmarshal(raw, &decoded); err != nil {
+		// If it's not valid JSON, don't store (retry would not be reliable anyway).
+		return "", false, bytesLen
+	}
+
+	decoded = redactSensitiveJSON(decoded)
+
+	encoded, err := json.Marshal(decoded)
+	if err != nil {
+		return "", false, bytesLen
+	}
+	if len(encoded) <= maxBytes {
+		return string(encoded), false, bytesLen
+	}
+
+	// Trim conversation history to keep the most recent context.
+	if root, ok := decoded.(map[string]any); ok {
+		if trimmed, ok := trimConversationArrays(root, maxBytes); ok {
+			encoded2, err2 := json.Marshal(trimmed)
+			if err2 == nil && len(encoded2) <= maxBytes {
+				return string(encoded2), true, bytesLen
+			}
+			// Fallthrough: keep shrinking.
+			decoded = trimmed
+		}
+
+		essential := shrinkToEssentials(root)
+		encoded3, err3 := json.Marshal(essential)
+		if err3 == nil && len(encoded3) <= maxBytes {
+			return string(encoded3), true, bytesLen
+		}
+	}
+
+	// Last resort: store a minimal placeholder (still valid JSON).
+	placeholder := map[string]any{
+		"request_body_truncated": true,
+	}
+	if model := extractString(decoded, "model"); model != "" {
+		placeholder["model"] = model
+	}
+	encoded4, err4 := json.Marshal(placeholder)
+	if err4 != nil {
+		return "", true, bytesLen
+	}
+	return string(encoded4), true, bytesLen
+}
+
+func redactSensitiveJSON(v any) any {
+	switch t := v.(type) {
+	case map[string]any:
+		out := make(map[string]any, len(t))
+		for k, vv := range t {
+			if isSensitiveKey(k) {
+				out[k] = "[REDACTED]"
+				continue
+			}
+			out[k] = redactSensitiveJSON(vv)
+		}
+		return out
+	case []any:
+		out := make([]any, 0, len(t))
+		for _, vv := range t {
+			out = append(out, redactSensitiveJSON(vv))
+		}
+		return out
+	default:
+		return v
+	}
+}
+
+func isSensitiveKey(key string) bool {
+	k := strings.ToLower(strings.TrimSpace(key))
+	if k == "" {
+		return false
+	}
+
+	// Exact matches (common credential fields).
+	switch k {
+	case "authorization",
+		"proxy-authorization",
+		"x-api-key",
+		"api_key",
+		"apikey",
+		"access_token",
+		"refresh_token",
+		"id_token",
+		"session_token",
+		"token",
+		"password",
+		"passwd",
+		"passphrase",
+		"secret",
+		"client_secret",
+		"private_key",
+		"jwt",
+		"signature",
+		"accesskeyid",
+		"secretaccesskey":
+		return true
+	}
+
+	// Suffix matches.
+	for _, suffix := range []string{
+		"_secret",
+		"_token",
+		"_id_token",
+		"_session_token",
+		"_password",
+		"_passwd",
+		"_passphrase",
+		"_key",
+		"secret_key",
+		"private_key",
+	} {
+		if strings.HasSuffix(k, suffix) {
+			return true
+		}
+	}
+
+	// Substring matches (conservative, but errs on the side of privacy).
+	for _, sub := range []string{
+		"secret",
+		"token",
+		"password",
+		"passwd",
+		"passphrase",
+		"privatekey",
+		"private_key",
+		"apikey",
+		"api_key",
+		"accesskeyid",
+		"secretaccesskey",
+		"bearer",
+		"cookie",
+		"credential",
+		"session",
+		"jwt",
+		"signature",
+	} {
+		if strings.Contains(k, sub) {
+			return true
+		}
+	}
+
+	return false
+}
+
+func trimConversationArrays(root map[string]any, maxBytes int) (map[string]any, bool) {
+	// Supported: anthropic/openai: messages; gemini: contents.
+	if out, ok := trimArrayField(root, "messages", maxBytes); ok {
+		return out, true
+	}
+	if out, ok := trimArrayField(root, "contents", maxBytes); ok {
+		return out, true
+	}
+	return root, false
+}
+
+func trimArrayField(root map[string]any, field string, maxBytes int) (map[string]any, bool) {
+	raw, ok := root[field]
+	if !ok {
+		return nil, false
+	}
+	arr, ok := raw.([]any)
+	if !ok || len(arr) == 0 {
+		return nil, false
+	}
+
+	// Keep at least the last message/content. Use binary search so we don't marshal O(n) times.
+	// We are dropping from the *front* of the array (oldest context first).
+	lo := 0
+	hi := len(arr) - 1 // inclusive; hi ensures at least one item remains
+
+	var best map[string]any
+	found := false
+
+	for lo <= hi {
+		mid := (lo + hi) / 2
+		candidateArr := arr[mid:]
+		if len(candidateArr) == 0 {
+			lo = mid + 1
+			continue
+		}
+
+		next := shallowCopyMap(root)
+		next[field] = candidateArr
+		encoded, err := json.Marshal(next)
+		if err != nil {
+			// If marshal fails, try dropping more.
+			lo = mid + 1
+			continue
+		}
+
+		if len(encoded) <= maxBytes {
+			best = next
+			found = true
+			// Try to keep more context by dropping fewer items.
+			hi = mid - 1
+			continue
+		}
+
+		// Need to drop more.
+		lo = mid + 1
+	}
+
+	if found {
+		return best, true
+	}
+
+	// Nothing fit (even with only one element); return the smallest slice and let the
+	// caller fall back to shrinkToEssentials().
+	next := shallowCopyMap(root)
+	next[field] = arr[len(arr)-1:]
+	return next, true
+}
+
+func shrinkToEssentials(root map[string]any) map[string]any {
+	out := make(map[string]any)
+	for _, key := range []string{"model", "stream", "max_tokens", "temperature", "top_p", "top_k"} {
+		if v, ok := root[key]; ok {
+			out[key] = v
+		}
+	}
+
+	// Keep only the last element of the conversation array.
+	if v, ok := root["messages"]; ok {
+		if arr, ok := v.([]any); ok && len(arr) > 0 {
+			out["messages"] = []any{arr[len(arr)-1]}
+		}
+	}
+	if v, ok := root["contents"]; ok {
+		if arr, ok := v.([]any); ok && len(arr) > 0 {
+			out["contents"] = []any{arr[len(arr)-1]}
+		}
+	}
+	return out
+}
+
+func shallowCopyMap(m map[string]any) map[string]any {
+	out := make(map[string]any, len(m))
+	for k, v := range m {
+		out[k] = v
+	}
+	return out
+}
+
+func sanitizeErrorBodyForStorage(raw string, maxBytes int) (sanitized string, truncated bool) {
+	raw = strings.TrimSpace(raw)
+	if raw == "" {
+		return "", false
+	}
+
+	// Prefer JSON-safe sanitization when possible.
+	if out, trunc, _ := sanitizeAndTrimRequestBody([]byte(raw), maxBytes); out != "" {
+		return out, trunc
+	}
+
+	// Non-JSON: best-effort truncate.
+	if maxBytes > 0 && len(raw) > maxBytes {
+		return truncateString(raw, maxBytes), true
+	}
+	return raw, false
+}
+
+func extractString(v any, key string) string {
+	root, ok := v.(map[string]any)
+	if !ok {
+		return ""
+	}
+	s, _ := root[key].(string)
+	return strings.TrimSpace(s)
+}
--- a/backend/internal/service/ops_settings.go
+++ b/backend/internal/service/ops_settings.go
+package service
+
+import (
+	"context"
+	"encoding/json"
+	"errors"
+	"strings"
+	"time"
+)
+
+const (
+	opsAlertEvaluatorLeaderLockKeyDefault = "ops:alert:evaluator:leader"
+	opsAlertEvaluatorLeaderLockTTLDefault = 30 * time.Second
+)
+
+// =========================
+// Email notification config
+// =========================
+
+func (s *OpsService) GetEmailNotificationConfig(ctx context.Context) (*OpsEmailNotificationConfig, error) {
+	defaultCfg := defaultOpsEmailNotificationConfig()
+	if s == nil || s.settingRepo == nil {
+		return defaultCfg, nil
+	}
+	if ctx == nil {
+		ctx = context.Background()
+	}
+
+	raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsEmailNotificationConfig)
+	if err != nil {
+		if errors.Is(err, ErrSettingNotFound) {
+			// Initialize defaults on first read (best-effort).
+			if b, mErr := json.Marshal(defaultCfg); mErr == nil {
+				_ = s.settingRepo.Set(ctx, SettingKeyOpsEmailNotificationConfig, string(b))
+			}
+			return defaultCfg, nil
+		}
+		return nil, err
+	}
+
+	cfg := &OpsEmailNotificationConfig{}
+	if err := json.Unmarshal([]byte(raw), cfg); err != nil {
+		// Corrupted JSON should not break ops UI; fall back to defaults.
+		return defaultCfg, nil
+	}
+	normalizeOpsEmailNotificationConfig(cfg)
+	return cfg, nil
+}
+
+func (s *OpsService) UpdateEmailNotificationConfig(ctx context.Context, req *OpsEmailNotificationConfigUpdateRequest) (*OpsEmailNotificationConfig, error) {
+	if s == nil || s.settingRepo == nil {
+		return nil, errors.New("setting repository not initialized")
+	}
+	if ctx == nil {
+		ctx = context.Background()
+	}
+	if req == nil {
+		return nil, errors.New("invalid request")
+	}
+
+	cfg, err := s.GetEmailNotificationConfig(ctx)
+	if err != nil {
+		return nil, err
+	}
+
+	if req.Alert != nil {
+		cfg.Alert.Enabled = req.Alert.Enabled
+		if req.Alert.Recipients != nil {
+			cfg.Alert.Recipients = req.Alert.Recipients
+		}
+		cfg.Alert.MinSeverity = strings.TrimSpace(req.Alert.MinSeverity)
+		cfg.Alert.RateLimitPerHour = req.Alert.RateLimitPerHour
+		cfg.Alert.BatchingWindowSeconds = req.Alert.BatchingWindowSeconds
+		cfg.Alert.IncludeResolvedAlerts = req.Alert.IncludeResolvedAlerts
+	}
+
+	if req.Report != nil {
+		cfg.Report.Enabled = req.Report.Enabled
+		if req.Report.Recipients != nil {
+			cfg.Report.Recipients = req.Report.Recipients
+		}
+		cfg.Report.DailySummaryEnabled = req.Report.DailySummaryEnabled
+		cfg.Report.DailySummarySchedule = strings.TrimSpace(req.Report.DailySummarySchedule)
+		cfg.Report.WeeklySummaryEnabled = req.Report.WeeklySummaryEnabled
+		cfg.Report.WeeklySummarySchedule = strings.TrimSpace(req.Report.WeeklySummarySchedule)
+		cfg.Report.ErrorDigestEnabled = req.Report.ErrorDigestEnabled
+		cfg.Report.ErrorDigestSchedule = strings.TrimSpace(req.Report.ErrorDigestSchedule)
+		cfg.Report.ErrorDigestMinCount = req.Report.ErrorDigestMinCount
+		cfg.Report.AccountHealthEnabled = req.Report.AccountHealthEnabled
+		cfg.Report.AccountHealthSchedule = strings.TrimSpace(req.Report.AccountHealthSchedule)
+		cfg.Report.AccountHealthErrorRateThreshold = req.Report.AccountHealthErrorRateThreshold
+	}
+
+	if err := validateOpsEmailNotificationConfig(cfg); err != nil {
+		return nil, err
+	}
+
+	normalizeOpsEmailNotificationConfig(cfg)
+	raw, err := json.Marshal(cfg)
+	if err != nil {
+		return nil, err
+	}
+	if err := s.settingRepo.Set(ctx, SettingKeyOpsEmailNotificationConfig, string(raw)); err != nil {
+		return nil, err
+	}
+	return cfg, nil
+}
+
+func defaultOpsEmailNotificationConfig() *OpsEmailNotificationConfig {
+	return &OpsEmailNotificationConfig{
+		Alert: OpsEmailAlertConfig{
+			Enabled:               true,
+			Recipients:            []string{},
+			MinSeverity:           "",
+			RateLimitPerHour:      0,
+			BatchingWindowSeconds: 0,
+			IncludeResolvedAlerts: false,
+		},
+		Report: OpsEmailReportConfig{
+			Enabled:                         false,
+			Recipients:                      []string{},
+			DailySummaryEnabled:             false,
+			DailySummarySchedule:            "0 9 * * *",
+			WeeklySummaryEnabled:            false,
+			WeeklySummarySchedule:           "0 9 * * 1",
+			ErrorDigestEnabled:              false,
+			ErrorDigestSchedule:             "0 9 * * *",
+			ErrorDigestMinCount:             10,
+			AccountHealthEnabled:            false,
+			AccountHealthSchedule:           "0 9 * * *",
+			AccountHealthErrorRateThreshold: 10.0,
+		},
+	}
+}
+
+func normalizeOpsEmailNotificationConfig(cfg *OpsEmailNotificationConfig) {
+	if cfg == nil {
+		return
+	}
+	if cfg.Alert.Recipients == nil {
+		cfg.Alert.Recipients = []string{}
+	}
+	if cfg.Report.Recipients == nil {
+		cfg.Report.Recipients = []string{}
+	}
+
+	cfg.Alert.MinSeverity = strings.TrimSpace(cfg.Alert.MinSeverity)
+	cfg.Report.DailySummarySchedule = strings.TrimSpace(cfg.Report.DailySummarySchedule)
+	cfg.Report.WeeklySummarySchedule = strings.TrimSpace(cfg.Report.WeeklySummarySchedule)
+	cfg.Report.ErrorDigestSchedule = strings.TrimSpace(cfg.Report.ErrorDigestSchedule)
+	cfg.Report.AccountHealthSchedule = strings.TrimSpace(cfg.Report.AccountHealthSchedule)
+
+	// Fill missing schedules with defaults to avoid breaking cron logic if clients send empty strings.
+	if cfg.Report.DailySummarySchedule == "" {
+		cfg.Report.DailySummarySchedule = "0 9 * * *"
+	}
+	if cfg.Report.WeeklySummarySchedule == "" {
+		cfg.Report.WeeklySummarySchedule = "0 9 * * 1"
+	}
+	if cfg.Report.ErrorDigestSchedule == "" {
+		cfg.Report.ErrorDigestSchedule = "0 9 * * *"
+	}
+	if cfg.Report.AccountHealthSchedule == "" {
+		cfg.Report.AccountHealthSchedule = "0 9 * * *"
+	}
+}
+
+func validateOpsEmailNotificationConfig(cfg *OpsEmailNotificationConfig) error {
+	if cfg == nil {
+		return errors.New("invalid config")
+	}
+
+	if cfg.Alert.RateLimitPerHour < 0 {
+		return errors.New("alert.rate_limit_per_hour must be >= 0")
+	}
+	if cfg.Alert.BatchingWindowSeconds < 0 {
+		return errors.New("alert.batching_window_seconds must be >= 0")
+	}
+	switch strings.TrimSpace(cfg.Alert.MinSeverity) {
+	case "", "critical", "warning", "info":
+	default:
+		return errors.New("alert.min_severity must be one of: critical, warning, info, or empty")
+	}
+
+	if cfg.Report.ErrorDigestMinCount < 0 {
+		return errors.New("report.error_digest_min_count must be >= 0")
+	}
+	if cfg.Report.AccountHealthErrorRateThreshold < 0 || cfg.Report.AccountHealthErrorRateThreshold > 100 {
+		return errors.New("report.account_health_error_rate_threshold must be between 0 and 100")
+	}
+	return nil
+}
+
+// =========================
+// Alert runtime settings
+// =========================
+
+func defaultOpsAlertRuntimeSettings() *OpsAlertRuntimeSettings {
+	return &OpsAlertRuntimeSettings{
+		EvaluationIntervalSeconds: 60,
+		DistributedLock: OpsDistributedLockSettings{
+			Enabled:    true,
+			Key:        opsAlertEvaluatorLeaderLockKeyDefault,
+			TTLSeconds: int(opsAlertEvaluatorLeaderLockTTLDefault.Seconds()),
+		},
+		Silencing: OpsAlertSilencingSettings{
+			Enabled:            false,
+			GlobalUntilRFC3339: "",
+			GlobalReason:       "",
+			Entries:            []OpsAlertSilenceEntry{},
+		},
+	}
+}
+
+func normalizeOpsDistributedLockSettings(s *OpsDistributedLockSettings, defaultKey string, defaultTTLSeconds int) {
+	if s == nil {
+		return
+	}
+	s.Key = strings.TrimSpace(s.Key)
+	if s.Key == "" {
+		s.Key = defaultKey
+	}
+	if s.TTLSeconds <= 0 {
+		s.TTLSeconds = defaultTTLSeconds
+	}
+}
+
+func normalizeOpsAlertSilencingSettings(s *OpsAlertSilencingSettings) {
+	if s == nil {
+		return
+	}
+	s.GlobalUntilRFC3339 = strings.TrimSpace(s.GlobalUntilRFC3339)
+	s.GlobalReason = strings.TrimSpace(s.GlobalReason)
+	if s.Entries == nil {
+		s.Entries = []OpsAlertSilenceEntry{}
+	}
+	for i := range s.Entries {
+		s.Entries[i].UntilRFC3339 = strings.TrimSpace(s.Entries[i].UntilRFC3339)
+		s.Entries[i].Reason = strings.TrimSpace(s.Entries[i].Reason)
+	}
+}
+
+func validateOpsDistributedLockSettings(s OpsDistributedLockSettings) error {
+	if strings.TrimSpace(s.Key) == "" {
+		return errors.New("distributed_lock.key is required")
+	}
+	if s.TTLSeconds <= 0 || s.TTLSeconds > int((24*time.Hour).Seconds()) {
+		return errors.New("distributed_lock.ttl_seconds must be between 1 and 86400")
+	}
+	return nil
+}
+
+func validateOpsAlertSilencingSettings(s OpsAlertSilencingSettings) error {
+	parse := func(raw string) error {
+		if strings.TrimSpace(raw) == "" {
+			return nil
+		}
+		if _, err := time.Parse(time.RFC3339, raw); err != nil {
+			return errors.New("silencing time must be RFC3339")
+		}
+		return nil
+	}
+
+	if err := parse(s.GlobalUntilRFC3339); err != nil {
+		return err
+	}
+	for _, entry := range s.Entries {
+		if strings.TrimSpace(entry.UntilRFC3339) == "" {
+			return errors.New("silencing.entries.until_rfc3339 is required")
+		}
+		if _, err := time.Parse(time.RFC3339, entry.UntilRFC3339); err != nil {
+			return errors.New("silencing.entries.until_rfc3339 must be RFC3339")
+		}
+	}
+	return nil
+}
+
+func (s *OpsService) GetOpsAlertRuntimeSettings(ctx context.Context) (*OpsAlertRuntimeSettings, error) {
+	defaultCfg := defaultOpsAlertRuntimeSettings()
+	if s == nil || s.settingRepo == nil {
+		return defaultCfg, nil
+	}
+	if ctx == nil {
+		ctx = context.Background()
+	}
+
+	raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsAlertRuntimeSettings)
+	if err != nil {
+		if errors.Is(err, ErrSettingNotFound) {
+			if b, mErr := json.Marshal(defaultCfg); mErr == nil {
+				_ = s.settingRepo.Set(ctx, SettingKeyOpsAlertRuntimeSettings, string(b))
+			}
+			return defaultCfg, nil
+		}
+		return nil, err
+	}
+
+	cfg := &OpsAlertRuntimeSettings{}
+	if err := json.Unmarshal([]byte(raw), cfg); err != nil {
+		return defaultCfg, nil
+	}
+
+	if cfg.EvaluationIntervalSeconds <= 0 {
+		cfg.EvaluationIntervalSeconds = defaultCfg.EvaluationIntervalSeconds
+	}
+	normalizeOpsDistributedLockSettings(&cfg.DistributedLock, opsAlertEvaluatorLeaderLockKeyDefault, defaultCfg.DistributedLock.TTLSeconds)
+	normalizeOpsAlertSilencingSettings(&cfg.Silencing)
+
+	return cfg, nil
+}
+
+func (s *OpsService) UpdateOpsAlertRuntimeSettings(ctx context.Context, cfg *OpsAlertRuntimeSettings) (*OpsAlertRuntimeSettings, error) {
+	if s == nil || s.settingRepo == nil {
+		return nil, errors.New("setting repository not initialized")
+	}
+	if ctx == nil {
+		ctx = context.Background()
+	}
+	if cfg == nil {
+		return nil, errors.New("invalid config")
+	}
+
+	if cfg.EvaluationIntervalSeconds < 1 || cfg.EvaluationIntervalSeconds > int((24*time.Hour).Seconds()) {
+		return nil, errors.New("evaluation_interval_seconds must be between 1 and 86400")
+	}
+	if cfg.DistributedLock.Enabled {
+		if err := validateOpsDistributedLockSettings(cfg.DistributedLock); err != nil {
+			return nil, err
+		}
+	}
+	if cfg.Silencing.Enabled {
+		if err := validateOpsAlertSilencingSettings(cfg.Silencing); err != nil {
+			return nil, err
+		}
+	}
+
+	defaultCfg := defaultOpsAlertRuntimeSettings()
+	normalizeOpsDistributedLockSettings(&cfg.DistributedLock, opsAlertEvaluatorLeaderLockKeyDefault, defaultCfg.DistributedLock.TTLSeconds)
+	normalizeOpsAlertSilencingSettings(&cfg.Silencing)
+
+	raw, err := json.Marshal(cfg)
+	if err != nil {
+		return nil, err
+	}
+	if err := s.settingRepo.Set(ctx, SettingKeyOpsAlertRuntimeSettings, string(raw)); err != nil {
+		return nil, err
+	}
+
+	// Return a fresh copy (avoid callers holding pointers into internal slices that may be mutated).
+	updated := &OpsAlertRuntimeSettings{}
+	_ = json.Unmarshal(raw, updated)
+	return updated, nil
+}
+
+// =========================
+// Advanced settings
+// =========================
+
+func defaultOpsAdvancedSettings() *OpsAdvancedSettings {
+	return &OpsAdvancedSettings{
+		DataRetention: OpsDataRetentionSettings{
+			CleanupEnabled:             false,
+			CleanupSchedule:            "0 2 * * *",
+			ErrorLogRetentionDays:      30,
+			MinuteMetricsRetentionDays: 30,
+			HourlyMetricsRetentionDays: 30,
+		},
+		Aggregation: OpsAggregationSettings{
+			AggregationEnabled: false,
+		},
+	}
+}
+
+func normalizeOpsAdvancedSettings(cfg *OpsAdvancedSettings) {
+	if cfg == nil {
+		return
+	}
+	cfg.DataRetention.CleanupSchedule = strings.TrimSpace(cfg.DataRetention.CleanupSchedule)
+	if cfg.DataRetention.CleanupSchedule == "" {
+		cfg.DataRetention.CleanupSchedule = "0 2 * * *"
+	}
+	if cfg.DataRetention.ErrorLogRetentionDays <= 0 {
+		cfg.DataRetention.ErrorLogRetentionDays = 30
+	}
+	if cfg.DataRetention.MinuteMetricsRetentionDays <= 0 {
+		cfg.DataRetention.MinuteMetricsRetentionDays = 30
+	}
+	if cfg.DataRetention.HourlyMetricsRetentionDays <= 0 {
+		cfg.DataRetention.HourlyMetricsRetentionDays = 30
+	}
+}
+
+func validateOpsAdvancedSettings(cfg *OpsAdvancedSettings) error {
+	if cfg == nil {
+		return errors.New("invalid config")
+	}
+	if cfg.DataRetention.ErrorLogRetentionDays < 1 || cfg.DataRetention.ErrorLogRetentionDays > 365 {
+		return errors.New("error_log_retention_days must be between 1 and 365")
+	}
+	if cfg.DataRetention.MinuteMetricsRetentionDays < 1 || cfg.DataRetention.MinuteMetricsRetentionDays > 365 {
+		return errors.New("minute_metrics_retention_days must be between 1 and 365")
+	}
+	if cfg.DataRetention.HourlyMetricsRetentionDays < 1 || cfg.DataRetention.HourlyMetricsRetentionDays > 365 {
+		return errors.New("hourly_metrics_retention_days must be between 1 and 365")
+	}
+	return nil
+}
+
+func (s *OpsService) GetOpsAdvancedSettings(ctx context.Context) (*OpsAdvancedSettings, error) {
+	defaultCfg := defaultOpsAdvancedSettings()
+	if s == nil || s.settingRepo == nil {
+		return defaultCfg, nil
+	}
+	if ctx == nil {
+		ctx = context.Background()
+	}
+
+	raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsAdvancedSettings)
+	if err != nil {
+		if errors.Is(err, ErrSettingNotFound) {
+			if b, mErr := json.Marshal(defaultCfg); mErr == nil {
+				_ = s.settingRepo.Set(ctx, SettingKeyOpsAdvancedSettings, string(b))
+			}
+			return defaultCfg, nil
+		}
+		return nil, err
+	}
+
+	cfg := &OpsAdvancedSettings{}
+	if err := json.Unmarshal([]byte(raw), cfg); err != nil {
+		return defaultCfg, nil
+	}
+
+	normalizeOpsAdvancedSettings(cfg)
+	return cfg, nil
+}
+
+func (s *OpsService) UpdateOpsAdvancedSettings(ctx context.Context, cfg *OpsAdvancedSettings) (*OpsAdvancedSettings, error) {
+	if s == nil || s.settingRepo == nil {
+		return nil, errors.New("setting repository not initialized")
+	}
+	if ctx == nil {
+		ctx = context.Background()
+	}
+	if cfg == nil {
+		return nil, errors.New("invalid config")
+	}
+
+	if err := validateOpsAdvancedSettings(cfg); err != nil {
+		return nil, err
+	}
+
+	normalizeOpsAdvancedSettings(cfg)
+	raw, err := json.Marshal(cfg)
+	if err != nil {
+		return nil, err
+	}
+	if err := s.settingRepo.Set(ctx, SettingKeyOpsAdvancedSettings, string(raw)); err != nil {
+		return nil, err
+	}
+
+	updated := &OpsAdvancedSettings{}
+	_ = json.Unmarshal(raw, updated)
+	return updated, nil
+}
--- a/backend/internal/service/ops_settings_models.go
+++ b/backend/internal/service/ops_settings_models.go
+package service
+
+// Ops settings models stored in DB `settings` table (JSON blobs).
+
+type OpsEmailNotificationConfig struct {
+	Alert  OpsEmailAlertConfig  `json:"alert"`
+	Report OpsEmailReportConfig `json:"report"`
+}
+
+type OpsEmailAlertConfig struct {
+	Enabled               bool     `json:"enabled"`
+	Recipients            []string `json:"recipients"`
+	MinSeverity           string   `json:"min_severity"`
+	RateLimitPerHour      int      `json:"rate_limit_per_hour"`
+	BatchingWindowSeconds int      `json:"batching_window_seconds"`
+	IncludeResolvedAlerts bool     `json:"include_resolved_alerts"`
+}
+
+type OpsEmailReportConfig struct {
+	Enabled                         bool     `json:"enabled"`
+	Recipients                      []string `json:"recipients"`
+	DailySummaryEnabled             bool     `json:"daily_summary_enabled"`
+	DailySummarySchedule            string   `json:"daily_summary_schedule"`
+	WeeklySummaryEnabled            bool     `json:"weekly_summary_enabled"`
+	WeeklySummarySchedule           string   `json:"weekly_summary_schedule"`
+	ErrorDigestEnabled              bool     `json:"error_digest_enabled"`
+	ErrorDigestSchedule             string   `json:"error_digest_schedule"`
+	ErrorDigestMinCount             int      `json:"error_digest_min_count"`
+	AccountHealthEnabled            bool     `json:"account_health_enabled"`
+	AccountHealthSchedule           string   `json:"account_health_schedule"`
+	AccountHealthErrorRateThreshold float64  `json:"account_health_error_rate_threshold"`
+}
+
+// OpsEmailNotificationConfigUpdateRequest allows partial updates, while the
+// frontend can still send the full config shape.
+type OpsEmailNotificationConfigUpdateRequest struct {
+	Alert  *OpsEmailAlertConfig  `json:"alert"`
+	Report *OpsEmailReportConfig `json:"report"`
+}
+
+type OpsDistributedLockSettings struct {
+	Enabled    bool   `json:"enabled"`
+	Key        string `json:"key"`
+	TTLSeconds int    `json:"ttl_seconds"`
+}
+
+type OpsAlertSilenceEntry struct {
+	RuleID     *int64   `json:"rule_id,omitempty"`
+	Severities []string `json:"severities,omitempty"`
+
+	UntilRFC3339 string `json:"until_rfc3339"`
+	Reason       string `json:"reason"`
+}
+
+type OpsAlertSilencingSettings struct {
+	Enabled bool `json:"enabled"`
+
+	GlobalUntilRFC3339 string `json:"global_until_rfc3339"`
+	GlobalReason       string `json:"global_reason"`
+
+	Entries []OpsAlertSilenceEntry `json:"entries,omitempty"`
+}
+
+type OpsAlertRuntimeSettings struct {
+	EvaluationIntervalSeconds int `json:"evaluation_interval_seconds"`
+
+	DistributedLock OpsDistributedLockSettings `json:"distributed_lock"`
+	Silencing       OpsAlertSilencingSettings  `json:"silencing"`
+}
+
+// OpsAdvancedSettings stores advanced ops configuration (data retention, aggregation).
+type OpsAdvancedSettings struct {
+	DataRetention OpsDataRetentionSettings `json:"data_retention"`
+	Aggregation   OpsAggregationSettings   `json:"aggregation"`
+}
+
+type OpsDataRetentionSettings struct {
+	CleanupEnabled             bool   `json:"cleanup_enabled"`
+	CleanupSchedule            string `json:"cleanup_schedule"`
+	ErrorLogRetentionDays      int    `json:"error_log_retention_days"`
+	MinuteMetricsRetentionDays int    `json:"minute_metrics_retention_days"`
+	HourlyMetricsRetentionDays int    `json:"hourly_metrics_retention_days"`
+}
+
+type OpsAggregationSettings struct {
+	AggregationEnabled bool `json:"aggregation_enabled"`
+}
--- a/backend/internal/service/ops_trend_models.go
+++ b/backend/internal/service/ops_trend_models.go
+package service
+
+import "time"
+
+type OpsThroughputTrendPoint struct {
+	BucketStart   time.Time `json:"bucket_start"`
+	RequestCount  int64     `json:"request_count"`
+	TokenConsumed int64     `json:"token_consumed"`
+	QPS           float64   `json:"qps"`
+	TPS           float64   `json:"tps"`
+}
+
+type OpsThroughputPlatformBreakdownItem struct {
+	Platform      string `json:"platform"`
+	RequestCount  int64  `json:"request_count"`
+	TokenConsumed int64  `json:"token_consumed"`
+}
+
+type OpsThroughputGroupBreakdownItem struct {
+	GroupID       int64  `json:"group_id"`
+	GroupName     string `json:"group_name"`
+	RequestCount  int64  `json:"request_count"`
+	TokenConsumed int64  `json:"token_consumed"`
+}
+
+type OpsThroughputTrendResponse struct {
+	Bucket string `json:"bucket"`
+
+	Points []*OpsThroughputTrendPoint `json:"points"`
+
+	// Optional drilldown helpers:
+	// - When no platform/group is selected: returns totals by platform.
+	// - When platform is selected but group is not: returns top groups in that platform.
+	ByPlatform []*OpsThroughputPlatformBreakdownItem `json:"by_platform,omitempty"`
+	TopGroups  []*OpsThroughputGroupBreakdownItem    `json:"top_groups,omitempty"`
+}
+
+type OpsErrorTrendPoint struct {
+	BucketStart time.Time `json:"bucket_start"`
+
+	ErrorCountTotal      int64 `json:"error_count_total"`
+	BusinessLimitedCount int64 `json:"business_limited_count"`
+	ErrorCountSLA        int64 `json:"error_count_sla"`
+
+	UpstreamErrorCountExcl429529 int64 `json:"upstream_error_count_excl_429_529"`
+	Upstream429Count             int64 `json:"upstream_429_count"`
+	Upstream529Count             int64 `json:"upstream_529_count"`
+}
+
+type OpsErrorTrendResponse struct {
+	Bucket string                `json:"bucket"`
+	Points []*OpsErrorTrendPoint `json:"points"`
+}
+
+type OpsErrorDistributionItem struct {
+	StatusCode      int   `json:"status_code"`
+	Total           int64 `json:"total"`
+	SLA             int64 `json:"sla"`
+	BusinessLimited int64 `json:"business_limited"`
+}
+
+type OpsErrorDistributionResponse struct {
+	Total int64                       `json:"total"`
+	Items []*OpsErrorDistributionItem `json:"items"`
+}
--- a/backend/internal/service/ops_trends.go
+++ b/backend/internal/service/ops_trends.go
+package service
+
+import (
+	"context"
+
+	infraerrors "github.com/Wei-Shaw/sub2api/internal/pkg/errors"
+)
+
+func (s *OpsService) GetThroughputTrend(ctx context.Context, filter *OpsDashboardFilter, bucketSeconds int) (*OpsThroughputTrendResponse, error) {
+	if err := s.RequireMonitoringEnabled(ctx); err != nil {
+		return nil, err
+	}
+	if s.opsRepo == nil {
+		return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+	}
+	if filter == nil {
+		return nil, infraerrors.BadRequest("OPS_FILTER_REQUIRED", "filter is required")
+	}
+	if filter.StartTime.IsZero() || filter.EndTime.IsZero() {
+		return nil, infraerrors.BadRequest("OPS_TIME_RANGE_REQUIRED", "start_time/end_time are required")
+	}
+	if filter.StartTime.After(filter.EndTime) {
+		return nil, infraerrors.BadRequest("OPS_TIME_RANGE_INVALID", "start_time must be <= end_time")
+	}
+	return s.opsRepo.GetThroughputTrend(ctx, filter, bucketSeconds)
+}