Merge pull request #285 from IanShaw027/fix/ops-bug

feat(ops): 增强错误日志管理、告警静默和前端 UI 优化

Merge pull request #285 from IanShaw027/fix/ops-bug
feat(ops): 增强错误日志管理、告警静默和前端 UI 优化
27214f86 · Wesley Liddick · GitHub · 28de614d · 5354ba36 · 27214f86
Unverified Commit 27214f86 authored Jan 15, 2026 by Wesley Liddick Committed by GitHub Jan 15, 2026
--- a/backend/internal/service/ops_retry.go
+++ b/backend/internal/service/ops_retry.go
@@ -108,6 +108,10 @@ func (w *limitedResponseWriter) truncated() bool {
 	return w.totalWritten > int64(w.limit)
 }
+const (
+	OpsRetryModeUpstreamEvent = "upstream_event"
+)
 func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, errorID int64, mode string, pinnedAccountID *int64) (*OpsRetryResult, error) {
 	if err := s.RequireMonitoringEnabled(ctx); err != nil {
 		return nil, err
@@ -123,6 +127,81 @@ func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, er
 		return nil, infraerrors.BadRequest("OPS_RETRY_INVALID_MODE", "mode must be client or upstream")
 	}
+	errorLog, err := s.GetErrorLogByID(ctx, errorID)
+	if err != nil {
+		return nil, err
+	}
+	if errorLog == nil {
+		return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found")
+	}
+	if strings.TrimSpace(errorLog.RequestBody) == "" {
+		return nil, infraerrors.BadRequest("OPS_RETRY_NO_REQUEST_BODY", "No request body found to retry")
+	}
+	var pinned *int64
+	if mode == OpsRetryModeUpstream {
+		if pinnedAccountID != nil && *pinnedAccountID > 0 {
+			pinned = pinnedAccountID
+		} else if errorLog.AccountID != nil && *errorLog.AccountID > 0 {
+			pinned = errorLog.AccountID
+		} else {
+			return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "pinned_account_id is required for upstream retry")
+		}
+	}
+	return s.retryWithErrorLog(ctx, requestedByUserID, errorID, mode, mode, pinned, errorLog)
+}
+// RetryUpstreamEvent retries a specific upstream attempt captured inside ops_error_logs.upstream_errors.
+// idx is 0-based. It always pins the original event account_id.
+func (s *OpsService) RetryUpstreamEvent(ctx context.Context, requestedByUserID int64, errorID int64, idx int) (*OpsRetryResult, error) {
+	if err := s.RequireMonitoringEnabled(ctx); err != nil {
+		return nil, err
+	}
+	if s.opsRepo == nil {
+		return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+	}
+	if idx < 0 {
+		return nil, infraerrors.BadRequest("OPS_RETRY_INVALID_UPSTREAM_IDX", "invalid upstream idx")
+	}
+	errorLog, err := s.GetErrorLogByID(ctx, errorID)
+	if err != nil {
+		return nil, err
+	}
+	if errorLog == nil {
+		return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found")
+	}
+	events, err := ParseOpsUpstreamErrors(errorLog.UpstreamErrors)
+	if err != nil {
+		return nil, infraerrors.BadRequest("OPS_RETRY_UPSTREAM_EVENTS_INVALID", "invalid upstream_errors")
+	}
+	if idx >= len(events) {
+		return nil, infraerrors.BadRequest("OPS_RETRY_UPSTREAM_IDX_OOB", "upstream idx out of range")
+	}
+	ev := events[idx]
+	if ev == nil {
+		return nil, infraerrors.BadRequest("OPS_RETRY_UPSTREAM_EVENT_MISSING", "upstream event missing")
+	}
+	if ev.AccountID <= 0 {
+		return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "account_id is required for upstream retry")
+	}
+	upstreamBody := strings.TrimSpace(ev.UpstreamRequestBody)
+	if upstreamBody == "" {
+		return nil, infraerrors.BadRequest("OPS_RETRY_UPSTREAM_NO_REQUEST_BODY", "No upstream request body found to retry")
+	}
+	override := *errorLog
+	override.RequestBody = upstreamBody
+	pinned := ev.AccountID
+	// Persist as upstream_event, execute as upstream pinned retry.
+	return s.retryWithErrorLog(ctx, requestedByUserID, errorID, OpsRetryModeUpstreamEvent, OpsRetryModeUpstream, &pinned, &override)
+}
+func (s *OpsService) retryWithErrorLog(ctx context.Context, requestedByUserID int64, errorID int64, mode string, execMode string, pinnedAccountID *int64, errorLog *OpsErrorLogDetail) (*OpsRetryResult, error) {
 	latest, err := s.opsRepo.GetLatestRetryAttemptForError(ctx, errorID)
 	if err != nil && !errors.Is(err, sql.ErrNoRows) {
 		return nil, infraerrors.InternalServer("OPS_RETRY_LOAD_LATEST_FAILED", "Failed to check retry status").WithCause(err)
@@ -144,22 +223,18 @@ func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, er
 		}
 	}
-	errorLog, err := s.GetErrorLogByID(ctx, errorID)
+	if errorLog == nil || strings.TrimSpace(errorLog.RequestBody) == "" {
-	if err != nil {
-		return nil, err
-	}
-	if strings.TrimSpace(errorLog.RequestBody) == "" {
 		return nil, infraerrors.BadRequest("OPS_RETRY_NO_REQUEST_BODY", "No request body found to retry")
 	}
 	var pinned *int64
-	if mode == OpsRetryModeUpstream {
+	if execMode == OpsRetryModeUpstream {
 		if pinnedAccountID != nil && *pinnedAccountID > 0 {
 			pinned = pinnedAccountID
 		} else if errorLog.AccountID != nil && *errorLog.AccountID > 0 {
 			pinned = errorLog.AccountID
 		} else {
-			return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "pinned_account_id is required for upstream retry")
+			return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "account_id is required for upstream retry")
 		}
 	}
@@ -196,7 +271,7 @@ func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, er
 	execCtx, cancel := context.WithTimeout(ctx, opsRetryTimeout)
 	defer cancel()
-	execRes := s.executeRetry(execCtx, errorLog, mode, pinned)
+	execRes := s.executeRetry(execCtx, errorLog, execMode, pinned)
 	finishedAt := time.Now()
 	result.FinishedAt = finishedAt
@@ -220,27 +295,40 @@ func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, er
 		msg := result.ErrorMessage
 		updateErrMsg = &msg
 	}
+	// Keep legacy result_request_id empty; use upstream_request_id instead.
 	var resultRequestID *string
-	if strings.TrimSpace(result.UpstreamRequestID) != "" {
-		v := result.UpstreamRequestID
-		resultRequestID = &v
-	}
 	finalStatus := result.Status
 	if strings.TrimSpace(finalStatus) == "" {
 		finalStatus = opsRetryStatusFailed
 	}
+	success := strings.EqualFold(finalStatus, opsRetryStatusSucceeded)
+	httpStatus := result.HTTPStatusCode
+	upstreamReqID := result.UpstreamRequestID
+	usedAccountID := result.UsedAccountID
+	preview := result.ResponsePreview
+	truncated := result.ResponseTruncated
 	if err := s.opsRepo.UpdateRetryAttempt(updateCtx, &OpsUpdateRetryAttemptInput{
 		ID:                attemptID,
 		Status:            finalStatus,
 		FinishedAt:        finishedAt,
 		DurationMs:        result.DurationMs,
+		Success:           &success,
+		HTTPStatusCode:    &httpStatus,
+		UpstreamRequestID: &upstreamReqID,
+		UsedAccountID:     usedAccountID,
+		ResponsePreview:   &preview,
+		ResponseTruncated: &truncated,
 		ResultRequestID:   resultRequestID,
 		ErrorMessage:      updateErrMsg,
 	}); err != nil {
-		// Best-effort: retry itself already executed; do not fail the API response.
 		log.Printf("[Ops] UpdateRetryAttempt failed: %v", err)
+	} else if success {
+		if err := s.opsRepo.UpdateErrorResolution(updateCtx, errorID, true, &requestedByUserID, &attemptID, &finishedAt); err != nil {
+			log.Printf("[Ops] UpdateErrorResolution failed: %v", err)
+		}
 	}
 	return result, nil

--- a/backend/internal/service/ops_service.go
+++ b/backend/internal/service/ops_service.go
@@ -208,6 +208,25 @@ func (s *OpsService) RecordError(ctx context.Context, entry *OpsInsertErrorLogIn
 				out.Detail = ""
 			}
+			out.UpstreamRequestBody = strings.TrimSpace(out.UpstreamRequestBody)
+			if out.UpstreamRequestBody != "" {
+				// Reuse the same sanitization/trimming strategy as request body storage.
+				// Keep it small so it is safe to persist in ops_error_logs JSON.
+				sanitized, truncated, _ := sanitizeAndTrimRequestBody([]byte(out.UpstreamRequestBody), 10*1024)
+				if sanitized != "" {
+					out.UpstreamRequestBody = sanitized
+					if truncated {
+						out.Kind = strings.TrimSpace(out.Kind)
+						if out.Kind == "" {
+							out.Kind = "upstream"
+						}
+						out.Kind = out.Kind + ":request_body_truncated"
+					}
+				} else {
+					out.UpstreamRequestBody = ""
+				}
+			}
 			// Drop fully-empty events (can happen if only status code was known).
 			if out.UpstreamStatusCode == 0 && out.Message == "" && out.Detail == "" {
 				continue
@@ -236,7 +255,13 @@ func (s *OpsService) GetErrorLogs(ctx context.Context, filter *OpsErrorLogFilter
 	if s.opsRepo == nil {
 		return &OpsErrorLogList{Errors: []*OpsErrorLog{}, Total: 0, Page: 1, PageSize: 20}, nil
 	}
-	return s.opsRepo.ListErrorLogs(ctx, filter)
+	result, err := s.opsRepo.ListErrorLogs(ctx, filter)
+	if err != nil {
+		log.Printf("[Ops] GetErrorLogs failed: %v", err)
+		return nil, err
+	}
+	return result, nil
 }
 func (s *OpsService) GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLogDetail, error) {
@@ -256,6 +281,46 @@ func (s *OpsService) GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLo
 	return detail, nil
 }
+func (s *OpsService) ListRetryAttemptsByErrorID(ctx context.Context, errorID int64, limit int) ([]*OpsRetryAttempt, error) {
+	if err := s.RequireMonitoringEnabled(ctx); err != nil {
+		return nil, err
+	}
+	if s.opsRepo == nil {
+		return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+	}
+	if errorID <= 0 {
+		return nil, infraerrors.BadRequest("OPS_ERROR_INVALID_ID", "invalid error id")
+	}
+	items, err := s.opsRepo.ListRetryAttemptsByErrorID(ctx, errorID, limit)
+	if err != nil {
+		if errors.Is(err, sql.ErrNoRows) {
+			return []*OpsRetryAttempt{}, nil
+		}
+		return nil, infraerrors.InternalServer("OPS_RETRY_LIST_FAILED", "Failed to list retry attempts").WithCause(err)
+	}
+	return items, nil
+}
+func (s *OpsService) UpdateErrorResolution(ctx context.Context, errorID int64, resolved bool, resolvedByUserID *int64, resolvedRetryID *int64) error {
+	if err := s.RequireMonitoringEnabled(ctx); err != nil {
+		return err
+	}
+	if s.opsRepo == nil {
+		return infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
+	}
+	if errorID <= 0 {
+		return infraerrors.BadRequest("OPS_ERROR_INVALID_ID", "invalid error id")
+	}
+	// Best-effort ensure the error exists
+	if _, err := s.opsRepo.GetErrorLogByID(ctx, errorID); err != nil {
+		if errors.Is(err, sql.ErrNoRows) {
+			return infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found")
+		}
+		return infraerrors.InternalServer("OPS_ERROR_LOAD_FAILED", "Failed to load ops error log").WithCause(err)
+	}
+	return s.opsRepo.UpdateErrorResolution(ctx, errorID, resolved, resolvedByUserID, resolvedRetryID, nil)
+}
 func sanitizeAndTrimRequestBody(raw []byte, maxBytes int) (jsonString string, truncated bool, bytesLen int) {
 	bytesLen = len(raw)
 	if len(raw) == 0 {
@@ -296,14 +361,34 @@ func sanitizeAndTrimRequestBody(raw []byte, maxBytes int) (jsonString string, tr
 		}
 	}
-	// Last resort: store a minimal placeholder (still valid JSON).
+	// Last resort: keep JSON shape but drop big fields.
-	placeholder := map[string]any{
+	// This avoids downstream code that expects certain top-level keys from crashing.
-		"request_body_truncated": true,
+	if root, ok := decoded.(map[string]any); ok {
+		placeholder := shallowCopyMap(root)
+		placeholder["request_body_truncated"] = true
+		// Replace potentially huge arrays/strings, but keep the keys present.
+		for _, k := range []string{"messages", "contents", "input", "prompt"} {
+			if _, exists := placeholder[k]; exists {
+				placeholder[k] = []any{}
 			}
-	if model := extractString(decoded, "model"); model != "" {
-		placeholder["model"] = model
 		}
+		for _, k := range []string{"text"} {
+			if _, exists := placeholder[k]; exists {
+				placeholder[k] = ""
+			}
+		}
 		encoded4, err4 := json.Marshal(placeholder)
+		if err4 == nil {
+			if len(encoded4) <= maxBytes {
+				return string(encoded4), true, bytesLen
+			}
+		}
+	}
+	// Final fallback: minimal valid JSON.
+	encoded4, err4 := json.Marshal(map[string]any{"request_body_truncated": true})
 	if err4 != nil {
 		return "", true, bytesLen
 	}
@@ -526,12 +611,3 @@ func sanitizeErrorBodyForStorage(raw string, maxBytes int) (sanitized string, tr
 	}
 	return raw, false
 }
-func extractString(v any, key string) string {
-	root, ok := v.(map[string]any)
-	if !ok {
-		return ""
-	}
-	s, _ := root[key].(string)
-	return strings.TrimSpace(s)
-}
--- a/backend/internal/service/ops_settings.go
+++ b/backend/internal/service/ops_settings.go
@@ -369,6 +369,8 @@ func defaultOpsAdvancedSettings() *OpsAdvancedSettings {
 			AggregationEnabled: false,
 		},
 		IgnoreCountTokensErrors:   false,
+		IgnoreContextCanceled:     true,  // Default to true - client disconnects are not errors
+		IgnoreNoAvailableAccounts: false, // Default to false - this is a real routing issue
 		AutoRefreshEnabled:        false,
 		AutoRefreshIntervalSec:    30,
 	}
@@ -482,13 +484,11 @@ const SettingKeyOpsMetricThresholds = "ops_metric_thresholds"
 func defaultOpsMetricThresholds() *OpsMetricThresholds {
 	slaMin := 99.5
-	latencyMax := 2000.0
 	ttftMax := 500.0
 	reqErrMax := 5.0
 	upstreamErrMax := 5.0
 	return &OpsMetricThresholds{
 		SLAPercentMin:               &slaMin,
-		LatencyP99MsMax:             &latencyMax,
 		TTFTp99MsMax:                &ttftMax,
 		RequestErrorRatePercentMax:  &reqErrMax,
 		UpstreamErrorRatePercentMax: &upstreamErrMax,
@@ -538,9 +538,6 @@ func (s *OpsService) UpdateMetricThresholds(ctx context.Context, cfg *OpsMetricT
 	if cfg.SLAPercentMin != nil && (*cfg.SLAPercentMin < 0 || *cfg.SLAPercentMin > 100) {
 		return nil, errors.New("sla_percent_min must be between 0 and 100")
 	}
-	if cfg.LatencyP99MsMax != nil && *cfg.LatencyP99MsMax < 0 {
-		return nil, errors.New("latency_p99_ms_max must be >= 0")
-	}
 	if cfg.TTFTp99MsMax != nil && *cfg.TTFTp99MsMax < 0 {
 		return nil, errors.New("ttft_p99_ms_max must be >= 0")
 	}

--- a/backend/internal/service/ops_settings_models.go
+++ b/backend/internal/service/ops_settings_models.go
@@ -63,7 +63,6 @@ type OpsAlertSilencingSettings struct {
 type OpsMetricThresholds struct {
 	SLAPercentMin               *float64 `json:"sla_percent_min,omitempty"`                 // SLA低于此值变红
-	LatencyP99MsMax             *float64 `json:"latency_p99_ms_max,omitempty"`              // 延迟P99高于此值变红
 	TTFTp99MsMax                *float64 `json:"ttft_p99_ms_max,omitempty"`                 // TTFT P99高于此值变红
 	RequestErrorRatePercentMax  *float64 `json:"request_error_rate_percent_max,omitempty"`  // 请求错误率高于此值变红
 	UpstreamErrorRatePercentMax *float64 `json:"upstream_error_rate_percent_max,omitempty"` // 上游错误率高于此值变红
@@ -82,6 +81,8 @@ type OpsAdvancedSettings struct {
 	DataRetention             OpsDataRetentionSettings `json:"data_retention"`
 	Aggregation               OpsAggregationSettings   `json:"aggregation"`
 	IgnoreCountTokensErrors   bool                     `json:"ignore_count_tokens_errors"`
+	IgnoreContextCanceled     bool                     `json:"ignore_context_canceled"`
+	IgnoreNoAvailableAccounts bool                     `json:"ignore_no_available_accounts"`
 	AutoRefreshEnabled        bool                     `json:"auto_refresh_enabled"`
 	AutoRefreshIntervalSec    int                      `json:"auto_refresh_interval_seconds"`
 }

--- a/backend/internal/service/ops_upstream_context.go
+++ b/backend/internal/service/ops_upstream_context.go
@@ -15,6 +15,11 @@ const (
 	OpsUpstreamErrorMessageKey = "ops_upstream_error_message"
 	OpsUpstreamErrorDetailKey  = "ops_upstream_error_detail"
 	OpsUpstreamErrorsKey       = "ops_upstream_errors"
+	// Best-effort capture of the current upstream request body so ops can
+	// retry the specific upstream attempt (not just the client request).
+	// This value is sanitized+trimmed before being persisted.
+	OpsUpstreamRequestBodyKey = "ops_upstream_request_body"
 )
 func setOpsUpstreamError(c *gin.Context, upstreamStatusCode int, upstreamMessage, upstreamDetail string) {
@@ -40,11 +45,19 @@ type OpsUpstreamErrorEvent struct {
 	// Context
 	Platform    string `json:"platform,omitempty"`
 	AccountID   int64  `json:"account_id,omitempty"`
+	AccountName string `json:"account_name,omitempty"`
 	// Outcome
 	UpstreamStatusCode int    `json:"upstream_status_code,omitempty"`
 	UpstreamRequestID  string `json:"upstream_request_id,omitempty"`
+	// Best-effort upstream request capture (sanitized+trimmed).
+	// Required for retrying a specific upstream attempt.
+	UpstreamRequestBody string `json:"upstream_request_body,omitempty"`
+	// Best-effort upstream response capture (sanitized+trimmed).
+	UpstreamResponseBody string `json:"upstream_response_body,omitempty"`
 	// Kind: http_error | request_error | retry_exhausted | failover
 	Kind string `json:"kind,omitempty"`
@@ -61,6 +74,8 @@ func appendOpsUpstreamError(c *gin.Context, ev OpsUpstreamErrorEvent) {
 	}
 	ev.Platform = strings.TrimSpace(ev.Platform)
 	ev.UpstreamRequestID = strings.TrimSpace(ev.UpstreamRequestID)
+	ev.UpstreamRequestBody = strings.TrimSpace(ev.UpstreamRequestBody)
+	ev.UpstreamResponseBody = strings.TrimSpace(ev.UpstreamResponseBody)
 	ev.Kind = strings.TrimSpace(ev.Kind)
 	ev.Message = strings.TrimSpace(ev.Message)
 	ev.Detail = strings.TrimSpace(ev.Detail)
@@ -68,6 +83,16 @@ func appendOpsUpstreamError(c *gin.Context, ev OpsUpstreamErrorEvent) {
 		ev.Message = sanitizeUpstreamErrorMessage(ev.Message)
 	}
+	// If the caller didn't explicitly pass upstream request body but the gateway
+	// stored it on the context, attach it so ops can retry this specific attempt.
+	if ev.UpstreamRequestBody == "" {
+		if v, ok := c.Get(OpsUpstreamRequestBodyKey); ok {
+			if s, ok := v.(string); ok {
+				ev.UpstreamRequestBody = strings.TrimSpace(s)
+			}
+		}
+	}
 	var existing []*OpsUpstreamErrorEvent
 	if v, ok := c.Get(OpsUpstreamErrorsKey); ok {
 		if arr, ok := v.([]*OpsUpstreamErrorEvent); ok {
@@ -92,3 +117,15 @@ func marshalOpsUpstreamErrors(events []*OpsUpstreamErrorEvent) *string {
 	s := string(raw)
 	return &s
 }
+func ParseOpsUpstreamErrors(raw string) ([]*OpsUpstreamErrorEvent, error) {
+	raw = strings.TrimSpace(raw)
+	if raw == "" {
+		return []*OpsUpstreamErrorEvent{}, nil
+	}
+	var out []*OpsUpstreamErrorEvent
+	if err := json.Unmarshal([]byte(raw), &out); err != nil {
+		return nil, err
+	}
+	return out, nil
+}
--- a/backend/migrations/037_ops_alert_silences.sql
+++ b/backend/migrations/037_ops_alert_silences.sql
+-- +goose Up
+-- +goose StatementBegin
+-- Ops alert silences: scoped (rule_id + platform + group_id + region)
+CREATE TABLE IF NOT EXISTS ops_alert_silences (
+    id BIGSERIAL PRIMARY KEY,
+    rule_id BIGINT NOT NULL,
+    platform VARCHAR(64) NOT NULL,
+    group_id BIGINT,
+    region VARCHAR(64),
+    until TIMESTAMPTZ NOT NULL,
+    reason TEXT,
+    created_by BIGINT,
+    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
+);
+CREATE INDEX IF NOT EXISTS idx_ops_alert_silences_lookup
+    ON ops_alert_silences (rule_id, platform, group_id, region, until);
+-- +goose StatementEnd
+-- +goose Down
+-- +goose StatementBegin
+DROP TABLE IF EXISTS ops_alert_silences;
+-- +goose StatementEnd
--- a/backend/migrations/038_ops_errors_resolution_retry_results_and_standardize_classification.sql
+++ b/backend/migrations/038_ops_errors_resolution_retry_results_and_standardize_classification.sql
+-- Add resolution tracking to ops_error_logs, persist retry results, and standardize error classification enums.
+--
+-- This migration is intentionally idempotent.
+SET LOCAL lock_timeout = '5s';
+SET LOCAL statement_timeout = '10min';
+-- ============================================
+-- 1) ops_error_logs: resolution fields
+-- ============================================
+ALTER TABLE ops_error_logs
+  ADD COLUMN IF NOT EXISTS resolved BOOLEAN NOT NULL DEFAULT false;
+ALTER TABLE ops_error_logs
+  ADD COLUMN IF NOT EXISTS resolved_at TIMESTAMPTZ;
+ALTER TABLE ops_error_logs
+  ADD COLUMN IF NOT EXISTS resolved_by_user_id BIGINT;
+ALTER TABLE ops_error_logs
+  ADD COLUMN IF NOT EXISTS resolved_retry_id BIGINT;
+CREATE INDEX IF NOT EXISTS idx_ops_error_logs_resolved_time
+  ON ops_error_logs (resolved, created_at DESC);
+CREATE INDEX IF NOT EXISTS idx_ops_error_logs_unresolved_time
+  ON ops_error_logs (created_at DESC)
+  WHERE resolved = false;
+-- ============================================
+-- 2) ops_retry_attempts: persist execution results
+-- ============================================
+ALTER TABLE ops_retry_attempts
+  ADD COLUMN IF NOT EXISTS success BOOLEAN;
+ALTER TABLE ops_retry_attempts
+  ADD COLUMN IF NOT EXISTS http_status_code INT;
+ALTER TABLE ops_retry_attempts
+  ADD COLUMN IF NOT EXISTS upstream_request_id VARCHAR(128);
+ALTER TABLE ops_retry_attempts
+  ADD COLUMN IF NOT EXISTS used_account_id BIGINT;
+ALTER TABLE ops_retry_attempts
+  ADD COLUMN IF NOT EXISTS response_preview TEXT;
+ALTER TABLE ops_retry_attempts
+  ADD COLUMN IF NOT EXISTS response_truncated BOOLEAN NOT NULL DEFAULT false;
+CREATE INDEX IF NOT EXISTS idx_ops_retry_attempts_success_time
+  ON ops_retry_attempts (success, created_at DESC);
+-- Backfill best-effort fields for existing rows.
+UPDATE ops_retry_attempts
+SET success = (LOWER(COALESCE(status, '')) = 'succeeded')
+WHERE success IS NULL;
+UPDATE ops_retry_attempts
+SET upstream_request_id = result_request_id
+WHERE upstream_request_id IS NULL AND result_request_id IS NOT NULL;
+-- ============================================
+-- 3) Standardize classification enums in ops_error_logs
+--
+-- New enums:
+--   error_phase:  request|auth|routing|upstream|network|internal
+--   error_owner:  client|provider|platform
+--   error_source: client_request|upstream_http|gateway
+-- ============================================
+-- Owner: legacy sub2api => platform.
+UPDATE ops_error_logs
+SET error_owner = 'platform'
+WHERE LOWER(COALESCE(error_owner, '')) = 'sub2api';
+-- Owner: normalize empty/null to platform (best-effort).
+UPDATE ops_error_logs
+SET error_owner = 'platform'
+WHERE COALESCE(TRIM(error_owner), '') = '';
+-- Phase: map legacy phases.
+UPDATE ops_error_logs
+SET error_phase = CASE
+  WHEN COALESCE(TRIM(error_phase), '') = '' THEN 'internal'
+  WHEN LOWER(error_phase) IN ('billing', 'concurrency', 'response') THEN 'request'
+  WHEN LOWER(error_phase) IN ('scheduling') THEN 'routing'
+  WHEN LOWER(error_phase) IN ('request', 'auth', 'routing', 'upstream', 'network', 'internal') THEN LOWER(error_phase)
+  ELSE 'internal'
+END;
+-- Source: map legacy sources.
+UPDATE ops_error_logs
+SET error_source = CASE
+  WHEN COALESCE(TRIM(error_source), '') = '' THEN 'gateway'
+  WHEN LOWER(error_source) IN ('billing', 'concurrency') THEN 'client_request'
+  WHEN LOWER(error_source) IN ('upstream_http') THEN 'upstream_http'
+  WHEN LOWER(error_source) IN ('upstream_network') THEN 'gateway'
+  WHEN LOWER(error_source) IN ('internal') THEN 'gateway'
+  WHEN LOWER(error_source) IN ('client_request', 'upstream_http', 'gateway') THEN LOWER(error_source)
+  ELSE 'gateway'
+END;
+-- Auto-resolve recovered upstream errors (client status < 400).
+UPDATE ops_error_logs
+SET
+  resolved = true,
+  resolved_at = COALESCE(resolved_at, created_at)
+WHERE resolved = false AND COALESCE(status_code, 0) > 0 AND COALESCE(status_code, 0) < 400;
--- a/frontend/src/api/admin/ops.ts
+++ b/frontend/src/api/admin/ops.ts
@@ -17,6 +17,47 @@ export interface OpsRequestOptions {
 export interface OpsRetryRequest {
  mode: OpsRetryMode
  pinned_account_id?: number
+  force?: boolean
+}
+export interface OpsRetryAttempt {
+  id: number
+  created_at: string
+  requested_by_user_id: number
+  source_error_id: number
+  mode: string
+  pinned_account_id?: number | null
+  pinned_account_name?: string
+  status: string
+  started_at?: string | null
+  finished_at?: string | null
+  duration_ms?: number | null
+  success?: boolean | null
+  http_status_code?: number | null
+  upstream_request_id?: string | null
+  used_account_id?: number | null
+  used_account_name?: string
+  response_preview?: string | null
+  response_truncated?: boolean | null
+  result_request_id?: string | null
+  result_error_id?: number | null
+  error_message?: string | null
+}
+export type OpsUpstreamErrorEvent = {
+  at_unix_ms?: number
+  platform?: string
+  account_id?: number
+  account_name?: string
+  upstream_status_code?: number
+  upstream_request_id?: string
+  upstream_request_body?: string
+  kind?: string
+  message?: string
+  detail?: string
 }
 export interface OpsRetryResult {
@@ -626,8 +667,6 @@ export type MetricType =
  | 'success_rate'
  | 'error_rate'
  | 'upstream_error_rate'
-  | 'p95_latency_ms'
-  | 'p99_latency_ms'
  | 'cpu_usage_percent'
  | 'memory_usage_percent'
  | 'concurrency_queue_depth'
@@ -663,7 +702,7 @@ export interface AlertEvent {
  id: number
  rule_id: number
  severity: OpsSeverity | string
-  status: 'firing' | 'resolved' | string
+  status: 'firing' | 'resolved' | 'manual_resolved' | string
  title?: string
  description?: string
  metric_value?: number
@@ -702,7 +741,6 @@ export interface EmailNotificationConfig {
 export interface OpsMetricThresholds {
  sla_percent_min?: number | null                 // SLA低于此值变红
-  latency_p99_ms_max?: number | null             // 延迟P99高于此值变红
  ttft_p99_ms_max?: number | null                 // TTFT P99高于此值变红
  request_error_rate_percent_max?: number | null  // 请求错误率高于此值变红
  upstream_error_rate_percent_max?: number | null // 上游错误率高于此值变红
@@ -735,6 +773,8 @@ export interface OpsAdvancedSettings {
  data_retention: OpsDataRetentionSettings
  aggregation: OpsAggregationSettings
  ignore_count_tokens_errors: boolean
+  ignore_context_canceled: boolean
+  ignore_no_available_accounts: boolean
  auto_refresh_enabled: boolean
  auto_refresh_interval_seconds: number
 }
@@ -754,21 +794,37 @@ export interface OpsAggregationSettings {
 export interface OpsErrorLog {
  id: number
  created_at: string
+  // Standardized classification
  phase: OpsPhase
  type: string
+  error_owner: 'client' | 'provider' | 'platform' | string
+  error_source: 'client_request' | 'upstream_http' | 'gateway' | string
  severity: OpsSeverity
  status_code: number
  platform: string
  model: string
-  latency_ms?: number | null
+  is_retryable: boolean
+  retry_count: number
+  resolved: boolean
+  resolved_at?: string | null
+  resolved_by_user_id?: number | null
+  resolved_retry_id?: number | null
  client_request_id: string
  request_id: string
  message: string
  user_id?: number | null
+  user_email: string
  api_key_id?: number | null
  account_id?: number | null
+  account_name: string
  group_id?: number | null
+  group_name: string
  client_ip?: string | null
  request_path?: string
@@ -890,7 +946,9 @@ export async function getErrorDistribution(
  return data
 }
-export async function listErrorLogs(params: {
+export type OpsErrorListView = 'errors' | 'excluded' | 'all'
+export type OpsErrorListQueryParams = {
  page?: number
  page_size?: number
  time_range?: string
@@ -899,10 +957,20 @@ export async function listErrorLogs(params: {
  platform?: string
  group_id?: number | null
  account_id?: number | null
  phase?: string
+  error_owner?: string
+  error_source?: string
+  resolved?: string
+  view?: OpsErrorListView
  q?: string
  status_codes?: string
-}): Promise<OpsErrorLogsResponse> {
+  status_codes_other?: string
+}
+// Legacy unified endpoints
+export async function listErrorLogs(params: OpsErrorListQueryParams): Promise<OpsErrorLogsResponse> {
  const { data } = await apiClient.get<OpsErrorLogsResponse>('/admin/ops/errors', { params })
  return data
 }
@@ -917,6 +985,70 @@ export async function retryErrorRequest(id: number, req: OpsRetryRequest): Promi
  return data
 }
+export async function listRetryAttempts(errorId: number, limit = 50): Promise<OpsRetryAttempt[]> {
+  const { data } = await apiClient.get<OpsRetryAttempt[]>(`/admin/ops/errors/${errorId}/retries`, { params: { limit } })
+  return data
+}
+export async function updateErrorResolved(errorId: number, resolved: boolean): Promise<void> {
+  await apiClient.put(`/admin/ops/errors/${errorId}/resolve`, { resolved })
+}
+// New split endpoints
+export async function listRequestErrors(params: OpsErrorListQueryParams): Promise<OpsErrorLogsResponse> {
+  const { data } = await apiClient.get<OpsErrorLogsResponse>('/admin/ops/request-errors', { params })
+  return data
+}
+export async function listUpstreamErrors(params: OpsErrorListQueryParams): Promise<OpsErrorLogsResponse> {
+  const { data } = await apiClient.get<OpsErrorLogsResponse>('/admin/ops/upstream-errors', { params })
+  return data
+}
+export async function getRequestErrorDetail(id: number): Promise<OpsErrorDetail> {
+  const { data } = await apiClient.get<OpsErrorDetail>(`/admin/ops/request-errors/${id}`)
+  return data
+}
+export async function getUpstreamErrorDetail(id: number): Promise<OpsErrorDetail> {
+  const { data } = await apiClient.get<OpsErrorDetail>(`/admin/ops/upstream-errors/${id}`)
+  return data
+}
+export async function retryRequestErrorClient(id: number): Promise<OpsRetryResult> {
+  const { data } = await apiClient.post<OpsRetryResult>(`/admin/ops/request-errors/${id}/retry-client`, {})
+  return data
+}
+export async function retryRequestErrorUpstreamEvent(id: number, idx: number): Promise<OpsRetryResult> {
+  const { data } = await apiClient.post<OpsRetryResult>(`/admin/ops/request-errors/${id}/upstream-errors/${idx}/retry`, {})
+  return data
+}
+export async function retryUpstreamError(id: number): Promise<OpsRetryResult> {
+  const { data } = await apiClient.post<OpsRetryResult>(`/admin/ops/upstream-errors/${id}/retry`, {})
+  return data
+}
+export async function updateRequestErrorResolved(errorId: number, resolved: boolean): Promise<void> {
+  await apiClient.put(`/admin/ops/request-errors/${errorId}/resolve`, { resolved })
+}
+export async function updateUpstreamErrorResolved(errorId: number, resolved: boolean): Promise<void> {
+  await apiClient.put(`/admin/ops/upstream-errors/${errorId}/resolve`, { resolved })
+}
+export async function listRequestErrorUpstreamErrors(
+  id: number,
+  params: OpsErrorListQueryParams = {},
+  options: { include_detail?: boolean } = {}
+): Promise<PaginatedResponse<OpsErrorDetail>> {
+  const query: Record<string, any> = { ...params }
+  if (options.include_detail) query.include_detail = '1'
+  const { data } = await apiClient.get<PaginatedResponse<OpsErrorDetail>>(`/admin/ops/request-errors/${id}/upstream-errors`, { params: query })
+  return data
+}
 export async function listRequestDetails(params: OpsRequestDetailsParams): Promise<OpsRequestDetailsResponse> {
  const { data } = await apiClient.get<OpsRequestDetailsResponse>('/admin/ops/requests', { params })
  return data
@@ -942,11 +1074,45 @@ export async function deleteAlertRule(id: number): Promise<void> {
  await apiClient.delete(`/admin/ops/alert-rules/${id}`)
 }
-export async function listAlertEvents(limit = 100): Promise<AlertEvent[]> {
+export interface AlertEventsQuery {
-  const { data } = await apiClient.get<AlertEvent[]>('/admin/ops/alert-events', { params: { limit } })
+  limit?: number
+  status?: string
+  severity?: string
+  email_sent?: boolean
+  time_range?: string
+  start_time?: string
+  end_time?: string
+  before_fired_at?: string
+  before_id?: number
+  platform?: string
+  group_id?: number
+}
+export async function listAlertEvents(params: AlertEventsQuery = {}): Promise<AlertEvent[]> {
+  const { data } = await apiClient.get<AlertEvent[]>('/admin/ops/alert-events', { params })
+  return data
+}
+export async function getAlertEvent(id: number): Promise<AlertEvent> {
+  const { data } = await apiClient.get<AlertEvent>(`/admin/ops/alert-events/${id}`)
  return data
 }
+export async function updateAlertEventStatus(id: number, status: 'resolved' | 'manual_resolved'): Promise<void> {
+  await apiClient.put(`/admin/ops/alert-events/${id}/status`, { status })
+}
+export async function createAlertSilence(payload: {
+  rule_id: number
+  platform: string
+  group_id?: number | null
+  region?: string | null
+  until: string
+  reason?: string
+}): Promise<void> {
+  await apiClient.post('/admin/ops/alert-silences', payload)
+}
 // Email notification config
 export async function getEmailNotificationConfig(): Promise<EmailNotificationConfig> {
  const { data } = await apiClient.get<EmailNotificationConfig>('/admin/ops/email-notification/config')
@@ -1001,15 +1167,35 @@ export const opsAPI = {
  getAccountAvailabilityStats,
  getRealtimeTrafficSummary,
  subscribeQPS,
+  // Legacy unified endpoints
  listErrorLogs,
  getErrorLogDetail,
  retryErrorRequest,
+  listRetryAttempts,
+  updateErrorResolved,
+  // New split endpoints
+  listRequestErrors,
+  listUpstreamErrors,
+  getRequestErrorDetail,
+  getUpstreamErrorDetail,
+  retryRequestErrorClient,
+  retryRequestErrorUpstreamEvent,
+  retryUpstreamError,
+  updateRequestErrorResolved,
+  updateUpstreamErrorResolved,
+  listRequestErrorUpstreamErrors,
  listRequestDetails,
  listAlertRules,
  createAlertRule,
  updateAlertRule,
  deleteAlertRule,
  listAlertEvents,
+  getAlertEvent,
+  updateAlertEventStatus,
+  createAlertSilence,
  getEmailNotificationConfig,
  updateEmailNotificationConfig,
  getAlertRuntimeSettings,

--- a/frontend/src/i18n/locales/en.ts
+++ b/frontend/src/i18n/locales/en.ts
@@ -129,6 +129,8 @@ export default {
    all: 'All',
    none: 'None',
    noData: 'No data',
+    expand: 'Expand',
+    collapse: 'Collapse',
    success: 'Success',
    error: 'Error',
    critical: 'Critical',
@@ -155,7 +157,8 @@ export default {
    noGroupsAvailable: 'No groups available',
    unknownError: 'Unknown error occurred',
    saving: 'Saving...',
-        selectedCount: '({count} selected)',    refresh: 'Refresh',
+    selectedCount: '({count} selected)',
+    refresh: 'Refresh',
    settings: 'Settings',
    notAvailable: 'N/A',
    now: 'Now',
@@ -1882,10 +1885,8 @@ export default {
      noSystemMetrics: 'No system metrics collected yet.',
      collectedAt: 'Collected at:',
      window: 'window',
-      cpu: 'CPU',
      memory: 'Memory',
      db: 'DB',
-      redis: 'Redis',
      goroutines: 'Goroutines',
      jobs: 'Jobs',
      jobsHelp: 'Click “Details” to view job heartbeats and recent errors',
@@ -1911,7 +1912,7 @@ export default {
      totalRequests: 'Total Requests',
      avgQps: 'Avg QPS',
      avgTps: 'Avg TPS',
-      avgLatency: 'Avg Latency',
+      avgLatency: 'Avg Request Duration',
      avgTtft: 'Avg TTFT',
      exceptions: 'Exceptions',
      requestErrors: 'Request Errors',
@@ -1923,7 +1924,7 @@ export default {
      errors: 'Errors',
      errorRate: 'error_rate:',
      upstreamRate: 'upstream_rate:',
-      latencyDuration: 'Latency (duration_ms)',
+      latencyDuration: 'Request Duration (ms)',
      ttftLabel: 'TTFT (first_token_ms)',
      p50: 'p50:',
      p90: 'p90:',
@@ -1931,7 +1932,6 @@ export default {
      p99: 'p99:',
      avg: 'avg:',
      max: 'max:',
-      qps: 'QPS',
      requests: 'Requests',
      requestsTitle: 'Requests',
      upstream: 'Upstream',
@@ -1943,7 +1943,7 @@ export default {
      failedToLoadData: 'Failed to load ops data.',
      failedToLoadOverview: 'Failed to load overview',
      failedToLoadThroughputTrend: 'Failed to load throughput trend',
-      failedToLoadLatencyHistogram: 'Failed to load latency histogram',
+      failedToLoadLatencyHistogram: 'Failed to load request duration histogram',
      failedToLoadErrorTrend: 'Failed to load error trend',
      failedToLoadErrorDistribution: 'Failed to load error distribution',
      failedToLoadErrorDetail: 'Failed to load error detail',
@@ -1951,7 +1951,7 @@ export default {
      tpsK: 'TPS (K)',
      top: 'Top:',
      throughputTrend: 'Throughput Trend',
-      latencyHistogram: 'Latency Histogram',
+      latencyHistogram: 'Request Duration Histogram',
      errorTrend: 'Error Trend',
      errorDistribution: 'Error Distribution',
      // Health Score & Diagnosis
@@ -1966,7 +1966,9 @@ export default {
        '30m': 'Last 30 minutes',
        '1h': 'Last 1 hour',
        '6h': 'Last 6 hours',
-        '24h': 'Last 24 hours'
+        '24h': 'Last 24 hours',
+        '7d': 'Last 7 days',
+        '30d': 'Last 30 days'
      },
      fullscreen: {
        enter: 'Enter Fullscreen'
@@ -1995,14 +1997,7 @@ export default {
        memoryHigh: 'Memory usage elevated ({usage}%)',
        memoryHighImpact: 'Memory pressure is high, needs attention',
        memoryHighAction: 'Monitor memory trends, check for memory leaks',
-        // Latency diagnostics
+        ttftHigh: 'Time to first token elevated ({ttft}ms)',
-        latencyCritical: 'Response latency critically high ({latency}ms)',
-        latencyCriticalImpact: 'User experience extremely poor, many requests timing out',
-        latencyCriticalAction: 'Check slow queries, database indexes, network latency, and upstream services',
-        latencyHigh: 'Response latency elevated ({latency}ms)',
-        latencyHighImpact: 'User experience degraded, needs optimization',
-        latencyHighAction: 'Analyze slow request logs, optimize database queries and business logic',
-        ttftHigh: 'Time to first byte elevated ({ttft}ms)',
        ttftHighImpact: 'User perceived latency increased',
        ttftHighAction: 'Optimize request processing flow, reduce pre-processing time',
        // Error rate diagnostics
@@ -2038,27 +2033,106 @@ export default {
      // Error Log
      errorLog: {
        timeId: 'Time / ID',
+        commonErrors: {
+          contextDeadlineExceeded: 'context deadline exceeded',
+          connectionRefused: 'connection refused',
+          rateLimit: 'rate limit'
+        },
+        time: 'Time',
+        type: 'Type',
        context: 'Context',
+        platform: 'Platform',
+        model: 'Model',
+        group: 'Group',
+        user: 'User',
+        userId: 'User ID',
+        account: 'Account',
+        accountId: 'Account ID',
        status: 'Status',
        message: 'Message',
-        latency: 'Latency',
+        latency: 'Request Duration',
        action: 'Action',
        noErrors: 'No errors in this window.',
        grp: 'GRP:',
        acc: 'ACC:',
        details: 'Details',
-        phase: 'Phase'
+        phase: 'Phase',
+        id: 'ID:',
+        typeUpstream: 'Upstream',
+        typeRequest: 'Request',
+        typeAuth: 'Auth',
+        typeRouting: 'Routing',
+        typeInternal: 'Internal'
      },
      // Error Details Modal
      errorDetails: {
        upstreamErrors: 'Upstream Errors',
        requestErrors: 'Request Errors',
+        unresolved: 'Unresolved',
+        resolved: 'Resolved',
+        viewErrors: 'Errors',
+        viewExcluded: 'Excluded',
+        statusCodeOther: 'Other',
+        owner: {
+          provider: 'Provider',
+          client: 'Client',
+          platform: 'Platform'
+        },
+        phase: {
+          request: 'Request',
+          auth: 'Auth',
+          routing: 'Routing',
+          upstream: 'Upstream',
+          network: 'Network',
+          internal: 'Internal'
+        },
        total: 'Total:',
        searchPlaceholder: 'Search request_id / client_request_id / message',
-        accountIdPlaceholder: 'account_id'
      },
      // Error Detail Modal
      errorDetail: {
+        title: 'Error Detail',
+        titleWithId: 'Error #{id}',
+        noErrorSelected: 'No error selected.',
+        resolution: 'Resolved:',
+        pinnedToOriginalAccountId: 'Pinned to original account_id',
+        missingUpstreamRequestBody: 'Missing upstream request body',
+        failedToLoadRetryHistory: 'Failed to load retry history',
+        failedToUpdateResolvedStatus: 'Failed to update resolved status',
+        unsupportedRetryMode: 'Unsupported retry mode',
+        classificationKeys: {
+          phase: 'Phase',
+          owner: 'Owner',
+          source: 'Source',
+          retryable: 'Retryable',
+          resolvedAt: 'Resolved At',
+          resolvedBy: 'Resolved By',
+          resolvedRetryId: 'Resolved Retry',
+          retryCount: 'Retry Count'
+        },
+        source: {
+          upstream_http: 'Upstream HTTP'
+        },
+        upstreamKeys: {
+          status: 'Status',
+          message: 'Message',
+          detail: 'Detail',
+          upstreamErrors: 'Upstream Errors'
+        },
+        upstreamEvent: {
+          account: 'Account',
+          status: 'Status',
+          requestId: 'Request ID'
+        },
+        responsePreview: {
+          expand: 'Response (click to expand)',
+          collapse: 'Response (click to collapse)'
+        },
+        retryMeta: {
+          used: 'Used',
+          success: 'Success',
+          pinned: 'Pinned'
+        },
        loading: 'Loading…',
        requestId: 'Request ID',
        time: 'Time',
@@ -2068,8 +2142,10 @@ export default {
        basicInfo: 'Basic Info',
        platform: 'Platform',
        model: 'Model',
-        latency: 'Latency',
+        group: 'Group',
-        ttft: 'TTFT',
+        user: 'User',
+        account: 'Account',
+        latency: 'Request Duration',
        businessLimited: 'Business Limited',
        requestPath: 'Request Path',
        timings: 'Timings',
@@ -2077,6 +2153,8 @@ export default {
        routing: 'Routing',
        upstream: 'Upstream',
        response: 'Response',
+        classification: 'Classification',
+        notRetryable: 'Not recommended to retry',
        retry: 'Retry',
        retryClient: 'Retry (Client)',
        retryUpstream: 'Retry (Upstream pinned)',
@@ -2088,7 +2166,6 @@ export default {
        confirmRetry: 'Confirm Retry',
        retrySuccess: 'Retry succeeded',
        retryFailed: 'Retry failed',
-        na: 'N/A',
        retryHint: 'Retry will resend the request with the same parameters',
        retryClientHint: 'Use client retry (no account pinning)',
        retryUpstreamHint: 'Use upstream pinned retry (pin to the error account)',
@@ -2096,8 +2173,33 @@ export default {
        retryNote1: 'Retry will use the same request body and parameters',
        retryNote2: 'If the original request failed due to account issues, pinned retry may still fail',
        retryNote3: 'Client retry will reselect an account',
+        retryNote4: 'You can force retry for non-retryable errors, but it is not recommended',
        confirmRetryMessage: 'Confirm retry this request?',
-        confirmRetryHint: 'Will resend with the same request parameters'
+        confirmRetryHint: 'Will resend with the same request parameters',
+        forceRetry: 'I understand and want to force retry',
+        forceRetryHint: 'This error usually cannot be fixed by retry; check to proceed',
+        forceRetryNeedAck: 'Please check to force retry',
+        markResolved: 'Mark resolved',
+        markUnresolved: 'Mark unresolved',
+        viewRetries: 'Retry history',
+        retryHistory: 'Retry History',
+        tabOverview: 'Overview',
+        tabRetries: 'Retries',
+        tabRequest: 'Request',
+        tabResponse: 'Response',
+        responseBody: 'Response',
+        compareA: 'Compare A',
+        compareB: 'Compare B',
+        retrySummary: 'Retry Summary',
+        responseHintSucceeded: 'Showing succeeded retry response_preview (#{id})',
+        responseHintFallback: 'No succeeded retry found; showing stored error_body',
+        suggestion: 'Suggestion',
+        suggestUpstreamResolved: '✓ Upstream error resolved by retry; no action needed',
+        suggestUpstream: 'Upstream instability: check account status, consider switching accounts, or retry',
+        suggestRequest: 'Client request error: ask customer to fix request parameters',
+        suggestAuth: 'Auth failed: verify API key/credentials',
+        suggestPlatform: 'Platform error: prioritize investigation and fix',
+        suggestGeneric: 'See details for more context'
      },
      requestDetails: {
        title: 'Request Details',
@@ -2133,13 +2235,46 @@ export default {
        loading: 'Loading...',
        empty: 'No alert events',
        loadFailed: 'Failed to load alert events',
+        status: {
+          firing: 'FIRING',
+          resolved: 'RESOLVED',
+          manualResolved: 'MANUAL RESOLVED'
+        },
+        detail: {
+          title: 'Alert Detail',
+          loading: 'Loading detail...',
+          empty: 'No detail',
+          loadFailed: 'Failed to load alert detail',
+          manualResolve: 'Mark as Resolved',
+          manualResolvedSuccess: 'Marked as manually resolved',
+          manualResolvedFailed: 'Failed to mark as manually resolved',
+          silence: 'Ignore Alert',
+          silenceSuccess: 'Alert silenced',
+          silenceFailed: 'Failed to silence alert',
+          viewRule: 'View Rule',
+          viewLogs: 'View Logs',
+          firedAt: 'Fired At',
+          resolvedAt: 'Resolved At',
+          ruleId: 'Rule ID',
+          dimensions: 'Dimensions',
+          historyTitle: 'History',
+          historyHint: 'Recent events with same rule + dimensions',
+          historyLoading: 'Loading history...',
+          historyEmpty: 'No history'
+        },
        table: {
          time: 'Time',
          status: 'Status',
          severity: 'Severity',
+          platform: 'Platform',
+          ruleId: 'Rule ID',
          title: 'Title',
+          duration: 'Duration',
          metric: 'Metric / Threshold',
-          email: 'Email Sent'
+          dimensions: 'Dimensions',
+          email: 'Email Sent',
+          emailSent: 'Sent',
+          emailIgnored: 'Ignored'
        }
      },
      alertRules: {
@@ -2253,7 +2388,6 @@ export default {
          title: 'Alert Silencing (Maintenance Mode)',
          enabled: 'Enable silencing',
          globalUntil: 'Silence until (RFC3339)',
-          untilPlaceholder: '2026-01-05T00:00:00Z',
          untilHint: 'Leave empty to only toggle silencing without an expiry (not recommended).',
          reason: 'Reason',
          reasonPlaceholder: 'e.g., planned maintenance',
@@ -2293,7 +2427,11 @@ export default {
          lockKeyRequired: 'Distributed lock key is required when lock is enabled',
          lockKeyPrefix: 'Distributed lock key must start with "{prefix}"',
          lockKeyHint: 'Recommended: start with "{prefix}" to avoid conflicts',
-          lockTtlRange: 'Distributed lock TTL must be between 1 and 86400 seconds'
+          lockTtlRange: 'Distributed lock TTL must be between 1 and 86400 seconds',
+          slaMinPercentRange: 'SLA minimum percentage must be between 0 and 100',
+          ttftP99MaxRange: 'TTFT P99 maximum must be a number ≥ 0',
+          requestErrorRateMaxRange: 'Request error rate maximum must be between 0 and 100',
+          upstreamErrorRateMaxRange: 'Upstream error rate maximum must be between 0 and 100'
        }
      },
      email: {
@@ -2358,8 +2496,6 @@ export default {
        metricThresholdsHint: 'Configure alert thresholds for metrics, values exceeding thresholds will be displayed in red',
        slaMinPercent: 'SLA Minimum Percentage',
        slaMinPercentHint: 'SLA below this value will be displayed in red (default: 99.5%)',
-        latencyP99MaxMs: 'Latency P99 Maximum (ms)',
-        latencyP99MaxMsHint: 'Latency P99 above this value will be displayed in red (default: 2000ms)',
        ttftP99MaxMs: 'TTFT P99 Maximum (ms)',
        ttftP99MaxMsHint: 'TTFT P99 above this value will be displayed in red (default: 500ms)',
        requestErrorRateMaxPercent: 'Request Error Rate Maximum (%)',
@@ -2378,9 +2514,28 @@ export default {
        aggregation: 'Pre-aggregation Tasks',
        enableAggregation: 'Enable Pre-aggregation',
        aggregationHint: 'Pre-aggregation improves query performance for long time windows',
+        errorFiltering: 'Error Filtering',
+        ignoreCountTokensErrors: 'Ignore count_tokens errors',
+        ignoreCountTokensErrorsHint: 'When enabled, errors from count_tokens requests will not be written to the error log.',
+        ignoreContextCanceled: 'Ignore client disconnect errors',
+        ignoreContextCanceledHint: 'When enabled, client disconnect (context canceled) errors will not be written to the error log.',
+        ignoreNoAvailableAccounts: 'Ignore no available accounts errors',
+        ignoreNoAvailableAccountsHint: 'When enabled, "No available accounts" errors will not be written to the error log (not recommended; usually a config issue).',
+        autoRefresh: 'Auto Refresh',
+        enableAutoRefresh: 'Enable auto refresh',
+        enableAutoRefreshHint: 'Automatically refresh dashboard data at a fixed interval.',
+        refreshInterval: 'Refresh Interval',
+        refreshInterval15s: '15 seconds',
+        refreshInterval30s: '30 seconds',
+        refreshInterval60s: '60 seconds',
+        autoRefreshCountdown: 'Auto refresh: {seconds}s',
        validation: {
          title: 'Please fix the following issues',
-          retentionDaysRange: 'Retention days must be between 1-365 days'
+          retentionDaysRange: 'Retention days must be between 1-365 days',
+          slaMinPercentRange: 'SLA minimum percentage must be between 0 and 100',
+          ttftP99MaxRange: 'TTFT P99 maximum must be a number ≥ 0',
+          requestErrorRateMaxRange: 'Request error rate maximum must be between 0 and 100',
+          upstreamErrorRateMaxRange: 'Upstream error rate maximum must be between 0 and 100'
        }
      },
      concurrency: {
@@ -2418,7 +2573,7 @@ export default {
      tooltips: {
        totalRequests: 'Total number of requests (including both successful and failed requests) in the selected time window.',
        throughputTrend: 'Requests/QPS + Tokens/TPS in the selected window.',
-        latencyHistogram: 'Latency distribution (duration_ms) for successful requests.',
+        latencyHistogram: 'Request duration distribution (ms) for successful requests.',
        errorTrend: 'Error counts over time (SLA scope excludes business limits; upstream excludes 429/529).',
        errorDistribution: 'Error distribution by status code.',
        goroutines:
@@ -2433,7 +2588,7 @@ export default {
        sla: 'Service Level Agreement success rate, excluding business limits (e.g., insufficient balance, quota exceeded).',
        errors: 'Error statistics, including total errors, error rate, and upstream error rate.',
        upstreamErrors: 'Upstream error statistics, excluding rate limit errors (429/529).',
-        latency: 'Request latency statistics, including p50, p90, p95, p99 percentiles.',
+        latency: 'Request duration statistics, including p50, p90, p95, p99 percentiles.',
        ttft: 'Time To First Token, measuring the speed of first byte return in streaming responses.',
        health: 'System health score (0-100), considering SLA, error rate, and resource usage.'
      },

--- a/frontend/src/i18n/locales/zh.ts
+++ b/frontend/src/i18n/locales/zh.ts
@@ -126,6 +126,8 @@ export default {
    all: '全部',
    none: '无',
    noData: '暂无数据',
+    expand: '展开',
+    collapse: '收起',
    success: '成功',
    error: '错误',
    critical: '严重',
@@ -2031,10 +2033,8 @@ export default {
      noSystemMetrics: '尚未收集系统指标。',
      collectedAt: '采集时间：',
      window: '窗口',
-      cpu: 'CPU',
      memory: '内存',
      db: '数据库',
-      redis: 'Redis',
      goroutines: '协程',
      jobs: '后台任务',
      jobsHelp: '点击“明细”查看任务心跳与报错信息',
@@ -2060,7 +2060,7 @@ export default {
      totalRequests: '总请求',
      avgQps: '平均 QPS',
      avgTps: '平均 TPS',
-      avgLatency: '平均延迟',
+      avgLatency: '平均请求时长',
      avgTtft: '平均首字延迟',
      exceptions: '异常数',
      requestErrors: '请求错误',
@@ -2072,7 +2072,7 @@ export default {
      errors: '错误',
      errorRate: '错误率：',
      upstreamRate: '上游错误率：',
-      latencyDuration: '延迟（毫秒）',
+      latencyDuration: '请求时长（毫秒）',
      ttftLabel: '首字延迟（毫秒）',
      p50: 'p50',
      p90: 'p90',
@@ -2080,7 +2080,6 @@ export default {
      p99: 'p99',
      avg: 'avg',
      max: 'max',
-      qps: 'QPS',
      requests: '请求数',
      requestsTitle: '请求',
      upstream: '上游',
@@ -2092,7 +2091,7 @@ export default {
      failedToLoadData: '加载运维数据失败',
      failedToLoadOverview: '加载概览数据失败',
      failedToLoadThroughputTrend: '加载吞吐趋势失败',
-      failedToLoadLatencyHistogram: '加载延迟分布失败',
+      failedToLoadLatencyHistogram: '加载请求时长分布失败',
      failedToLoadErrorTrend: '加载错误趋势失败',
      failedToLoadErrorDistribution: '加载错误分布失败',
      failedToLoadErrorDetail: '加载错误详情失败',
@@ -2100,7 +2099,7 @@ export default {
      tpsK: 'TPS（千）',
      top: '最高：',
      throughputTrend: '吞吐趋势',
-      latencyHistogram: '延迟分布',
+      latencyHistogram: '请求时长分布',
      errorTrend: '错误趋势',
      errorDistribution: '错误分布',
      // Health Score & Diagnosis
@@ -2115,7 +2114,9 @@ export default {
        '30m': '近30分钟',
        '1h': '近1小时',
        '6h': '近6小时',
-        '24h': '近24小时'
+        '24h': '近24小时',
+        '7d': '近7天',
+        '30d': '近30天'
      },
      fullscreen: {
        enter: '进入全屏'
@@ -2144,15 +2145,8 @@ export default {
        memoryHigh: '内存使用率偏高 ({usage}%)',
        memoryHighImpact: '内存压力较大，需要关注',
        memoryHighAction: '监控内存趋势，检查是否有内存泄漏',
-        // Latency diagnostics
-        latencyCritical: '响应延迟严重过高 ({latency}ms)',
-        latencyCriticalImpact: '用户体验极差，大量请求超时',
-        latencyCriticalAction: '检查慢查询、数据库索引、网络延迟和上游服务',
-        latencyHigh: '响应延迟偏高 ({latency}ms)',
-        latencyHighImpact: '用户体验下降，需要优化',
-        latencyHighAction: '分析慢请求日志，优化数据库查询和业务逻辑',
        ttftHigh: '首字节时间偏高 ({ttft}ms)',
-        ttftHighImpact: '用户感知延迟增加',
+        ttftHighImpact: '用户感知时长增加',
        ttftHighAction: '优化请求处理流程，减少前置逻辑耗时',
        // Error rate diagnostics
        upstreamCritical: '上游错误率严重偏高 ({rate}%)',
@@ -2170,13 +2164,13 @@ export default {
        // SLA diagnostics
        slaCritical: 'SLA 严重低于目标 ({sla}%)',
        slaCriticalImpact: '用户体验严重受损',
-        slaCriticalAction: '紧急排查错误和延迟问题，考虑限流保护',
+        slaCriticalAction: '紧急排查错误原因，必要时采取限流保护',
        slaLow: 'SLA 低于目标 ({sla}%)',
        slaLowImpact: '需要关注服务质量',
        slaLowAction: '分析SLA下降原因，优化系统性能',
        // Health score diagnostics
        healthCritical: '综合健康评分过低 ({score})',
-        healthCriticalImpact: '多个指标可能同时异常，建议优先排查错误与延迟',
+        healthCriticalImpact: '多个指标可能同时异常，建议优先排查错误与资源使用情况',
        healthCriticalAction: '全面检查系统状态，优先处理critical级别问题',
        healthLow: '综合健康评分偏低 ({score})',
        healthLowImpact: '可能存在轻度波动，建议关注 SLA 与错误率',
@@ -2187,27 +2181,106 @@ export default {
      // Error Log
      errorLog: {
        timeId: '时间 / ID',
+        commonErrors: {
+          contextDeadlineExceeded: '请求超时',
+          connectionRefused: '连接被拒绝',
+          rateLimit: '触发限流'
+        },
+        time: '时间',
+        type: '类型',
        context: '上下文',
+        platform: '平台',
+        model: '模型',
+        group: '分组',
+        user: '用户',
+        userId: '用户 ID',
+        account: '账号',
+        accountId: '账号 ID',
        status: '状态码',
-        message: '消息',
+        message: '响应内容',
-        latency: '延迟',
+        latency: '请求时长',
        action: '操作',
        noErrors: '该窗口内暂无错误。',
        grp: 'GRP：',
        acc: 'ACC：',
        details: '详情',
-        phase: '阶段'
+        phase: '阶段',
+        id: 'ID：',
+        typeUpstream: '上游',
+        typeRequest: '请求',
+        typeAuth: '认证',
+        typeRouting: '路由',
+        typeInternal: '内部'
      },
      // Error Details Modal
      errorDetails: {
        upstreamErrors: '上游错误',
        requestErrors: '请求错误',
+        unresolved: '未解决',
+        resolved: '已解决',
+        viewErrors: '错误',
+        viewExcluded: '排除项',
+        statusCodeOther: '其他',
+        owner: {
+          provider: '服务商',
+          client: '客户端',
+          platform: '平台'
+        },
+        phase: {
+          request: '请求',
+          auth: '认证',
+          routing: '路由',
+          upstream: '上游',
+          network: '网络',
+          internal: '内部'
+        },
        total: '总计：',
        searchPlaceholder: '搜索 request_id / client_request_id / message',
-        accountIdPlaceholder: 'account_id'
      },
      // Error Detail Modal
      errorDetail: {
+        title: '错误详情',
+        titleWithId: '错误 #{id}',
+        noErrorSelected: '未选择错误。',
+        resolution: '已解决：',
+        pinnedToOriginalAccountId: '固定到原 account_id',
+        missingUpstreamRequestBody: '缺少上游请求体',
+        failedToLoadRetryHistory: '加载重试历史失败',
+        failedToUpdateResolvedStatus: '更新解决状态失败',
+        unsupportedRetryMode: '不支持的重试模式',
+        classificationKeys: {
+          phase: '阶段',
+          owner: '归属方',
+          source: '来源',
+          retryable: '可重试',
+          resolvedAt: '解决时间',
+          resolvedBy: '解决人',
+          resolvedRetryId: '解决重试ID',
+          retryCount: '重试次数'
+        },
+        source: {
+          upstream_http: '上游 HTTP'
+        },
+        upstreamKeys: {
+          status: '状态码',
+          message: '消息',
+          detail: '详情',
+          upstreamErrors: '上游错误列表'
+        },
+        upstreamEvent: {
+          account: '账号',
+          status: '状态码',
+          requestId: '请求ID'
+        },
+        responsePreview: {
+          expand: '响应内容（点击展开）',
+          collapse: '响应内容（点击收起）'
+        },
+        retryMeta: {
+          used: '使用账号',
+          success: '成功',
+          pinned: '固定账号'
+        },
        loading: '加载中…',
        requestId: '请求 ID',
        time: '时间',
@@ -2217,8 +2290,10 @@ export default {
        basicInfo: '基本信息',
        platform: '平台',
        model: '模型',
-        latency: '延迟',
+        group: '分组',
-        ttft: 'TTFT',
+        user: '用户',
+        account: '账号',
+        latency: '请求时长',
        businessLimited: '业务限制',
        requestPath: '请求路径',
        timings: '时序信息',
@@ -2226,6 +2301,8 @@ export default {
        routing: '路由',
        upstream: '上游',
        response: '响应',
+        classification: '错误分类',
+        notRetryable: '此错误不建议重试',
        retry: '重试',
        retryClient: '重试（客户端）',
        retryUpstream: '重试（上游固定）',
@@ -2237,7 +2314,6 @@ export default {
        confirmRetry: '确认重试',
        retrySuccess: '重试成功',
        retryFailed: '重试失败',
-        na: 'N/A',
        retryHint: '重试将使用相同的请求参数重新发送请求',
        retryClientHint: '使用客户端重试（不固定账号）',
        retryUpstreamHint: '使用上游固定重试（固定到错误的账号）',
@@ -2245,8 +2321,33 @@ export default {
        retryNote1: '重试会使用相同的请求体和参数',
        retryNote2: '如果原请求失败是因为账号问题，固定重试可能仍会失败',
        retryNote3: '客户端重试会重新选择账号',
+        retryNote4: '对不可重试的错误可以强制重试，但不推荐',
        confirmRetryMessage: '确认要重试该请求吗？',
-        confirmRetryHint: '将使用相同的请求参数重新发送'
+        confirmRetryHint: '将使用相同的请求参数重新发送',
+        forceRetry: '我已确认并理解强制重试风险',
+        forceRetryHint: '此错误类型通常不可通过重试解决；如仍需重试请勾选确认',
+        forceRetryNeedAck: '请先勾选确认再强制重试',
+        markResolved: '标记已解决',
+        markUnresolved: '标记未解决',
+        viewRetries: '重试历史',
+        retryHistory: '重试历史',
+        tabOverview: '概览',
+        tabRetries: '重试历史',
+        tabRequest: '请求详情',
+        tabResponse: '响应详情',
+        responseBody: '响应详情',
+        compareA: '对比 A',
+        compareB: '对比 B',
+        retrySummary: '重试摘要',
+        responseHintSucceeded: '展示重试成功的 response_preview（#{id}）',
+        responseHintFallback: '没有成功的重试结果，展示存储的 error_body',
+        suggestion: '处理建议',
+        suggestUpstreamResolved: '✓ 上游错误已通过重试解决，无需人工介入',
+        suggestUpstream: '⚠️ 上游服务不稳定，建议：检查上游账号状态 / 考虑切换账号 / 再次重试',
+        suggestRequest: '⚠️ 客户端请求错误，建议：联系客户修正请求参数 / 手动标记已解决',
+        suggestAuth: '⚠️ 认证失败，建议：检查 API Key 是否有效 / 联系客户更新凭证',
+        suggestPlatform: '🚨 平台错误，建议立即排查修复',
+        suggestGeneric: '查看详情了解更多信息'
      },
      requestDetails: {
        title: '请求明细',
@@ -2282,13 +2383,46 @@ export default {
        loading: '加载中...',
        empty: '暂无告警事件',
        loadFailed: '加载告警事件失败',
+        status: {
+          firing: '告警中',
+          resolved: '已恢复',
+          manualResolved: '手动已解决'
+        },
+        detail: {
+          title: '告警详情',
+          loading: '加载详情中...',
+          empty: '暂无详情',
+          loadFailed: '加载告警详情失败',
+          manualResolve: '标记为已解决',
+          manualResolvedSuccess: '已标记为手动解决',
+          manualResolvedFailed: '标记为手动解决失败',
+          silence: '忽略此告警',
+          silenceSuccess: '已静默该告警',
+          silenceFailed: '静默失败',
+          viewRule: '查看规则',
+          viewLogs: '查看相关日志',
+          firedAt: '触发时间',
+          resolvedAt: '解决时间',
+          ruleId: '规则 ID',
+          dimensions: '维度信息',
+          historyTitle: '历史记录',
+          historyHint: '同一规则 + 相同维度的最近事件',
+          historyLoading: '加载历史中...',
+          historyEmpty: '暂无历史记录'
+        },
        table: {
          time: '时间',
          status: '状态',
          severity: '级别',
+          platform: '平台',
+          ruleId: '规则ID',
          title: '标题',
+          duration: '持续时间',
          metric: '指标 / 阈值',
-          email: '邮件已发送'
+          dimensions: '维度',
+          email: '邮件已发送',
+          emailSent: '已发送',
+          emailIgnored: '已忽略'
        }
      },
      alertRules: {
@@ -2316,8 +2450,8 @@ export default {
          successRate: '成功率 (%)',
          errorRate: '错误率 (%)',
          upstreamErrorRate: '上游错误率 (%)',
-          p95: 'P95 延迟 (ms)',
+          p95: 'P95 请求时长 (ms)',
-          p99: 'P99 延迟 (ms)',
+          p99: 'P99 请求时长 (ms)',
          cpu: 'CPU 使用率 (%)',
          memory: '内存使用率 (%)',
          queueDepth: '并发排队深度',
@@ -2402,7 +2536,6 @@ export default {
          title: '告警静默（维护模式）',
          enabled: '启用静默',
          globalUntil: '静默截止时间（RFC3339）',
-          untilPlaceholder: '2026-01-05T00:00:00Z',
          untilHint: '建议填写截止时间，避免忘记关闭静默。',
          reason: '原因',
          reasonPlaceholder: '例如：计划维护',
@@ -2442,7 +2575,11 @@ export default {
          lockKeyRequired: '启用分布式锁时必须填写 Lock Key',
          lockKeyPrefix: '分布式锁 Key 必须以「{prefix}」开头',
          lockKeyHint: '建议以「{prefix}」开头以避免冲突',
-          lockTtlRange: '分布式锁 TTL 必须在 1 到 86400 秒之间'
+          lockTtlRange: '分布式锁 TTL 必须在 1 到 86400 秒之间',
+          slaMinPercentRange: 'SLA 最低值必须在 0-100 之间',
+          ttftP99MaxRange: 'TTFT P99 最大值必须大于或等于 0',
+          requestErrorRateMaxRange: '请求错误率最大值必须在 0-100 之间',
+          upstreamErrorRateMaxRange: '上游错误率最大值必须在 0-100 之间'
        }
      },
      email: {
@@ -2507,8 +2644,6 @@ export default {
        metricThresholdsHint: '配置各项指标的告警阈值，超出阈值时将以红色显示',
        slaMinPercent: 'SLA最低百分比',
        slaMinPercentHint: 'SLA低于此值时显示为红色（默认：99.5%）',
-        latencyP99MaxMs: '延迟P99最大值（毫秒）',
-        latencyP99MaxMsHint: '延迟P99高于此值时显示为红色（默认：2000ms）',
        ttftP99MaxMs: 'TTFT P99最大值（毫秒）',
        ttftP99MaxMsHint: 'TTFT P99高于此值时显示为红色（默认：500ms）',
        requestErrorRateMaxPercent: '请求错误率最大值（%）',
@@ -2527,9 +2662,28 @@ export default {
        aggregation: '预聚合任务',
        enableAggregation: '启用预聚合任务',
        aggregationHint: '预聚合可提升长时间窗口查询性能',
+        errorFiltering: '错误过滤',
+        ignoreCountTokensErrors: '忽略 count_tokens 错误',
+        ignoreCountTokensErrorsHint: '启用后，count_tokens 请求的错误将不会写入错误日志。',
+        ignoreContextCanceled: '忽略客户端断连错误',
+        ignoreContextCanceledHint: '启用后，客户端主动断开连接（context canceled）的错误将不会写入错误日志。',
+        ignoreNoAvailableAccounts: '忽略无可用账号错误',
+        ignoreNoAvailableAccountsHint: '启用后，“No available accounts” 错误将不会写入错误日志（不推荐，这通常是配置问题）。',
+        autoRefresh: '自动刷新',
+        enableAutoRefresh: '启用自动刷新',
+        enableAutoRefreshHint: '自动刷新仪表板数据，启用后会定期拉取最新数据。',
+        refreshInterval: '刷新间隔',
+        refreshInterval15s: '15 秒',
+        refreshInterval30s: '30 秒',
+        refreshInterval60s: '60 秒',
+        autoRefreshCountdown: '自动刷新：{seconds}s',
        validation: {
          title: '请先修正以下问题',
-          retentionDaysRange: '保留天数必须在1-365天之间'
+          retentionDaysRange: '保留天数必须在1-365天之间',
+          slaMinPercentRange: 'SLA最低百分比必须在0-100之间',
+          ttftP99MaxRange: 'TTFT P99最大值必须大于等于0',
+          requestErrorRateMaxRange: '请求错误率最大值必须在0-100之间',
+          upstreamErrorRateMaxRange: '上游错误率最大值必须在0-100之间'
        }
      },
      concurrency: {
@@ -2567,12 +2721,12 @@ export default {
      tooltips: {
        totalRequests: '当前时间窗口内的总请求数和Token消耗量。',
        throughputTrend: '当前窗口内的请求/QPS 与 token/TPS 趋势。',
-        latencyHistogram: '成功请求的延迟分布（毫秒）。',
+        latencyHistogram: '成功请求的请求时长分布（毫秒）。',
        errorTrend: '错误趋势（SLA 口径排除业务限制；上游错误率排除 429/529）。',
        errorDistribution: '按状态码统计的错误分布。',
        upstreamErrors: '上游服务返回的错误，包括API提供商的错误响应（排除429/529限流错误）。',
        goroutines:
-          'Go 运行时的协程数量（轻量级线程）。没有绝对“安全值”，建议以历史基线为准。经验参考：<2000 常见；2000-8000 需关注；>8000 且伴随队列/延迟上升时，优先排查阻塞/泄漏。',
+          'Go 运行时的协程数量（轻量级线程）。没有绝对"安全值"，建议以历史基线为准。经验参考：<2000 常见；2000-8000 需关注；>8000 且伴随队列上升时，优先排查阻塞/泄漏。',
        cpu: 'CPU 使用率，显示系统处理器的负载情况。',
        memory: '内存使用率，包括已使用和总可用内存。',
        db: '数据库连接池状态，包括活跃连接、空闲连接和等待连接数。',
@@ -2582,7 +2736,7 @@ export default {
        tokens: '当前时间窗口内处理的总Token数量。',
        sla: '服务等级协议达成率，排除业务限制（如余额不足、配额超限）的成功请求占比。',
        errors: '错误统计，包括总错误数、错误率和上游错误率。',
-        latency: '请求延迟统计，包括 p50、p90、p95、p99 等百分位数。',
+        latency: '请求时长统计，包括 p50、p90、p95、p99 等百分位数。',
        ttft: '首Token延迟（Time To First Token），衡量流式响应的首字节返回速度。',
        health: '系统健康评分（0-100），综合考虑 SLA、错误率和资源使用情况。'
      },

--- a/frontend/src/views/admin/ops/OpsDashboard.vue
+++ b/frontend/src/views/admin/ops/OpsDashboard.vue
@@ -8,7 +8,7 @@
        {{ errorMessage }}
      </div>
-      <OpsDashboardSkeleton v-if="loading && !hasLoadedOnce" />
+      <OpsDashboardSkeleton v-if="loading && !hasLoadedOnce" :fullscreen="isFullscreen" />
      <OpsDashboardHeader
        v-else-if="opsEnabled"
@@ -94,7 +94,7 @@
          @openErrorDetail="openError"
        />
-        <OpsErrorDetailModal v-model:show="showErrorModal" :error-id="selectedErrorId" />
+        <OpsErrorDetailModal v-model:show="showErrorModal" :error-id="selectedErrorId" :error-type="errorDetailsType" />
        <OpsRequestDetailsModal
          v-model="showRequestDetails"
@@ -169,7 +169,13 @@ const QUERY_KEYS = {
  platform: 'platform',
  groupId: 'group_id',
  queryMode: 'mode',
-  fullscreen: 'fullscreen'
+  fullscreen: 'fullscreen',
+  // Deep links
+  openErrorDetails: 'open_error_details',
+  errorType: 'error_type',
+  alertRuleId: 'alert_rule_id',
+  openAlertRules: 'open_alert_rules'
 } as const
 const isApplyingRouteQuery = ref(false)
@@ -249,6 +255,24 @@ const applyRouteQueryToState = () => {
    const fallback = adminSettingsStore.opsQueryModeDefault || 'auto'
    queryMode.value = allowedQueryModes.has(fallback as QueryMode) ? (fallback as QueryMode) : 'auto'
  }
+  // Deep links
+  const openRules = readQueryString(QUERY_KEYS.openAlertRules)
+  if (openRules === '1' || openRules === 'true') {
+    showAlertRulesCard.value = true
+  }
+  const ruleID = readQueryNumber(QUERY_KEYS.alertRuleId)
+  if (typeof ruleID === 'number' && ruleID > 0) {
+    showAlertRulesCard.value = true
+  }
+  const openErr = readQueryString(QUERY_KEYS.openErrorDetails)
+  if (openErr === '1' || openErr === 'true') {
+    const typ = readQueryString(QUERY_KEYS.errorType)
+    errorDetailsType.value = typ === 'upstream' ? 'upstream' : 'request'
+    showErrorDetails.value = true
+  }
 }
 applyRouteQueryToState()
@@ -376,11 +400,17 @@ function handleOpenRequestDetails(preset?: OpsRequestDetailsPreset) {
  requestDetailsPreset.value = { ...basePreset, ...(preset ?? {}) }
  if (!requestDetailsPreset.value.title) requestDetailsPreset.value.title = basePreset.title
+  // Ensure only one modal visible at a time.
+  showErrorDetails.value = false
+  showErrorModal.value = false
  showRequestDetails.value = true
 }
 function openErrorDetails(kind: 'request' | 'upstream') {
  errorDetailsType.value = kind
+  // Ensure only one modal visible at a time.
+  showRequestDetails.value = false
+  showErrorModal.value = false
  showErrorDetails.value = true
 }
@@ -422,6 +452,9 @@ function onQueryModeChange(v: string | number | boolean | null) {
 function openError(id: number) {
  selectedErrorId.value = id
+  // Ensure only one modal visible at a time.
+  showErrorDetails.value = false
+  showRequestDetails.value = false
  showErrorModal.value = true
 }

--- a/frontend/src/views/admin/ops/components/OpsAlertEventsCard.vue
+++ b/frontend/src/views/admin/ops/components/OpsAlertEventsCard.vue
--- a/frontend/src/views/admin/ops/components/OpsAlertRulesCard.vue
+++ b/frontend/src/views/admin/ops/components/OpsAlertRulesCard.vue
@@ -140,24 +140,6 @@ const metricDefinitions = computed(() => {
      recommendedThreshold: 1,
      unit: '%'
    },
-    {
-      type: 'p95_latency_ms',
-      group: 'system',
-      label: t('admin.ops.alertRules.metrics.p95'),
-      description: t('admin.ops.alertRules.metricDescriptions.p95'),
-      recommendedOperator: '>',
-      recommendedThreshold: 1000,
-      unit: 'ms'
-    },
-    {
-      type: 'p99_latency_ms',
-      group: 'system',
-      label: t('admin.ops.alertRules.metrics.p99'),
-      description: t('admin.ops.alertRules.metricDescriptions.p99'),
-      recommendedOperator: '>',
-      recommendedThreshold: 2000,
-      unit: 'ms'
-    },
    {
      type: 'cpu_usage_percent',
      group: 'system',

--- a/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue
+++ b/frontend/src/views/admin/ops/components/OpsDashboardHeader.vue
@@ -169,8 +169,8 @@ const updatedAtLabel = computed(() => {
  return props.lastUpdated.toLocaleTimeString()
 })
-// --- Color coding for latency/TTFT ---
+// --- Color coding for TTFT ---
-function getLatencyColor(ms: number | null | undefined): string {
+function getTTFTColor(ms: number | null | undefined): string {
  if (ms == null) return 'text-gray-900 dark:text-white'
  if (ms < 500) return 'text-green-600 dark:text-green-400'
  if (ms < 1000) return 'text-yellow-600 dark:text-yellow-400'
@@ -186,13 +186,6 @@ function isSLABelowThreshold(slaPercent: number | null): boolean {
  return slaPercent < threshold
 }
-function isLatencyAboveThreshold(latencyP99Ms: number | null): boolean {
-  if (latencyP99Ms == null) return false
-  const threshold = props.thresholds?.latency_p99_ms_max
-  if (threshold == null) return false
-  return latencyP99Ms > threshold
-}
 function isTTFTAboveThreshold(ttftP99Ms: number | null): boolean {
  if (ttftP99Ms == null) return false
  const threshold = props.thresholds?.ttft_p99_ms_max
@@ -482,24 +475,6 @@ const diagnosisReport = computed<DiagnosisItem[]>(() => {
    }
  }
-  // Latency diagnostics
-  const durationP99 = ov.duration?.p99_ms ?? 0
-  if (durationP99 > 2000) {
-    report.push({
-      type: 'critical',
-      message: t('admin.ops.diagnosis.latencyCritical', { latency: durationP99.toFixed(0) }),
-      impact: t('admin.ops.diagnosis.latencyCriticalImpact'),
-      action: t('admin.ops.diagnosis.latencyCriticalAction')
-    })
-  } else if (durationP99 > 1000) {
-    report.push({
-      type: 'warning',
-      message: t('admin.ops.diagnosis.latencyHigh', { latency: durationP99.toFixed(0) }),
-      impact: t('admin.ops.diagnosis.latencyHighImpact'),
-      action: t('admin.ops.diagnosis.latencyHighAction')
-    })
-  }
  const ttftP99 = ov.ttft?.p99_ms ?? 0
  if (ttftP99 > 500) {
    report.push({
@@ -851,7 +826,7 @@ function handleToolbarRefresh() {
                <circle class="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" stroke-width="4"></circle>
                <path class="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4zm2 5.291A7.962 7.962 0 014 12H0c0 3.042 1.135 5.824 3 7.938l3-2.647z"></path>
              </svg>
-              <span>自动刷新: {{ props.autoRefreshCountdown }}s</span>
+              <span>{{ t('admin.ops.settings.autoRefreshCountdown', { seconds: props.autoRefreshCountdown }) }}</span>
            </span>
          </template>
@@ -1113,7 +1088,7 @@ function handleToolbarRefresh() {
                  </div>
                  <div class="flex items-baseline gap-1.5">
                    <span :class="[props.fullscreen ? 'text-4xl' : 'text-xl sm:text-2xl', 'font-black text-gray-900 dark:text-white']">{{ displayRealTimeTps.toFixed(1) }}</span>
-                    <span :class="[props.fullscreen ? 'text-sm' : 'text-xs', 'font-bold text-gray-500']">TPS</span>
+                    <span :class="[props.fullscreen ? 'text-sm' : 'text-xs', 'font-bold text-gray-500']">{{ t('admin.ops.tps') }}</span>
                  </div>
                </div>
              </div>
@@ -1130,7 +1105,7 @@ function handleToolbarRefresh() {
                    </div>
                    <div class="flex items-baseline gap-1.5">
                      <span class="font-black text-gray-900 dark:text-white">{{ realtimeTpsPeakLabel }}</span>
-                      <span class="text-xs">TPS</span>
+                      <span class="text-xs">{{ t('admin.ops.tps') }}</span>
                    </div>
                  </div>
                </div>
@@ -1145,7 +1120,7 @@ function handleToolbarRefresh() {
                    </div>
                    <div class="flex items-baseline gap-1.5">
                      <span class="font-black text-gray-900 dark:text-white">{{ realtimeTpsAvgLabel }}</span>
-                      <span class="text-xs">TPS</span>
+                      <span class="text-xs">{{ t('admin.ops.tps') }}</span>
                    </div>
                  </div>
                </div>
@@ -1181,7 +1156,7 @@ function handleToolbarRefresh() {
      <!-- Right: 6 cards (3 cols x 2 rows) -->
      <div class="grid h-full grid-cols-1 content-center gap-4 sm:grid-cols-2 lg:col-span-7 lg:grid-cols-3">
        <!-- Card 1: Requests -->
-        <div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900">
+        <div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900" style="order: 1;">
          <div class="flex items-center justify-between">
            <div class="flex items-center gap-1">
              <span class="text-[10px] font-bold uppercase text-gray-400">{{ t('admin.ops.requestsTitle') }}</span>
@@ -1217,10 +1192,10 @@ function handleToolbarRefresh() {
        </div>
        <!-- Card 2: SLA -->
-        <div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900">
+        <div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900" style="order: 2;">
          <div class="flex items-center justify-between">
            <div class="flex items-center gap-2">
-              <span class="text-[10px] font-bold uppercase text-gray-400">SLA</span>
+              <span class="text-[10px] font-bold uppercase text-gray-400">{{ t('admin.ops.sla') }}</span>
              <HelpTooltip v-if="!props.fullscreen" :content="t('admin.ops.tooltips.sla')" />
              <span class="h-1.5 w-1.5 rounded-full" :class="isSLABelowThreshold(slaPercent) ? 'bg-red-500' : (slaPercent ?? 0) >= 99.5 ? 'bg-green-500' : 'bg-yellow-500'"></span>
            </div>
@@ -1247,8 +1222,8 @@ function handleToolbarRefresh() {
          </div>
        </div>
-        <!-- Card 3: Latency (Duration) -->
+        <!-- Card 4: Request Duration -->
-        <div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900">
+        <div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900" style="order: 4;">
          <div class="flex items-center justify-between">
            <div class="flex items-center gap-1">
              <span class="text-[10px] font-bold uppercase text-gray-400">{{ t('admin.ops.latencyDuration') }}</span>
@@ -1264,42 +1239,42 @@ function handleToolbarRefresh() {
            </button>
          </div>
          <div class="mt-2 flex items-baseline gap-2">
-            <div class="text-3xl font-black" :class="isLatencyAboveThreshold(durationP99Ms) ? 'text-red-600 dark:text-red-400' : getLatencyColor(durationP99Ms)">
+            <div class="text-3xl font-black text-gray-900 dark:text-white">
              {{ durationP99Ms ?? '-' }}
            </div>
            <span class="text-xs font-bold text-gray-400">ms (P99)</span>
          </div>
          <div class="mt-3 flex flex-wrap gap-x-3 gap-y-1 text-xs">
            <div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
-              <span class="text-gray-500">P95:</span>
+              <span class="text-gray-500">{{ t('admin.ops.p95') }}</span>
-              <span class="font-bold" :class="getLatencyColor(durationP95Ms)">{{ durationP95Ms ?? '-' }}</span>
+              <span class="font-bold text-gray-900 dark:text-white">{{ durationP95Ms ?? '-' }}</span>
              <span class="text-gray-400">ms</span>
            </div>
            <div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
-              <span class="text-gray-500">P90:</span>
+              <span class="text-gray-500">{{ t('admin.ops.p90') }}</span>
-              <span class="font-bold" :class="getLatencyColor(durationP90Ms)">{{ durationP90Ms ?? '-' }}</span>
+              <span class="font-bold text-gray-900 dark:text-white">{{ durationP90Ms ?? '-' }}</span>
              <span class="text-gray-400">ms</span>
            </div>
            <div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
-              <span class="text-gray-500">P50:</span>
+              <span class="text-gray-500">{{ t('admin.ops.p50') }}</span>
-              <span class="font-bold" :class="getLatencyColor(durationP50Ms)">{{ durationP50Ms ?? '-' }}</span>
+              <span class="font-bold text-gray-900 dark:text-white">{{ durationP50Ms ?? '-' }}</span>
              <span class="text-gray-400">ms</span>
            </div>
            <div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
              <span class="text-gray-500">Avg:</span>
-              <span class="font-bold" :class="getLatencyColor(durationAvgMs)">{{ durationAvgMs ?? '-' }}</span>
+              <span class="font-bold text-gray-900 dark:text-white">{{ durationAvgMs ?? '-' }}</span>
              <span class="text-gray-400">ms</span>
            </div>
            <div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
              <span class="text-gray-500">Max:</span>
-              <span class="font-bold" :class="getLatencyColor(durationMaxMs)">{{ durationMaxMs ?? '-' }}</span>
+              <span class="font-bold text-gray-900 dark:text-white">{{ durationMaxMs ?? '-' }}</span>
              <span class="text-gray-400">ms</span>
            </div>
          </div>
        </div>
-        <!-- Card 4: TTFT -->
+        <!-- Card 5: TTFT -->
-        <div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900">
+        <div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900" style="order: 5;">
          <div class="flex items-center justify-between">
            <div class="flex items-center gap-1">
              <span class="text-[10px] font-bold uppercase text-gray-400">TTFT</span>
@@ -1309,48 +1284,48 @@ function handleToolbarRefresh() {
              v-if="!props.fullscreen"
              class="text-[10px] font-bold text-blue-500 hover:underline"
              type="button"
-              @click="openDetails({ title: 'TTFT', sort: 'duration_desc' })"
+              @click="openDetails({ title: t('admin.ops.ttftLabel'), sort: 'duration_desc' })"
            >
              {{ t('admin.ops.requestDetails.details') }}
            </button>
          </div>
          <div class="mt-2 flex items-baseline gap-2">
-            <div class="text-3xl font-black" :class="isTTFTAboveThreshold(ttftP99Ms) ? 'text-red-600 dark:text-red-400' : getLatencyColor(ttftP99Ms)">
+            <div class="text-3xl font-black" :class="isTTFTAboveThreshold(ttftP99Ms) ? 'text-red-600 dark:text-red-400' : getTTFTColor(ttftP99Ms)">
              {{ ttftP99Ms ?? '-' }}
            </div>
            <span class="text-xs font-bold text-gray-400">ms (P99)</span>
          </div>
          <div class="mt-3 flex flex-wrap gap-x-3 gap-y-1 text-xs">
            <div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
-              <span class="text-gray-500">P95:</span>
+              <span class="text-gray-500">{{ t('admin.ops.p95') }}</span>
-              <span class="font-bold" :class="getLatencyColor(ttftP95Ms)">{{ ttftP95Ms ?? '-' }}</span>
+              <span class="font-bold" :class="getTTFTColor(ttftP95Ms)">{{ ttftP95Ms ?? '-' }}</span>
              <span class="text-gray-400">ms</span>
            </div>
            <div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
-              <span class="text-gray-500">P90:</span>
+              <span class="text-gray-500">{{ t('admin.ops.p90') }}</span>
-              <span class="font-bold" :class="getLatencyColor(ttftP90Ms)">{{ ttftP90Ms ?? '-' }}</span>
+              <span class="font-bold" :class="getTTFTColor(ttftP90Ms)">{{ ttftP90Ms ?? '-' }}</span>
              <span class="text-gray-400">ms</span>
            </div>
            <div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
-              <span class="text-gray-500">P50:</span>
+              <span class="text-gray-500">{{ t('admin.ops.p50') }}</span>
-              <span class="font-bold" :class="getLatencyColor(ttftP50Ms)">{{ ttftP50Ms ?? '-' }}</span>
+              <span class="font-bold" :class="getTTFTColor(ttftP50Ms)">{{ ttftP50Ms ?? '-' }}</span>
              <span class="text-gray-400">ms</span>
            </div>
            <div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
              <span class="text-gray-500">Avg:</span>
-              <span class="font-bold" :class="getLatencyColor(ttftAvgMs)">{{ ttftAvgMs ?? '-' }}</span>
+              <span class="font-bold" :class="getTTFTColor(ttftAvgMs)">{{ ttftAvgMs ?? '-' }}</span>
              <span class="text-gray-400">ms</span>
            </div>
            <div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
              <span class="text-gray-500">Max:</span>
-              <span class="font-bold" :class="getLatencyColor(ttftMaxMs)">{{ ttftMaxMs ?? '-' }}</span>
+              <span class="font-bold" :class="getTTFTColor(ttftMaxMs)">{{ ttftMaxMs ?? '-' }}</span>
              <span class="text-gray-400">ms</span>
            </div>
          </div>
        </div>
-        <!-- Card 5: Request Errors -->
+        <!-- Card 3: Request Errors -->
-        <div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900">
+        <div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900" style="order: 3;">
          <div class="flex items-center justify-between">
            <div class="flex items-center gap-1">
              <span class="text-[10px] font-bold uppercase text-gray-400">{{ t('admin.ops.requestErrors') }}</span>
@@ -1376,7 +1351,7 @@ function handleToolbarRefresh() {
        </div>
        <!-- Card 6: Upstream Errors -->
-        <div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900">
+        <div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900" style="order: 6;">
          <div class="flex items-center justify-between">
            <div class="flex items-center gap-1">
              <span class="text-[10px] font-bold uppercase text-gray-400">{{ t('admin.ops.upstreamErrors') }}</span>
@@ -1423,7 +1398,7 @@ function handleToolbarRefresh() {
        <!-- MEM -->
        <div class="rounded-xl bg-gray-50 p-3 dark:bg-dark-900">
          <div class="flex items-center gap-1">
-            <div class="text-[10px] font-bold uppercase tracking-wider text-gray-400">MEM</div>
+            <div class="text-[10px] font-bold uppercase tracking-wider text-gray-400">{{ t('admin.ops.mem') }}</div>
            <HelpTooltip v-if="!props.fullscreen" :content="t('admin.ops.tooltips.memory')" />
          </div>
          <div class="mt-1 text-lg font-black" :class="memPercentClass">
@@ -1441,7 +1416,7 @@ function handleToolbarRefresh() {
        <!-- DB -->
        <div class="rounded-xl bg-gray-50 p-3 dark:bg-dark-900">
          <div class="flex items-center gap-1">
-            <div class="text-[10px] font-bold uppercase tracking-wider text-gray-400">DB</div>
+            <div class="text-[10px] font-bold uppercase tracking-wider text-gray-400">{{ t('admin.ops.db') }}</div>
            <HelpTooltip v-if="!props.fullscreen" :content="t('admin.ops.tooltips.db')" />
          </div>
          <div class="mt-1 text-lg font-black" :class="dbMiddleClass">

--- a/frontend/src/views/admin/ops/components/OpsDashboardSkeleton.vue
+++ b/frontend/src/views/admin/ops/components/OpsDashboardSkeleton.vue
--- a/frontend/src/views/admin/ops/components/OpsErrorDetailModal.vue
+++ b/frontend/src/views/admin/ops/components/OpsErrorDetailModal.vue
--- a/frontend/src/views/admin/ops/components/OpsErrorDetailsModal.vue
+++ b/frontend/src/views/admin/ops/components/OpsErrorDetailsModal.vue
--- a/frontend/src/views/admin/ops/components/OpsErrorLogTable.vue
+++ b/frontend/src/views/admin/ops/components/OpsErrorLogTable.vue
--- a/frontend/src/views/admin/ops/components/OpsRequestDetailsModal.vue
+++ b/frontend/src/views/admin/ops/components/OpsRequestDetailsModal.vue
@@ -38,7 +38,7 @@ const loading = ref(false)
 const items = ref<OpsRequestDetail[]>([])
 const total = ref(0)
 const page = ref(1)
-const pageSize = ref(20)
+const pageSize = ref(10)
 const close = () => emit('update:modelValue', false)
@@ -95,7 +95,7 @@ watch(
  (open) => {
    if (open) {
      page.value = 1
-      pageSize.value = 20
+      pageSize.value = 10
      fetchData()
    }
  }

--- a/frontend/src/views/admin/ops/components/OpsRuntimeSettingsCard.vue
+++ b/frontend/src/views/admin/ops/components/OpsRuntimeSettingsCard.vue