Unverified Commit 27214f86 authored by Wesley Liddick's avatar Wesley Liddick Committed by GitHub
Browse files

Merge pull request #285 from IanShaw027/fix/ops-bug

feat(ops): 增强错误日志管理、告警静默和前端 UI 优化
parents 28de614d 5354ba36
...@@ -108,6 +108,10 @@ func (w *limitedResponseWriter) truncated() bool { ...@@ -108,6 +108,10 @@ func (w *limitedResponseWriter) truncated() bool {
return w.totalWritten > int64(w.limit) return w.totalWritten > int64(w.limit)
} }
const (
OpsRetryModeUpstreamEvent = "upstream_event"
)
func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, errorID int64, mode string, pinnedAccountID *int64) (*OpsRetryResult, error) { func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, errorID int64, mode string, pinnedAccountID *int64) (*OpsRetryResult, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil { if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err return nil, err
...@@ -123,6 +127,81 @@ func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, er ...@@ -123,6 +127,81 @@ func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, er
return nil, infraerrors.BadRequest("OPS_RETRY_INVALID_MODE", "mode must be client or upstream") return nil, infraerrors.BadRequest("OPS_RETRY_INVALID_MODE", "mode must be client or upstream")
} }
errorLog, err := s.GetErrorLogByID(ctx, errorID)
if err != nil {
return nil, err
}
if errorLog == nil {
return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found")
}
if strings.TrimSpace(errorLog.RequestBody) == "" {
return nil, infraerrors.BadRequest("OPS_RETRY_NO_REQUEST_BODY", "No request body found to retry")
}
var pinned *int64
if mode == OpsRetryModeUpstream {
if pinnedAccountID != nil && *pinnedAccountID > 0 {
pinned = pinnedAccountID
} else if errorLog.AccountID != nil && *errorLog.AccountID > 0 {
pinned = errorLog.AccountID
} else {
return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "pinned_account_id is required for upstream retry")
}
}
return s.retryWithErrorLog(ctx, requestedByUserID, errorID, mode, mode, pinned, errorLog)
}
// RetryUpstreamEvent retries a specific upstream attempt captured inside ops_error_logs.upstream_errors.
// idx is 0-based. It always pins the original event account_id.
func (s *OpsService) RetryUpstreamEvent(ctx context.Context, requestedByUserID int64, errorID int64, idx int) (*OpsRetryResult, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
if idx < 0 {
return nil, infraerrors.BadRequest("OPS_RETRY_INVALID_UPSTREAM_IDX", "invalid upstream idx")
}
errorLog, err := s.GetErrorLogByID(ctx, errorID)
if err != nil {
return nil, err
}
if errorLog == nil {
return nil, infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found")
}
events, err := ParseOpsUpstreamErrors(errorLog.UpstreamErrors)
if err != nil {
return nil, infraerrors.BadRequest("OPS_RETRY_UPSTREAM_EVENTS_INVALID", "invalid upstream_errors")
}
if idx >= len(events) {
return nil, infraerrors.BadRequest("OPS_RETRY_UPSTREAM_IDX_OOB", "upstream idx out of range")
}
ev := events[idx]
if ev == nil {
return nil, infraerrors.BadRequest("OPS_RETRY_UPSTREAM_EVENT_MISSING", "upstream event missing")
}
if ev.AccountID <= 0 {
return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "account_id is required for upstream retry")
}
upstreamBody := strings.TrimSpace(ev.UpstreamRequestBody)
if upstreamBody == "" {
return nil, infraerrors.BadRequest("OPS_RETRY_UPSTREAM_NO_REQUEST_BODY", "No upstream request body found to retry")
}
override := *errorLog
override.RequestBody = upstreamBody
pinned := ev.AccountID
// Persist as upstream_event, execute as upstream pinned retry.
return s.retryWithErrorLog(ctx, requestedByUserID, errorID, OpsRetryModeUpstreamEvent, OpsRetryModeUpstream, &pinned, &override)
}
func (s *OpsService) retryWithErrorLog(ctx context.Context, requestedByUserID int64, errorID int64, mode string, execMode string, pinnedAccountID *int64, errorLog *OpsErrorLogDetail) (*OpsRetryResult, error) {
latest, err := s.opsRepo.GetLatestRetryAttemptForError(ctx, errorID) latest, err := s.opsRepo.GetLatestRetryAttemptForError(ctx, errorID)
if err != nil && !errors.Is(err, sql.ErrNoRows) { if err != nil && !errors.Is(err, sql.ErrNoRows) {
return nil, infraerrors.InternalServer("OPS_RETRY_LOAD_LATEST_FAILED", "Failed to check retry status").WithCause(err) return nil, infraerrors.InternalServer("OPS_RETRY_LOAD_LATEST_FAILED", "Failed to check retry status").WithCause(err)
...@@ -144,22 +223,18 @@ func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, er ...@@ -144,22 +223,18 @@ func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, er
} }
} }
errorLog, err := s.GetErrorLogByID(ctx, errorID) if errorLog == nil || strings.TrimSpace(errorLog.RequestBody) == "" {
if err != nil {
return nil, err
}
if strings.TrimSpace(errorLog.RequestBody) == "" {
return nil, infraerrors.BadRequest("OPS_RETRY_NO_REQUEST_BODY", "No request body found to retry") return nil, infraerrors.BadRequest("OPS_RETRY_NO_REQUEST_BODY", "No request body found to retry")
} }
var pinned *int64 var pinned *int64
if mode == OpsRetryModeUpstream { if execMode == OpsRetryModeUpstream {
if pinnedAccountID != nil && *pinnedAccountID > 0 { if pinnedAccountID != nil && *pinnedAccountID > 0 {
pinned = pinnedAccountID pinned = pinnedAccountID
} else if errorLog.AccountID != nil && *errorLog.AccountID > 0 { } else if errorLog.AccountID != nil && *errorLog.AccountID > 0 {
pinned = errorLog.AccountID pinned = errorLog.AccountID
} else { } else {
return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "pinned_account_id is required for upstream retry") return nil, infraerrors.BadRequest("OPS_RETRY_PINNED_ACCOUNT_REQUIRED", "account_id is required for upstream retry")
} }
} }
...@@ -196,7 +271,7 @@ func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, er ...@@ -196,7 +271,7 @@ func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, er
execCtx, cancel := context.WithTimeout(ctx, opsRetryTimeout) execCtx, cancel := context.WithTimeout(ctx, opsRetryTimeout)
defer cancel() defer cancel()
execRes := s.executeRetry(execCtx, errorLog, mode, pinned) execRes := s.executeRetry(execCtx, errorLog, execMode, pinned)
finishedAt := time.Now() finishedAt := time.Now()
result.FinishedAt = finishedAt result.FinishedAt = finishedAt
...@@ -220,27 +295,40 @@ func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, er ...@@ -220,27 +295,40 @@ func (s *OpsService) RetryError(ctx context.Context, requestedByUserID int64, er
msg := result.ErrorMessage msg := result.ErrorMessage
updateErrMsg = &msg updateErrMsg = &msg
} }
// Keep legacy result_request_id empty; use upstream_request_id instead.
var resultRequestID *string var resultRequestID *string
if strings.TrimSpace(result.UpstreamRequestID) != "" {
v := result.UpstreamRequestID
resultRequestID = &v
}
finalStatus := result.Status finalStatus := result.Status
if strings.TrimSpace(finalStatus) == "" { if strings.TrimSpace(finalStatus) == "" {
finalStatus = opsRetryStatusFailed finalStatus = opsRetryStatusFailed
} }
success := strings.EqualFold(finalStatus, opsRetryStatusSucceeded)
httpStatus := result.HTTPStatusCode
upstreamReqID := result.UpstreamRequestID
usedAccountID := result.UsedAccountID
preview := result.ResponsePreview
truncated := result.ResponseTruncated
if err := s.opsRepo.UpdateRetryAttempt(updateCtx, &OpsUpdateRetryAttemptInput{ if err := s.opsRepo.UpdateRetryAttempt(updateCtx, &OpsUpdateRetryAttemptInput{
ID: attemptID, ID: attemptID,
Status: finalStatus, Status: finalStatus,
FinishedAt: finishedAt, FinishedAt: finishedAt,
DurationMs: result.DurationMs, DurationMs: result.DurationMs,
Success: &success,
HTTPStatusCode: &httpStatus,
UpstreamRequestID: &upstreamReqID,
UsedAccountID: usedAccountID,
ResponsePreview: &preview,
ResponseTruncated: &truncated,
ResultRequestID: resultRequestID, ResultRequestID: resultRequestID,
ErrorMessage: updateErrMsg, ErrorMessage: updateErrMsg,
}); err != nil { }); err != nil {
// Best-effort: retry itself already executed; do not fail the API response.
log.Printf("[Ops] UpdateRetryAttempt failed: %v", err) log.Printf("[Ops] UpdateRetryAttempt failed: %v", err)
} else if success {
if err := s.opsRepo.UpdateErrorResolution(updateCtx, errorID, true, &requestedByUserID, &attemptID, &finishedAt); err != nil {
log.Printf("[Ops] UpdateErrorResolution failed: %v", err)
}
} }
return result, nil return result, nil
......
...@@ -208,6 +208,25 @@ func (s *OpsService) RecordError(ctx context.Context, entry *OpsInsertErrorLogIn ...@@ -208,6 +208,25 @@ func (s *OpsService) RecordError(ctx context.Context, entry *OpsInsertErrorLogIn
out.Detail = "" out.Detail = ""
} }
out.UpstreamRequestBody = strings.TrimSpace(out.UpstreamRequestBody)
if out.UpstreamRequestBody != "" {
// Reuse the same sanitization/trimming strategy as request body storage.
// Keep it small so it is safe to persist in ops_error_logs JSON.
sanitized, truncated, _ := sanitizeAndTrimRequestBody([]byte(out.UpstreamRequestBody), 10*1024)
if sanitized != "" {
out.UpstreamRequestBody = sanitized
if truncated {
out.Kind = strings.TrimSpace(out.Kind)
if out.Kind == "" {
out.Kind = "upstream"
}
out.Kind = out.Kind + ":request_body_truncated"
}
} else {
out.UpstreamRequestBody = ""
}
}
// Drop fully-empty events (can happen if only status code was known). // Drop fully-empty events (can happen if only status code was known).
if out.UpstreamStatusCode == 0 && out.Message == "" && out.Detail == "" { if out.UpstreamStatusCode == 0 && out.Message == "" && out.Detail == "" {
continue continue
...@@ -236,7 +255,13 @@ func (s *OpsService) GetErrorLogs(ctx context.Context, filter *OpsErrorLogFilter ...@@ -236,7 +255,13 @@ func (s *OpsService) GetErrorLogs(ctx context.Context, filter *OpsErrorLogFilter
if s.opsRepo == nil { if s.opsRepo == nil {
return &OpsErrorLogList{Errors: []*OpsErrorLog{}, Total: 0, Page: 1, PageSize: 20}, nil return &OpsErrorLogList{Errors: []*OpsErrorLog{}, Total: 0, Page: 1, PageSize: 20}, nil
} }
return s.opsRepo.ListErrorLogs(ctx, filter) result, err := s.opsRepo.ListErrorLogs(ctx, filter)
if err != nil {
log.Printf("[Ops] GetErrorLogs failed: %v", err)
return nil, err
}
return result, nil
} }
func (s *OpsService) GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLogDetail, error) { func (s *OpsService) GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLogDetail, error) {
...@@ -256,6 +281,46 @@ func (s *OpsService) GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLo ...@@ -256,6 +281,46 @@ func (s *OpsService) GetErrorLogByID(ctx context.Context, id int64) (*OpsErrorLo
return detail, nil return detail, nil
} }
func (s *OpsService) ListRetryAttemptsByErrorID(ctx context.Context, errorID int64, limit int) ([]*OpsRetryAttempt, error) {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return nil, err
}
if s.opsRepo == nil {
return nil, infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
if errorID <= 0 {
return nil, infraerrors.BadRequest("OPS_ERROR_INVALID_ID", "invalid error id")
}
items, err := s.opsRepo.ListRetryAttemptsByErrorID(ctx, errorID, limit)
if err != nil {
if errors.Is(err, sql.ErrNoRows) {
return []*OpsRetryAttempt{}, nil
}
return nil, infraerrors.InternalServer("OPS_RETRY_LIST_FAILED", "Failed to list retry attempts").WithCause(err)
}
return items, nil
}
func (s *OpsService) UpdateErrorResolution(ctx context.Context, errorID int64, resolved bool, resolvedByUserID *int64, resolvedRetryID *int64) error {
if err := s.RequireMonitoringEnabled(ctx); err != nil {
return err
}
if s.opsRepo == nil {
return infraerrors.ServiceUnavailable("OPS_REPO_UNAVAILABLE", "Ops repository not available")
}
if errorID <= 0 {
return infraerrors.BadRequest("OPS_ERROR_INVALID_ID", "invalid error id")
}
// Best-effort ensure the error exists
if _, err := s.opsRepo.GetErrorLogByID(ctx, errorID); err != nil {
if errors.Is(err, sql.ErrNoRows) {
return infraerrors.NotFound("OPS_ERROR_NOT_FOUND", "ops error log not found")
}
return infraerrors.InternalServer("OPS_ERROR_LOAD_FAILED", "Failed to load ops error log").WithCause(err)
}
return s.opsRepo.UpdateErrorResolution(ctx, errorID, resolved, resolvedByUserID, resolvedRetryID, nil)
}
func sanitizeAndTrimRequestBody(raw []byte, maxBytes int) (jsonString string, truncated bool, bytesLen int) { func sanitizeAndTrimRequestBody(raw []byte, maxBytes int) (jsonString string, truncated bool, bytesLen int) {
bytesLen = len(raw) bytesLen = len(raw)
if len(raw) == 0 { if len(raw) == 0 {
...@@ -296,14 +361,34 @@ func sanitizeAndTrimRequestBody(raw []byte, maxBytes int) (jsonString string, tr ...@@ -296,14 +361,34 @@ func sanitizeAndTrimRequestBody(raw []byte, maxBytes int) (jsonString string, tr
} }
} }
// Last resort: store a minimal placeholder (still valid JSON). // Last resort: keep JSON shape but drop big fields.
placeholder := map[string]any{ // This avoids downstream code that expects certain top-level keys from crashing.
"request_body_truncated": true, if root, ok := decoded.(map[string]any); ok {
placeholder := shallowCopyMap(root)
placeholder["request_body_truncated"] = true
// Replace potentially huge arrays/strings, but keep the keys present.
for _, k := range []string{"messages", "contents", "input", "prompt"} {
if _, exists := placeholder[k]; exists {
placeholder[k] = []any{}
} }
if model := extractString(decoded, "model"); model != "" {
placeholder["model"] = model
} }
for _, k := range []string{"text"} {
if _, exists := placeholder[k]; exists {
placeholder[k] = ""
}
}
encoded4, err4 := json.Marshal(placeholder) encoded4, err4 := json.Marshal(placeholder)
if err4 == nil {
if len(encoded4) <= maxBytes {
return string(encoded4), true, bytesLen
}
}
}
// Final fallback: minimal valid JSON.
encoded4, err4 := json.Marshal(map[string]any{"request_body_truncated": true})
if err4 != nil { if err4 != nil {
return "", true, bytesLen return "", true, bytesLen
} }
...@@ -526,12 +611,3 @@ func sanitizeErrorBodyForStorage(raw string, maxBytes int) (sanitized string, tr ...@@ -526,12 +611,3 @@ func sanitizeErrorBodyForStorage(raw string, maxBytes int) (sanitized string, tr
} }
return raw, false return raw, false
} }
func extractString(v any, key string) string {
root, ok := v.(map[string]any)
if !ok {
return ""
}
s, _ := root[key].(string)
return strings.TrimSpace(s)
}
...@@ -369,6 +369,8 @@ func defaultOpsAdvancedSettings() *OpsAdvancedSettings { ...@@ -369,6 +369,8 @@ func defaultOpsAdvancedSettings() *OpsAdvancedSettings {
AggregationEnabled: false, AggregationEnabled: false,
}, },
IgnoreCountTokensErrors: false, IgnoreCountTokensErrors: false,
IgnoreContextCanceled: true, // Default to true - client disconnects are not errors
IgnoreNoAvailableAccounts: false, // Default to false - this is a real routing issue
AutoRefreshEnabled: false, AutoRefreshEnabled: false,
AutoRefreshIntervalSec: 30, AutoRefreshIntervalSec: 30,
} }
...@@ -482,13 +484,11 @@ const SettingKeyOpsMetricThresholds = "ops_metric_thresholds" ...@@ -482,13 +484,11 @@ const SettingKeyOpsMetricThresholds = "ops_metric_thresholds"
func defaultOpsMetricThresholds() *OpsMetricThresholds { func defaultOpsMetricThresholds() *OpsMetricThresholds {
slaMin := 99.5 slaMin := 99.5
latencyMax := 2000.0
ttftMax := 500.0 ttftMax := 500.0
reqErrMax := 5.0 reqErrMax := 5.0
upstreamErrMax := 5.0 upstreamErrMax := 5.0
return &OpsMetricThresholds{ return &OpsMetricThresholds{
SLAPercentMin: &slaMin, SLAPercentMin: &slaMin,
LatencyP99MsMax: &latencyMax,
TTFTp99MsMax: &ttftMax, TTFTp99MsMax: &ttftMax,
RequestErrorRatePercentMax: &reqErrMax, RequestErrorRatePercentMax: &reqErrMax,
UpstreamErrorRatePercentMax: &upstreamErrMax, UpstreamErrorRatePercentMax: &upstreamErrMax,
...@@ -538,9 +538,6 @@ func (s *OpsService) UpdateMetricThresholds(ctx context.Context, cfg *OpsMetricT ...@@ -538,9 +538,6 @@ func (s *OpsService) UpdateMetricThresholds(ctx context.Context, cfg *OpsMetricT
if cfg.SLAPercentMin != nil && (*cfg.SLAPercentMin < 0 || *cfg.SLAPercentMin > 100) { if cfg.SLAPercentMin != nil && (*cfg.SLAPercentMin < 0 || *cfg.SLAPercentMin > 100) {
return nil, errors.New("sla_percent_min must be between 0 and 100") return nil, errors.New("sla_percent_min must be between 0 and 100")
} }
if cfg.LatencyP99MsMax != nil && *cfg.LatencyP99MsMax < 0 {
return nil, errors.New("latency_p99_ms_max must be >= 0")
}
if cfg.TTFTp99MsMax != nil && *cfg.TTFTp99MsMax < 0 { if cfg.TTFTp99MsMax != nil && *cfg.TTFTp99MsMax < 0 {
return nil, errors.New("ttft_p99_ms_max must be >= 0") return nil, errors.New("ttft_p99_ms_max must be >= 0")
} }
......
...@@ -63,7 +63,6 @@ type OpsAlertSilencingSettings struct { ...@@ -63,7 +63,6 @@ type OpsAlertSilencingSettings struct {
type OpsMetricThresholds struct { type OpsMetricThresholds struct {
SLAPercentMin *float64 `json:"sla_percent_min,omitempty"` // SLA低于此值变红 SLAPercentMin *float64 `json:"sla_percent_min,omitempty"` // SLA低于此值变红
LatencyP99MsMax *float64 `json:"latency_p99_ms_max,omitempty"` // 延迟P99高于此值变红
TTFTp99MsMax *float64 `json:"ttft_p99_ms_max,omitempty"` // TTFT P99高于此值变红 TTFTp99MsMax *float64 `json:"ttft_p99_ms_max,omitempty"` // TTFT P99高于此值变红
RequestErrorRatePercentMax *float64 `json:"request_error_rate_percent_max,omitempty"` // 请求错误率高于此值变红 RequestErrorRatePercentMax *float64 `json:"request_error_rate_percent_max,omitempty"` // 请求错误率高于此值变红
UpstreamErrorRatePercentMax *float64 `json:"upstream_error_rate_percent_max,omitempty"` // 上游错误率高于此值变红 UpstreamErrorRatePercentMax *float64 `json:"upstream_error_rate_percent_max,omitempty"` // 上游错误率高于此值变红
...@@ -82,6 +81,8 @@ type OpsAdvancedSettings struct { ...@@ -82,6 +81,8 @@ type OpsAdvancedSettings struct {
DataRetention OpsDataRetentionSettings `json:"data_retention"` DataRetention OpsDataRetentionSettings `json:"data_retention"`
Aggregation OpsAggregationSettings `json:"aggregation"` Aggregation OpsAggregationSettings `json:"aggregation"`
IgnoreCountTokensErrors bool `json:"ignore_count_tokens_errors"` IgnoreCountTokensErrors bool `json:"ignore_count_tokens_errors"`
IgnoreContextCanceled bool `json:"ignore_context_canceled"`
IgnoreNoAvailableAccounts bool `json:"ignore_no_available_accounts"`
AutoRefreshEnabled bool `json:"auto_refresh_enabled"` AutoRefreshEnabled bool `json:"auto_refresh_enabled"`
AutoRefreshIntervalSec int `json:"auto_refresh_interval_seconds"` AutoRefreshIntervalSec int `json:"auto_refresh_interval_seconds"`
} }
......
...@@ -15,6 +15,11 @@ const ( ...@@ -15,6 +15,11 @@ const (
OpsUpstreamErrorMessageKey = "ops_upstream_error_message" OpsUpstreamErrorMessageKey = "ops_upstream_error_message"
OpsUpstreamErrorDetailKey = "ops_upstream_error_detail" OpsUpstreamErrorDetailKey = "ops_upstream_error_detail"
OpsUpstreamErrorsKey = "ops_upstream_errors" OpsUpstreamErrorsKey = "ops_upstream_errors"
// Best-effort capture of the current upstream request body so ops can
// retry the specific upstream attempt (not just the client request).
// This value is sanitized+trimmed before being persisted.
OpsUpstreamRequestBodyKey = "ops_upstream_request_body"
) )
func setOpsUpstreamError(c *gin.Context, upstreamStatusCode int, upstreamMessage, upstreamDetail string) { func setOpsUpstreamError(c *gin.Context, upstreamStatusCode int, upstreamMessage, upstreamDetail string) {
...@@ -40,11 +45,19 @@ type OpsUpstreamErrorEvent struct { ...@@ -40,11 +45,19 @@ type OpsUpstreamErrorEvent struct {
// Context // Context
Platform string `json:"platform,omitempty"` Platform string `json:"platform,omitempty"`
AccountID int64 `json:"account_id,omitempty"` AccountID int64 `json:"account_id,omitempty"`
AccountName string `json:"account_name,omitempty"`
// Outcome // Outcome
UpstreamStatusCode int `json:"upstream_status_code,omitempty"` UpstreamStatusCode int `json:"upstream_status_code,omitempty"`
UpstreamRequestID string `json:"upstream_request_id,omitempty"` UpstreamRequestID string `json:"upstream_request_id,omitempty"`
// Best-effort upstream request capture (sanitized+trimmed).
// Required for retrying a specific upstream attempt.
UpstreamRequestBody string `json:"upstream_request_body,omitempty"`
// Best-effort upstream response capture (sanitized+trimmed).
UpstreamResponseBody string `json:"upstream_response_body,omitempty"`
// Kind: http_error | request_error | retry_exhausted | failover // Kind: http_error | request_error | retry_exhausted | failover
Kind string `json:"kind,omitempty"` Kind string `json:"kind,omitempty"`
...@@ -61,6 +74,8 @@ func appendOpsUpstreamError(c *gin.Context, ev OpsUpstreamErrorEvent) { ...@@ -61,6 +74,8 @@ func appendOpsUpstreamError(c *gin.Context, ev OpsUpstreamErrorEvent) {
} }
ev.Platform = strings.TrimSpace(ev.Platform) ev.Platform = strings.TrimSpace(ev.Platform)
ev.UpstreamRequestID = strings.TrimSpace(ev.UpstreamRequestID) ev.UpstreamRequestID = strings.TrimSpace(ev.UpstreamRequestID)
ev.UpstreamRequestBody = strings.TrimSpace(ev.UpstreamRequestBody)
ev.UpstreamResponseBody = strings.TrimSpace(ev.UpstreamResponseBody)
ev.Kind = strings.TrimSpace(ev.Kind) ev.Kind = strings.TrimSpace(ev.Kind)
ev.Message = strings.TrimSpace(ev.Message) ev.Message = strings.TrimSpace(ev.Message)
ev.Detail = strings.TrimSpace(ev.Detail) ev.Detail = strings.TrimSpace(ev.Detail)
...@@ -68,6 +83,16 @@ func appendOpsUpstreamError(c *gin.Context, ev OpsUpstreamErrorEvent) { ...@@ -68,6 +83,16 @@ func appendOpsUpstreamError(c *gin.Context, ev OpsUpstreamErrorEvent) {
ev.Message = sanitizeUpstreamErrorMessage(ev.Message) ev.Message = sanitizeUpstreamErrorMessage(ev.Message)
} }
// If the caller didn't explicitly pass upstream request body but the gateway
// stored it on the context, attach it so ops can retry this specific attempt.
if ev.UpstreamRequestBody == "" {
if v, ok := c.Get(OpsUpstreamRequestBodyKey); ok {
if s, ok := v.(string); ok {
ev.UpstreamRequestBody = strings.TrimSpace(s)
}
}
}
var existing []*OpsUpstreamErrorEvent var existing []*OpsUpstreamErrorEvent
if v, ok := c.Get(OpsUpstreamErrorsKey); ok { if v, ok := c.Get(OpsUpstreamErrorsKey); ok {
if arr, ok := v.([]*OpsUpstreamErrorEvent); ok { if arr, ok := v.([]*OpsUpstreamErrorEvent); ok {
...@@ -92,3 +117,15 @@ func marshalOpsUpstreamErrors(events []*OpsUpstreamErrorEvent) *string { ...@@ -92,3 +117,15 @@ func marshalOpsUpstreamErrors(events []*OpsUpstreamErrorEvent) *string {
s := string(raw) s := string(raw)
return &s return &s
} }
func ParseOpsUpstreamErrors(raw string) ([]*OpsUpstreamErrorEvent, error) {
raw = strings.TrimSpace(raw)
if raw == "" {
return []*OpsUpstreamErrorEvent{}, nil
}
var out []*OpsUpstreamErrorEvent
if err := json.Unmarshal([]byte(raw), &out); err != nil {
return nil, err
}
return out, nil
}
-- +goose Up
-- +goose StatementBegin
-- Ops alert silences: scoped (rule_id + platform + group_id + region)
CREATE TABLE IF NOT EXISTS ops_alert_silences (
id BIGSERIAL PRIMARY KEY,
rule_id BIGINT NOT NULL,
platform VARCHAR(64) NOT NULL,
group_id BIGINT,
region VARCHAR(64),
until TIMESTAMPTZ NOT NULL,
reason TEXT,
created_by BIGINT,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_ops_alert_silences_lookup
ON ops_alert_silences (rule_id, platform, group_id, region, until);
-- +goose StatementEnd
-- +goose Down
-- +goose StatementBegin
DROP TABLE IF EXISTS ops_alert_silences;
-- +goose StatementEnd
-- Add resolution tracking to ops_error_logs, persist retry results, and standardize error classification enums.
--
-- This migration is intentionally idempotent.
SET LOCAL lock_timeout = '5s';
SET LOCAL statement_timeout = '10min';
-- ============================================
-- 1) ops_error_logs: resolution fields
-- ============================================
ALTER TABLE ops_error_logs
ADD COLUMN IF NOT EXISTS resolved BOOLEAN NOT NULL DEFAULT false;
ALTER TABLE ops_error_logs
ADD COLUMN IF NOT EXISTS resolved_at TIMESTAMPTZ;
ALTER TABLE ops_error_logs
ADD COLUMN IF NOT EXISTS resolved_by_user_id BIGINT;
ALTER TABLE ops_error_logs
ADD COLUMN IF NOT EXISTS resolved_retry_id BIGINT;
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_resolved_time
ON ops_error_logs (resolved, created_at DESC);
CREATE INDEX IF NOT EXISTS idx_ops_error_logs_unresolved_time
ON ops_error_logs (created_at DESC)
WHERE resolved = false;
-- ============================================
-- 2) ops_retry_attempts: persist execution results
-- ============================================
ALTER TABLE ops_retry_attempts
ADD COLUMN IF NOT EXISTS success BOOLEAN;
ALTER TABLE ops_retry_attempts
ADD COLUMN IF NOT EXISTS http_status_code INT;
ALTER TABLE ops_retry_attempts
ADD COLUMN IF NOT EXISTS upstream_request_id VARCHAR(128);
ALTER TABLE ops_retry_attempts
ADD COLUMN IF NOT EXISTS used_account_id BIGINT;
ALTER TABLE ops_retry_attempts
ADD COLUMN IF NOT EXISTS response_preview TEXT;
ALTER TABLE ops_retry_attempts
ADD COLUMN IF NOT EXISTS response_truncated BOOLEAN NOT NULL DEFAULT false;
CREATE INDEX IF NOT EXISTS idx_ops_retry_attempts_success_time
ON ops_retry_attempts (success, created_at DESC);
-- Backfill best-effort fields for existing rows.
UPDATE ops_retry_attempts
SET success = (LOWER(COALESCE(status, '')) = 'succeeded')
WHERE success IS NULL;
UPDATE ops_retry_attempts
SET upstream_request_id = result_request_id
WHERE upstream_request_id IS NULL AND result_request_id IS NOT NULL;
-- ============================================
-- 3) Standardize classification enums in ops_error_logs
--
-- New enums:
-- error_phase: request|auth|routing|upstream|network|internal
-- error_owner: client|provider|platform
-- error_source: client_request|upstream_http|gateway
-- ============================================
-- Owner: legacy sub2api => platform.
UPDATE ops_error_logs
SET error_owner = 'platform'
WHERE LOWER(COALESCE(error_owner, '')) = 'sub2api';
-- Owner: normalize empty/null to platform (best-effort).
UPDATE ops_error_logs
SET error_owner = 'platform'
WHERE COALESCE(TRIM(error_owner), '') = '';
-- Phase: map legacy phases.
UPDATE ops_error_logs
SET error_phase = CASE
WHEN COALESCE(TRIM(error_phase), '') = '' THEN 'internal'
WHEN LOWER(error_phase) IN ('billing', 'concurrency', 'response') THEN 'request'
WHEN LOWER(error_phase) IN ('scheduling') THEN 'routing'
WHEN LOWER(error_phase) IN ('request', 'auth', 'routing', 'upstream', 'network', 'internal') THEN LOWER(error_phase)
ELSE 'internal'
END;
-- Source: map legacy sources.
UPDATE ops_error_logs
SET error_source = CASE
WHEN COALESCE(TRIM(error_source), '') = '' THEN 'gateway'
WHEN LOWER(error_source) IN ('billing', 'concurrency') THEN 'client_request'
WHEN LOWER(error_source) IN ('upstream_http') THEN 'upstream_http'
WHEN LOWER(error_source) IN ('upstream_network') THEN 'gateway'
WHEN LOWER(error_source) IN ('internal') THEN 'gateway'
WHEN LOWER(error_source) IN ('client_request', 'upstream_http', 'gateway') THEN LOWER(error_source)
ELSE 'gateway'
END;
-- Auto-resolve recovered upstream errors (client status < 400).
UPDATE ops_error_logs
SET
resolved = true,
resolved_at = COALESCE(resolved_at, created_at)
WHERE resolved = false AND COALESCE(status_code, 0) > 0 AND COALESCE(status_code, 0) < 400;
...@@ -17,6 +17,47 @@ export interface OpsRequestOptions { ...@@ -17,6 +17,47 @@ export interface OpsRequestOptions {
export interface OpsRetryRequest { export interface OpsRetryRequest {
mode: OpsRetryMode mode: OpsRetryMode
pinned_account_id?: number pinned_account_id?: number
force?: boolean
}
export interface OpsRetryAttempt {
id: number
created_at: string
requested_by_user_id: number
source_error_id: number
mode: string
pinned_account_id?: number | null
pinned_account_name?: string
status: string
started_at?: string | null
finished_at?: string | null
duration_ms?: number | null
success?: boolean | null
http_status_code?: number | null
upstream_request_id?: string | null
used_account_id?: number | null
used_account_name?: string
response_preview?: string | null
response_truncated?: boolean | null
result_request_id?: string | null
result_error_id?: number | null
error_message?: string | null
}
export type OpsUpstreamErrorEvent = {
at_unix_ms?: number
platform?: string
account_id?: number
account_name?: string
upstream_status_code?: number
upstream_request_id?: string
upstream_request_body?: string
kind?: string
message?: string
detail?: string
} }
export interface OpsRetryResult { export interface OpsRetryResult {
...@@ -626,8 +667,6 @@ export type MetricType = ...@@ -626,8 +667,6 @@ export type MetricType =
| 'success_rate' | 'success_rate'
| 'error_rate' | 'error_rate'
| 'upstream_error_rate' | 'upstream_error_rate'
| 'p95_latency_ms'
| 'p99_latency_ms'
| 'cpu_usage_percent' | 'cpu_usage_percent'
| 'memory_usage_percent' | 'memory_usage_percent'
| 'concurrency_queue_depth' | 'concurrency_queue_depth'
...@@ -663,7 +702,7 @@ export interface AlertEvent { ...@@ -663,7 +702,7 @@ export interface AlertEvent {
id: number id: number
rule_id: number rule_id: number
severity: OpsSeverity | string severity: OpsSeverity | string
status: 'firing' | 'resolved' | string status: 'firing' | 'resolved' | 'manual_resolved' | string
title?: string title?: string
description?: string description?: string
metric_value?: number metric_value?: number
...@@ -702,7 +741,6 @@ export interface EmailNotificationConfig { ...@@ -702,7 +741,6 @@ export interface EmailNotificationConfig {
export interface OpsMetricThresholds { export interface OpsMetricThresholds {
sla_percent_min?: number | null // SLA低于此值变红 sla_percent_min?: number | null // SLA低于此值变红
latency_p99_ms_max?: number | null // 延迟P99高于此值变红
ttft_p99_ms_max?: number | null // TTFT P99高于此值变红 ttft_p99_ms_max?: number | null // TTFT P99高于此值变红
request_error_rate_percent_max?: number | null // 请求错误率高于此值变红 request_error_rate_percent_max?: number | null // 请求错误率高于此值变红
upstream_error_rate_percent_max?: number | null // 上游错误率高于此值变红 upstream_error_rate_percent_max?: number | null // 上游错误率高于此值变红
...@@ -735,6 +773,8 @@ export interface OpsAdvancedSettings { ...@@ -735,6 +773,8 @@ export interface OpsAdvancedSettings {
data_retention: OpsDataRetentionSettings data_retention: OpsDataRetentionSettings
aggregation: OpsAggregationSettings aggregation: OpsAggregationSettings
ignore_count_tokens_errors: boolean ignore_count_tokens_errors: boolean
ignore_context_canceled: boolean
ignore_no_available_accounts: boolean
auto_refresh_enabled: boolean auto_refresh_enabled: boolean
auto_refresh_interval_seconds: number auto_refresh_interval_seconds: number
} }
...@@ -754,21 +794,37 @@ export interface OpsAggregationSettings { ...@@ -754,21 +794,37 @@ export interface OpsAggregationSettings {
export interface OpsErrorLog { export interface OpsErrorLog {
id: number id: number
created_at: string created_at: string
// Standardized classification
phase: OpsPhase phase: OpsPhase
type: string type: string
error_owner: 'client' | 'provider' | 'platform' | string
error_source: 'client_request' | 'upstream_http' | 'gateway' | string
severity: OpsSeverity severity: OpsSeverity
status_code: number status_code: number
platform: string platform: string
model: string model: string
latency_ms?: number | null
is_retryable: boolean
retry_count: number
resolved: boolean
resolved_at?: string | null
resolved_by_user_id?: number | null
resolved_retry_id?: number | null
client_request_id: string client_request_id: string
request_id: string request_id: string
message: string message: string
user_id?: number | null user_id?: number | null
user_email: string
api_key_id?: number | null api_key_id?: number | null
account_id?: number | null account_id?: number | null
account_name: string
group_id?: number | null group_id?: number | null
group_name: string
client_ip?: string | null client_ip?: string | null
request_path?: string request_path?: string
...@@ -890,7 +946,9 @@ export async function getErrorDistribution( ...@@ -890,7 +946,9 @@ export async function getErrorDistribution(
return data return data
} }
export async function listErrorLogs(params: { export type OpsErrorListView = 'errors' | 'excluded' | 'all'
export type OpsErrorListQueryParams = {
page?: number page?: number
page_size?: number page_size?: number
time_range?: string time_range?: string
...@@ -899,10 +957,20 @@ export async function listErrorLogs(params: { ...@@ -899,10 +957,20 @@ export async function listErrorLogs(params: {
platform?: string platform?: string
group_id?: number | null group_id?: number | null
account_id?: number | null account_id?: number | null
phase?: string phase?: string
error_owner?: string
error_source?: string
resolved?: string
view?: OpsErrorListView
q?: string q?: string
status_codes?: string status_codes?: string
}): Promise<OpsErrorLogsResponse> { status_codes_other?: string
}
// Legacy unified endpoints
export async function listErrorLogs(params: OpsErrorListQueryParams): Promise<OpsErrorLogsResponse> {
const { data } = await apiClient.get<OpsErrorLogsResponse>('/admin/ops/errors', { params }) const { data } = await apiClient.get<OpsErrorLogsResponse>('/admin/ops/errors', { params })
return data return data
} }
...@@ -917,6 +985,70 @@ export async function retryErrorRequest(id: number, req: OpsRetryRequest): Promi ...@@ -917,6 +985,70 @@ export async function retryErrorRequest(id: number, req: OpsRetryRequest): Promi
return data return data
} }
export async function listRetryAttempts(errorId: number, limit = 50): Promise<OpsRetryAttempt[]> {
const { data } = await apiClient.get<OpsRetryAttempt[]>(`/admin/ops/errors/${errorId}/retries`, { params: { limit } })
return data
}
export async function updateErrorResolved(errorId: number, resolved: boolean): Promise<void> {
await apiClient.put(`/admin/ops/errors/${errorId}/resolve`, { resolved })
}
// New split endpoints
export async function listRequestErrors(params: OpsErrorListQueryParams): Promise<OpsErrorLogsResponse> {
const { data } = await apiClient.get<OpsErrorLogsResponse>('/admin/ops/request-errors', { params })
return data
}
export async function listUpstreamErrors(params: OpsErrorListQueryParams): Promise<OpsErrorLogsResponse> {
const { data } = await apiClient.get<OpsErrorLogsResponse>('/admin/ops/upstream-errors', { params })
return data
}
export async function getRequestErrorDetail(id: number): Promise<OpsErrorDetail> {
const { data } = await apiClient.get<OpsErrorDetail>(`/admin/ops/request-errors/${id}`)
return data
}
export async function getUpstreamErrorDetail(id: number): Promise<OpsErrorDetail> {
const { data } = await apiClient.get<OpsErrorDetail>(`/admin/ops/upstream-errors/${id}`)
return data
}
export async function retryRequestErrorClient(id: number): Promise<OpsRetryResult> {
const { data } = await apiClient.post<OpsRetryResult>(`/admin/ops/request-errors/${id}/retry-client`, {})
return data
}
export async function retryRequestErrorUpstreamEvent(id: number, idx: number): Promise<OpsRetryResult> {
const { data } = await apiClient.post<OpsRetryResult>(`/admin/ops/request-errors/${id}/upstream-errors/${idx}/retry`, {})
return data
}
export async function retryUpstreamError(id: number): Promise<OpsRetryResult> {
const { data } = await apiClient.post<OpsRetryResult>(`/admin/ops/upstream-errors/${id}/retry`, {})
return data
}
export async function updateRequestErrorResolved(errorId: number, resolved: boolean): Promise<void> {
await apiClient.put(`/admin/ops/request-errors/${errorId}/resolve`, { resolved })
}
export async function updateUpstreamErrorResolved(errorId: number, resolved: boolean): Promise<void> {
await apiClient.put(`/admin/ops/upstream-errors/${errorId}/resolve`, { resolved })
}
export async function listRequestErrorUpstreamErrors(
id: number,
params: OpsErrorListQueryParams = {},
options: { include_detail?: boolean } = {}
): Promise<PaginatedResponse<OpsErrorDetail>> {
const query: Record<string, any> = { ...params }
if (options.include_detail) query.include_detail = '1'
const { data } = await apiClient.get<PaginatedResponse<OpsErrorDetail>>(`/admin/ops/request-errors/${id}/upstream-errors`, { params: query })
return data
}
export async function listRequestDetails(params: OpsRequestDetailsParams): Promise<OpsRequestDetailsResponse> { export async function listRequestDetails(params: OpsRequestDetailsParams): Promise<OpsRequestDetailsResponse> {
const { data } = await apiClient.get<OpsRequestDetailsResponse>('/admin/ops/requests', { params }) const { data } = await apiClient.get<OpsRequestDetailsResponse>('/admin/ops/requests', { params })
return data return data
...@@ -942,11 +1074,45 @@ export async function deleteAlertRule(id: number): Promise<void> { ...@@ -942,11 +1074,45 @@ export async function deleteAlertRule(id: number): Promise<void> {
await apiClient.delete(`/admin/ops/alert-rules/${id}`) await apiClient.delete(`/admin/ops/alert-rules/${id}`)
} }
export async function listAlertEvents(limit = 100): Promise<AlertEvent[]> { export interface AlertEventsQuery {
const { data } = await apiClient.get<AlertEvent[]>('/admin/ops/alert-events', { params: { limit } }) limit?: number
status?: string
severity?: string
email_sent?: boolean
time_range?: string
start_time?: string
end_time?: string
before_fired_at?: string
before_id?: number
platform?: string
group_id?: number
}
export async function listAlertEvents(params: AlertEventsQuery = {}): Promise<AlertEvent[]> {
const { data } = await apiClient.get<AlertEvent[]>('/admin/ops/alert-events', { params })
return data
}
export async function getAlertEvent(id: number): Promise<AlertEvent> {
const { data } = await apiClient.get<AlertEvent>(`/admin/ops/alert-events/${id}`)
return data return data
} }
export async function updateAlertEventStatus(id: number, status: 'resolved' | 'manual_resolved'): Promise<void> {
await apiClient.put(`/admin/ops/alert-events/${id}/status`, { status })
}
export async function createAlertSilence(payload: {
rule_id: number
platform: string
group_id?: number | null
region?: string | null
until: string
reason?: string
}): Promise<void> {
await apiClient.post('/admin/ops/alert-silences', payload)
}
// Email notification config // Email notification config
export async function getEmailNotificationConfig(): Promise<EmailNotificationConfig> { export async function getEmailNotificationConfig(): Promise<EmailNotificationConfig> {
const { data } = await apiClient.get<EmailNotificationConfig>('/admin/ops/email-notification/config') const { data } = await apiClient.get<EmailNotificationConfig>('/admin/ops/email-notification/config')
...@@ -1001,15 +1167,35 @@ export const opsAPI = { ...@@ -1001,15 +1167,35 @@ export const opsAPI = {
getAccountAvailabilityStats, getAccountAvailabilityStats,
getRealtimeTrafficSummary, getRealtimeTrafficSummary,
subscribeQPS, subscribeQPS,
// Legacy unified endpoints
listErrorLogs, listErrorLogs,
getErrorLogDetail, getErrorLogDetail,
retryErrorRequest, retryErrorRequest,
listRetryAttempts,
updateErrorResolved,
// New split endpoints
listRequestErrors,
listUpstreamErrors,
getRequestErrorDetail,
getUpstreamErrorDetail,
retryRequestErrorClient,
retryRequestErrorUpstreamEvent,
retryUpstreamError,
updateRequestErrorResolved,
updateUpstreamErrorResolved,
listRequestErrorUpstreamErrors,
listRequestDetails, listRequestDetails,
listAlertRules, listAlertRules,
createAlertRule, createAlertRule,
updateAlertRule, updateAlertRule,
deleteAlertRule, deleteAlertRule,
listAlertEvents, listAlertEvents,
getAlertEvent,
updateAlertEventStatus,
createAlertSilence,
getEmailNotificationConfig, getEmailNotificationConfig,
updateEmailNotificationConfig, updateEmailNotificationConfig,
getAlertRuntimeSettings, getAlertRuntimeSettings,
......
...@@ -129,6 +129,8 @@ export default { ...@@ -129,6 +129,8 @@ export default {
all: 'All', all: 'All',
none: 'None', none: 'None',
noData: 'No data', noData: 'No data',
expand: 'Expand',
collapse: 'Collapse',
success: 'Success', success: 'Success',
error: 'Error', error: 'Error',
critical: 'Critical', critical: 'Critical',
...@@ -155,7 +157,8 @@ export default { ...@@ -155,7 +157,8 @@ export default {
noGroupsAvailable: 'No groups available', noGroupsAvailable: 'No groups available',
unknownError: 'Unknown error occurred', unknownError: 'Unknown error occurred',
saving: 'Saving...', saving: 'Saving...',
selectedCount: '({count} selected)', refresh: 'Refresh', selectedCount: '({count} selected)',
refresh: 'Refresh',
settings: 'Settings', settings: 'Settings',
notAvailable: 'N/A', notAvailable: 'N/A',
now: 'Now', now: 'Now',
...@@ -1882,10 +1885,8 @@ export default { ...@@ -1882,10 +1885,8 @@ export default {
noSystemMetrics: 'No system metrics collected yet.', noSystemMetrics: 'No system metrics collected yet.',
collectedAt: 'Collected at:', collectedAt: 'Collected at:',
window: 'window', window: 'window',
cpu: 'CPU',
memory: 'Memory', memory: 'Memory',
db: 'DB', db: 'DB',
redis: 'Redis',
goroutines: 'Goroutines', goroutines: 'Goroutines',
jobs: 'Jobs', jobs: 'Jobs',
jobsHelp: 'Click “Details” to view job heartbeats and recent errors', jobsHelp: 'Click “Details” to view job heartbeats and recent errors',
...@@ -1911,7 +1912,7 @@ export default { ...@@ -1911,7 +1912,7 @@ export default {
totalRequests: 'Total Requests', totalRequests: 'Total Requests',
avgQps: 'Avg QPS', avgQps: 'Avg QPS',
avgTps: 'Avg TPS', avgTps: 'Avg TPS',
avgLatency: 'Avg Latency', avgLatency: 'Avg Request Duration',
avgTtft: 'Avg TTFT', avgTtft: 'Avg TTFT',
exceptions: 'Exceptions', exceptions: 'Exceptions',
requestErrors: 'Request Errors', requestErrors: 'Request Errors',
...@@ -1923,7 +1924,7 @@ export default { ...@@ -1923,7 +1924,7 @@ export default {
errors: 'Errors', errors: 'Errors',
errorRate: 'error_rate:', errorRate: 'error_rate:',
upstreamRate: 'upstream_rate:', upstreamRate: 'upstream_rate:',
latencyDuration: 'Latency (duration_ms)', latencyDuration: 'Request Duration (ms)',
ttftLabel: 'TTFT (first_token_ms)', ttftLabel: 'TTFT (first_token_ms)',
p50: 'p50:', p50: 'p50:',
p90: 'p90:', p90: 'p90:',
...@@ -1931,7 +1932,6 @@ export default { ...@@ -1931,7 +1932,6 @@ export default {
p99: 'p99:', p99: 'p99:',
avg: 'avg:', avg: 'avg:',
max: 'max:', max: 'max:',
qps: 'QPS',
requests: 'Requests', requests: 'Requests',
requestsTitle: 'Requests', requestsTitle: 'Requests',
upstream: 'Upstream', upstream: 'Upstream',
...@@ -1943,7 +1943,7 @@ export default { ...@@ -1943,7 +1943,7 @@ export default {
failedToLoadData: 'Failed to load ops data.', failedToLoadData: 'Failed to load ops data.',
failedToLoadOverview: 'Failed to load overview', failedToLoadOverview: 'Failed to load overview',
failedToLoadThroughputTrend: 'Failed to load throughput trend', failedToLoadThroughputTrend: 'Failed to load throughput trend',
failedToLoadLatencyHistogram: 'Failed to load latency histogram', failedToLoadLatencyHistogram: 'Failed to load request duration histogram',
failedToLoadErrorTrend: 'Failed to load error trend', failedToLoadErrorTrend: 'Failed to load error trend',
failedToLoadErrorDistribution: 'Failed to load error distribution', failedToLoadErrorDistribution: 'Failed to load error distribution',
failedToLoadErrorDetail: 'Failed to load error detail', failedToLoadErrorDetail: 'Failed to load error detail',
...@@ -1951,7 +1951,7 @@ export default { ...@@ -1951,7 +1951,7 @@ export default {
tpsK: 'TPS (K)', tpsK: 'TPS (K)',
top: 'Top:', top: 'Top:',
throughputTrend: 'Throughput Trend', throughputTrend: 'Throughput Trend',
latencyHistogram: 'Latency Histogram', latencyHistogram: 'Request Duration Histogram',
errorTrend: 'Error Trend', errorTrend: 'Error Trend',
errorDistribution: 'Error Distribution', errorDistribution: 'Error Distribution',
// Health Score & Diagnosis // Health Score & Diagnosis
...@@ -1966,7 +1966,9 @@ export default { ...@@ -1966,7 +1966,9 @@ export default {
'30m': 'Last 30 minutes', '30m': 'Last 30 minutes',
'1h': 'Last 1 hour', '1h': 'Last 1 hour',
'6h': 'Last 6 hours', '6h': 'Last 6 hours',
'24h': 'Last 24 hours' '24h': 'Last 24 hours',
'7d': 'Last 7 days',
'30d': 'Last 30 days'
}, },
fullscreen: { fullscreen: {
enter: 'Enter Fullscreen' enter: 'Enter Fullscreen'
...@@ -1995,14 +1997,7 @@ export default { ...@@ -1995,14 +1997,7 @@ export default {
memoryHigh: 'Memory usage elevated ({usage}%)', memoryHigh: 'Memory usage elevated ({usage}%)',
memoryHighImpact: 'Memory pressure is high, needs attention', memoryHighImpact: 'Memory pressure is high, needs attention',
memoryHighAction: 'Monitor memory trends, check for memory leaks', memoryHighAction: 'Monitor memory trends, check for memory leaks',
// Latency diagnostics ttftHigh: 'Time to first token elevated ({ttft}ms)',
latencyCritical: 'Response latency critically high ({latency}ms)',
latencyCriticalImpact: 'User experience extremely poor, many requests timing out',
latencyCriticalAction: 'Check slow queries, database indexes, network latency, and upstream services',
latencyHigh: 'Response latency elevated ({latency}ms)',
latencyHighImpact: 'User experience degraded, needs optimization',
latencyHighAction: 'Analyze slow request logs, optimize database queries and business logic',
ttftHigh: 'Time to first byte elevated ({ttft}ms)',
ttftHighImpact: 'User perceived latency increased', ttftHighImpact: 'User perceived latency increased',
ttftHighAction: 'Optimize request processing flow, reduce pre-processing time', ttftHighAction: 'Optimize request processing flow, reduce pre-processing time',
// Error rate diagnostics // Error rate diagnostics
...@@ -2038,27 +2033,106 @@ export default { ...@@ -2038,27 +2033,106 @@ export default {
// Error Log // Error Log
errorLog: { errorLog: {
timeId: 'Time / ID', timeId: 'Time / ID',
commonErrors: {
contextDeadlineExceeded: 'context deadline exceeded',
connectionRefused: 'connection refused',
rateLimit: 'rate limit'
},
time: 'Time',
type: 'Type',
context: 'Context', context: 'Context',
platform: 'Platform',
model: 'Model',
group: 'Group',
user: 'User',
userId: 'User ID',
account: 'Account',
accountId: 'Account ID',
status: 'Status', status: 'Status',
message: 'Message', message: 'Message',
latency: 'Latency', latency: 'Request Duration',
action: 'Action', action: 'Action',
noErrors: 'No errors in this window.', noErrors: 'No errors in this window.',
grp: 'GRP:', grp: 'GRP:',
acc: 'ACC:', acc: 'ACC:',
details: 'Details', details: 'Details',
phase: 'Phase' phase: 'Phase',
id: 'ID:',
typeUpstream: 'Upstream',
typeRequest: 'Request',
typeAuth: 'Auth',
typeRouting: 'Routing',
typeInternal: 'Internal'
}, },
// Error Details Modal // Error Details Modal
errorDetails: { errorDetails: {
upstreamErrors: 'Upstream Errors', upstreamErrors: 'Upstream Errors',
requestErrors: 'Request Errors', requestErrors: 'Request Errors',
unresolved: 'Unresolved',
resolved: 'Resolved',
viewErrors: 'Errors',
viewExcluded: 'Excluded',
statusCodeOther: 'Other',
owner: {
provider: 'Provider',
client: 'Client',
platform: 'Platform'
},
phase: {
request: 'Request',
auth: 'Auth',
routing: 'Routing',
upstream: 'Upstream',
network: 'Network',
internal: 'Internal'
},
total: 'Total:', total: 'Total:',
searchPlaceholder: 'Search request_id / client_request_id / message', searchPlaceholder: 'Search request_id / client_request_id / message',
accountIdPlaceholder: 'account_id'
}, },
// Error Detail Modal // Error Detail Modal
errorDetail: { errorDetail: {
title: 'Error Detail',
titleWithId: 'Error #{id}',
noErrorSelected: 'No error selected.',
resolution: 'Resolved:',
pinnedToOriginalAccountId: 'Pinned to original account_id',
missingUpstreamRequestBody: 'Missing upstream request body',
failedToLoadRetryHistory: 'Failed to load retry history',
failedToUpdateResolvedStatus: 'Failed to update resolved status',
unsupportedRetryMode: 'Unsupported retry mode',
classificationKeys: {
phase: 'Phase',
owner: 'Owner',
source: 'Source',
retryable: 'Retryable',
resolvedAt: 'Resolved At',
resolvedBy: 'Resolved By',
resolvedRetryId: 'Resolved Retry',
retryCount: 'Retry Count'
},
source: {
upstream_http: 'Upstream HTTP'
},
upstreamKeys: {
status: 'Status',
message: 'Message',
detail: 'Detail',
upstreamErrors: 'Upstream Errors'
},
upstreamEvent: {
account: 'Account',
status: 'Status',
requestId: 'Request ID'
},
responsePreview: {
expand: 'Response (click to expand)',
collapse: 'Response (click to collapse)'
},
retryMeta: {
used: 'Used',
success: 'Success',
pinned: 'Pinned'
},
loading: 'Loading…', loading: 'Loading…',
requestId: 'Request ID', requestId: 'Request ID',
time: 'Time', time: 'Time',
...@@ -2068,8 +2142,10 @@ export default { ...@@ -2068,8 +2142,10 @@ export default {
basicInfo: 'Basic Info', basicInfo: 'Basic Info',
platform: 'Platform', platform: 'Platform',
model: 'Model', model: 'Model',
latency: 'Latency', group: 'Group',
ttft: 'TTFT', user: 'User',
account: 'Account',
latency: 'Request Duration',
businessLimited: 'Business Limited', businessLimited: 'Business Limited',
requestPath: 'Request Path', requestPath: 'Request Path',
timings: 'Timings', timings: 'Timings',
...@@ -2077,6 +2153,8 @@ export default { ...@@ -2077,6 +2153,8 @@ export default {
routing: 'Routing', routing: 'Routing',
upstream: 'Upstream', upstream: 'Upstream',
response: 'Response', response: 'Response',
classification: 'Classification',
notRetryable: 'Not recommended to retry',
retry: 'Retry', retry: 'Retry',
retryClient: 'Retry (Client)', retryClient: 'Retry (Client)',
retryUpstream: 'Retry (Upstream pinned)', retryUpstream: 'Retry (Upstream pinned)',
...@@ -2088,7 +2166,6 @@ export default { ...@@ -2088,7 +2166,6 @@ export default {
confirmRetry: 'Confirm Retry', confirmRetry: 'Confirm Retry',
retrySuccess: 'Retry succeeded', retrySuccess: 'Retry succeeded',
retryFailed: 'Retry failed', retryFailed: 'Retry failed',
na: 'N/A',
retryHint: 'Retry will resend the request with the same parameters', retryHint: 'Retry will resend the request with the same parameters',
retryClientHint: 'Use client retry (no account pinning)', retryClientHint: 'Use client retry (no account pinning)',
retryUpstreamHint: 'Use upstream pinned retry (pin to the error account)', retryUpstreamHint: 'Use upstream pinned retry (pin to the error account)',
...@@ -2096,8 +2173,33 @@ export default { ...@@ -2096,8 +2173,33 @@ export default {
retryNote1: 'Retry will use the same request body and parameters', retryNote1: 'Retry will use the same request body and parameters',
retryNote2: 'If the original request failed due to account issues, pinned retry may still fail', retryNote2: 'If the original request failed due to account issues, pinned retry may still fail',
retryNote3: 'Client retry will reselect an account', retryNote3: 'Client retry will reselect an account',
retryNote4: 'You can force retry for non-retryable errors, but it is not recommended',
confirmRetryMessage: 'Confirm retry this request?', confirmRetryMessage: 'Confirm retry this request?',
confirmRetryHint: 'Will resend with the same request parameters' confirmRetryHint: 'Will resend with the same request parameters',
forceRetry: 'I understand and want to force retry',
forceRetryHint: 'This error usually cannot be fixed by retry; check to proceed',
forceRetryNeedAck: 'Please check to force retry',
markResolved: 'Mark resolved',
markUnresolved: 'Mark unresolved',
viewRetries: 'Retry history',
retryHistory: 'Retry History',
tabOverview: 'Overview',
tabRetries: 'Retries',
tabRequest: 'Request',
tabResponse: 'Response',
responseBody: 'Response',
compareA: 'Compare A',
compareB: 'Compare B',
retrySummary: 'Retry Summary',
responseHintSucceeded: 'Showing succeeded retry response_preview (#{id})',
responseHintFallback: 'No succeeded retry found; showing stored error_body',
suggestion: 'Suggestion',
suggestUpstreamResolved: '✓ Upstream error resolved by retry; no action needed',
suggestUpstream: 'Upstream instability: check account status, consider switching accounts, or retry',
suggestRequest: 'Client request error: ask customer to fix request parameters',
suggestAuth: 'Auth failed: verify API key/credentials',
suggestPlatform: 'Platform error: prioritize investigation and fix',
suggestGeneric: 'See details for more context'
}, },
requestDetails: { requestDetails: {
title: 'Request Details', title: 'Request Details',
...@@ -2133,13 +2235,46 @@ export default { ...@@ -2133,13 +2235,46 @@ export default {
loading: 'Loading...', loading: 'Loading...',
empty: 'No alert events', empty: 'No alert events',
loadFailed: 'Failed to load alert events', loadFailed: 'Failed to load alert events',
status: {
firing: 'FIRING',
resolved: 'RESOLVED',
manualResolved: 'MANUAL RESOLVED'
},
detail: {
title: 'Alert Detail',
loading: 'Loading detail...',
empty: 'No detail',
loadFailed: 'Failed to load alert detail',
manualResolve: 'Mark as Resolved',
manualResolvedSuccess: 'Marked as manually resolved',
manualResolvedFailed: 'Failed to mark as manually resolved',
silence: 'Ignore Alert',
silenceSuccess: 'Alert silenced',
silenceFailed: 'Failed to silence alert',
viewRule: 'View Rule',
viewLogs: 'View Logs',
firedAt: 'Fired At',
resolvedAt: 'Resolved At',
ruleId: 'Rule ID',
dimensions: 'Dimensions',
historyTitle: 'History',
historyHint: 'Recent events with same rule + dimensions',
historyLoading: 'Loading history...',
historyEmpty: 'No history'
},
table: { table: {
time: 'Time', time: 'Time',
status: 'Status', status: 'Status',
severity: 'Severity', severity: 'Severity',
platform: 'Platform',
ruleId: 'Rule ID',
title: 'Title', title: 'Title',
duration: 'Duration',
metric: 'Metric / Threshold', metric: 'Metric / Threshold',
email: 'Email Sent' dimensions: 'Dimensions',
email: 'Email Sent',
emailSent: 'Sent',
emailIgnored: 'Ignored'
} }
}, },
alertRules: { alertRules: {
...@@ -2253,7 +2388,6 @@ export default { ...@@ -2253,7 +2388,6 @@ export default {
title: 'Alert Silencing (Maintenance Mode)', title: 'Alert Silencing (Maintenance Mode)',
enabled: 'Enable silencing', enabled: 'Enable silencing',
globalUntil: 'Silence until (RFC3339)', globalUntil: 'Silence until (RFC3339)',
untilPlaceholder: '2026-01-05T00:00:00Z',
untilHint: 'Leave empty to only toggle silencing without an expiry (not recommended).', untilHint: 'Leave empty to only toggle silencing without an expiry (not recommended).',
reason: 'Reason', reason: 'Reason',
reasonPlaceholder: 'e.g., planned maintenance', reasonPlaceholder: 'e.g., planned maintenance',
...@@ -2293,7 +2427,11 @@ export default { ...@@ -2293,7 +2427,11 @@ export default {
lockKeyRequired: 'Distributed lock key is required when lock is enabled', lockKeyRequired: 'Distributed lock key is required when lock is enabled',
lockKeyPrefix: 'Distributed lock key must start with "{prefix}"', lockKeyPrefix: 'Distributed lock key must start with "{prefix}"',
lockKeyHint: 'Recommended: start with "{prefix}" to avoid conflicts', lockKeyHint: 'Recommended: start with "{prefix}" to avoid conflicts',
lockTtlRange: 'Distributed lock TTL must be between 1 and 86400 seconds' lockTtlRange: 'Distributed lock TTL must be between 1 and 86400 seconds',
slaMinPercentRange: 'SLA minimum percentage must be between 0 and 100',
ttftP99MaxRange: 'TTFT P99 maximum must be a number ≥ 0',
requestErrorRateMaxRange: 'Request error rate maximum must be between 0 and 100',
upstreamErrorRateMaxRange: 'Upstream error rate maximum must be between 0 and 100'
} }
}, },
email: { email: {
...@@ -2358,8 +2496,6 @@ export default { ...@@ -2358,8 +2496,6 @@ export default {
metricThresholdsHint: 'Configure alert thresholds for metrics, values exceeding thresholds will be displayed in red', metricThresholdsHint: 'Configure alert thresholds for metrics, values exceeding thresholds will be displayed in red',
slaMinPercent: 'SLA Minimum Percentage', slaMinPercent: 'SLA Minimum Percentage',
slaMinPercentHint: 'SLA below this value will be displayed in red (default: 99.5%)', slaMinPercentHint: 'SLA below this value will be displayed in red (default: 99.5%)',
latencyP99MaxMs: 'Latency P99 Maximum (ms)',
latencyP99MaxMsHint: 'Latency P99 above this value will be displayed in red (default: 2000ms)',
ttftP99MaxMs: 'TTFT P99 Maximum (ms)', ttftP99MaxMs: 'TTFT P99 Maximum (ms)',
ttftP99MaxMsHint: 'TTFT P99 above this value will be displayed in red (default: 500ms)', ttftP99MaxMsHint: 'TTFT P99 above this value will be displayed in red (default: 500ms)',
requestErrorRateMaxPercent: 'Request Error Rate Maximum (%)', requestErrorRateMaxPercent: 'Request Error Rate Maximum (%)',
...@@ -2378,9 +2514,28 @@ export default { ...@@ -2378,9 +2514,28 @@ export default {
aggregation: 'Pre-aggregation Tasks', aggregation: 'Pre-aggregation Tasks',
enableAggregation: 'Enable Pre-aggregation', enableAggregation: 'Enable Pre-aggregation',
aggregationHint: 'Pre-aggregation improves query performance for long time windows', aggregationHint: 'Pre-aggregation improves query performance for long time windows',
errorFiltering: 'Error Filtering',
ignoreCountTokensErrors: 'Ignore count_tokens errors',
ignoreCountTokensErrorsHint: 'When enabled, errors from count_tokens requests will not be written to the error log.',
ignoreContextCanceled: 'Ignore client disconnect errors',
ignoreContextCanceledHint: 'When enabled, client disconnect (context canceled) errors will not be written to the error log.',
ignoreNoAvailableAccounts: 'Ignore no available accounts errors',
ignoreNoAvailableAccountsHint: 'When enabled, "No available accounts" errors will not be written to the error log (not recommended; usually a config issue).',
autoRefresh: 'Auto Refresh',
enableAutoRefresh: 'Enable auto refresh',
enableAutoRefreshHint: 'Automatically refresh dashboard data at a fixed interval.',
refreshInterval: 'Refresh Interval',
refreshInterval15s: '15 seconds',
refreshInterval30s: '30 seconds',
refreshInterval60s: '60 seconds',
autoRefreshCountdown: 'Auto refresh: {seconds}s',
validation: { validation: {
title: 'Please fix the following issues', title: 'Please fix the following issues',
retentionDaysRange: 'Retention days must be between 1-365 days' retentionDaysRange: 'Retention days must be between 1-365 days',
slaMinPercentRange: 'SLA minimum percentage must be between 0 and 100',
ttftP99MaxRange: 'TTFT P99 maximum must be a number ≥ 0',
requestErrorRateMaxRange: 'Request error rate maximum must be between 0 and 100',
upstreamErrorRateMaxRange: 'Upstream error rate maximum must be between 0 and 100'
} }
}, },
concurrency: { concurrency: {
...@@ -2418,7 +2573,7 @@ export default { ...@@ -2418,7 +2573,7 @@ export default {
tooltips: { tooltips: {
totalRequests: 'Total number of requests (including both successful and failed requests) in the selected time window.', totalRequests: 'Total number of requests (including both successful and failed requests) in the selected time window.',
throughputTrend: 'Requests/QPS + Tokens/TPS in the selected window.', throughputTrend: 'Requests/QPS + Tokens/TPS in the selected window.',
latencyHistogram: 'Latency distribution (duration_ms) for successful requests.', latencyHistogram: 'Request duration distribution (ms) for successful requests.',
errorTrend: 'Error counts over time (SLA scope excludes business limits; upstream excludes 429/529).', errorTrend: 'Error counts over time (SLA scope excludes business limits; upstream excludes 429/529).',
errorDistribution: 'Error distribution by status code.', errorDistribution: 'Error distribution by status code.',
goroutines: goroutines:
...@@ -2433,7 +2588,7 @@ export default { ...@@ -2433,7 +2588,7 @@ export default {
sla: 'Service Level Agreement success rate, excluding business limits (e.g., insufficient balance, quota exceeded).', sla: 'Service Level Agreement success rate, excluding business limits (e.g., insufficient balance, quota exceeded).',
errors: 'Error statistics, including total errors, error rate, and upstream error rate.', errors: 'Error statistics, including total errors, error rate, and upstream error rate.',
upstreamErrors: 'Upstream error statistics, excluding rate limit errors (429/529).', upstreamErrors: 'Upstream error statistics, excluding rate limit errors (429/529).',
latency: 'Request latency statistics, including p50, p90, p95, p99 percentiles.', latency: 'Request duration statistics, including p50, p90, p95, p99 percentiles.',
ttft: 'Time To First Token, measuring the speed of first byte return in streaming responses.', ttft: 'Time To First Token, measuring the speed of first byte return in streaming responses.',
health: 'System health score (0-100), considering SLA, error rate, and resource usage.' health: 'System health score (0-100), considering SLA, error rate, and resource usage.'
}, },
......
...@@ -126,6 +126,8 @@ export default { ...@@ -126,6 +126,8 @@ export default {
all: '全部', all: '全部',
none: '', none: '',
noData: '暂无数据', noData: '暂无数据',
expand: '展开',
collapse: '收起',
success: '成功', success: '成功',
error: '错误', error: '错误',
critical: '严重', critical: '严重',
...@@ -2031,10 +2033,8 @@ export default { ...@@ -2031,10 +2033,8 @@ export default {
noSystemMetrics: '尚未收集系统指标。', noSystemMetrics: '尚未收集系统指标。',
collectedAt: '采集时间:', collectedAt: '采集时间:',
window: '窗口', window: '窗口',
cpu: 'CPU',
memory: '内存', memory: '内存',
db: '数据库', db: '数据库',
redis: 'Redis',
goroutines: '协程', goroutines: '协程',
jobs: '后台任务', jobs: '后台任务',
jobsHelp: '点击“明细”查看任务心跳与报错信息', jobsHelp: '点击“明细”查看任务心跳与报错信息',
...@@ -2060,7 +2060,7 @@ export default { ...@@ -2060,7 +2060,7 @@ export default {
totalRequests: '总请求', totalRequests: '总请求',
avgQps: '平均 QPS', avgQps: '平均 QPS',
avgTps: '平均 TPS', avgTps: '平均 TPS',
avgLatency: '平均延迟', avgLatency: '平均请求时长',
avgTtft: '平均首字延迟', avgTtft: '平均首字延迟',
exceptions: '异常数', exceptions: '异常数',
requestErrors: '请求错误', requestErrors: '请求错误',
...@@ -2072,7 +2072,7 @@ export default { ...@@ -2072,7 +2072,7 @@ export default {
errors: '错误', errors: '错误',
errorRate: '错误率:', errorRate: '错误率:',
upstreamRate: '上游错误率:', upstreamRate: '上游错误率:',
latencyDuration: '延迟(毫秒)', latencyDuration: '请求时长(毫秒)',
ttftLabel: '首字延迟(毫秒)', ttftLabel: '首字延迟(毫秒)',
p50: 'p50', p50: 'p50',
p90: 'p90', p90: 'p90',
...@@ -2080,7 +2080,6 @@ export default { ...@@ -2080,7 +2080,6 @@ export default {
p99: 'p99', p99: 'p99',
avg: 'avg', avg: 'avg',
max: 'max', max: 'max',
qps: 'QPS',
requests: '请求数', requests: '请求数',
requestsTitle: '请求', requestsTitle: '请求',
upstream: '上游', upstream: '上游',
...@@ -2092,7 +2091,7 @@ export default { ...@@ -2092,7 +2091,7 @@ export default {
failedToLoadData: '加载运维数据失败', failedToLoadData: '加载运维数据失败',
failedToLoadOverview: '加载概览数据失败', failedToLoadOverview: '加载概览数据失败',
failedToLoadThroughputTrend: '加载吞吐趋势失败', failedToLoadThroughputTrend: '加载吞吐趋势失败',
failedToLoadLatencyHistogram: '加载延迟分布失败', failedToLoadLatencyHistogram: '加载请求时长分布失败',
failedToLoadErrorTrend: '加载错误趋势失败', failedToLoadErrorTrend: '加载错误趋势失败',
failedToLoadErrorDistribution: '加载错误分布失败', failedToLoadErrorDistribution: '加载错误分布失败',
failedToLoadErrorDetail: '加载错误详情失败', failedToLoadErrorDetail: '加载错误详情失败',
...@@ -2100,7 +2099,7 @@ export default { ...@@ -2100,7 +2099,7 @@ export default {
tpsK: 'TPS(千)', tpsK: 'TPS(千)',
top: '最高:', top: '最高:',
throughputTrend: '吞吐趋势', throughputTrend: '吞吐趋势',
latencyHistogram: '延迟分布', latencyHistogram: '请求时长分布',
errorTrend: '错误趋势', errorTrend: '错误趋势',
errorDistribution: '错误分布', errorDistribution: '错误分布',
// Health Score & Diagnosis // Health Score & Diagnosis
...@@ -2115,7 +2114,9 @@ export default { ...@@ -2115,7 +2114,9 @@ export default {
'30m': '近30分钟', '30m': '近30分钟',
'1h': '近1小时', '1h': '近1小时',
'6h': '近6小时', '6h': '近6小时',
'24h': '近24小时' '24h': '近24小时',
'7d': '近7天',
'30d': '近30天'
}, },
fullscreen: { fullscreen: {
enter: '进入全屏' enter: '进入全屏'
...@@ -2144,15 +2145,8 @@ export default { ...@@ -2144,15 +2145,8 @@ export default {
memoryHigh: '内存使用率偏高 ({usage}%)', memoryHigh: '内存使用率偏高 ({usage}%)',
memoryHighImpact: '内存压力较大,需要关注', memoryHighImpact: '内存压力较大,需要关注',
memoryHighAction: '监控内存趋势,检查是否有内存泄漏', memoryHighAction: '监控内存趋势,检查是否有内存泄漏',
// Latency diagnostics
latencyCritical: '响应延迟严重过高 ({latency}ms)',
latencyCriticalImpact: '用户体验极差,大量请求超时',
latencyCriticalAction: '检查慢查询、数据库索引、网络延迟和上游服务',
latencyHigh: '响应延迟偏高 ({latency}ms)',
latencyHighImpact: '用户体验下降,需要优化',
latencyHighAction: '分析慢请求日志,优化数据库查询和业务逻辑',
ttftHigh: '首字节时间偏高 ({ttft}ms)', ttftHigh: '首字节时间偏高 ({ttft}ms)',
ttftHighImpact: '用户感知延迟增加', ttftHighImpact: '用户感知时长增加',
ttftHighAction: '优化请求处理流程,减少前置逻辑耗时', ttftHighAction: '优化请求处理流程,减少前置逻辑耗时',
// Error rate diagnostics // Error rate diagnostics
upstreamCritical: '上游错误率严重偏高 ({rate}%)', upstreamCritical: '上游错误率严重偏高 ({rate}%)',
...@@ -2170,13 +2164,13 @@ export default { ...@@ -2170,13 +2164,13 @@ export default {
// SLA diagnostics // SLA diagnostics
slaCritical: 'SLA 严重低于目标 ({sla}%)', slaCritical: 'SLA 严重低于目标 ({sla}%)',
slaCriticalImpact: '用户体验严重受损', slaCriticalImpact: '用户体验严重受损',
slaCriticalAction: '紧急排查错误和延迟问题,考虑限流保护', slaCriticalAction: '紧急排查错误原因,必要时采取限流保护',
slaLow: 'SLA 低于目标 ({sla}%)', slaLow: 'SLA 低于目标 ({sla}%)',
slaLowImpact: '需要关注服务质量', slaLowImpact: '需要关注服务质量',
slaLowAction: '分析SLA下降原因,优化系统性能', slaLowAction: '分析SLA下降原因,优化系统性能',
// Health score diagnostics // Health score diagnostics
healthCritical: '综合健康评分过低 ({score})', healthCritical: '综合健康评分过低 ({score})',
healthCriticalImpact: '多个指标可能同时异常,建议优先排查错误与延迟', healthCriticalImpact: '多个指标可能同时异常,建议优先排查错误与资源使用情况',
healthCriticalAction: '全面检查系统状态,优先处理critical级别问题', healthCriticalAction: '全面检查系统状态,优先处理critical级别问题',
healthLow: '综合健康评分偏低 ({score})', healthLow: '综合健康评分偏低 ({score})',
healthLowImpact: '可能存在轻度波动,建议关注 SLA 与错误率', healthLowImpact: '可能存在轻度波动,建议关注 SLA 与错误率',
...@@ -2187,27 +2181,106 @@ export default { ...@@ -2187,27 +2181,106 @@ export default {
// Error Log // Error Log
errorLog: { errorLog: {
timeId: '时间 / ID', timeId: '时间 / ID',
commonErrors: {
contextDeadlineExceeded: '请求超时',
connectionRefused: '连接被拒绝',
rateLimit: '触发限流'
},
time: '时间',
type: '类型',
context: '上下文', context: '上下文',
platform: '平台',
model: '模型',
group: '分组',
user: '用户',
userId: '用户 ID',
account: '账号',
accountId: '账号 ID',
status: '状态码', status: '状态码',
message: '消息', message: '响应内容',
latency: '延迟', latency: '请求时长',
action: '操作', action: '操作',
noErrors: '该窗口内暂无错误。', noErrors: '该窗口内暂无错误。',
grp: 'GRP:', grp: 'GRP:',
acc: 'ACC:', acc: 'ACC:',
details: '详情', details: '详情',
phase: '阶段' phase: '阶段',
id: 'ID:',
typeUpstream: '上游',
typeRequest: '请求',
typeAuth: '认证',
typeRouting: '路由',
typeInternal: '内部'
}, },
// Error Details Modal // Error Details Modal
errorDetails: { errorDetails: {
upstreamErrors: '上游错误', upstreamErrors: '上游错误',
requestErrors: '请求错误', requestErrors: '请求错误',
unresolved: '未解决',
resolved: '已解决',
viewErrors: '错误',
viewExcluded: '排除项',
statusCodeOther: '其他',
owner: {
provider: '服务商',
client: '客户端',
platform: '平台'
},
phase: {
request: '请求',
auth: '认证',
routing: '路由',
upstream: '上游',
network: '网络',
internal: '内部'
},
total: '总计:', total: '总计:',
searchPlaceholder: '搜索 request_id / client_request_id / message', searchPlaceholder: '搜索 request_id / client_request_id / message',
accountIdPlaceholder: 'account_id'
}, },
// Error Detail Modal // Error Detail Modal
errorDetail: { errorDetail: {
title: '错误详情',
titleWithId: '错误 #{id}',
noErrorSelected: '未选择错误。',
resolution: '已解决:',
pinnedToOriginalAccountId: '固定到原 account_id',
missingUpstreamRequestBody: '缺少上游请求体',
failedToLoadRetryHistory: '加载重试历史失败',
failedToUpdateResolvedStatus: '更新解决状态失败',
unsupportedRetryMode: '不支持的重试模式',
classificationKeys: {
phase: '阶段',
owner: '归属方',
source: '来源',
retryable: '可重试',
resolvedAt: '解决时间',
resolvedBy: '解决人',
resolvedRetryId: '解决重试ID',
retryCount: '重试次数'
},
source: {
upstream_http: '上游 HTTP'
},
upstreamKeys: {
status: '状态码',
message: '消息',
detail: '详情',
upstreamErrors: '上游错误列表'
},
upstreamEvent: {
account: '账号',
status: '状态码',
requestId: '请求ID'
},
responsePreview: {
expand: '响应内容(点击展开)',
collapse: '响应内容(点击收起)'
},
retryMeta: {
used: '使用账号',
success: '成功',
pinned: '固定账号'
},
loading: '加载中…', loading: '加载中…',
requestId: '请求 ID', requestId: '请求 ID',
time: '时间', time: '时间',
...@@ -2217,8 +2290,10 @@ export default { ...@@ -2217,8 +2290,10 @@ export default {
basicInfo: '基本信息', basicInfo: '基本信息',
platform: '平台', platform: '平台',
model: '模型', model: '模型',
latency: '延迟', group: '分组',
ttft: 'TTFT', user: '用户',
account: '账号',
latency: '请求时长',
businessLimited: '业务限制', businessLimited: '业务限制',
requestPath: '请求路径', requestPath: '请求路径',
timings: '时序信息', timings: '时序信息',
...@@ -2226,6 +2301,8 @@ export default { ...@@ -2226,6 +2301,8 @@ export default {
routing: '路由', routing: '路由',
upstream: '上游', upstream: '上游',
response: '响应', response: '响应',
classification: '错误分类',
notRetryable: '此错误不建议重试',
retry: '重试', retry: '重试',
retryClient: '重试(客户端)', retryClient: '重试(客户端)',
retryUpstream: '重试(上游固定)', retryUpstream: '重试(上游固定)',
...@@ -2237,7 +2314,6 @@ export default { ...@@ -2237,7 +2314,6 @@ export default {
confirmRetry: '确认重试', confirmRetry: '确认重试',
retrySuccess: '重试成功', retrySuccess: '重试成功',
retryFailed: '重试失败', retryFailed: '重试失败',
na: 'N/A',
retryHint: '重试将使用相同的请求参数重新发送请求', retryHint: '重试将使用相同的请求参数重新发送请求',
retryClientHint: '使用客户端重试(不固定账号)', retryClientHint: '使用客户端重试(不固定账号)',
retryUpstreamHint: '使用上游固定重试(固定到错误的账号)', retryUpstreamHint: '使用上游固定重试(固定到错误的账号)',
...@@ -2245,8 +2321,33 @@ export default { ...@@ -2245,8 +2321,33 @@ export default {
retryNote1: '重试会使用相同的请求体和参数', retryNote1: '重试会使用相同的请求体和参数',
retryNote2: '如果原请求失败是因为账号问题,固定重试可能仍会失败', retryNote2: '如果原请求失败是因为账号问题,固定重试可能仍会失败',
retryNote3: '客户端重试会重新选择账号', retryNote3: '客户端重试会重新选择账号',
retryNote4: '对不可重试的错误可以强制重试,但不推荐',
confirmRetryMessage: '确认要重试该请求吗?', confirmRetryMessage: '确认要重试该请求吗?',
confirmRetryHint: '将使用相同的请求参数重新发送' confirmRetryHint: '将使用相同的请求参数重新发送',
forceRetry: '我已确认并理解强制重试风险',
forceRetryHint: '此错误类型通常不可通过重试解决;如仍需重试请勾选确认',
forceRetryNeedAck: '请先勾选确认再强制重试',
markResolved: '标记已解决',
markUnresolved: '标记未解决',
viewRetries: '重试历史',
retryHistory: '重试历史',
tabOverview: '概览',
tabRetries: '重试历史',
tabRequest: '请求详情',
tabResponse: '响应详情',
responseBody: '响应详情',
compareA: '对比 A',
compareB: '对比 B',
retrySummary: '重试摘要',
responseHintSucceeded: '展示重试成功的 response_preview(#{id})',
responseHintFallback: '没有成功的重试结果,展示存储的 error_body',
suggestion: '处理建议',
suggestUpstreamResolved: '✓ 上游错误已通过重试解决,无需人工介入',
suggestUpstream: '⚠️ 上游服务不稳定,建议:检查上游账号状态 / 考虑切换账号 / 再次重试',
suggestRequest: '⚠️ 客户端请求错误,建议:联系客户修正请求参数 / 手动标记已解决',
suggestAuth: '⚠️ 认证失败,建议:检查 API Key 是否有效 / 联系客户更新凭证',
suggestPlatform: '🚨 平台错误,建议立即排查修复',
suggestGeneric: '查看详情了解更多信息'
}, },
requestDetails: { requestDetails: {
title: '请求明细', title: '请求明细',
...@@ -2282,13 +2383,46 @@ export default { ...@@ -2282,13 +2383,46 @@ export default {
loading: '加载中...', loading: '加载中...',
empty: '暂无告警事件', empty: '暂无告警事件',
loadFailed: '加载告警事件失败', loadFailed: '加载告警事件失败',
status: {
firing: '告警中',
resolved: '已恢复',
manualResolved: '手动已解决'
},
detail: {
title: '告警详情',
loading: '加载详情中...',
empty: '暂无详情',
loadFailed: '加载告警详情失败',
manualResolve: '标记为已解决',
manualResolvedSuccess: '已标记为手动解决',
manualResolvedFailed: '标记为手动解决失败',
silence: '忽略此告警',
silenceSuccess: '已静默该告警',
silenceFailed: '静默失败',
viewRule: '查看规则',
viewLogs: '查看相关日志',
firedAt: '触发时间',
resolvedAt: '解决时间',
ruleId: '规则 ID',
dimensions: '维度信息',
historyTitle: '历史记录',
historyHint: '同一规则 + 相同维度的最近事件',
historyLoading: '加载历史中...',
historyEmpty: '暂无历史记录'
},
table: { table: {
time: '时间', time: '时间',
status: '状态', status: '状态',
severity: '级别', severity: '级别',
platform: '平台',
ruleId: '规则ID',
title: '标题', title: '标题',
duration: '持续时间',
metric: '指标 / 阈值', metric: '指标 / 阈值',
email: '邮件已发送' dimensions: '维度',
email: '邮件已发送',
emailSent: '已发送',
emailIgnored: '已忽略'
} }
}, },
alertRules: { alertRules: {
...@@ -2316,8 +2450,8 @@ export default { ...@@ -2316,8 +2450,8 @@ export default {
successRate: '成功率 (%)', successRate: '成功率 (%)',
errorRate: '错误率 (%)', errorRate: '错误率 (%)',
upstreamErrorRate: '上游错误率 (%)', upstreamErrorRate: '上游错误率 (%)',
p95: 'P95 延迟 (ms)', p95: 'P95 请求时长 (ms)',
p99: 'P99 延迟 (ms)', p99: 'P99 请求时长 (ms)',
cpu: 'CPU 使用率 (%)', cpu: 'CPU 使用率 (%)',
memory: '内存使用率 (%)', memory: '内存使用率 (%)',
queueDepth: '并发排队深度', queueDepth: '并发排队深度',
...@@ -2402,7 +2536,6 @@ export default { ...@@ -2402,7 +2536,6 @@ export default {
title: '告警静默(维护模式)', title: '告警静默(维护模式)',
enabled: '启用静默', enabled: '启用静默',
globalUntil: '静默截止时间(RFC3339)', globalUntil: '静默截止时间(RFC3339)',
untilPlaceholder: '2026-01-05T00:00:00Z',
untilHint: '建议填写截止时间,避免忘记关闭静默。', untilHint: '建议填写截止时间,避免忘记关闭静默。',
reason: '原因', reason: '原因',
reasonPlaceholder: '例如:计划维护', reasonPlaceholder: '例如:计划维护',
...@@ -2442,7 +2575,11 @@ export default { ...@@ -2442,7 +2575,11 @@ export default {
lockKeyRequired: '启用分布式锁时必须填写 Lock Key', lockKeyRequired: '启用分布式锁时必须填写 Lock Key',
lockKeyPrefix: '分布式锁 Key 必须以「{prefix}」开头', lockKeyPrefix: '分布式锁 Key 必须以「{prefix}」开头',
lockKeyHint: '建议以「{prefix}」开头以避免冲突', lockKeyHint: '建议以「{prefix}」开头以避免冲突',
lockTtlRange: '分布式锁 TTL 必须在 1 到 86400 秒之间' lockTtlRange: '分布式锁 TTL 必须在 1 到 86400 秒之间',
slaMinPercentRange: 'SLA 最低值必须在 0-100 之间',
ttftP99MaxRange: 'TTFT P99 最大值必须大于或等于 0',
requestErrorRateMaxRange: '请求错误率最大值必须在 0-100 之间',
upstreamErrorRateMaxRange: '上游错误率最大值必须在 0-100 之间'
} }
}, },
email: { email: {
...@@ -2507,8 +2644,6 @@ export default { ...@@ -2507,8 +2644,6 @@ export default {
metricThresholdsHint: '配置各项指标的告警阈值,超出阈值时将以红色显示', metricThresholdsHint: '配置各项指标的告警阈值,超出阈值时将以红色显示',
slaMinPercent: 'SLA最低百分比', slaMinPercent: 'SLA最低百分比',
slaMinPercentHint: 'SLA低于此值时显示为红色(默认:99.5%)', slaMinPercentHint: 'SLA低于此值时显示为红色(默认:99.5%)',
latencyP99MaxMs: '延迟P99最大值(毫秒)',
latencyP99MaxMsHint: '延迟P99高于此值时显示为红色(默认:2000ms)',
ttftP99MaxMs: 'TTFT P99最大值(毫秒)', ttftP99MaxMs: 'TTFT P99最大值(毫秒)',
ttftP99MaxMsHint: 'TTFT P99高于此值时显示为红色(默认:500ms)', ttftP99MaxMsHint: 'TTFT P99高于此值时显示为红色(默认:500ms)',
requestErrorRateMaxPercent: '请求错误率最大值(%)', requestErrorRateMaxPercent: '请求错误率最大值(%)',
...@@ -2527,9 +2662,28 @@ export default { ...@@ -2527,9 +2662,28 @@ export default {
aggregation: '预聚合任务', aggregation: '预聚合任务',
enableAggregation: '启用预聚合任务', enableAggregation: '启用预聚合任务',
aggregationHint: '预聚合可提升长时间窗口查询性能', aggregationHint: '预聚合可提升长时间窗口查询性能',
errorFiltering: '错误过滤',
ignoreCountTokensErrors: '忽略 count_tokens 错误',
ignoreCountTokensErrorsHint: '启用后,count_tokens 请求的错误将不会写入错误日志。',
ignoreContextCanceled: '忽略客户端断连错误',
ignoreContextCanceledHint: '启用后,客户端主动断开连接(context canceled)的错误将不会写入错误日志。',
ignoreNoAvailableAccounts: '忽略无可用账号错误',
ignoreNoAvailableAccountsHint: '启用后,“No available accounts” 错误将不会写入错误日志(不推荐,这通常是配置问题)。',
autoRefresh: '自动刷新',
enableAutoRefresh: '启用自动刷新',
enableAutoRefreshHint: '自动刷新仪表板数据,启用后会定期拉取最新数据。',
refreshInterval: '刷新间隔',
refreshInterval15s: '15 秒',
refreshInterval30s: '30 秒',
refreshInterval60s: '60 秒',
autoRefreshCountdown: '自动刷新:{seconds}s',
validation: { validation: {
title: '请先修正以下问题', title: '请先修正以下问题',
retentionDaysRange: '保留天数必须在1-365天之间' retentionDaysRange: '保留天数必须在1-365天之间',
slaMinPercentRange: 'SLA最低百分比必须在0-100之间',
ttftP99MaxRange: 'TTFT P99最大值必须大于等于0',
requestErrorRateMaxRange: '请求错误率最大值必须在0-100之间',
upstreamErrorRateMaxRange: '上游错误率最大值必须在0-100之间'
} }
}, },
concurrency: { concurrency: {
...@@ -2567,12 +2721,12 @@ export default { ...@@ -2567,12 +2721,12 @@ export default {
tooltips: { tooltips: {
totalRequests: '当前时间窗口内的总请求数和Token消耗量。', totalRequests: '当前时间窗口内的总请求数和Token消耗量。',
throughputTrend: '当前窗口内的请求/QPS 与 token/TPS 趋势。', throughputTrend: '当前窗口内的请求/QPS 与 token/TPS 趋势。',
latencyHistogram: '成功请求的延迟分布(毫秒)。', latencyHistogram: '成功请求的请求时长分布(毫秒)。',
errorTrend: '错误趋势(SLA 口径排除业务限制;上游错误率排除 429/529)。', errorTrend: '错误趋势(SLA 口径排除业务限制;上游错误率排除 429/529)。',
errorDistribution: '按状态码统计的错误分布。', errorDistribution: '按状态码统计的错误分布。',
upstreamErrors: '上游服务返回的错误,包括API提供商的错误响应(排除429/529限流错误)。', upstreamErrors: '上游服务返回的错误,包括API提供商的错误响应(排除429/529限流错误)。',
goroutines: goroutines:
'Go 运行时的协程数量(轻量级线程)。没有绝对安全值,建议以历史基线为准。经验参考:<2000 常见;2000-8000 需关注;>8000 且伴随队列/延迟上升时,优先排查阻塞/泄漏。', 'Go 运行时的协程数量(轻量级线程)。没有绝对"安全值",建议以历史基线为准。经验参考:<2000 常见;2000-8000 需关注;>8000 且伴随队列上升时,优先排查阻塞/泄漏。',
cpu: 'CPU 使用率,显示系统处理器的负载情况。', cpu: 'CPU 使用率,显示系统处理器的负载情况。',
memory: '内存使用率,包括已使用和总可用内存。', memory: '内存使用率,包括已使用和总可用内存。',
db: '数据库连接池状态,包括活跃连接、空闲连接和等待连接数。', db: '数据库连接池状态,包括活跃连接、空闲连接和等待连接数。',
...@@ -2582,7 +2736,7 @@ export default { ...@@ -2582,7 +2736,7 @@ export default {
tokens: '当前时间窗口内处理的总Token数量。', tokens: '当前时间窗口内处理的总Token数量。',
sla: '服务等级协议达成率,排除业务限制(如余额不足、配额超限)的成功请求占比。', sla: '服务等级协议达成率,排除业务限制(如余额不足、配额超限)的成功请求占比。',
errors: '错误统计,包括总错误数、错误率和上游错误率。', errors: '错误统计,包括总错误数、错误率和上游错误率。',
latency: '请求延迟统计,包括 p50、p90、p95、p99 等百分位数。', latency: '请求时长统计,包括 p50、p90、p95、p99 等百分位数。',
ttft: '首Token延迟(Time To First Token),衡量流式响应的首字节返回速度。', ttft: '首Token延迟(Time To First Token),衡量流式响应的首字节返回速度。',
health: '系统健康评分(0-100),综合考虑 SLA、错误率和资源使用情况。' health: '系统健康评分(0-100),综合考虑 SLA、错误率和资源使用情况。'
}, },
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
{{ errorMessage }} {{ errorMessage }}
</div> </div>
<OpsDashboardSkeleton v-if="loading && !hasLoadedOnce" /> <OpsDashboardSkeleton v-if="loading && !hasLoadedOnce" :fullscreen="isFullscreen" />
<OpsDashboardHeader <OpsDashboardHeader
v-else-if="opsEnabled" v-else-if="opsEnabled"
...@@ -94,7 +94,7 @@ ...@@ -94,7 +94,7 @@
@openErrorDetail="openError" @openErrorDetail="openError"
/> />
<OpsErrorDetailModal v-model:show="showErrorModal" :error-id="selectedErrorId" /> <OpsErrorDetailModal v-model:show="showErrorModal" :error-id="selectedErrorId" :error-type="errorDetailsType" />
<OpsRequestDetailsModal <OpsRequestDetailsModal
v-model="showRequestDetails" v-model="showRequestDetails"
...@@ -169,7 +169,13 @@ const QUERY_KEYS = { ...@@ -169,7 +169,13 @@ const QUERY_KEYS = {
platform: 'platform', platform: 'platform',
groupId: 'group_id', groupId: 'group_id',
queryMode: 'mode', queryMode: 'mode',
fullscreen: 'fullscreen' fullscreen: 'fullscreen',
// Deep links
openErrorDetails: 'open_error_details',
errorType: 'error_type',
alertRuleId: 'alert_rule_id',
openAlertRules: 'open_alert_rules'
} as const } as const
const isApplyingRouteQuery = ref(false) const isApplyingRouteQuery = ref(false)
...@@ -249,6 +255,24 @@ const applyRouteQueryToState = () => { ...@@ -249,6 +255,24 @@ const applyRouteQueryToState = () => {
const fallback = adminSettingsStore.opsQueryModeDefault || 'auto' const fallback = adminSettingsStore.opsQueryModeDefault || 'auto'
queryMode.value = allowedQueryModes.has(fallback as QueryMode) ? (fallback as QueryMode) : 'auto' queryMode.value = allowedQueryModes.has(fallback as QueryMode) ? (fallback as QueryMode) : 'auto'
} }
// Deep links
const openRules = readQueryString(QUERY_KEYS.openAlertRules)
if (openRules === '1' || openRules === 'true') {
showAlertRulesCard.value = true
}
const ruleID = readQueryNumber(QUERY_KEYS.alertRuleId)
if (typeof ruleID === 'number' && ruleID > 0) {
showAlertRulesCard.value = true
}
const openErr = readQueryString(QUERY_KEYS.openErrorDetails)
if (openErr === '1' || openErr === 'true') {
const typ = readQueryString(QUERY_KEYS.errorType)
errorDetailsType.value = typ === 'upstream' ? 'upstream' : 'request'
showErrorDetails.value = true
}
} }
applyRouteQueryToState() applyRouteQueryToState()
...@@ -376,11 +400,17 @@ function handleOpenRequestDetails(preset?: OpsRequestDetailsPreset) { ...@@ -376,11 +400,17 @@ function handleOpenRequestDetails(preset?: OpsRequestDetailsPreset) {
requestDetailsPreset.value = { ...basePreset, ...(preset ?? {}) } requestDetailsPreset.value = { ...basePreset, ...(preset ?? {}) }
if (!requestDetailsPreset.value.title) requestDetailsPreset.value.title = basePreset.title if (!requestDetailsPreset.value.title) requestDetailsPreset.value.title = basePreset.title
// Ensure only one modal visible at a time.
showErrorDetails.value = false
showErrorModal.value = false
showRequestDetails.value = true showRequestDetails.value = true
} }
function openErrorDetails(kind: 'request' | 'upstream') { function openErrorDetails(kind: 'request' | 'upstream') {
errorDetailsType.value = kind errorDetailsType.value = kind
// Ensure only one modal visible at a time.
showRequestDetails.value = false
showErrorModal.value = false
showErrorDetails.value = true showErrorDetails.value = true
} }
...@@ -422,6 +452,9 @@ function onQueryModeChange(v: string | number | boolean | null) { ...@@ -422,6 +452,9 @@ function onQueryModeChange(v: string | number | boolean | null) {
function openError(id: number) { function openError(id: number) {
selectedErrorId.value = id selectedErrorId.value = id
// Ensure only one modal visible at a time.
showErrorDetails.value = false
showRequestDetails.value = false
showErrorModal.value = true showErrorModal.value = true
} }
......
...@@ -140,24 +140,6 @@ const metricDefinitions = computed(() => { ...@@ -140,24 +140,6 @@ const metricDefinitions = computed(() => {
recommendedThreshold: 1, recommendedThreshold: 1,
unit: '%' unit: '%'
}, },
{
type: 'p95_latency_ms',
group: 'system',
label: t('admin.ops.alertRules.metrics.p95'),
description: t('admin.ops.alertRules.metricDescriptions.p95'),
recommendedOperator: '>',
recommendedThreshold: 1000,
unit: 'ms'
},
{
type: 'p99_latency_ms',
group: 'system',
label: t('admin.ops.alertRules.metrics.p99'),
description: t('admin.ops.alertRules.metricDescriptions.p99'),
recommendedOperator: '>',
recommendedThreshold: 2000,
unit: 'ms'
},
{ {
type: 'cpu_usage_percent', type: 'cpu_usage_percent',
group: 'system', group: 'system',
......
...@@ -169,8 +169,8 @@ const updatedAtLabel = computed(() => { ...@@ -169,8 +169,8 @@ const updatedAtLabel = computed(() => {
return props.lastUpdated.toLocaleTimeString() return props.lastUpdated.toLocaleTimeString()
}) })
// --- Color coding for latency/TTFT --- // --- Color coding for TTFT ---
function getLatencyColor(ms: number | null | undefined): string { function getTTFTColor(ms: number | null | undefined): string {
if (ms == null) return 'text-gray-900 dark:text-white' if (ms == null) return 'text-gray-900 dark:text-white'
if (ms < 500) return 'text-green-600 dark:text-green-400' if (ms < 500) return 'text-green-600 dark:text-green-400'
if (ms < 1000) return 'text-yellow-600 dark:text-yellow-400' if (ms < 1000) return 'text-yellow-600 dark:text-yellow-400'
...@@ -186,13 +186,6 @@ function isSLABelowThreshold(slaPercent: number | null): boolean { ...@@ -186,13 +186,6 @@ function isSLABelowThreshold(slaPercent: number | null): boolean {
return slaPercent < threshold return slaPercent < threshold
} }
function isLatencyAboveThreshold(latencyP99Ms: number | null): boolean {
if (latencyP99Ms == null) return false
const threshold = props.thresholds?.latency_p99_ms_max
if (threshold == null) return false
return latencyP99Ms > threshold
}
function isTTFTAboveThreshold(ttftP99Ms: number | null): boolean { function isTTFTAboveThreshold(ttftP99Ms: number | null): boolean {
if (ttftP99Ms == null) return false if (ttftP99Ms == null) return false
const threshold = props.thresholds?.ttft_p99_ms_max const threshold = props.thresholds?.ttft_p99_ms_max
...@@ -482,24 +475,6 @@ const diagnosisReport = computed<DiagnosisItem[]>(() => { ...@@ -482,24 +475,6 @@ const diagnosisReport = computed<DiagnosisItem[]>(() => {
} }
} }
// Latency diagnostics
const durationP99 = ov.duration?.p99_ms ?? 0
if (durationP99 > 2000) {
report.push({
type: 'critical',
message: t('admin.ops.diagnosis.latencyCritical', { latency: durationP99.toFixed(0) }),
impact: t('admin.ops.diagnosis.latencyCriticalImpact'),
action: t('admin.ops.diagnosis.latencyCriticalAction')
})
} else if (durationP99 > 1000) {
report.push({
type: 'warning',
message: t('admin.ops.diagnosis.latencyHigh', { latency: durationP99.toFixed(0) }),
impact: t('admin.ops.diagnosis.latencyHighImpact'),
action: t('admin.ops.diagnosis.latencyHighAction')
})
}
const ttftP99 = ov.ttft?.p99_ms ?? 0 const ttftP99 = ov.ttft?.p99_ms ?? 0
if (ttftP99 > 500) { if (ttftP99 > 500) {
report.push({ report.push({
...@@ -851,7 +826,7 @@ function handleToolbarRefresh() { ...@@ -851,7 +826,7 @@ function handleToolbarRefresh() {
<circle class="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" stroke-width="4"></circle> <circle class="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" stroke-width="4"></circle>
<path class="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4zm2 5.291A7.962 7.962 0 014 12H0c0 3.042 1.135 5.824 3 7.938l3-2.647z"></path> <path class="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4zm2 5.291A7.962 7.962 0 014 12H0c0 3.042 1.135 5.824 3 7.938l3-2.647z"></path>
</svg> </svg>
<span>自动刷新: {{ props.autoRefreshCountdown }}s</span> <span>{{ t('admin.ops.settings.autoRefreshCountdown', { seconds: props.autoRefreshCountdown }) }}</span>
</span> </span>
</template> </template>
...@@ -1113,7 +1088,7 @@ function handleToolbarRefresh() { ...@@ -1113,7 +1088,7 @@ function handleToolbarRefresh() {
</div> </div>
<div class="flex items-baseline gap-1.5"> <div class="flex items-baseline gap-1.5">
<span :class="[props.fullscreen ? 'text-4xl' : 'text-xl sm:text-2xl', 'font-black text-gray-900 dark:text-white']">{{ displayRealTimeTps.toFixed(1) }}</span> <span :class="[props.fullscreen ? 'text-4xl' : 'text-xl sm:text-2xl', 'font-black text-gray-900 dark:text-white']">{{ displayRealTimeTps.toFixed(1) }}</span>
<span :class="[props.fullscreen ? 'text-sm' : 'text-xs', 'font-bold text-gray-500']">TPS</span> <span :class="[props.fullscreen ? 'text-sm' : 'text-xs', 'font-bold text-gray-500']">{{ t('admin.ops.tps') }}</span>
</div> </div>
</div> </div>
</div> </div>
...@@ -1130,7 +1105,7 @@ function handleToolbarRefresh() { ...@@ -1130,7 +1105,7 @@ function handleToolbarRefresh() {
</div> </div>
<div class="flex items-baseline gap-1.5"> <div class="flex items-baseline gap-1.5">
<span class="font-black text-gray-900 dark:text-white">{{ realtimeTpsPeakLabel }}</span> <span class="font-black text-gray-900 dark:text-white">{{ realtimeTpsPeakLabel }}</span>
<span class="text-xs">TPS</span> <span class="text-xs">{{ t('admin.ops.tps') }}</span>
</div> </div>
</div> </div>
</div> </div>
...@@ -1145,7 +1120,7 @@ function handleToolbarRefresh() { ...@@ -1145,7 +1120,7 @@ function handleToolbarRefresh() {
</div> </div>
<div class="flex items-baseline gap-1.5"> <div class="flex items-baseline gap-1.5">
<span class="font-black text-gray-900 dark:text-white">{{ realtimeTpsAvgLabel }}</span> <span class="font-black text-gray-900 dark:text-white">{{ realtimeTpsAvgLabel }}</span>
<span class="text-xs">TPS</span> <span class="text-xs">{{ t('admin.ops.tps') }}</span>
</div> </div>
</div> </div>
</div> </div>
...@@ -1181,7 +1156,7 @@ function handleToolbarRefresh() { ...@@ -1181,7 +1156,7 @@ function handleToolbarRefresh() {
<!-- Right: 6 cards (3 cols x 2 rows) --> <!-- Right: 6 cards (3 cols x 2 rows) -->
<div class="grid h-full grid-cols-1 content-center gap-4 sm:grid-cols-2 lg:col-span-7 lg:grid-cols-3"> <div class="grid h-full grid-cols-1 content-center gap-4 sm:grid-cols-2 lg:col-span-7 lg:grid-cols-3">
<!-- Card 1: Requests --> <!-- Card 1: Requests -->
<div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900"> <div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900" style="order: 1;">
<div class="flex items-center justify-between"> <div class="flex items-center justify-between">
<div class="flex items-center gap-1"> <div class="flex items-center gap-1">
<span class="text-[10px] font-bold uppercase text-gray-400">{{ t('admin.ops.requestsTitle') }}</span> <span class="text-[10px] font-bold uppercase text-gray-400">{{ t('admin.ops.requestsTitle') }}</span>
...@@ -1217,10 +1192,10 @@ function handleToolbarRefresh() { ...@@ -1217,10 +1192,10 @@ function handleToolbarRefresh() {
</div> </div>
<!-- Card 2: SLA --> <!-- Card 2: SLA -->
<div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900"> <div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900" style="order: 2;">
<div class="flex items-center justify-between"> <div class="flex items-center justify-between">
<div class="flex items-center gap-2"> <div class="flex items-center gap-2">
<span class="text-[10px] font-bold uppercase text-gray-400">SLA</span> <span class="text-[10px] font-bold uppercase text-gray-400">{{ t('admin.ops.sla') }}</span>
<HelpTooltip v-if="!props.fullscreen" :content="t('admin.ops.tooltips.sla')" /> <HelpTooltip v-if="!props.fullscreen" :content="t('admin.ops.tooltips.sla')" />
<span class="h-1.5 w-1.5 rounded-full" :class="isSLABelowThreshold(slaPercent) ? 'bg-red-500' : (slaPercent ?? 0) >= 99.5 ? 'bg-green-500' : 'bg-yellow-500'"></span> <span class="h-1.5 w-1.5 rounded-full" :class="isSLABelowThreshold(slaPercent) ? 'bg-red-500' : (slaPercent ?? 0) >= 99.5 ? 'bg-green-500' : 'bg-yellow-500'"></span>
</div> </div>
...@@ -1247,8 +1222,8 @@ function handleToolbarRefresh() { ...@@ -1247,8 +1222,8 @@ function handleToolbarRefresh() {
</div> </div>
</div> </div>
<!-- Card 3: Latency (Duration) --> <!-- Card 4: Request Duration -->
<div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900"> <div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900" style="order: 4;">
<div class="flex items-center justify-between"> <div class="flex items-center justify-between">
<div class="flex items-center gap-1"> <div class="flex items-center gap-1">
<span class="text-[10px] font-bold uppercase text-gray-400">{{ t('admin.ops.latencyDuration') }}</span> <span class="text-[10px] font-bold uppercase text-gray-400">{{ t('admin.ops.latencyDuration') }}</span>
...@@ -1264,42 +1239,42 @@ function handleToolbarRefresh() { ...@@ -1264,42 +1239,42 @@ function handleToolbarRefresh() {
</button> </button>
</div> </div>
<div class="mt-2 flex items-baseline gap-2"> <div class="mt-2 flex items-baseline gap-2">
<div class="text-3xl font-black" :class="isLatencyAboveThreshold(durationP99Ms) ? 'text-red-600 dark:text-red-400' : getLatencyColor(durationP99Ms)"> <div class="text-3xl font-black text-gray-900 dark:text-white">
{{ durationP99Ms ?? '-' }} {{ durationP99Ms ?? '-' }}
</div> </div>
<span class="text-xs font-bold text-gray-400">ms (P99)</span> <span class="text-xs font-bold text-gray-400">ms (P99)</span>
</div> </div>
<div class="mt-3 flex flex-wrap gap-x-3 gap-y-1 text-xs"> <div class="mt-3 flex flex-wrap gap-x-3 gap-y-1 text-xs">
<div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap"> <div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
<span class="text-gray-500">P95:</span> <span class="text-gray-500">{{ t('admin.ops.p95') }}</span>
<span class="font-bold" :class="getLatencyColor(durationP95Ms)">{{ durationP95Ms ?? '-' }}</span> <span class="font-bold text-gray-900 dark:text-white">{{ durationP95Ms ?? '-' }}</span>
<span class="text-gray-400">ms</span> <span class="text-gray-400">ms</span>
</div> </div>
<div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap"> <div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
<span class="text-gray-500">P90:</span> <span class="text-gray-500">{{ t('admin.ops.p90') }}</span>
<span class="font-bold" :class="getLatencyColor(durationP90Ms)">{{ durationP90Ms ?? '-' }}</span> <span class="font-bold text-gray-900 dark:text-white">{{ durationP90Ms ?? '-' }}</span>
<span class="text-gray-400">ms</span> <span class="text-gray-400">ms</span>
</div> </div>
<div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap"> <div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
<span class="text-gray-500">P50:</span> <span class="text-gray-500">{{ t('admin.ops.p50') }}</span>
<span class="font-bold" :class="getLatencyColor(durationP50Ms)">{{ durationP50Ms ?? '-' }}</span> <span class="font-bold text-gray-900 dark:text-white">{{ durationP50Ms ?? '-' }}</span>
<span class="text-gray-400">ms</span> <span class="text-gray-400">ms</span>
</div> </div>
<div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap"> <div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
<span class="text-gray-500">Avg:</span> <span class="text-gray-500">Avg:</span>
<span class="font-bold" :class="getLatencyColor(durationAvgMs)">{{ durationAvgMs ?? '-' }}</span> <span class="font-bold text-gray-900 dark:text-white">{{ durationAvgMs ?? '-' }}</span>
<span class="text-gray-400">ms</span> <span class="text-gray-400">ms</span>
</div> </div>
<div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap"> <div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
<span class="text-gray-500">Max:</span> <span class="text-gray-500">Max:</span>
<span class="font-bold" :class="getLatencyColor(durationMaxMs)">{{ durationMaxMs ?? '-' }}</span> <span class="font-bold text-gray-900 dark:text-white">{{ durationMaxMs ?? '-' }}</span>
<span class="text-gray-400">ms</span> <span class="text-gray-400">ms</span>
</div> </div>
</div> </div>
</div> </div>
<!-- Card 4: TTFT --> <!-- Card 5: TTFT -->
<div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900"> <div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900" style="order: 5;">
<div class="flex items-center justify-between"> <div class="flex items-center justify-between">
<div class="flex items-center gap-1"> <div class="flex items-center gap-1">
<span class="text-[10px] font-bold uppercase text-gray-400">TTFT</span> <span class="text-[10px] font-bold uppercase text-gray-400">TTFT</span>
...@@ -1309,48 +1284,48 @@ function handleToolbarRefresh() { ...@@ -1309,48 +1284,48 @@ function handleToolbarRefresh() {
v-if="!props.fullscreen" v-if="!props.fullscreen"
class="text-[10px] font-bold text-blue-500 hover:underline" class="text-[10px] font-bold text-blue-500 hover:underline"
type="button" type="button"
@click="openDetails({ title: 'TTFT', sort: 'duration_desc' })" @click="openDetails({ title: t('admin.ops.ttftLabel'), sort: 'duration_desc' })"
> >
{{ t('admin.ops.requestDetails.details') }} {{ t('admin.ops.requestDetails.details') }}
</button> </button>
</div> </div>
<div class="mt-2 flex items-baseline gap-2"> <div class="mt-2 flex items-baseline gap-2">
<div class="text-3xl font-black" :class="isTTFTAboveThreshold(ttftP99Ms) ? 'text-red-600 dark:text-red-400' : getLatencyColor(ttftP99Ms)"> <div class="text-3xl font-black" :class="isTTFTAboveThreshold(ttftP99Ms) ? 'text-red-600 dark:text-red-400' : getTTFTColor(ttftP99Ms)">
{{ ttftP99Ms ?? '-' }} {{ ttftP99Ms ?? '-' }}
</div> </div>
<span class="text-xs font-bold text-gray-400">ms (P99)</span> <span class="text-xs font-bold text-gray-400">ms (P99)</span>
</div> </div>
<div class="mt-3 flex flex-wrap gap-x-3 gap-y-1 text-xs"> <div class="mt-3 flex flex-wrap gap-x-3 gap-y-1 text-xs">
<div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap"> <div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
<span class="text-gray-500">P95:</span> <span class="text-gray-500">{{ t('admin.ops.p95') }}</span>
<span class="font-bold" :class="getLatencyColor(ttftP95Ms)">{{ ttftP95Ms ?? '-' }}</span> <span class="font-bold" :class="getTTFTColor(ttftP95Ms)">{{ ttftP95Ms ?? '-' }}</span>
<span class="text-gray-400">ms</span> <span class="text-gray-400">ms</span>
</div> </div>
<div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap"> <div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
<span class="text-gray-500">P90:</span> <span class="text-gray-500">{{ t('admin.ops.p90') }}</span>
<span class="font-bold" :class="getLatencyColor(ttftP90Ms)">{{ ttftP90Ms ?? '-' }}</span> <span class="font-bold" :class="getTTFTColor(ttftP90Ms)">{{ ttftP90Ms ?? '-' }}</span>
<span class="text-gray-400">ms</span> <span class="text-gray-400">ms</span>
</div> </div>
<div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap"> <div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
<span class="text-gray-500">P50:</span> <span class="text-gray-500">{{ t('admin.ops.p50') }}</span>
<span class="font-bold" :class="getLatencyColor(ttftP50Ms)">{{ ttftP50Ms ?? '-' }}</span> <span class="font-bold" :class="getTTFTColor(ttftP50Ms)">{{ ttftP50Ms ?? '-' }}</span>
<span class="text-gray-400">ms</span> <span class="text-gray-400">ms</span>
</div> </div>
<div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap"> <div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
<span class="text-gray-500">Avg:</span> <span class="text-gray-500">Avg:</span>
<span class="font-bold" :class="getLatencyColor(ttftAvgMs)">{{ ttftAvgMs ?? '-' }}</span> <span class="font-bold" :class="getTTFTColor(ttftAvgMs)">{{ ttftAvgMs ?? '-' }}</span>
<span class="text-gray-400">ms</span> <span class="text-gray-400">ms</span>
</div> </div>
<div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap"> <div class="flex min-w-[60px] items-baseline gap-1 whitespace-nowrap">
<span class="text-gray-500">Max:</span> <span class="text-gray-500">Max:</span>
<span class="font-bold" :class="getLatencyColor(ttftMaxMs)">{{ ttftMaxMs ?? '-' }}</span> <span class="font-bold" :class="getTTFTColor(ttftMaxMs)">{{ ttftMaxMs ?? '-' }}</span>
<span class="text-gray-400">ms</span> <span class="text-gray-400">ms</span>
</div> </div>
</div> </div>
</div> </div>
<!-- Card 5: Request Errors --> <!-- Card 3: Request Errors -->
<div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900"> <div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900" style="order: 3;">
<div class="flex items-center justify-between"> <div class="flex items-center justify-between">
<div class="flex items-center gap-1"> <div class="flex items-center gap-1">
<span class="text-[10px] font-bold uppercase text-gray-400">{{ t('admin.ops.requestErrors') }}</span> <span class="text-[10px] font-bold uppercase text-gray-400">{{ t('admin.ops.requestErrors') }}</span>
...@@ -1376,7 +1351,7 @@ function handleToolbarRefresh() { ...@@ -1376,7 +1351,7 @@ function handleToolbarRefresh() {
</div> </div>
<!-- Card 6: Upstream Errors --> <!-- Card 6: Upstream Errors -->
<div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900"> <div class="rounded-2xl bg-gray-50 p-4 dark:bg-dark-900" style="order: 6;">
<div class="flex items-center justify-between"> <div class="flex items-center justify-between">
<div class="flex items-center gap-1"> <div class="flex items-center gap-1">
<span class="text-[10px] font-bold uppercase text-gray-400">{{ t('admin.ops.upstreamErrors') }}</span> <span class="text-[10px] font-bold uppercase text-gray-400">{{ t('admin.ops.upstreamErrors') }}</span>
...@@ -1423,7 +1398,7 @@ function handleToolbarRefresh() { ...@@ -1423,7 +1398,7 @@ function handleToolbarRefresh() {
<!-- MEM --> <!-- MEM -->
<div class="rounded-xl bg-gray-50 p-3 dark:bg-dark-900"> <div class="rounded-xl bg-gray-50 p-3 dark:bg-dark-900">
<div class="flex items-center gap-1"> <div class="flex items-center gap-1">
<div class="text-[10px] font-bold uppercase tracking-wider text-gray-400">MEM</div> <div class="text-[10px] font-bold uppercase tracking-wider text-gray-400">{{ t('admin.ops.mem') }}</div>
<HelpTooltip v-if="!props.fullscreen" :content="t('admin.ops.tooltips.memory')" /> <HelpTooltip v-if="!props.fullscreen" :content="t('admin.ops.tooltips.memory')" />
</div> </div>
<div class="mt-1 text-lg font-black" :class="memPercentClass"> <div class="mt-1 text-lg font-black" :class="memPercentClass">
...@@ -1441,7 +1416,7 @@ function handleToolbarRefresh() { ...@@ -1441,7 +1416,7 @@ function handleToolbarRefresh() {
<!-- DB --> <!-- DB -->
<div class="rounded-xl bg-gray-50 p-3 dark:bg-dark-900"> <div class="rounded-xl bg-gray-50 p-3 dark:bg-dark-900">
<div class="flex items-center gap-1"> <div class="flex items-center gap-1">
<div class="text-[10px] font-bold uppercase tracking-wider text-gray-400">DB</div> <div class="text-[10px] font-bold uppercase tracking-wider text-gray-400">{{ t('admin.ops.db') }}</div>
<HelpTooltip v-if="!props.fullscreen" :content="t('admin.ops.tooltips.db')" /> <HelpTooltip v-if="!props.fullscreen" :content="t('admin.ops.tooltips.db')" />
</div> </div>
<div class="mt-1 text-lg font-black" :class="dbMiddleClass"> <div class="mt-1 text-lg font-black" :class="dbMiddleClass">
......
...@@ -38,7 +38,7 @@ const loading = ref(false) ...@@ -38,7 +38,7 @@ const loading = ref(false)
const items = ref<OpsRequestDetail[]>([]) const items = ref<OpsRequestDetail[]>([])
const total = ref(0) const total = ref(0)
const page = ref(1) const page = ref(1)
const pageSize = ref(20) const pageSize = ref(10)
const close = () => emit('update:modelValue', false) const close = () => emit('update:modelValue', false)
...@@ -95,7 +95,7 @@ watch( ...@@ -95,7 +95,7 @@ watch(
(open) => { (open) => {
if (open) { if (open) {
page.value = 1 page.value = 1
pageSize.value = 20 pageSize.value = 10
fetchData() fetchData()
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment