groups:
- name: kaireon-slo-alerts
rules:
# --- Availability ---
- alert: HighErrorRate
expr: |
(
sum(rate(http_responses_total{status=~"5.."}[5m]))
/
sum(rate(http_responses_total[5m]))
) > 0.001
for: 5m
labels:
severity: critical
team: platform
annotations:
summary: "API error rate exceeds 0.1% SLO"
description: "Current error rate is {{ $value | humanizePercentage }}. SLO target is < 0.1%."
runbook: "docs/ops/runbooks/high-error-rate.md"
- alert: AvailabilityBurnRateFast
expr: |
(
1 - (sum(rate(http_responses_total{status!~"5.."}[5m])) / sum(rate(http_responses_total[5m])))
) > 0.01
for: 2m
labels:
severity: critical
team: platform
annotations:
summary: "Fast burn: error budget consumption rate is critical"
description: "At current rate, the monthly error budget will be exhausted in less than 4 hours."
- alert: AvailabilityBurnRateSlow
expr: |
(
1 - (sum(rate(http_responses_total{status!~"5.."}[1h])) / sum(rate(http_responses_total[1h])))
) > 0.002
for: 30m
labels:
severity: warning
team: platform
annotations:
summary: "Slow burn: error budget is being consumed above normal rate"
# --- Decision Latency ---
- alert: DecisionLatencyP99High
expr: |
histogram_quantile(0.99,
sum(rate(http_request_duration_seconds_bucket{route=~"/api/v1/decisions.*"}[5m])) by (le)
) > 0.200
for: 5m
labels:
severity: critical
team: platform
annotations:
summary: "Decision P99 latency exceeds 200ms SLO"
description: "Current P99 latency is {{ $value | humanizeDuration }}."
runbook: "docs/ops/runbooks/high-latency.md"
- alert: DecisionLatencyP95High
expr: |
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket{route=~"/api/v1/decisions.*"}[5m])) by (le)
) > 0.100
for: 5m
labels:
severity: warning
team: platform
annotations:
summary: "Decision P95 latency exceeds 100ms SLO"
description: "Current P95 latency is {{ $value | humanizeDuration }}."
# --- Pipeline Success ---
- alert: PipelineFailureRateHigh
expr: |
(
sum(rate(pipeline_runs_total{status="failed"}[1h]))
/
sum(rate(pipeline_runs_total[1h]))
) > 0.01
for: 15m
labels:
severity: warning
team: data-engineering
annotations:
summary: "Pipeline failure rate exceeds 1% SLO"
description: "Current failure rate is {{ $value | humanizePercentage }}."
runbook: "docs/ops/runbooks/pipeline-failure.md"