108 lines
4.0 KiB
YAML
108 lines
4.0 KiB
YAML
#========================================
|
||
# Prometheus 告警规则(U8a)
|
||
# 覆盖服务可用性、JVM、HTTP、数据库连接池、熔断器
|
||
#========================================
|
||
groups:
|
||
# ====== 服务可用性 ======
|
||
- name: service-availability
|
||
rules:
|
||
- alert: ServiceDown
|
||
expr: up{job=~"pms-.*"} == 0
|
||
for: 1m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "服务 {{ $labels.job }} 宕机"
|
||
description: "{{ $labels.job }}(实例 {{ $labels.instance }})已离线超过 1 分钟"
|
||
|
||
- alert: ServiceUnreachable
|
||
expr: up{job=~"pms-.*"} == 0
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "服务 {{ $labels.job }} 持续不可达"
|
||
description: "{{ $labels.job }} 已离线 5 分钟,请立即检查"
|
||
|
||
# ====== JVM 运行时 ======
|
||
- name: jvm-runtime
|
||
rules:
|
||
- alert: HighCpuUsage
|
||
expr: process_cpu_usage{application=~".*-service"} > 0.8
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
annotations:
|
||
summary: "CPU 使用率过高:{{ $labels.application }}"
|
||
description: "{{ $labels.application }} CPU 使用率 {{ $value | humanizePercentage }} 持续 5 分钟超过 80%"
|
||
|
||
- alert: HighMemoryUsage
|
||
expr: jvm_memory_used_bytes{area="heap"} / jvm_memory_max_bytes{area="heap"} > 0.85
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
annotations:
|
||
summary: "JVM 堆内存使用率过高:{{ $labels.application }}"
|
||
description: "{{ $labels.application }} 堆内存使用率 {{ $value | humanizePercentage }} 持续 5 分钟超过 85%"
|
||
|
||
- alert: HighThreadCount
|
||
expr: jvm_threads_live_threads > 500
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
annotations:
|
||
summary: "JVM 线程数过高:{{ $labels.application }}"
|
||
description: "{{ $labels.application }} 活跃线程数 {{ $value }} 超过 500"
|
||
|
||
# ====== HTTP 请求 ======
|
||
- name: http-requests
|
||
rules:
|
||
- alert: HighErrorRate
|
||
expr: |
|
||
sum(rate(http_server_requests_seconds_count{status=~"5.."}[5m])) by (application)
|
||
/ sum(rate(http_server_requests_seconds_count[5m])) by (application)
|
||
> 0.05
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "HTTP 5xx 错误率过高:{{ $labels.application }}"
|
||
description: "{{ $labels.application }} 5xx 错误率 {{ $value | humanizePercentage }} 持续 5 分钟超过 5%"
|
||
|
||
- alert: SlowResponseTime
|
||
expr: |
|
||
histogram_quantile(0.95,
|
||
sum(rate(http_server_requests_seconds_bucket[5m])) by (application, le)
|
||
) > 2
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
annotations:
|
||
summary: "HTTP P95 响应时间过长:{{ $labels.application }}"
|
||
description: "{{ $labels.application }} P95 响应时间 {{ $value }}s 持续 5 分钟超过 2s"
|
||
|
||
# ====== 数据库连接池(HikariCP)======
|
||
- name: db-pool
|
||
rules:
|
||
- alert: HighDbPoolUsage
|
||
expr: |
|
||
hikaricp_connections_active / hikaricp_connections_maximum > 0.8
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
annotations:
|
||
summary: "数据库连接池使用率过高:{{ $labels.application }}"
|
||
description: "{{ $labels.application }} HikariCP 活跃连接占比 {{ $value | humanizePercentage }} 持续 5 分钟超过 80%"
|
||
|
||
# ====== Resilience4j 熔断器(pms-operation 道闸调用)======
|
||
- name: circuit-breaker
|
||
rules:
|
||
- alert: CircuitBreakerOpen
|
||
expr: resilience4j_circuitbreaker_state{state="open"} == 1
|
||
for: 1m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "熔断器已开启:{{ $labels.application }} / {{ $labels.name }}"
|
||
description: "熔断器 {{ $labels.name }} 处于 OPEN 状态,下游服务不可用"
|