ether/deploy/monitoring/prometheus/rules.yml

108 lines
4.0 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#========================================
# Prometheus 告警规则U8a
# 覆盖服务可用性、JVM、HTTP、数据库连接池、熔断器
#========================================
groups:
# ====== 服务可用性 ======
- name: service-availability
rules:
- alert: ServiceDown
expr: up{job=~"pms-.*"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "服务 {{ $labels.job }} 宕机"
description: "{{ $labels.job }}(实例 {{ $labels.instance }})已离线超过 1 分钟"
- alert: ServiceUnreachable
expr: up{job=~"pms-.*"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "服务 {{ $labels.job }} 持续不可达"
description: "{{ $labels.job }} 已离线 5 分钟,请立即检查"
# ====== JVM 运行时 ======
- name: jvm-runtime
rules:
- alert: HighCpuUsage
expr: process_cpu_usage{application=~".*-service"} > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "CPU 使用率过高:{{ $labels.application }}"
description: "{{ $labels.application }} CPU 使用率 {{ $value | humanizePercentage }} 持续 5 分钟超过 80%"
- alert: HighMemoryUsage
expr: jvm_memory_used_bytes{area="heap"} / jvm_memory_max_bytes{area="heap"} > 0.85
for: 5m
labels:
severity: warning
annotations:
summary: "JVM 堆内存使用率过高:{{ $labels.application }}"
description: "{{ $labels.application }} 堆内存使用率 {{ $value | humanizePercentage }} 持续 5 分钟超过 85%"
- alert: HighThreadCount
expr: jvm_threads_live_threads > 500
for: 5m
labels:
severity: warning
annotations:
summary: "JVM 线程数过高:{{ $labels.application }}"
description: "{{ $labels.application }} 活跃线程数 {{ $value }} 超过 500"
# ====== HTTP 请求 ======
- name: http-requests
rules:
- alert: HighErrorRate
expr: |
sum(rate(http_server_requests_seconds_count{status=~"5.."}[5m])) by (application)
/ sum(rate(http_server_requests_seconds_count[5m])) by (application)
> 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "HTTP 5xx 错误率过高:{{ $labels.application }}"
description: "{{ $labels.application }} 5xx 错误率 {{ $value | humanizePercentage }} 持续 5 分钟超过 5%"
- alert: SlowResponseTime
expr: |
histogram_quantile(0.95,
sum(rate(http_server_requests_seconds_bucket[5m])) by (application, le)
) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "HTTP P95 响应时间过长:{{ $labels.application }}"
description: "{{ $labels.application }} P95 响应时间 {{ $value }}s 持续 5 分钟超过 2s"
# ====== 数据库连接池HikariCP======
- name: db-pool
rules:
- alert: HighDbPoolUsage
expr: |
hikaricp_connections_active / hikaricp_connections_maximum > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "数据库连接池使用率过高:{{ $labels.application }}"
description: "{{ $labels.application }} HikariCP 活跃连接占比 {{ $value | humanizePercentage }} 持续 5 分钟超过 80%"
# ====== Resilience4j 熔断器pms-operation 道闸调用)======
- name: circuit-breaker
rules:
- alert: CircuitBreakerOpen
expr: resilience4j_circuitbreaker_state{state="open"} == 1
for: 1m
labels:
severity: critical
annotations:
summary: "熔断器已开启:{{ $labels.application }} / {{ $labels.name }}"
description: "熔断器 {{ $labels.name }} 处于 OPEN 状态,下游服务不可用"