feat(u8a): 添加 Prometheus + Grafana + Alertmanager 监控指标链路

- prometheus.yml: 抓取 8 个后端微服务 /actuator/prometheus 端点 + 自身 + alertmanager
- rules.yml: 7 组告警规则(服务可用性/JVM/HTTP/HikariCP/熔断器)
- alertmanager.yml: 邮件 + Webhook 通知,critical/warning 分级路由,critical 抑制 warning
- grafana provisioning: Prometheus 数据源自动配置 + 仪表盘自动加载
- jvm-http-overview.json: 6 panel 仪表盘(服务状态/堆内存/CPU/HTTP 请求/P95/连接池)
- docker-compose.prod.yml: 3 个监控服务 + 3 个数据卷,Prometheus/Alertmanager 仅 expose,Grafana ports 3001
- .env.example: 新增 Grafana 管理员密码 + SMTP 文档说明
- 验证: docker compose config 通过 + 6 个 YAML/JSON 文件语法检查通过
This commit is contained in:
ether 2026-07-04 18:58:23 +08:00
parent 7fdba3a512
commit d62c1063f7
8 changed files with 690 additions and 15 deletions

View File

@ -29,6 +29,15 @@ RABBITMQ_PASSWORD=change_me_rabbitmq_password
MINIO_ACCESS_KEY=etherpms
# MINIO_SECRET_KEY 通过 Docker secrets 挂载,见 deploy/secrets/minio_secret_key.txt
# ─── 监控U8aGrafana + Alertmanager───
# Grafana 管理员账号(首次启动后建议立即修改)
GRAFANA_ADMIN_USER=admin
GRAFANA_ADMIN_PASSWORD=change_me_grafana_admin
# Alertmanager SMTP/告警通知配置:在 deploy/monitoring/alertmanager/alertmanager.yml 中直接修改
# 以下变量仅作为部署文档参考alertmanager 不读取环境变量
# SMTP_HOST / SMTP_PORT / SMTP_USER / SMTP_PASSWORD
# ALERT_EMAIL_TO / ALERT_WEBHOOK_URL
# ─── Docker Secrets 文件路径 ───
# 以下文件需在部署前手动生成,存放于 deploy/secrets/ 目录
# - jwt_private_key.pem JWT 签名私钥pms-gateway / pms-auth 使用)

View File

@ -0,0 +1,74 @@
#========================================
# Alertmanager 配置U8a
# 告警路由与通知渠道
# 通知渠道:邮件(默认)+ Webhook可对接飞书/钉钉/企业微信)
#
# 注意Alertmanager 不支持环境变量替换,本文件被直接挂载到容器内。
# 修改 SMTP/Webhook 配置后,执行 `docker compose restart alertmanager` 生效,
# 或向 http://localhost:9093/-/reload 发送 POST 请求热重载。
#========================================
global:
# SMTP 邮件服务器配置(部署时修改为实际值)
smtp_smarthost: 'localhost:587'
smtp_from: 'alertmanager@etherpms.com'
smtp_auth_username: ''
smtp_auth_password: ''
# 告警模板(可选,后续扩展)
templates:
- '/etc/alertmanager/templates/*.tmpl'
# 告警路由规则
route:
group_by: ['alertname', 'application', 'cluster']
group_wait: 30s # 首次告警等待时间(聚合同组告警)
group_interval: 5m # 同组告警发送间隔
repeat_interval: 4h # 重复告警间隔
receiver: 'default' # 默认接收者
# 子路由:按严重级别分流
routes:
# critical 级别立即发送,重复间隔缩短
- matchers:
- severity="critical"
receiver: 'critical'
group_wait: 10s
repeat_interval: 1h
continue: true # 继续匹配其他规则
# warning 级别走默认渠道
- matchers:
- severity="warning"
receiver: 'default'
# 抑制规则critical 触发时抑制同服务的 warning
inhibit_rules:
- source_matchers:
- severity="critical"
target_matchers:
- severity="warning"
equal: ['application', 'alertname']
# ====== 接收者 ======
receivers:
# 默认接收者warning + 通用)
- name: 'default'
email_configs:
- to: 'ops@etherpms.com'
send_resolved: true # 恢复时发送通知
headers:
Subject: '[EtherPMS 告警] {{ .CommonLabels.alertname }}'
# critical 接收者(邮件 + Webhook
- name: 'critical'
email_configs:
- to: 'ops@etherpms.com'
send_resolved: true
headers:
Subject: '[EtherPMS 严重告警] {{ .CommonLabels.alertname }}'
# Webhook可对接飞书/钉钉/企业微信机器人
# 部署时修改为实际机器人 URL无需通知则删除此段
webhook_configs:
- url: 'http://127.0.0.1:9999/webhook'
send_resolved: true
max_alerts: 0

View File

@ -0,0 +1,285 @@
{
"annotations": {
"list": []
},
"description": "EtherPMS JVM 与 HTTP 概览U8a 自动加载)",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": null,
"links": [],
"liveNow": false,
"panels": [
{
"title": "服务状态",
"type": "stat",
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"gridPos": {
"h": 6,
"w": 24,
"x": 0,
"y": 0
},
"fieldConfig": {
"defaults": {
"mappings": [
{
"options": {
"0": {
"text": "DOWN",
"color": "red"
},
"1": {
"text": "UP",
"color": "green"
}
},
"type": "value"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "green",
"value": 1
}
]
}
}
},
"options": {
"reduceOptions": {
"values": false,
"calcs": ["lastNotNull"],
"fields": ""
},
"orientation": "horizontal",
"textMode": "auto",
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center"
},
"targets": [
{
"expr": "up{job=~\"pms-.*\"}",
"legendFormat": "{{__name__}}",
"refId": "A"
}
]
},
{
"title": "JVM 堆内存使用率",
"type": "timeseries",
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 6
},
"fieldConfig": {
"defaults": {
"unit": "percentunit",
"min": 0,
"max": 1,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 0.7 },
{ "color": "red", "value": 0.85 }
]
}
}
},
"options": {
"legend": { "displayMode": "table", "placement": "bottom" },
"tooltip": { "mode": "multi", "sort": "none" }
},
"targets": [
{
"expr": "jvm_memory_used_bytes{area=\"heap\"} / jvm_memory_max_bytes{area=\"heap\"}",
"legendFormat": "{{application}}",
"refId": "A"
}
]
},
{
"title": "CPU 使用率",
"type": "timeseries",
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 6
},
"fieldConfig": {
"defaults": {
"unit": "percentunit",
"min": 0,
"max": 1,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 0.7 },
{ "color": "red", "value": 0.85 }
]
}
}
},
"options": {
"legend": { "displayMode": "table", "placement": "bottom" },
"tooltip": { "mode": "multi", "sort": "none" }
},
"targets": [
{
"expr": "process_cpu_usage",
"legendFormat": "{{application}}",
"refId": "A"
}
]
},
{
"title": "HTTP 请求速率(按状态码)",
"type": "timeseries",
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 14
},
"fieldConfig": {
"defaults": {
"unit": "reqps",
"custom": {
"drawStyle": "line",
"lineInterpolation": "smooth",
"fillOpacity": 20,
"stacking": { "mode": "normal", "group": "A" }
}
}
},
"options": {
"legend": { "displayMode": "table", "placement": "bottom" },
"tooltip": { "mode": "multi", "sort": "none" }
},
"targets": [
{
"expr": "sum(rate(http_server_requests_seconds_count[1m])) by (application, status)",
"legendFormat": "{{application}} {{status}}",
"refId": "A"
}
]
},
{
"title": "HTTP P95 响应时间",
"type": "timeseries",
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 14
},
"fieldConfig": {
"defaults": {
"unit": "s",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 1 },
{ "color": "red", "value": 2 }
]
}
}
},
"options": {
"legend": { "displayMode": "table", "placement": "bottom" },
"tooltip": { "mode": "multi", "sort": "none" }
},
"targets": [
{
"expr": "histogram_quantile(0.95, sum(rate(http_server_requests_seconds_bucket[5m])) by (application, le))",
"legendFormat": "{{application}} P95",
"refId": "A"
}
]
},
{
"title": "HikariCP 连接池使用率",
"type": "timeseries",
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 22
},
"fieldConfig": {
"defaults": {
"unit": "percentunit",
"min": 0,
"max": 1,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 0.7 },
{ "color": "red", "value": 0.85 }
]
}
}
},
"options": {
"legend": { "displayMode": "table", "placement": "bottom" },
"tooltip": { "mode": "multi", "sort": "none" }
},
"targets": [
{
"expr": "hikaricp_connections_active / hikaricp_connections_maximum",
"legendFormat": "{{application}}",
"refId": "A"
}
]
}
],
"refresh": "30s",
"schemaVersion": 38,
"style": "dark",
"tags": ["etherpms", "jvm", "http"],
"templating": { "list": [] },
"time": { "from": "now-1h", "to": "now" },
"timepicker": {},
"timezone": "browser",
"title": "JVM 与 HTTP 概览",
"uid": "jvm-http-overview",
"version": 1,
"weekStart": ""
}

View File

@ -0,0 +1,19 @@
#========================================
# Grafana 仪表盘自动加载配置U8a
# 从 /var/lib/grafana/dashboards/ 目录加载 JSON 仪表盘
# 仪表盘 JSON 文件放置于 deploy/monitoring/grafana/dashboards/
#========================================
apiVersion: 1
providers:
- name: 'EtherPMS 默认仪表盘'
orgId: 1
folder: 'EtherPMS'
folderUid: etherpms-default
type: file
disableDeletion: false
updateIntervalSeconds: 30
allowUiUpdates: true
options:
path: /var/lib/grafana/dashboards
foldersFromFilesStructure: true

View File

@ -0,0 +1,17 @@
#========================================
# Grafana 数据源自动配置U8a
# 启动时自动加载 Prometheus 数据源,无需手动 UI 配置
#========================================
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
editable: true
jsonData:
httpMethod: POST
manageAlerts: false
timeInterval: '15s'

View File

@ -0,0 +1,93 @@
#========================================
# Prometheus 主配置U8a
# 抓取 8 个后端微服务的 /actuator/prometheus 端点
# 所有服务通过 pms-net 网络以服务名访问
#========================================
global:
scrape_interval: 15s # 默认抓取间隔
evaluation_interval: 15s # 告警规则评估间隔
external_labels:
cluster: etherpms-prod
environment: production
# 告警规则文件
rule_files:
- /etc/prometheus/rules.yml
# Alertmanager 配置
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
# 抓取目标
scrape_configs:
# Prometheus 自身指标
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
# Alertmanager 指标
- job_name: 'alertmanager'
static_configs:
- targets: ['alertmanager:9093']
# ====== EtherPMS 后端微服务8 个)======
# 每个服务通过 /actuator/prometheus 暴露 Micrometer 指标
# metrics.tags.application 标签由 application.yml 注入
- job_name: 'pms-gateway'
metrics_path: '/actuator/prometheus'
static_configs:
- targets: ['pms-gateway:8080']
labels:
service: 'gateway-service'
- job_name: 'pms-auth'
metrics_path: '/actuator/prometheus'
static_configs:
- targets: ['pms-auth:8081']
labels:
service: 'auth-service'
- job_name: 'pms-base'
metrics_path: '/actuator/prometheus'
static_configs:
- targets: ['pms-base:8082']
labels:
service: 'base-service'
- job_name: 'pms-operation'
metrics_path: '/actuator/prometheus'
static_configs:
- targets: ['pms-operation:8083']
labels:
service: 'operation-service'
- job_name: 'pms-charge'
metrics_path: '/actuator/prometheus'
static_configs:
- targets: ['pms-charge:8084']
labels:
service: 'charge-service'
- job_name: 'pms-notify'
metrics_path: '/actuator/prometheus'
static_configs:
- targets: ['pms-notify:8085']
labels:
service: 'notify-service'
- job_name: 'pms-file'
metrics_path: '/actuator/prometheus'
static_configs:
- targets: ['pms-file:8086']
labels:
service: 'file-service'
- job_name: 'pms-audit'
metrics_path: '/actuator/prometheus'
static_configs:
- targets: ['pms-audit:8087']
labels:
service: 'audit-service'

View File

@ -0,0 +1,107 @@
#========================================
# Prometheus 告警规则U8a
# 覆盖服务可用性、JVM、HTTP、数据库连接池、熔断器
#========================================
groups:
# ====== 服务可用性 ======
- name: service-availability
rules:
- alert: ServiceDown
expr: up{job=~"pms-.*"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "服务 {{ $labels.job }} 宕机"
description: "{{ $labels.job }}(实例 {{ $labels.instance }})已离线超过 1 分钟"
- alert: ServiceUnreachable
expr: up{job=~"pms-.*"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "服务 {{ $labels.job }} 持续不可达"
description: "{{ $labels.job }} 已离线 5 分钟,请立即检查"
# ====== JVM 运行时 ======
- name: jvm-runtime
rules:
- alert: HighCpuUsage
expr: process_cpu_usage{application=~".*-service"} > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "CPU 使用率过高:{{ $labels.application }}"
description: "{{ $labels.application }} CPU 使用率 {{ $value | humanizePercentage }} 持续 5 分钟超过 80%"
- alert: HighMemoryUsage
expr: jvm_memory_used_bytes{area="heap"} / jvm_memory_max_bytes{area="heap"} > 0.85
for: 5m
labels:
severity: warning
annotations:
summary: "JVM 堆内存使用率过高:{{ $labels.application }}"
description: "{{ $labels.application }} 堆内存使用率 {{ $value | humanizePercentage }} 持续 5 分钟超过 85%"
- alert: HighThreadCount
expr: jvm_threads_live_threads > 500
for: 5m
labels:
severity: warning
annotations:
summary: "JVM 线程数过高:{{ $labels.application }}"
description: "{{ $labels.application }} 活跃线程数 {{ $value }} 超过 500"
# ====== HTTP 请求 ======
- name: http-requests
rules:
- alert: HighErrorRate
expr: |
sum(rate(http_server_requests_seconds_count{status=~"5.."}[5m])) by (application)
/ sum(rate(http_server_requests_seconds_count[5m])) by (application)
> 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "HTTP 5xx 错误率过高:{{ $labels.application }}"
description: "{{ $labels.application }} 5xx 错误率 {{ $value | humanizePercentage }} 持续 5 分钟超过 5%"
- alert: SlowResponseTime
expr: |
histogram_quantile(0.95,
sum(rate(http_server_requests_seconds_bucket[5m])) by (application, le)
) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "HTTP P95 响应时间过长:{{ $labels.application }}"
description: "{{ $labels.application }} P95 响应时间 {{ $value }}s 持续 5 分钟超过 2s"
# ====== 数据库连接池HikariCP======
- name: db-pool
rules:
- alert: HighDbPoolUsage
expr: |
hikaricp_connections_active / hikaricp_connections_maximum > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "数据库连接池使用率过高:{{ $labels.application }}"
description: "{{ $labels.application }} HikariCP 活跃连接占比 {{ $value | humanizePercentage }} 持续 5 分钟超过 80%"
# ====== Resilience4j 熔断器pms-operation 道闸调用)======
- name: circuit-breaker
rules:
- alert: CircuitBreakerOpen
expr: resilience4j_circuitbreaker_state{state="open"} == 1
for: 1m
labels:
severity: critical
annotations:
summary: "熔断器已开启:{{ $labels.application }} / {{ $labels.name }}"
description: "熔断器 {{ $labels.name }} 处于 OPEN 状态,下游服务不可用"

View File

@ -456,25 +456,92 @@ services:
networks:
- pms-net
# ====== 监控组件占位U8 4a/4b 补充详细配置)======
# prometheus:
# image: prom/prometheus:latest
# container_name: prod-prometheus
# ...
# grafana:
# image: grafana/grafana:latest
# container_name: prod-grafana
# ...
# alertmanager:
# image: prom/alertmanager:latest
# container_name: prod-alertmanager
# ...
# ====== 监控组件U8a指标链路======
prometheus:
image: prom/prometheus:v2.51.0
container_name: prod-prometheus
restart: unless-stopped
volumes:
- ./deploy/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ./deploy/monitoring/prometheus/rules.yml:/etc/prometheus/rules.yml:ro
- prometheus-data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=30d'
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
- '--web.console.templates=/usr/share/prometheus/consoles'
- '--web.enable-lifecycle' # 支持 POST /-/reload 热重载
# F5: 仅 expose 不 portsGrafana 通过 pms-net 访问
expose:
- "9090"
networks:
- pms-net
healthcheck:
test: ["CMD", "wget", "-q", "--spider", "http://localhost:9090/-/healthy"]
interval: 30s
timeout: 10s
retries: 3
alertmanager:
image: prom/alertmanager:v0.27.0
container_name: prod-alertmanager
restart: unless-stopped
volumes:
- ./deploy/monitoring/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
- alertmanager-data:/alertmanager
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
- '--web.external-url=http://localhost:9093'
# SMTP/Webhook 配置在 alertmanager.yml 中直接修改alertmanager 不支持环境变量替换)
expose:
- "9093"
networks:
- pms-net
healthcheck:
test: ["CMD", "wget", "-q", "--spider", "http://localhost:9093/-/healthy"]
interval: 30s
timeout: 10s
retries: 3
grafana:
image: grafana/grafana:10.4.0
container_name: prod-grafana
restart: unless-stopped
environment:
- GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER:-admin}
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin}
- GF_USERS_ALLOW_SIGN_UP=false
- GF_USERS_ALLOW_ORG_CREATE=false
- GF_AUTH_ANONYMOUS_ENABLED=false
- GF_SERVER_HTTP_PORT=3000
- GF_INSTALL_PLUGINS=grafana-piechart-panel
volumes:
- ./deploy/monitoring/grafana/provisioning:/etc/grafana/provisioning:ro
- ./deploy/monitoring/grafana/dashboards:/var/lib/grafana/dashboards:ro
- grafana-data:/var/lib/grafana
ports:
- "3001:3000" # 对外暴露 Grafana UI生产环境建议通过 nginx 反代 + IP 白名单限制)
networks:
- pms-net
depends_on:
prometheus:
condition: service_healthy
healthcheck:
test: ["CMD-SHELL", "wget -q --spider http://localhost:3000/api/health || exit 1"]
interval: 30s
timeout: 10s
retries: 3
start_period: 30s
# ====== 日志聚合U8bLoki + Promtail可延后 2 周落地)======
# loki:
# image: grafana/loki:latest
# image: grafana/loki:2.9.0
# container_name: prod-loki
# ...
# promtail:
# image: grafana/promtail:latest
# image: grafana/promtail:2.9.0
# container_name: prod-promtail
# ...
@ -495,6 +562,10 @@ volumes:
nacos-data:
nacos-logs:
es-data:
# U8a: 监控数据卷
prometheus-data:
alertmanager-data:
grafana-data:
# ====== 网络 ======
networks: