feat(u8a): 添加 Prometheus + Grafana + Alertmanager 监控指标链路
- prometheus.yml: 抓取 8 个后端微服务 /actuator/prometheus 端点 + 自身 + alertmanager - rules.yml: 7 组告警规则(服务可用性/JVM/HTTP/HikariCP/熔断器) - alertmanager.yml: 邮件 + Webhook 通知,critical/warning 分级路由,critical 抑制 warning - grafana provisioning: Prometheus 数据源自动配置 + 仪表盘自动加载 - jvm-http-overview.json: 6 panel 仪表盘(服务状态/堆内存/CPU/HTTP 请求/P95/连接池) - docker-compose.prod.yml: 3 个监控服务 + 3 个数据卷,Prometheus/Alertmanager 仅 expose,Grafana ports 3001 - .env.example: 新增 Grafana 管理员密码 + SMTP 文档说明 - 验证: docker compose config 通过 + 6 个 YAML/JSON 文件语法检查通过
This commit is contained in:
parent
7fdba3a512
commit
d62c1063f7
|
|
@ -29,6 +29,15 @@ RABBITMQ_PASSWORD=change_me_rabbitmq_password
|
|||
MINIO_ACCESS_KEY=etherpms
|
||||
# MINIO_SECRET_KEY 通过 Docker secrets 挂载,见 deploy/secrets/minio_secret_key.txt
|
||||
|
||||
# ─── 监控(U8a:Grafana + Alertmanager)───
|
||||
# Grafana 管理员账号(首次启动后建议立即修改)
|
||||
GRAFANA_ADMIN_USER=admin
|
||||
GRAFANA_ADMIN_PASSWORD=change_me_grafana_admin
|
||||
# Alertmanager SMTP/告警通知配置:在 deploy/monitoring/alertmanager/alertmanager.yml 中直接修改
|
||||
# 以下变量仅作为部署文档参考,alertmanager 不读取环境变量
|
||||
# SMTP_HOST / SMTP_PORT / SMTP_USER / SMTP_PASSWORD
|
||||
# ALERT_EMAIL_TO / ALERT_WEBHOOK_URL
|
||||
|
||||
# ─── Docker Secrets 文件路径 ───
|
||||
# 以下文件需在部署前手动生成,存放于 deploy/secrets/ 目录
|
||||
# - jwt_private_key.pem JWT 签名私钥(pms-gateway / pms-auth 使用)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,74 @@
|
|||
#========================================
|
||||
# Alertmanager 配置(U8a)
|
||||
# 告警路由与通知渠道
|
||||
# 通知渠道:邮件(默认)+ Webhook(可对接飞书/钉钉/企业微信)
|
||||
#
|
||||
# 注意:Alertmanager 不支持环境变量替换,本文件被直接挂载到容器内。
|
||||
# 修改 SMTP/Webhook 配置后,执行 `docker compose restart alertmanager` 生效,
|
||||
# 或向 http://localhost:9093/-/reload 发送 POST 请求热重载。
|
||||
#========================================
|
||||
global:
|
||||
# SMTP 邮件服务器配置(部署时修改为实际值)
|
||||
smtp_smarthost: 'localhost:587'
|
||||
smtp_from: 'alertmanager@etherpms.com'
|
||||
smtp_auth_username: ''
|
||||
smtp_auth_password: ''
|
||||
|
||||
# 告警模板(可选,后续扩展)
|
||||
templates:
|
||||
- '/etc/alertmanager/templates/*.tmpl'
|
||||
|
||||
# 告警路由规则
|
||||
route:
|
||||
group_by: ['alertname', 'application', 'cluster']
|
||||
group_wait: 30s # 首次告警等待时间(聚合同组告警)
|
||||
group_interval: 5m # 同组告警发送间隔
|
||||
repeat_interval: 4h # 重复告警间隔
|
||||
receiver: 'default' # 默认接收者
|
||||
|
||||
# 子路由:按严重级别分流
|
||||
routes:
|
||||
# critical 级别立即发送,重复间隔缩短
|
||||
- matchers:
|
||||
- severity="critical"
|
||||
receiver: 'critical'
|
||||
group_wait: 10s
|
||||
repeat_interval: 1h
|
||||
continue: true # 继续匹配其他规则
|
||||
|
||||
# warning 级别走默认渠道
|
||||
- matchers:
|
||||
- severity="warning"
|
||||
receiver: 'default'
|
||||
|
||||
# 抑制规则:critical 触发时抑制同服务的 warning
|
||||
inhibit_rules:
|
||||
- source_matchers:
|
||||
- severity="critical"
|
||||
target_matchers:
|
||||
- severity="warning"
|
||||
equal: ['application', 'alertname']
|
||||
|
||||
# ====== 接收者 ======
|
||||
receivers:
|
||||
# 默认接收者(warning + 通用)
|
||||
- name: 'default'
|
||||
email_configs:
|
||||
- to: 'ops@etherpms.com'
|
||||
send_resolved: true # 恢复时发送通知
|
||||
headers:
|
||||
Subject: '[EtherPMS 告警] {{ .CommonLabels.alertname }}'
|
||||
|
||||
# critical 接收者(邮件 + Webhook)
|
||||
- name: 'critical'
|
||||
email_configs:
|
||||
- to: 'ops@etherpms.com'
|
||||
send_resolved: true
|
||||
headers:
|
||||
Subject: '[EtherPMS 严重告警] {{ .CommonLabels.alertname }}'
|
||||
# Webhook:可对接飞书/钉钉/企业微信机器人
|
||||
# 部署时修改为实际机器人 URL,无需通知则删除此段
|
||||
webhook_configs:
|
||||
- url: 'http://127.0.0.1:9999/webhook'
|
||||
send_resolved: true
|
||||
max_alerts: 0
|
||||
|
|
@ -0,0 +1,285 @@
|
|||
{
|
||||
"annotations": {
|
||||
"list": []
|
||||
},
|
||||
"description": "EtherPMS JVM 与 HTTP 概览(U8a 自动加载)",
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 1,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"liveNow": false,
|
||||
"panels": [
|
||||
{
|
||||
"title": "服务状态",
|
||||
"type": "stat",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"mappings": [
|
||||
{
|
||||
"options": {
|
||||
"0": {
|
||||
"text": "DOWN",
|
||||
"color": "red"
|
||||
},
|
||||
"1": {
|
||||
"text": "UP",
|
||||
"color": "green"
|
||||
}
|
||||
},
|
||||
"type": "value"
|
||||
}
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"values": false,
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": ""
|
||||
},
|
||||
"orientation": "horizontal",
|
||||
"textMode": "auto",
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "up{job=~\"pms-.*\"}",
|
||||
"legendFormat": "{{__name__}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "JVM 堆内存使用率",
|
||||
"type": "timeseries",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 6
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percentunit",
|
||||
"min": 0,
|
||||
"max": 1,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.7 },
|
||||
{ "color": "red", "value": 0.85 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"legend": { "displayMode": "table", "placement": "bottom" },
|
||||
"tooltip": { "mode": "multi", "sort": "none" }
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "jvm_memory_used_bytes{area=\"heap\"} / jvm_memory_max_bytes{area=\"heap\"}",
|
||||
"legendFormat": "{{application}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "CPU 使用率",
|
||||
"type": "timeseries",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 6
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percentunit",
|
||||
"min": 0,
|
||||
"max": 1,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.7 },
|
||||
{ "color": "red", "value": 0.85 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"legend": { "displayMode": "table", "placement": "bottom" },
|
||||
"tooltip": { "mode": "multi", "sort": "none" }
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "process_cpu_usage",
|
||||
"legendFormat": "{{application}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "HTTP 请求速率(按状态码)",
|
||||
"type": "timeseries",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 14
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineInterpolation": "smooth",
|
||||
"fillOpacity": 20,
|
||||
"stacking": { "mode": "normal", "group": "A" }
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"legend": { "displayMode": "table", "placement": "bottom" },
|
||||
"tooltip": { "mode": "multi", "sort": "none" }
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_server_requests_seconds_count[1m])) by (application, status)",
|
||||
"legendFormat": "{{application}} {{status}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "HTTP P95 响应时间",
|
||||
"type": "timeseries",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 14
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 2 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"legend": { "displayMode": "table", "placement": "bottom" },
|
||||
"tooltip": { "mode": "multi", "sort": "none" }
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(http_server_requests_seconds_bucket[5m])) by (application, le))",
|
||||
"legendFormat": "{{application}} P95",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "HikariCP 连接池使用率",
|
||||
"type": "timeseries",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 22
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percentunit",
|
||||
"min": 0,
|
||||
"max": 1,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.7 },
|
||||
{ "color": "red", "value": 0.85 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"legend": { "displayMode": "table", "placement": "bottom" },
|
||||
"tooltip": { "mode": "multi", "sort": "none" }
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "hikaricp_connections_active / hikaricp_connections_maximum",
|
||||
"legendFormat": "{{application}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 38,
|
||||
"style": "dark",
|
||||
"tags": ["etherpms", "jvm", "http"],
|
||||
"templating": { "list": [] },
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"timepicker": {},
|
||||
"timezone": "browser",
|
||||
"title": "JVM 与 HTTP 概览",
|
||||
"uid": "jvm-http-overview",
|
||||
"version": 1,
|
||||
"weekStart": ""
|
||||
}
|
||||
|
|
@ -0,0 +1,19 @@
|
|||
#========================================
|
||||
# Grafana 仪表盘自动加载配置(U8a)
|
||||
# 从 /var/lib/grafana/dashboards/ 目录加载 JSON 仪表盘
|
||||
# 仪表盘 JSON 文件放置于 deploy/monitoring/grafana/dashboards/
|
||||
#========================================
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: 'EtherPMS 默认仪表盘'
|
||||
orgId: 1
|
||||
folder: 'EtherPMS'
|
||||
folderUid: etherpms-default
|
||||
type: file
|
||||
disableDeletion: false
|
||||
updateIntervalSeconds: 30
|
||||
allowUiUpdates: true
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards
|
||||
foldersFromFilesStructure: true
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
#========================================
|
||||
# Grafana 数据源自动配置(U8a)
|
||||
# 启动时自动加载 Prometheus 数据源,无需手动 UI 配置
|
||||
#========================================
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://prometheus:9090
|
||||
isDefault: true
|
||||
editable: true
|
||||
jsonData:
|
||||
httpMethod: POST
|
||||
manageAlerts: false
|
||||
timeInterval: '15s'
|
||||
|
|
@ -0,0 +1,93 @@
|
|||
#========================================
|
||||
# Prometheus 主配置(U8a)
|
||||
# 抓取 8 个后端微服务的 /actuator/prometheus 端点
|
||||
# 所有服务通过 pms-net 网络以服务名访问
|
||||
#========================================
|
||||
global:
|
||||
scrape_interval: 15s # 默认抓取间隔
|
||||
evaluation_interval: 15s # 告警规则评估间隔
|
||||
external_labels:
|
||||
cluster: etherpms-prod
|
||||
environment: production
|
||||
|
||||
# 告警规则文件
|
||||
rule_files:
|
||||
- /etc/prometheus/rules.yml
|
||||
|
||||
# Alertmanager 配置
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
- alertmanager:9093
|
||||
|
||||
# 抓取目标
|
||||
scrape_configs:
|
||||
# Prometheus 自身指标
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
# Alertmanager 指标
|
||||
- job_name: 'alertmanager'
|
||||
static_configs:
|
||||
- targets: ['alertmanager:9093']
|
||||
|
||||
# ====== EtherPMS 后端微服务(8 个)======
|
||||
# 每个服务通过 /actuator/prometheus 暴露 Micrometer 指标
|
||||
# metrics.tags.application 标签由 application.yml 注入
|
||||
- job_name: 'pms-gateway'
|
||||
metrics_path: '/actuator/prometheus'
|
||||
static_configs:
|
||||
- targets: ['pms-gateway:8080']
|
||||
labels:
|
||||
service: 'gateway-service'
|
||||
|
||||
- job_name: 'pms-auth'
|
||||
metrics_path: '/actuator/prometheus'
|
||||
static_configs:
|
||||
- targets: ['pms-auth:8081']
|
||||
labels:
|
||||
service: 'auth-service'
|
||||
|
||||
- job_name: 'pms-base'
|
||||
metrics_path: '/actuator/prometheus'
|
||||
static_configs:
|
||||
- targets: ['pms-base:8082']
|
||||
labels:
|
||||
service: 'base-service'
|
||||
|
||||
- job_name: 'pms-operation'
|
||||
metrics_path: '/actuator/prometheus'
|
||||
static_configs:
|
||||
- targets: ['pms-operation:8083']
|
||||
labels:
|
||||
service: 'operation-service'
|
||||
|
||||
- job_name: 'pms-charge'
|
||||
metrics_path: '/actuator/prometheus'
|
||||
static_configs:
|
||||
- targets: ['pms-charge:8084']
|
||||
labels:
|
||||
service: 'charge-service'
|
||||
|
||||
- job_name: 'pms-notify'
|
||||
metrics_path: '/actuator/prometheus'
|
||||
static_configs:
|
||||
- targets: ['pms-notify:8085']
|
||||
labels:
|
||||
service: 'notify-service'
|
||||
|
||||
- job_name: 'pms-file'
|
||||
metrics_path: '/actuator/prometheus'
|
||||
static_configs:
|
||||
- targets: ['pms-file:8086']
|
||||
labels:
|
||||
service: 'file-service'
|
||||
|
||||
- job_name: 'pms-audit'
|
||||
metrics_path: '/actuator/prometheus'
|
||||
static_configs:
|
||||
- targets: ['pms-audit:8087']
|
||||
labels:
|
||||
service: 'audit-service'
|
||||
|
|
@ -0,0 +1,107 @@
|
|||
#========================================
|
||||
# Prometheus 告警规则(U8a)
|
||||
# 覆盖服务可用性、JVM、HTTP、数据库连接池、熔断器
|
||||
#========================================
|
||||
groups:
|
||||
# ====== 服务可用性 ======
|
||||
- name: service-availability
|
||||
rules:
|
||||
- alert: ServiceDown
|
||||
expr: up{job=~"pms-.*"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "服务 {{ $labels.job }} 宕机"
|
||||
description: "{{ $labels.job }}(实例 {{ $labels.instance }})已离线超过 1 分钟"
|
||||
|
||||
- alert: ServiceUnreachable
|
||||
expr: up{job=~"pms-.*"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "服务 {{ $labels.job }} 持续不可达"
|
||||
description: "{{ $labels.job }} 已离线 5 分钟,请立即检查"
|
||||
|
||||
# ====== JVM 运行时 ======
|
||||
- name: jvm-runtime
|
||||
rules:
|
||||
- alert: HighCpuUsage
|
||||
expr: process_cpu_usage{application=~".*-service"} > 0.8
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "CPU 使用率过高:{{ $labels.application }}"
|
||||
description: "{{ $labels.application }} CPU 使用率 {{ $value | humanizePercentage }} 持续 5 分钟超过 80%"
|
||||
|
||||
- alert: HighMemoryUsage
|
||||
expr: jvm_memory_used_bytes{area="heap"} / jvm_memory_max_bytes{area="heap"} > 0.85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "JVM 堆内存使用率过高:{{ $labels.application }}"
|
||||
description: "{{ $labels.application }} 堆内存使用率 {{ $value | humanizePercentage }} 持续 5 分钟超过 85%"
|
||||
|
||||
- alert: HighThreadCount
|
||||
expr: jvm_threads_live_threads > 500
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "JVM 线程数过高:{{ $labels.application }}"
|
||||
description: "{{ $labels.application }} 活跃线程数 {{ $value }} 超过 500"
|
||||
|
||||
# ====== HTTP 请求 ======
|
||||
- name: http-requests
|
||||
rules:
|
||||
- alert: HighErrorRate
|
||||
expr: |
|
||||
sum(rate(http_server_requests_seconds_count{status=~"5.."}[5m])) by (application)
|
||||
/ sum(rate(http_server_requests_seconds_count[5m])) by (application)
|
||||
> 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "HTTP 5xx 错误率过高:{{ $labels.application }}"
|
||||
description: "{{ $labels.application }} 5xx 错误率 {{ $value | humanizePercentage }} 持续 5 分钟超过 5%"
|
||||
|
||||
- alert: SlowResponseTime
|
||||
expr: |
|
||||
histogram_quantile(0.95,
|
||||
sum(rate(http_server_requests_seconds_bucket[5m])) by (application, le)
|
||||
) > 2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "HTTP P95 响应时间过长:{{ $labels.application }}"
|
||||
description: "{{ $labels.application }} P95 响应时间 {{ $value }}s 持续 5 分钟超过 2s"
|
||||
|
||||
# ====== 数据库连接池(HikariCP)======
|
||||
- name: db-pool
|
||||
rules:
|
||||
- alert: HighDbPoolUsage
|
||||
expr: |
|
||||
hikaricp_connections_active / hikaricp_connections_maximum > 0.8
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "数据库连接池使用率过高:{{ $labels.application }}"
|
||||
description: "{{ $labels.application }} HikariCP 活跃连接占比 {{ $value | humanizePercentage }} 持续 5 分钟超过 80%"
|
||||
|
||||
# ====== Resilience4j 熔断器(pms-operation 道闸调用)======
|
||||
- name: circuit-breaker
|
||||
rules:
|
||||
- alert: CircuitBreakerOpen
|
||||
expr: resilience4j_circuitbreaker_state{state="open"} == 1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "熔断器已开启:{{ $labels.application }} / {{ $labels.name }}"
|
||||
description: "熔断器 {{ $labels.name }} 处于 OPEN 状态,下游服务不可用"
|
||||
|
|
@ -456,25 +456,92 @@ services:
|
|||
networks:
|
||||
- pms-net
|
||||
|
||||
# ====== 监控组件占位(U8 4a/4b 补充详细配置)======
|
||||
# prometheus:
|
||||
# image: prom/prometheus:latest
|
||||
# container_name: prod-prometheus
|
||||
# ...
|
||||
# grafana:
|
||||
# image: grafana/grafana:latest
|
||||
# container_name: prod-grafana
|
||||
# ...
|
||||
# alertmanager:
|
||||
# image: prom/alertmanager:latest
|
||||
# container_name: prod-alertmanager
|
||||
# ...
|
||||
# ====== 监控组件(U8a:指标链路)======
|
||||
prometheus:
|
||||
image: prom/prometheus:v2.51.0
|
||||
container_name: prod-prometheus
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- ./deploy/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||
- ./deploy/monitoring/prometheus/rules.yml:/etc/prometheus/rules.yml:ro
|
||||
- prometheus-data:/prometheus
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
- '--storage.tsdb.path=/prometheus'
|
||||
- '--storage.tsdb.retention.time=30d'
|
||||
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
|
||||
- '--web.console.templates=/usr/share/prometheus/consoles'
|
||||
- '--web.enable-lifecycle' # 支持 POST /-/reload 热重载
|
||||
# F5: 仅 expose 不 ports,Grafana 通过 pms-net 访问
|
||||
expose:
|
||||
- "9090"
|
||||
networks:
|
||||
- pms-net
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "-q", "--spider", "http://localhost:9090/-/healthy"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
alertmanager:
|
||||
image: prom/alertmanager:v0.27.0
|
||||
container_name: prod-alertmanager
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- ./deploy/monitoring/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
|
||||
- alertmanager-data:/alertmanager
|
||||
command:
|
||||
- '--config.file=/etc/alertmanager/alertmanager.yml'
|
||||
- '--storage.path=/alertmanager'
|
||||
- '--web.external-url=http://localhost:9093'
|
||||
# SMTP/Webhook 配置在 alertmanager.yml 中直接修改(alertmanager 不支持环境变量替换)
|
||||
expose:
|
||||
- "9093"
|
||||
networks:
|
||||
- pms-net
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "-q", "--spider", "http://localhost:9093/-/healthy"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana:10.4.0
|
||||
container_name: prod-grafana
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
- GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER:-admin}
|
||||
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin}
|
||||
- GF_USERS_ALLOW_SIGN_UP=false
|
||||
- GF_USERS_ALLOW_ORG_CREATE=false
|
||||
- GF_AUTH_ANONYMOUS_ENABLED=false
|
||||
- GF_SERVER_HTTP_PORT=3000
|
||||
- GF_INSTALL_PLUGINS=grafana-piechart-panel
|
||||
volumes:
|
||||
- ./deploy/monitoring/grafana/provisioning:/etc/grafana/provisioning:ro
|
||||
- ./deploy/monitoring/grafana/dashboards:/var/lib/grafana/dashboards:ro
|
||||
- grafana-data:/var/lib/grafana
|
||||
ports:
|
||||
- "3001:3000" # 对外暴露 Grafana UI(生产环境建议通过 nginx 反代 + IP 白名单限制)
|
||||
networks:
|
||||
- pms-net
|
||||
depends_on:
|
||||
prometheus:
|
||||
condition: service_healthy
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "wget -q --spider http://localhost:3000/api/health || exit 1"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 30s
|
||||
|
||||
# ====== 日志聚合(U8b:Loki + Promtail,可延后 2 周落地)======
|
||||
# loki:
|
||||
# image: grafana/loki:latest
|
||||
# image: grafana/loki:2.9.0
|
||||
# container_name: prod-loki
|
||||
# ...
|
||||
# promtail:
|
||||
# image: grafana/promtail:latest
|
||||
# image: grafana/promtail:2.9.0
|
||||
# container_name: prod-promtail
|
||||
# ...
|
||||
|
||||
|
|
@ -495,6 +562,10 @@ volumes:
|
|||
nacos-data:
|
||||
nacos-logs:
|
||||
es-data:
|
||||
# U8a: 监控数据卷
|
||||
prometheus-data:
|
||||
alertmanager-data:
|
||||
grafana-data:
|
||||
|
||||
# ====== 网络 ======
|
||||
networks:
|
||||
|
|
|
|||
Loading…
Reference in New Issue