feat(u8a): 添加 Prometheus + Grafana + Alertmanager 监控指标链路

- prometheus.yml: 抓取 8 个后端微服务 /actuator/prometheus 端点 + 自身 + alertmanager - rules.yml: 7 组告警规则（服务可用性/JVM/HTTP/HikariCP/熔断器） - alertmanager.yml: 邮件 + Webhook 通知，critical/warning 分级路由，critical 抑制 warning - grafana provisioning: Prometheus 数据源自动配置 + 仪表盘自动加载 - jvm-http-overview.json: 6 panel 仪表盘（服务状态/堆内存/CPU/HTTP 请求/P95/连接池） - docker-compose.prod.yml: 3 个监控服务 + 3 个数据卷，Prometheus/Alertmanager 仅 expose，Grafana ports 3001 - .env.example: 新增 Grafana 管理员密码 + SMTP 文档说明 - 验证: docker compose config 通过 + 6 个 YAML/JSON 文件语法检查通过
2026-07-04 18:58:23 +08:00 · 2026-07-04 18:58:23 +08:00 · d62c1063f7
parent 7fdba3a512
commit d62c1063f7
8 changed files with 690 additions and 15 deletions
--- a/.env.example
+++ b/.env.example
@ -29,6 +29,15 @@ RABBITMQ_PASSWORD=change_me_rabbitmq_password
 MINIO_ACCESS_KEY=etherpms
 # MINIO_SECRET_KEY 通过 Docker secrets 挂载，见 deploy/secrets/minio_secret_key.txt

+# ─── 监控（U8a：Grafana + Alertmanager）───
+# Grafana 管理员账号（首次启动后建议立即修改）
+GRAFANA_ADMIN_USER=admin
+GRAFANA_ADMIN_PASSWORD=change_me_grafana_admin
+# Alertmanager SMTP/告警通知配置：在 deploy/monitoring/alertmanager/alertmanager.yml 中直接修改
+# 以下变量仅作为部署文档参考，alertmanager 不读取环境变量
+# SMTP_HOST / SMTP_PORT / SMTP_USER / SMTP_PASSWORD
+# ALERT_EMAIL_TO / ALERT_WEBHOOK_URL
+
 # ─── Docker Secrets 文件路径 ───
 # 以下文件需在部署前手动生成，存放于 deploy/secrets/ 目录
 # - jwt_private_key.pem    JWT 签名私钥（pms-gateway / pms-auth 使用）
--- a/deploy/monitoring/alertmanager/alertmanager.yml
+++ b/deploy/monitoring/alertmanager/alertmanager.yml
@ -0,0 +1,74 @@
+#========================================
+# Alertmanager 配置（U8a）
+# 告警路由与通知渠道
+# 通知渠道：邮件（默认）+ Webhook（可对接飞书/钉钉/企业微信）
+#
+# 注意：Alertmanager 不支持环境变量替换，本文件被直接挂载到容器内。
+#       修改 SMTP/Webhook 配置后，执行 `docker compose restart alertmanager` 生效，
+#       或向 http://localhost:9093/-/reload 发送 POST 请求热重载。
+#========================================
+global:
+  # SMTP 邮件服务器配置（部署时修改为实际值）
+  smtp_smarthost: 'localhost:587'
+  smtp_from: 'alertmanager@etherpms.com'
+  smtp_auth_username: ''
+  smtp_auth_password: ''
+
+# 告警模板（可选，后续扩展）
+templates:
+  - '/etc/alertmanager/templates/*.tmpl'
+
+# 告警路由规则
+route:
+  group_by: ['alertname', 'application', 'cluster']
+  group_wait: 30s         # 首次告警等待时间（聚合同组告警）
+  group_interval: 5m      # 同组告警发送间隔
+  repeat_interval: 4h     # 重复告警间隔
+  receiver: 'default'     # 默认接收者
+
+  # 子路由：按严重级别分流
+  routes:
+    # critical 级别立即发送，重复间隔缩短
+    - matchers:
+        - severity="critical"
+      receiver: 'critical'
+      group_wait: 10s
+      repeat_interval: 1h
+      continue: true       # 继续匹配其他规则
+
+    # warning 级别走默认渠道
+    - matchers:
+        - severity="warning"
+      receiver: 'default'
+
+# 抑制规则：critical 触发时抑制同服务的 warning
+inhibit_rules:
+  - source_matchers:
+      - severity="critical"
+    target_matchers:
+      - severity="warning"
+    equal: ['application', 'alertname']
+
+# ====== 接收者 ======
+receivers:
+  # 默认接收者（warning + 通用）
+  - name: 'default'
+    email_configs:
+      - to: 'ops@etherpms.com'
+        send_resolved: true   # 恢复时发送通知
+        headers:
+          Subject: '[EtherPMS 告警] {{ .CommonLabels.alertname }}'
+
+  # critical 接收者（邮件 + Webhook）
+  - name: 'critical'
+    email_configs:
+      - to: 'ops@etherpms.com'
+        send_resolved: true
+        headers:
+          Subject: '[EtherPMS 严重告警] {{ .CommonLabels.alertname }}'
+    # Webhook：可对接飞书/钉钉/企业微信机器人
+    # 部署时修改为实际机器人 URL，无需通知则删除此段
+    webhook_configs:
+      - url: 'http://127.0.0.1:9999/webhook'
+        send_resolved: true
+        max_alerts: 0
--- a/deploy/monitoring/grafana/dashboards/jvm-http-overview.json
+++ b/deploy/monitoring/grafana/dashboards/jvm-http-overview.json
@ -0,0 +1,285 @@
+{
+  "annotations": {
+    "list": []
+  },
+  "description": "EtherPMS JVM 与 HTTP 概览（U8a 自动加载）",
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 1,
+  "id": null,
+  "links": [],
+  "liveNow": false,
+  "panels": [
+    {
+      "title": "服务状态",
+      "type": "stat",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 24,
+        "x": 0,
+        "y": 0
+      },
+      "fieldConfig": {
+        "defaults": {
+          "mappings": [
+            {
+              "options": {
+                "0": {
+                  "text": "DOWN",
+                  "color": "red"
+                },
+                "1": {
+                  "text": "UP",
+                  "color": "green"
+                }
+              },
+              "type": "value"
+            }
+          ],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "red",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          }
+        }
+      },
+      "options": {
+        "reduceOptions": {
+          "values": false,
+          "calcs": ["lastNotNull"],
+          "fields": ""
+        },
+        "orientation": "horizontal",
+        "textMode": "auto",
+        "colorMode": "background",
+        "graphMode": "none",
+        "justifyMode": "center"
+      },
+      "targets": [
+        {
+          "expr": "up{job=~\"pms-.*\"}",
+          "legendFormat": "{{__name__}}",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "title": "JVM 堆内存使用率",
+      "type": "timeseries",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 6
+      },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percentunit",
+          "min": 0,
+          "max": 1,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "color": "green", "value": null },
+              { "color": "yellow", "value": 0.7 },
+              { "color": "red", "value": 0.85 }
+            ]
+          }
+        }
+      },
+      "options": {
+        "legend": { "displayMode": "table", "placement": "bottom" },
+        "tooltip": { "mode": "multi", "sort": "none" }
+      },
+      "targets": [
+        {
+          "expr": "jvm_memory_used_bytes{area=\"heap\"} / jvm_memory_max_bytes{area=\"heap\"}",
+          "legendFormat": "{{application}}",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "title": "CPU 使用率",
+      "type": "timeseries",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 6
+      },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percentunit",
+          "min": 0,
+          "max": 1,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "color": "green", "value": null },
+              { "color": "yellow", "value": 0.7 },
+              { "color": "red", "value": 0.85 }
+            ]
+          }
+        }
+      },
+      "options": {
+        "legend": { "displayMode": "table", "placement": "bottom" },
+        "tooltip": { "mode": "multi", "sort": "none" }
+      },
+      "targets": [
+        {
+          "expr": "process_cpu_usage",
+          "legendFormat": "{{application}}",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "title": "HTTP 请求速率（按状态码）",
+      "type": "timeseries",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 14
+      },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "reqps",
+          "custom": {
+            "drawStyle": "line",
+            "lineInterpolation": "smooth",
+            "fillOpacity": 20,
+            "stacking": { "mode": "normal", "group": "A" }
+          }
+        }
+      },
+      "options": {
+        "legend": { "displayMode": "table", "placement": "bottom" },
+        "tooltip": { "mode": "multi", "sort": "none" }
+      },
+      "targets": [
+        {
+          "expr": "sum(rate(http_server_requests_seconds_count[1m])) by (application, status)",
+          "legendFormat": "{{application}} {{status}}",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "title": "HTTP P95 响应时间",
+      "type": "timeseries",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 14
+      },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "s",
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "color": "green", "value": null },
+              { "color": "yellow", "value": 1 },
+              { "color": "red", "value": 2 }
+            ]
+          }
+        }
+      },
+      "options": {
+        "legend": { "displayMode": "table", "placement": "bottom" },
+        "tooltip": { "mode": "multi", "sort": "none" }
+      },
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(http_server_requests_seconds_bucket[5m])) by (application, le))",
+          "legendFormat": "{{application}} P95",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "title": "HikariCP 连接池使用率",
+      "type": "timeseries",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 24,
+        "x": 0,
+        "y": 22
+      },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percentunit",
+          "min": 0,
+          "max": 1,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "color": "green", "value": null },
+              { "color": "yellow", "value": 0.7 },
+              { "color": "red", "value": 0.85 }
+            ]
+          }
+        }
+      },
+      "options": {
+        "legend": { "displayMode": "table", "placement": "bottom" },
+        "tooltip": { "mode": "multi", "sort": "none" }
+      },
+      "targets": [
+        {
+          "expr": "hikaricp_connections_active / hikaricp_connections_maximum",
+          "legendFormat": "{{application}}",
+          "refId": "A"
+        }
+      ]
+    }
+  ],
+  "refresh": "30s",
+  "schemaVersion": 38,
+  "style": "dark",
+  "tags": ["etherpms", "jvm", "http"],
+  "templating": { "list": [] },
+  "time": { "from": "now-1h", "to": "now" },
+  "timepicker": {},
+  "timezone": "browser",
+  "title": "JVM 与 HTTP 概览",
+  "uid": "jvm-http-overview",
+  "version": 1,
+  "weekStart": ""
+}
--- a/deploy/monitoring/grafana/provisioning/dashboards/dashboards.yml
+++ b/deploy/monitoring/grafana/provisioning/dashboards/dashboards.yml
@ -0,0 +1,19 @@
+#========================================
+# Grafana 仪表盘自动加载配置（U8a）
+# 从 /var/lib/grafana/dashboards/ 目录加载 JSON 仪表盘
+# 仪表盘 JSON 文件放置于 deploy/monitoring/grafana/dashboards/
+#========================================
+apiVersion: 1
+
+providers:
+  - name: 'EtherPMS 默认仪表盘'
+    orgId: 1
+    folder: 'EtherPMS'
+    folderUid: etherpms-default
+    type: file
+    disableDeletion: false
+    updateIntervalSeconds: 30
+    allowUiUpdates: true
+    options:
+      path: /var/lib/grafana/dashboards
+      foldersFromFilesStructure: true
--- a/deploy/monitoring/grafana/provisioning/datasources/prometheus.yml
+++ b/deploy/monitoring/grafana/provisioning/datasources/prometheus.yml
@ -0,0 +1,17 @@
+#========================================
+# Grafana 数据源自动配置（U8a）
+# 启动时自动加载 Prometheus 数据源，无需手动 UI 配置
+#========================================
+apiVersion: 1
+
+datasources:
+  - name: Prometheus
+    type: prometheus
+    access: proxy
+    url: http://prometheus:9090
+    isDefault: true
+    editable: true
+    jsonData:
+      httpMethod: POST
+      manageAlerts: false
+      timeInterval: '15s'
--- a/deploy/monitoring/prometheus/prometheus.yml
+++ b/deploy/monitoring/prometheus/prometheus.yml
@ -0,0 +1,93 @@
+#========================================
+# Prometheus 主配置（U8a）
+# 抓取 8 个后端微服务的 /actuator/prometheus 端点
+# 所有服务通过 pms-net 网络以服务名访问
+#========================================
+global:
+  scrape_interval: 15s        # 默认抓取间隔
+  evaluation_interval: 15s    # 告警规则评估间隔
+  external_labels:
+    cluster: etherpms-prod
+    environment: production
+
+# 告警规则文件
+rule_files:
+  - /etc/prometheus/rules.yml
+
+# Alertmanager 配置
+alerting:
+  alertmanagers:
+    - static_configs:
+        - targets:
+            - alertmanager:9093
+
+# 抓取目标
+scrape_configs:
+  # Prometheus 自身指标
+  - job_name: 'prometheus'
+    static_configs:
+      - targets: ['localhost:9090']
+
+  # Alertmanager 指标
+  - job_name: 'alertmanager'
+    static_configs:
+      - targets: ['alertmanager:9093']
+
+  # ====== EtherPMS 后端微服务（8 个）======
+  # 每个服务通过 /actuator/prometheus 暴露 Micrometer 指标
+  # metrics.tags.application 标签由 application.yml 注入
+  - job_name: 'pms-gateway'
+    metrics_path: '/actuator/prometheus'
+    static_configs:
+      - targets: ['pms-gateway:8080']
+        labels:
+          service: 'gateway-service'
+
+  - job_name: 'pms-auth'
+    metrics_path: '/actuator/prometheus'
+    static_configs:
+      - targets: ['pms-auth:8081']
+        labels:
+          service: 'auth-service'
+
+  - job_name: 'pms-base'
+    metrics_path: '/actuator/prometheus'
+    static_configs:
+      - targets: ['pms-base:8082']
+        labels:
+          service: 'base-service'
+
+  - job_name: 'pms-operation'
+    metrics_path: '/actuator/prometheus'
+    static_configs:
+      - targets: ['pms-operation:8083']
+        labels:
+          service: 'operation-service'
+
+  - job_name: 'pms-charge'
+    metrics_path: '/actuator/prometheus'
+    static_configs:
+      - targets: ['pms-charge:8084']
+        labels:
+          service: 'charge-service'
+
+  - job_name: 'pms-notify'
+    metrics_path: '/actuator/prometheus'
+    static_configs:
+      - targets: ['pms-notify:8085']
+        labels:
+          service: 'notify-service'
+
+  - job_name: 'pms-file'
+    metrics_path: '/actuator/prometheus'
+    static_configs:
+      - targets: ['pms-file:8086']
+        labels:
+          service: 'file-service'
+
+  - job_name: 'pms-audit'
+    metrics_path: '/actuator/prometheus'
+    static_configs:
+      - targets: ['pms-audit:8087']
+        labels:
+          service: 'audit-service'
--- a/deploy/monitoring/prometheus/rules.yml
+++ b/deploy/monitoring/prometheus/rules.yml
@ -0,0 +1,107 @@
+#========================================
+# Prometheus 告警规则（U8a）
+# 覆盖服务可用性、JVM、HTTP、数据库连接池、熔断器
+#========================================
+groups:
+  # ====== 服务可用性 ======
+  - name: service-availability
+    rules:
+      - alert: ServiceDown
+        expr: up{job=~"pms-.*"} == 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "服务 {{ $labels.job }} 宕机"
+          description: "{{ $labels.job }}（实例 {{ $labels.instance }}）已离线超过 1 分钟"
+
+      - alert: ServiceUnreachable
+        expr: up{job=~"pms-.*"} == 0
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "服务 {{ $labels.job }} 持续不可达"
+          description: "{{ $labels.job }} 已离线 5 分钟，请立即检查"
+
+  # ====== JVM 运行时 ======
+  - name: jvm-runtime
+    rules:
+      - alert: HighCpuUsage
+        expr: process_cpu_usage{application=~".*-service"} > 0.8
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "CPU 使用率过高：{{ $labels.application }}"
+          description: "{{ $labels.application }} CPU 使用率 {{ $value | humanizePercentage }} 持续 5 分钟超过 80%"
+
+      - alert: HighMemoryUsage
+        expr: jvm_memory_used_bytes{area="heap"} / jvm_memory_max_bytes{area="heap"} > 0.85
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "JVM 堆内存使用率过高：{{ $labels.application }}"
+          description: "{{ $labels.application }} 堆内存使用率 {{ $value | humanizePercentage }} 持续 5 分钟超过 85%"
+
+      - alert: HighThreadCount
+        expr: jvm_threads_live_threads > 500
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "JVM 线程数过高：{{ $labels.application }}"
+          description: "{{ $labels.application }} 活跃线程数 {{ $value }} 超过 500"
+
+  # ====== HTTP 请求 ======
+  - name: http-requests
+    rules:
+      - alert: HighErrorRate
+        expr: |
+          sum(rate(http_server_requests_seconds_count{status=~"5.."}[5m])) by (application)
+          / sum(rate(http_server_requests_seconds_count[5m])) by (application)
+          > 0.05
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "HTTP 5xx 错误率过高：{{ $labels.application }}"
+          description: "{{ $labels.application }} 5xx 错误率 {{ $value | humanizePercentage }} 持续 5 分钟超过 5%"
+
+      - alert: SlowResponseTime
+        expr: |
+          histogram_quantile(0.95,
+            sum(rate(http_server_requests_seconds_bucket[5m])) by (application, le)
+          ) > 2
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "HTTP P95 响应时间过长：{{ $labels.application }}"
+          description: "{{ $labels.application }} P95 响应时间 {{ $value }}s 持续 5 分钟超过 2s"
+
+  # ====== 数据库连接池（HikariCP）======
+  - name: db-pool
+    rules:
+      - alert: HighDbPoolUsage
+        expr: |
+          hikaricp_connections_active / hikaricp_connections_maximum > 0.8
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "数据库连接池使用率过高：{{ $labels.application }}"
+          description: "{{ $labels.application }} HikariCP 活跃连接占比 {{ $value | humanizePercentage }} 持续 5 分钟超过 80%"
+
+  # ====== Resilience4j 熔断器（pms-operation 道闸调用）======
+  - name: circuit-breaker
+    rules:
+      - alert: CircuitBreakerOpen
+        expr: resilience4j_circuitbreaker_state{state="open"} == 1
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "熔断器已开启：{{ $labels.application }} / {{ $labels.name }}"
+          description: "熔断器 {{ $labels.name }} 处于 OPEN 状态，下游服务不可用"
--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@ -456,25 +456,92 @@ services:
    networks:
      - pms-net

-  # ====== 监控组件占位（U8 4a/4b 补充详细配置）======
-  # prometheus:
-  #   image: prom/prometheus:latest
-  #   container_name: prod-prometheus
-  #   ...
-  # grafana:
-  #   image: grafana/grafana:latest
-  #   container_name: prod-grafana
-  #   ...
-  # alertmanager:
-  #   image: prom/alertmanager:latest
-  #   container_name: prod-alertmanager
-  #   ...
+  # ====== 监控组件（U8a：指标链路）======
+  prometheus:
+    image: prom/prometheus:v2.51.0
+    container_name: prod-prometheus
+    restart: unless-stopped
+    volumes:
+      - ./deploy/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
+      - ./deploy/monitoring/prometheus/rules.yml:/etc/prometheus/rules.yml:ro
+      - prometheus-data:/prometheus
+    command:
+      - '--config.file=/etc/prometheus/prometheus.yml'
+      - '--storage.tsdb.path=/prometheus'
+      - '--storage.tsdb.retention.time=30d'
+      - '--web.console.libraries=/usr/share/prometheus/console_libraries'
+      - '--web.console.templates=/usr/share/prometheus/consoles'
+      - '--web.enable-lifecycle'  # 支持 POST /-/reload 热重载
+    # F5: 仅 expose 不 ports，Grafana 通过 pms-net 访问
+    expose:
+      - "9090"
+    networks:
+      - pms-net
+    healthcheck:
+      test: ["CMD", "wget", "-q", "--spider", "http://localhost:9090/-/healthy"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+
+  alertmanager:
+    image: prom/alertmanager:v0.27.0
+    container_name: prod-alertmanager
+    restart: unless-stopped
+    volumes:
+      - ./deploy/monitoring/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
+      - alertmanager-data:/alertmanager
+    command:
+      - '--config.file=/etc/alertmanager/alertmanager.yml'
+      - '--storage.path=/alertmanager'
+      - '--web.external-url=http://localhost:9093'
+    # SMTP/Webhook 配置在 alertmanager.yml 中直接修改（alertmanager 不支持环境变量替换）
+    expose:
+      - "9093"
+    networks:
+      - pms-net
+    healthcheck:
+      test: ["CMD", "wget", "-q", "--spider", "http://localhost:9093/-/healthy"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+
+  grafana:
+    image: grafana/grafana:10.4.0
+    container_name: prod-grafana
+    restart: unless-stopped
+    environment:
+      - GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER:-admin}
+      - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin}
+      - GF_USERS_ALLOW_SIGN_UP=false
+      - GF_USERS_ALLOW_ORG_CREATE=false
+      - GF_AUTH_ANONYMOUS_ENABLED=false
+      - GF_SERVER_HTTP_PORT=3000
+      - GF_INSTALL_PLUGINS=grafana-piechart-panel
+    volumes:
+      - ./deploy/monitoring/grafana/provisioning:/etc/grafana/provisioning:ro
+      - ./deploy/monitoring/grafana/dashboards:/var/lib/grafana/dashboards:ro
+      - grafana-data:/var/lib/grafana
+    ports:
+      - "3001:3000"  # 对外暴露 Grafana UI（生产环境建议通过 nginx 反代 + IP 白名单限制）
+    networks:
+      - pms-net
+    depends_on:
+      prometheus:
+        condition: service_healthy
+    healthcheck:
+      test: ["CMD-SHELL", "wget -q --spider http://localhost:3000/api/health || exit 1"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 30s
+
+  # ====== 日志聚合（U8b：Loki + Promtail，可延后 2 周落地）======
  # loki:
-  #   image: grafana/loki:latest
+  #   image: grafana/loki:2.9.0
  #   container_name: prod-loki
  #   ...
  # promtail:
-  #   image: grafana/promtail:latest
+  #   image: grafana/promtail:2.9.0
  #   container_name: prod-promtail
  #   ...

@ -495,6 +562,10 @@ volumes:
  nacos-data:
  nacos-logs:
  es-data:
+  # U8a: 监控数据卷
+  prometheus-data:
+  alertmanager-data:
+  grafana-data:

 # ====== 网络 ======
 networks: