260322:1648 Correct Coresspondence / Doing RFA / Correct CI

2026-03-22 16:48:12 +07:00
parent e5deedb42e
commit 11984bfa29
683 changed files with 105251 additions and 29068 deletions
@@ -1,4 +1,5 @@
 # 04.3 Monitoring & Alerting
+
 **Project:** LCBP3-DMS
 **Version:** 1.8.0
 **Status:** Active
@@ -78,12 +79,7 @@ This document describes monitoring setup, health checks, and alerting rules for
 ```typescript
 // File: backend/src/health/health.controller.ts
 import { Controller, Get } from '@nestjs/common';
-import {
-  HealthCheck,
-  HealthCheckService,
-  TypeOrmHealthIndicator,
-  DiskHealthIndicator,
-} from '@nestjs/terminus';
+import { HealthCheck, HealthCheckService, TypeOrmHealthIndicator, DiskHealthIndicator } from '@nestjs/terminus';

@Controller('health')
 export class HealthController {
@@ -208,12 +204,7 @@ done

 ```typescript
 // File: backend/src/common/interceptors/performance.interceptor.ts
-import {
-  Injectable,
-  NestInterceptor,
-  ExecutionContext,
-  CallHandler,
-} from '@nestjs/common';
+import { Injectable, NestInterceptor, ExecutionContext, CallHandler } from '@nestjs/common';
 import { Observable } from 'rxjs';
 import { tap } from 'rxjs/operators';
 import { logger } from 'src/config/logger.config';
@@ -460,7 +451,6 @@ ab -n 1000 -c 10 \
 **Last Review:** 2025-12-01
 **Next Review:** 2026-03-01

-
 ---

 # การติดตั้ง Monitoring Stack บน ASUSTOR
@@ -472,15 +462,15 @@ ab -n 1000 -c 10 \

 Stack สำหรับ Monitoring ประกอบด้วย:

-| Service           | Port                         | Purpose                           | Host    |
-| :---------------- | :--------------------------- | :-------------------------------- | :------ |
+| Service           | Port                         | Purpose                            | Host    |
+| :---------------- | :--------------------------- | :--------------------------------- | :------ |
 | **Prometheus**    | 9090                         | เก็บ Metrics และ Time-series data  | ASUSTOR |
-| **Grafana**       | 3000                         | Dashboard สำหรับแสดงผล Metrics      | ASUSTOR |
+| **Grafana**       | 3000                         | Dashboard สำหรับแสดงผล Metrics     | ASUSTOR |
 | **Node Exporter** | 9100                         | เก็บ Metrics ของ Host system       | Both    |
 | **cAdvisor**      | 8080 (ASUSTOR) / 8088 (QNAP) | เก็บ Metrics ของ Docker containers | Both    |
-| **Uptime Kuma**   | 3001                         | Service Availability Monitoring   | ASUSTOR |
-| **Loki**          | 3100                         | Log aggregation                   | ASUSTOR |
-| **Promtail**      | -                            | Log shipper (Sender)              | ASUSTOR |
+| **Uptime Kuma**   | 3001                         | Service Availability Monitoring    | ASUSTOR |
+| **Loki**          | 3100                         | Log aggregation                    | ASUSTOR |
+| **Promtail**      | -                            | Log shipper (Sender)               | ASUSTOR |

 ---

@@ -613,10 +603,10 @@ x-restart: &restart_policy

 x-logging: &default_logging
  logging:
-    driver: "json-file"
+    driver: 'json-file'
    options:
-      max-size: "10m"
-      max-file: "5"
+      max-size: '10m'
+      max-file: '5'

 networks:
  lcbp3:
@@ -635,27 +625,27 @@ services:
    deploy:
      resources:
        limits:
-          cpus: "1.0"
+          cpus: '1.0'
          memory: 1G
        reservations:
-          cpus: "0.25"
+          cpus: '0.25'
          memory: 256M
    environment:
-      TZ: "Asia/Bangkok"
+      TZ: 'Asia/Bangkok'
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--storage.tsdb.retention.time=30d'
      - '--web.enable-lifecycle'
    ports:
-      - "9090:9090"
+      - '9090:9090'
    networks:
      - lcbp3
    volumes:
-      - "/volume1/np-dms/monitoring/prometheus/config:/etc/prometheus:ro"
-      - "/volume1/np-dms/monitoring/prometheus/data:/prometheus"
+      - '/volume1/np-dms/monitoring/prometheus/config:/etc/prometheus:ro'
+      - '/volume1/np-dms/monitoring/prometheus/data:/prometheus'
    healthcheck:
-      test: ["CMD", "wget", "--spider", "-q", "http://localhost:9090/-/healthy"]
+      test: ['CMD', 'wget', '--spider', '-q', 'http://localhost:9090/-/healthy']
      interval: 30s
      timeout: 10s
      retries: 3
@@ -672,27 +662,27 @@ services:
    deploy:
      resources:
        limits:
-          cpus: "1.0"
+          cpus: '1.0'
          memory: 512M
        reservations:
-          cpus: "0.25"
+          cpus: '0.25'
          memory: 128M
    environment:
-      TZ: "Asia/Bangkok"
+      TZ: 'Asia/Bangkok'
      GF_SECURITY_ADMIN_USER: admin
-      GF_SECURITY_ADMIN_PASSWORD: "Center#2025"
-      GF_SERVER_ROOT_URL: "https://grafana.np-dms.work"
+      GF_SECURITY_ADMIN_PASSWORD: 'Center#2025'
+      GF_SERVER_ROOT_URL: 'https://grafana.np-dms.work'
      GF_INSTALL_PLUGINS: grafana-clock-panel,grafana-piechart-panel
    ports:
-      - "3000:3000"
+      - '3000:3000'
    networks:
      - lcbp3
    volumes:
-      - "/volume1/np-dms/monitoring/grafana/data:/var/lib/grafana"
+      - '/volume1/np-dms/monitoring/grafana/data:/var/lib/grafana'
    depends_on:
      - prometheus
    healthcheck:
-      test: ["CMD-SHELL", "wget --spider -q http://localhost:3000/api/health || exit 1"]
+      test: ['CMD-SHELL', 'wget --spider -q http://localhost:3000/api/health || exit 1']
      interval: 30s
      timeout: 10s
      retries: 3
@@ -707,18 +697,18 @@ services:
    deploy:
      resources:
        limits:
-          cpus: "0.5"
+          cpus: '0.5'
          memory: 256M
    environment:
-      TZ: "Asia/Bangkok"
+      TZ: 'Asia/Bangkok'
    ports:
-      - "3001:3001"
+      - '3001:3001'
    networks:
      - lcbp3
    volumes:
-      - "/volume1/np-dms/monitoring/uptime-kuma/data:/app/data"
+      - '/volume1/np-dms/monitoring/uptime-kuma/data:/app/data'
    healthcheck:
-      test: ["CMD-SHELL", "curl -f http://localhost:3001/api/entry-page || exit 1"]
+      test: ['CMD-SHELL', 'curl -f http://localhost:3001/api/entry-page || exit 1']
      interval: 30s
      timeout: 10s
      retries: 3
@@ -733,16 +723,16 @@ services:
    deploy:
      resources:
        limits:
-          cpus: "0.5"
+          cpus: '0.5'
          memory: 128M
    environment:
-      TZ: "Asia/Bangkok"
+      TZ: 'Asia/Bangkok'
    command:
      - '--path.procfs=/host/proc'
      - '--path.sysfs=/host/sys'
      - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
    ports:
-      - "9100:9100"
+      - '9100:9100'
    networks:
      - lcbp3
    volumes:
@@ -750,7 +740,7 @@ services:
      - /sys:/host/sys:ro
      - /:/rootfs:ro
    healthcheck:
-      test: ["CMD", "wget", "--spider", "-q", "http://localhost:9100/metrics"]
+      test: ['CMD', 'wget', '--spider', '-q', 'http://localhost:9100/metrics']
      interval: 30s
      timeout: 10s
      retries: 3
@@ -768,12 +758,12 @@ services:
    deploy:
      resources:
        limits:
-          cpus: "0.5"
+          cpus: '0.5'
          memory: 256M
    environment:
-      TZ: "Asia/Bangkok"
+      TZ: 'Asia/Bangkok'
    ports:
-      - "8088:8088"
+      - '8088:8088'
    networks:
      - lcbp3
    volumes:
@@ -783,7 +773,7 @@ services:
      - /var/lib/docker/:/var/lib/docker:ro
      - /dev/disk/:/dev/disk:ro
    healthcheck:
-      test: ["CMD", "wget", "--spider", "-q", "http://localhost:8080/healthz"]
+      test: ['CMD', 'wget', '--spider', '-q', 'http://localhost:8080/healthz']
      interval: 30s
      timeout: 10s
      retries: 3
@@ -798,19 +788,19 @@ services:
    deploy:
      resources:
        limits:
-          cpus: "0.5"
+          cpus: '0.5'
          memory: 512M
    environment:
-      TZ: "Asia/Bangkok"
+      TZ: 'Asia/Bangkok'
    command: -config.file=/etc/loki/local-config.yaml
    ports:
-      - "3100:3100"
+      - '3100:3100'
    networks:
      - lcbp3
    volumes:
-      - "/volume1/np-dms/monitoring/loki/data:/loki"
+      - '/volume1/np-dms/monitoring/loki/data:/loki'
    healthcheck:
-      test: ["CMD", "wget", "--spider", "-q", "http://localhost:3100/ready"]
+      test: ['CMD', 'wget', '--spider', '-q', 'http://localhost:3100/ready']
      interval: 30s
      timeout: 10s
      retries: 3
@@ -822,21 +812,21 @@ services:
    <<: [*restart_policy, *default_logging]
    image: grafana/promtail:2.9.0
    container_name: promtail
-    user: "0:0"
+    user: '0:0'
    deploy:
      resources:
        limits:
-          cpus: "0.5"
+          cpus: '0.5'
          memory: 256M
    environment:
-      TZ: "Asia/Bangkok"
+      TZ: 'Asia/Bangkok'
    command: -config.file=/etc/promtail/promtail-config.yml
    networks:
      - lcbp3
    volumes:
-      - "/volume1/np-dms/monitoring/promtail/config:/etc/promtail:ro"
-      - "/var/run/docker.sock:/var/run/docker.sock:ro"
-      - "/var/lib/docker/containers:/var/lib/docker/containers:ro"
+      - '/volume1/np-dms/monitoring/promtail/config:/etc/promtail:ro'
+      - '/var/run/docker.sock:/var/run/docker.sock:ro'
+      - '/var/lib/docker/containers:/var/lib/docker/containers:ro'
    depends_on:
      - loki
 ```
@@ -867,7 +857,7 @@ services:
      - '--path.sysfs=/host/sys'
      - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
    ports:
-      - "9100:9100"
+      - '9100:9100'
    networks:
      - lcbp3
    volumes:
@@ -881,7 +871,7 @@ services:
    restart: unless-stopped
    privileged: true
    ports:
-      - "8088:8080"
+      - '8088:8080'
    networks:
      - lcbp3
    volumes:
@@ -899,11 +889,11 @@ services:
    command:
      - '--config.my-cnf=/etc/mysql/my.cnf'
    ports:
-      - "9104:9104"
+      - '9104:9104'
    networks:
      - lcbp3
    volumes:
-      - "/share/np-dms/monitoring/mysqld-exporter/.my.cnf:/etc/mysql/my.cnf:ro"
+      - '/share/np-dms/monitoring/mysqld-exporter/.my.cnf:/etc/mysql/my.cnf:ro'
 ```

 ---
@@ -1012,7 +1002,6 @@ scrape_configs:
 | 14204        | Elasticsearch                | Elasticsearch view             |
 | 13106        | MySQL/MariaDB Overview       | Detailed MySQL/MariaDB metrics |

-
 ### Import Dashboard via Grafana UI

 1. Go to **Dashboards → Import**
@@ -1026,13 +1015,13 @@ scrape_configs:

 ### 📋 Prerequisites Checklist

-| #    | ขั้นตอน                                                                                              | Status |
-| :--- | :------------------------------------------------------------------------------------------------- | :----- |
-| 1    | SSH เข้า ASUSTOR ได้ (`ssh admin@192.168.10.9`)                                                      | ✅      |
-| 2    | Docker Network `lcbp3` สร้างแล้ว (ดูหัวข้อ [สร้าง Docker Network](#-สร้าง-docker-network-ทำครั้งแรกครั้งเดียว)) | ✅      |
-| 3    | สร้าง Directories และกำหนดสิทธิ์แล้ว (ดูหัวข้อ [กำหนดสิทธิ](#กำหนดสิทธิ-บน-asustor))                              | ✅      |
-| 4    | สร้าง `prometheus.yml` แล้ว (ดูหัวข้อ [Prometheus Configuration](#prometheus-configuration))            | ✅      |
-| 5    | สร้าง `promtail-config.yml` แล้ว (ดูหัวข้อ [Step 1.2](#step-12-สร้าง-promtail-configyml))                | ✅      |
+| #   | ขั้นตอน                                                                                                         | Status |
+| :-- | :-------------------------------------------------------------------------------------------------------------- | :----- |
+| 1   | SSH เข้า ASUSTOR ได้ (`ssh admin@192.168.10.9`)                                                                 | ✅     |
+| 2   | Docker Network `lcbp3` สร้างแล้ว (ดูหัวข้อ [สร้าง Docker Network](#-สร้าง-docker-network-ทำครั้งแรกครั้งเดียว)) | ✅     |
+| 3   | สร้าง Directories และกำหนดสิทธิ์แล้ว (ดูหัวข้อ [กำหนดสิทธิ](#กำหนดสิทธิ-บน-asustor))                            | ✅     |
+| 4   | สร้าง `prometheus.yml` แล้ว (ดูหัวข้อ [Prometheus Configuration](#prometheus-configuration))                    | ✅     |
+| 5   | สร้าง `promtail-config.yml` แล้ว (ดูหัวข้อ [Step 1.2](#step-12-สร้าง-promtail-configyml))                       | ✅     |

 ---

@@ -1093,7 +1082,7 @@ cat /volume1/np-dms/monitoring/prometheus/config/prometheus.yml

 ต้องสร้าง Config ให้ Promtail อ่าน logs จาก Docker containers และส่งไป Loki:

-```bash
+````bash
 # สร้างไฟล์ promtail-config.yml
 cat > /volume1/np-dms/monitoring/promtail/config/promtail-config.yml << 'EOF'
 server:
@@ -1127,9 +1116,10 @@ EOF
 CREATE USER 'exporter'@'%' IDENTIFIED BY 'Center2025' WITH MAX_USER_CONNECTIONS 3;
 GRANT PROCESS, REPLICATION CLIENT, SELECT, SLAVE MONITOR ON *.* TO 'exporter'@'%';
 FLUSH PRIVILEGES;
-```
+````

 ### 2. สร้างไฟล์คอนฟิก .my.cnf บน QNAP
+
 เพื่อให้ `mysqld-exporter` อ่านรหัสผ่านที่มีตัวอักษรพิเศษได้ถูกต้อง:

 1. **SSH เข้า QNAP** (หรือใช้ File Station สร้าง Folder):
@@ -1143,11 +1133,11 @@ FLUSH PRIVILEGES;
 3. **สร้างไฟล์ .my.cnf**:
   ```bash
   cat > /share/np-dms/monitoring/mysqld-exporter/.my.cnf << 'EOF'
-[client]
-user=exporter
-password=Center2025
-host=mariadb
-EOF
+   [client]
+   user=exporter
+   password=Center2025
+   host=mariadb
+   EOF
   ```
 4. **กำหนดสิทธิ์ไฟล์** (เพื่อให้ Container อ่านไฟล์ได้):
   ```bash
@@ -1155,8 +1145,10 @@ EOF
   ```

 # ตรวจสอบ
+
 cat /volume1/np-dms/monitoring/promtail/config/promtail-config.yml
-```
+
+````

 ---

@@ -1187,7 +1179,7 @@ docker compose up -d

 # ตรวจสอบ container status
 docker compose ps
-```
+````

 ---

@@ -1200,15 +1192,15 @@ docker ps --filter "name=prometheus" --filter "name=grafana" \
  --filter "name=cadvisor" --filter "name=loki" --filter "name=promtail"
 ```

-| Service           | วิธีตรวจสอบ                                                          | Expected Result                       |
-| :---------------- | :----------------------------------------------------------------- | :------------------------------------ |
-| ✅ **Prometheus**  | `curl http://192.168.10.9:9090/-/healthy`                          | `Prometheus Server is Healthy`        |
-| ✅ **Grafana**     | เปิด `https://grafana.np-dms.work` (หรือ `http://192.168.10.9:3000`) | หน้า Login                             |
-| ✅ **Uptime Kuma** | เปิด `https://uptime.np-dms.work` (หรือ `http://192.168.10.9:3001`)  | หน้า Setup                             |
-| ✅ **Node Exp.**   | `curl http://192.168.10.9:9100/metrics \| head`                    | Metrics output                        |
-| ✅ **cAdvisor**    | `curl http://192.168.10.9:8080/healthz`                            | `ok`                                  |
-| ✅ **Loki**        | `curl http://192.168.10.9:3100/ready`                              | `ready`                               |
-| ✅ **Promtail**    | เช็ค Logs: `docker logs promtail`                                   | ไม่ควรมี Error + เห็น connection success |
+| Service            | วิธีตรวจสอบ                                                          | Expected Result                          |
+| :----------------- | :------------------------------------------------------------------- | :--------------------------------------- |
+| ✅ **Prometheus**  | `curl http://192.168.10.9:9090/-/healthy`                            | `Prometheus Server is Healthy`           |
+| ✅ **Grafana**     | เปิด `https://grafana.np-dms.work` (หรือ `http://192.168.10.9:3000`) | หน้า Login                               |
+| ✅ **Uptime Kuma** | เปิด `https://uptime.np-dms.work` (หรือ `http://192.168.10.9:3001`)  | หน้า Setup                               |
+| ✅ **Node Exp.**   | `curl http://192.168.10.9:9100/metrics \| head`                      | Metrics output                           |
+| ✅ **cAdvisor**    | `curl http://192.168.10.9:8080/healthz`                              | `ok`                                     |
+| ✅ **Loki**        | `curl http://192.168.10.9:3100/ready`                                | `ready`                                  |
+| ✅ **Promtail**    | เช็ค Logs: `docker logs promtail`                                    | ไม่ควรมี Error + เห็น connection success |

 ---

@@ -1262,30 +1254,33 @@ curl -s http://localhost:9090/api/v1/targets | grep -E '"qnap-(node|cadvisor)"'
 เพื่อการ Monitor ที่สมบูรณ์ แนะนำให้ Import Dashboards ต่อไปนี้:

 #### 6.1 Host Monitoring (Node Exporter)
-*   **Concept:** ดู resource ของเครื่อง Host (CPU, RAM, Disk, Network)
-*   **Dashboard ID:** `1860` (Node Exporter Full)
-*   **วิธี Import:**
-    1. ไปที่ **Dashboards** → **New** → **Import**
-    2. ช่อง **Import via grafana.com** ใส่เลข `1860` กด **Load**
-    3. เลือก Data source: **Prometheus**
-    4. กด **Import**
+
+- **Concept:** ดู resource ของเครื่อง Host (CPU, RAM, Disk, Network)
+- **Dashboard ID:** `1860` (Node Exporter Full)
+- **วิธี Import:**
+  1. ไปที่ **Dashboards** → **New** → **Import**
+  2. ช่อง **Import via grafana.com** ใส่เลข `1860` กด **Load**
+  3. เลือก Data source: **Prometheus**
+  4. กด **Import**

 #### 6.2 Container Monitoring (cAdvisor)
-*   **Concept:** ดู resource ของแต่ละ Container (เชื่อม Logs ด้วย)
-*   **Dashboard ID:** `14282` (Cadvisor exporter)
-*   **วิธี Import:**
-    1. ใส่เลข `14282` กด **Load**
-    2. เลือก Data source: **Prometheus**
-    3. กด **Import**
+
+- **Concept:** ดู resource ของแต่ละ Container (เชื่อม Logs ด้วย)
+- **Dashboard ID:** `14282` (Cadvisor exporter)
+- **วิธี Import:**
+  1. ใส่เลข `14282` กด **Load**
+  2. เลือก Data source: **Prometheus**
+  3. กด **Import**

 #### 6.3 Logs Monitoring (Loki Integration)
+
 เพื่อให้ Dashboard ของ Container แสดง Logs จาก Loki ได้ด้วย:

 1. เปิด Dashboard **Cadvisor exporter** ที่เพิ่ง Import มา
 2. กดปุ่ม **Add visualization** (หรือ Edit dashboard)
 3. เลือก Data source: **Loki**
 4. ในช่อง Query ใส่: `{container="$name"}`
-    *   *(Note: `$name` มาจาก Variable ของ Dashboard 14282)*
+   - _(Note: `$name` มาจาก Variable ของ Dashboard 14282)_
 5. ปรับ Visualization type เป็น **Logs**
 6. ตั้งชื่อ Panel ว่า **"Container Logs"**
 7. กด **Apply** และ **Save Dashboard**
@@ -1316,8 +1311,6 @@ curl -s http://localhost:9090/api/v1/targets | grep -E '"qnap-(node|cadvisor)"'

 > 📝 **หมายเหตุ**: เอกสารนี้อ้างอิงจาก Architecture Document **v1.8.0** - Monitoring Stack deploy บน ASUSTOR AS5403T

-
-
 ---

 ## 📈 Document Numbering Specific Monitoring
@@ -1389,9 +1382,9 @@ groups:
          severity: critical
          component: document-numbering
        annotations:
-          summary: "Redis is unavailable for document numbering"
-          description: "System is falling back to DB-only locking. Performance degraded by 30-50%."
-          runbook_url: "https://wiki.lcbp3/runbooks/redis-unavailable"
+          summary: 'Redis is unavailable for document numbering'
+          description: 'System is falling back to DB-only locking. Performance degraded by 30-50%.'
+          runbook_url: 'https://wiki.lcbp3/runbooks/redis-unavailable'

      # CRITICAL: High lock failure rate
      - alert: HighLockFailureRate
@@ -1402,9 +1395,9 @@ groups:
          severity: critical
          component: document-numbering
        annotations:
-          summary: "Lock acquisition failure rate > 10%"
-          description: "Check Redis and database performance immediately"
-          runbook_url: "https://wiki.lcbp3/runbooks/high-lock-failure"
+          summary: 'Lock acquisition failure rate > 10%'
+          description: 'Check Redis and database performance immediately'
+          runbook_url: 'https://wiki.lcbp3/runbooks/high-lock-failure'

      # WARNING: Elevated lock failure rate
      - alert: ElevatedLockFailureRate
@@ -1415,8 +1408,8 @@ groups:
          severity: warning
          component: document-numbering
        annotations:
-          summary: "Lock acquisition failure rate > 5%"
-          description: "Monitor closely. May escalate to critical soon."
+          summary: 'Lock acquisition failure rate > 5%'
+          description: 'Monitor closely. May escalate to critical soon.'

      # WARNING: Slow lock acquisition
      - alert: SlowLockAcquisition
@@ -1429,8 +1422,8 @@ groups:
          severity: warning
          component: document-numbering
        annotations:
-          summary: "P95 lock acquisition time > 1 second"
-          description: "Lock acquisition is slower than expected. Check Redis latency."
+          summary: 'P95 lock acquisition time > 1 second'
+          description: 'Lock acquisition is slower than expected. Check Redis latency.'

      # WARNING: High retry count
      - alert: HighRetryCount
@@ -1443,8 +1436,8 @@ groups:
          severity: warning
          component: document-numbering
        annotations:
-          summary: "Retry count > 100 per hour in project {{ $labels.project }}"
-          description: "High contention detected. Consider scaling."
+          summary: 'Retry count > 100 per hour in project {{ $labels.project }}'
+          description: 'High contention detected. Consider scaling.'

      # WARNING: Slow generation
      - alert: SlowDocumentNumberGeneration
@@ -1457,8 +1450,8 @@ groups:
          severity: warning
          component: document-numbering
        annotations:
-          summary: "P95 generation time > 2 seconds"
-          description: "Document number generation is slower than SLA target"
+          summary: 'P95 generation time > 2 seconds'
+          description: 'Document number generation is slower than SLA target'
 ```

 ### 3.3. AlertManager Configuration
@@ -1553,4 +1546,3 @@ Dashboard panels ที่สำคัญ:
 6. **DB Connection Pool Usage** (Gauge)
   - Query: `docnum_db_connection_pool_usage`
   - Alert threshold: > 80%
-