260322:1648 Correct Coresspondence / Doing RFA / Correct CI
CI Pipeline / build (push) Failing after 12m41s
Build and Deploy / deploy (push) Failing after 2m44s

This commit is contained in:
admin
2026-03-22 16:48:12 +07:00
parent e5deedb42e
commit 11984bfa29
683 changed files with 105251 additions and 29068 deletions
+114 -122
View File
@@ -1,4 +1,5 @@
# 04.3 Monitoring & Alerting
**Project:** LCBP3-DMS
**Version:** 1.8.0
**Status:** Active
@@ -78,12 +79,7 @@ This document describes monitoring setup, health checks, and alerting rules for
```typescript
// File: backend/src/health/health.controller.ts
import { Controller, Get } from '@nestjs/common';
import {
HealthCheck,
HealthCheckService,
TypeOrmHealthIndicator,
DiskHealthIndicator,
} from '@nestjs/terminus';
import { HealthCheck, HealthCheckService, TypeOrmHealthIndicator, DiskHealthIndicator } from '@nestjs/terminus';
@Controller('health')
export class HealthController {
@@ -208,12 +204,7 @@ done
```typescript
// File: backend/src/common/interceptors/performance.interceptor.ts
import {
Injectable,
NestInterceptor,
ExecutionContext,
CallHandler,
} from '@nestjs/common';
import { Injectable, NestInterceptor, ExecutionContext, CallHandler } from '@nestjs/common';
import { Observable } from 'rxjs';
import { tap } from 'rxjs/operators';
import { logger } from 'src/config/logger.config';
@@ -460,7 +451,6 @@ ab -n 1000 -c 10 \
**Last Review:** 2025-12-01
**Next Review:** 2026-03-01
---
# การติดตั้ง Monitoring Stack บน ASUSTOR
@@ -472,15 +462,15 @@ ab -n 1000 -c 10 \
Stack สำหรับ Monitoring ประกอบด้วย:
| Service | Port | Purpose | Host |
| :---------------- | :--------------------------- | :-------------------------------- | :------ |
| Service | Port | Purpose | Host |
| :---------------- | :--------------------------- | :--------------------------------- | :------ |
| **Prometheus** | 9090 | เก็บ Metrics และ Time-series data | ASUSTOR |
| **Grafana** | 3000 | Dashboard สำหรับแสดงผล Metrics | ASUSTOR |
| **Grafana** | 3000 | Dashboard สำหรับแสดงผล Metrics | ASUSTOR |
| **Node Exporter** | 9100 | เก็บ Metrics ของ Host system | Both |
| **cAdvisor** | 8080 (ASUSTOR) / 8088 (QNAP) | เก็บ Metrics ของ Docker containers | Both |
| **Uptime Kuma** | 3001 | Service Availability Monitoring | ASUSTOR |
| **Loki** | 3100 | Log aggregation | ASUSTOR |
| **Promtail** | - | Log shipper (Sender) | ASUSTOR |
| **Uptime Kuma** | 3001 | Service Availability Monitoring | ASUSTOR |
| **Loki** | 3100 | Log aggregation | ASUSTOR |
| **Promtail** | - | Log shipper (Sender) | ASUSTOR |
---
@@ -613,10 +603,10 @@ x-restart: &restart_policy
x-logging: &default_logging
logging:
driver: "json-file"
driver: 'json-file'
options:
max-size: "10m"
max-file: "5"
max-size: '10m'
max-file: '5'
networks:
lcbp3:
@@ -635,27 +625,27 @@ services:
deploy:
resources:
limits:
cpus: "1.0"
cpus: '1.0'
memory: 1G
reservations:
cpus: "0.25"
cpus: '0.25'
memory: 256M
environment:
TZ: "Asia/Bangkok"
TZ: 'Asia/Bangkok'
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=30d'
- '--web.enable-lifecycle'
ports:
- "9090:9090"
- '9090:9090'
networks:
- lcbp3
volumes:
- "/volume1/np-dms/monitoring/prometheus/config:/etc/prometheus:ro"
- "/volume1/np-dms/monitoring/prometheus/data:/prometheus"
- '/volume1/np-dms/monitoring/prometheus/config:/etc/prometheus:ro'
- '/volume1/np-dms/monitoring/prometheus/data:/prometheus'
healthcheck:
test: ["CMD", "wget", "--spider", "-q", "http://localhost:9090/-/healthy"]
test: ['CMD', 'wget', '--spider', '-q', 'http://localhost:9090/-/healthy']
interval: 30s
timeout: 10s
retries: 3
@@ -672,27 +662,27 @@ services:
deploy:
resources:
limits:
cpus: "1.0"
cpus: '1.0'
memory: 512M
reservations:
cpus: "0.25"
cpus: '0.25'
memory: 128M
environment:
TZ: "Asia/Bangkok"
TZ: 'Asia/Bangkok'
GF_SECURITY_ADMIN_USER: admin
GF_SECURITY_ADMIN_PASSWORD: "Center#2025"
GF_SERVER_ROOT_URL: "https://grafana.np-dms.work"
GF_SECURITY_ADMIN_PASSWORD: 'Center#2025'
GF_SERVER_ROOT_URL: 'https://grafana.np-dms.work'
GF_INSTALL_PLUGINS: grafana-clock-panel,grafana-piechart-panel
ports:
- "3000:3000"
- '3000:3000'
networks:
- lcbp3
volumes:
- "/volume1/np-dms/monitoring/grafana/data:/var/lib/grafana"
- '/volume1/np-dms/monitoring/grafana/data:/var/lib/grafana'
depends_on:
- prometheus
healthcheck:
test: ["CMD-SHELL", "wget --spider -q http://localhost:3000/api/health || exit 1"]
test: ['CMD-SHELL', 'wget --spider -q http://localhost:3000/api/health || exit 1']
interval: 30s
timeout: 10s
retries: 3
@@ -707,18 +697,18 @@ services:
deploy:
resources:
limits:
cpus: "0.5"
cpus: '0.5'
memory: 256M
environment:
TZ: "Asia/Bangkok"
TZ: 'Asia/Bangkok'
ports:
- "3001:3001"
- '3001:3001'
networks:
- lcbp3
volumes:
- "/volume1/np-dms/monitoring/uptime-kuma/data:/app/data"
- '/volume1/np-dms/monitoring/uptime-kuma/data:/app/data'
healthcheck:
test: ["CMD-SHELL", "curl -f http://localhost:3001/api/entry-page || exit 1"]
test: ['CMD-SHELL', 'curl -f http://localhost:3001/api/entry-page || exit 1']
interval: 30s
timeout: 10s
retries: 3
@@ -733,16 +723,16 @@ services:
deploy:
resources:
limits:
cpus: "0.5"
cpus: '0.5'
memory: 128M
environment:
TZ: "Asia/Bangkok"
TZ: 'Asia/Bangkok'
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
ports:
- "9100:9100"
- '9100:9100'
networks:
- lcbp3
volumes:
@@ -750,7 +740,7 @@ services:
- /sys:/host/sys:ro
- /:/rootfs:ro
healthcheck:
test: ["CMD", "wget", "--spider", "-q", "http://localhost:9100/metrics"]
test: ['CMD', 'wget', '--spider', '-q', 'http://localhost:9100/metrics']
interval: 30s
timeout: 10s
retries: 3
@@ -768,12 +758,12 @@ services:
deploy:
resources:
limits:
cpus: "0.5"
cpus: '0.5'
memory: 256M
environment:
TZ: "Asia/Bangkok"
TZ: 'Asia/Bangkok'
ports:
- "8088:8088"
- '8088:8088'
networks:
- lcbp3
volumes:
@@ -783,7 +773,7 @@ services:
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
healthcheck:
test: ["CMD", "wget", "--spider", "-q", "http://localhost:8080/healthz"]
test: ['CMD', 'wget', '--spider', '-q', 'http://localhost:8080/healthz']
interval: 30s
timeout: 10s
retries: 3
@@ -798,19 +788,19 @@ services:
deploy:
resources:
limits:
cpus: "0.5"
cpus: '0.5'
memory: 512M
environment:
TZ: "Asia/Bangkok"
TZ: 'Asia/Bangkok'
command: -config.file=/etc/loki/local-config.yaml
ports:
- "3100:3100"
- '3100:3100'
networks:
- lcbp3
volumes:
- "/volume1/np-dms/monitoring/loki/data:/loki"
- '/volume1/np-dms/monitoring/loki/data:/loki'
healthcheck:
test: ["CMD", "wget", "--spider", "-q", "http://localhost:3100/ready"]
test: ['CMD', 'wget', '--spider', '-q', 'http://localhost:3100/ready']
interval: 30s
timeout: 10s
retries: 3
@@ -822,21 +812,21 @@ services:
<<: [*restart_policy, *default_logging]
image: grafana/promtail:2.9.0
container_name: promtail
user: "0:0"
user: '0:0'
deploy:
resources:
limits:
cpus: "0.5"
cpus: '0.5'
memory: 256M
environment:
TZ: "Asia/Bangkok"
TZ: 'Asia/Bangkok'
command: -config.file=/etc/promtail/promtail-config.yml
networks:
- lcbp3
volumes:
- "/volume1/np-dms/monitoring/promtail/config:/etc/promtail:ro"
- "/var/run/docker.sock:/var/run/docker.sock:ro"
- "/var/lib/docker/containers:/var/lib/docker/containers:ro"
- '/volume1/np-dms/monitoring/promtail/config:/etc/promtail:ro'
- '/var/run/docker.sock:/var/run/docker.sock:ro'
- '/var/lib/docker/containers:/var/lib/docker/containers:ro'
depends_on:
- loki
```
@@ -867,7 +857,7 @@ services:
- '--path.sysfs=/host/sys'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
ports:
- "9100:9100"
- '9100:9100'
networks:
- lcbp3
volumes:
@@ -881,7 +871,7 @@ services:
restart: unless-stopped
privileged: true
ports:
- "8088:8080"
- '8088:8080'
networks:
- lcbp3
volumes:
@@ -899,11 +889,11 @@ services:
command:
- '--config.my-cnf=/etc/mysql/my.cnf'
ports:
- "9104:9104"
- '9104:9104'
networks:
- lcbp3
volumes:
- "/share/np-dms/monitoring/mysqld-exporter/.my.cnf:/etc/mysql/my.cnf:ro"
- '/share/np-dms/monitoring/mysqld-exporter/.my.cnf:/etc/mysql/my.cnf:ro'
```
---
@@ -1012,7 +1002,6 @@ scrape_configs:
| 14204 | Elasticsearch | Elasticsearch view |
| 13106 | MySQL/MariaDB Overview | Detailed MySQL/MariaDB metrics |
### Import Dashboard via Grafana UI
1. Go to **Dashboards → Import**
@@ -1026,13 +1015,13 @@ scrape_configs:
### 📋 Prerequisites Checklist
| # | ขั้นตอน | Status |
| :--- | :------------------------------------------------------------------------------------------------- | :----- |
| 1 | SSH เข้า ASUSTOR ได้ (`ssh admin@192.168.10.9`) | ✅ |
| 2 | Docker Network `lcbp3` สร้างแล้ว (ดูหัวข้อ [สร้าง Docker Network](#-สร้าง-docker-network-ทำครั้งแรกครั้งเดียว)) | ✅ |
| 3 | สร้าง Directories และกำหนดสิทธิ์แล้ว (ดูหัวข้อ [กำหนดสิทธิ](#กำหนดสิทธิ-บน-asustor)) | ✅ |
| 4 | สร้าง `prometheus.yml` แล้ว (ดูหัวข้อ [Prometheus Configuration](#prometheus-configuration)) | ✅ |
| 5 | สร้าง `promtail-config.yml` แล้ว (ดูหัวข้อ [Step 1.2](#step-12-สร้าง-promtail-configyml)) | ✅ |
| # | ขั้นตอน | Status |
| :-- | :-------------------------------------------------------------------------------------------------------------- | :----- |
| 1 | SSH เข้า ASUSTOR ได้ (`ssh admin@192.168.10.9`) | ✅ |
| 2 | Docker Network `lcbp3` สร้างแล้ว (ดูหัวข้อ [สร้าง Docker Network](#-สร้าง-docker-network-ทำครั้งแรกครั้งเดียว)) | ✅ |
| 3 | สร้าง Directories และกำหนดสิทธิ์แล้ว (ดูหัวข้อ [กำหนดสิทธิ](#กำหนดสิทธิ-บน-asustor)) | ✅ |
| 4 | สร้าง `prometheus.yml` แล้ว (ดูหัวข้อ [Prometheus Configuration](#prometheus-configuration)) | ✅ |
| 5 | สร้าง `promtail-config.yml` แล้ว (ดูหัวข้อ [Step 1.2](#step-12-สร้าง-promtail-configyml)) | ✅ |
---
@@ -1093,7 +1082,7 @@ cat /volume1/np-dms/monitoring/prometheus/config/prometheus.yml
ต้องสร้าง Config ให้ Promtail อ่าน logs จาก Docker containers และส่งไป Loki:
```bash
````bash
# สร้างไฟล์ promtail-config.yml
cat > /volume1/np-dms/monitoring/promtail/config/promtail-config.yml << 'EOF'
server:
@@ -1127,9 +1116,10 @@ EOF
CREATE USER 'exporter'@'%' IDENTIFIED BY 'Center2025' WITH MAX_USER_CONNECTIONS 3;
GRANT PROCESS, REPLICATION CLIENT, SELECT, SLAVE MONITOR ON *.* TO 'exporter'@'%';
FLUSH PRIVILEGES;
```
````
### 2. สร้างไฟล์คอนฟิก .my.cnf บน QNAP
เพื่อให้ `mysqld-exporter` อ่านรหัสผ่านที่มีตัวอักษรพิเศษได้ถูกต้อง:
1. **SSH เข้า QNAP** (หรือใช้ File Station สร้าง Folder):
@@ -1143,11 +1133,11 @@ FLUSH PRIVILEGES;
3. **สร้างไฟล์ .my.cnf**:
```bash
cat > /share/np-dms/monitoring/mysqld-exporter/.my.cnf << 'EOF'
[client]
user=exporter
password=Center2025
host=mariadb
EOF
[client]
user=exporter
password=Center2025
host=mariadb
EOF
```
4. **กำหนดสิทธิ์ไฟล์** (เพื่อให้ Container อ่านไฟล์ได้):
```bash
@@ -1155,8 +1145,10 @@ EOF
```
# ตรวจสอบ
cat /volume1/np-dms/monitoring/promtail/config/promtail-config.yml
```
````
---
@@ -1187,7 +1179,7 @@ docker compose up -d
# ตรวจสอบ container status
docker compose ps
```
````
---
@@ -1200,15 +1192,15 @@ docker ps --filter "name=prometheus" --filter "name=grafana" \
--filter "name=cadvisor" --filter "name=loki" --filter "name=promtail"
```
| Service | วิธีตรวจสอบ | Expected Result |
| :---------------- | :----------------------------------------------------------------- | :------------------------------------ |
| ✅ **Prometheus** | `curl http://192.168.10.9:9090/-/healthy` | `Prometheus Server is Healthy` |
| ✅ **Grafana** | เปิด `https://grafana.np-dms.work` (หรือ `http://192.168.10.9:3000`) | หน้า Login |
| ✅ **Uptime Kuma** | เปิด `https://uptime.np-dms.work` (หรือ `http://192.168.10.9:3001`) | หน้า Setup |
| ✅ **Node Exp.** | `curl http://192.168.10.9:9100/metrics \| head` | Metrics output |
| ✅ **cAdvisor** | `curl http://192.168.10.9:8080/healthz` | `ok` |
| ✅ **Loki** | `curl http://192.168.10.9:3100/ready` | `ready` |
| ✅ **Promtail** | เช็ค Logs: `docker logs promtail` | ไม่ควรมี Error + เห็น connection success |
| Service | วิธีตรวจสอบ | Expected Result |
| :----------------- | :------------------------------------------------------------------- | :--------------------------------------- |
| ✅ **Prometheus** | `curl http://192.168.10.9:9090/-/healthy` | `Prometheus Server is Healthy` |
| ✅ **Grafana** | เปิด `https://grafana.np-dms.work` (หรือ `http://192.168.10.9:3000`) | หน้า Login |
| ✅ **Uptime Kuma** | เปิด `https://uptime.np-dms.work` (หรือ `http://192.168.10.9:3001`) | หน้า Setup |
| ✅ **Node Exp.** | `curl http://192.168.10.9:9100/metrics \| head` | Metrics output |
| ✅ **cAdvisor** | `curl http://192.168.10.9:8080/healthz` | `ok` |
| ✅ **Loki** | `curl http://192.168.10.9:3100/ready` | `ready` |
| ✅ **Promtail** | เช็ค Logs: `docker logs promtail` | ไม่ควรมี Error + เห็น connection success |
---
@@ -1262,30 +1254,33 @@ curl -s http://localhost:9090/api/v1/targets | grep -E '"qnap-(node|cadvisor)"'
เพื่อการ Monitor ที่สมบูรณ์ แนะนำให้ Import Dashboards ต่อไปนี้:
#### 6.1 Host Monitoring (Node Exporter)
* **Concept:** ดู resource ของเครื่อง Host (CPU, RAM, Disk, Network)
* **Dashboard ID:** `1860` (Node Exporter Full)
* **วิธี Import:**
1. ไปที่ **Dashboards** → **New** → **Import**
2. ช่อง **Import via grafana.com** ใส่เลข `1860` กด **Load**
3. เลือก Data source: **Prometheus**
4. กด **Import**
- **Concept:** ดู resource ของเครื่อง Host (CPU, RAM, Disk, Network)
- **Dashboard ID:** `1860` (Node Exporter Full)
- **วิธี Import:**
1. ไปที่ **Dashboards** → **New** → **Import**
2. ช่อง **Import via grafana.com** ใส่เลข `1860` กด **Load**
3. เลือก Data source: **Prometheus**
4. กด **Import**
#### 6.2 Container Monitoring (cAdvisor)
* **Concept:** ดู resource ของแต่ละ Container (เชื่อม Logs ด้วย)
* **Dashboard ID:** `14282` (Cadvisor exporter)
* **วิธี Import:**
1. ใส่เลข `14282` กด **Load**
2. เลือก Data source: **Prometheus**
3. กด **Import**
- **Concept:** ดู resource ของแต่ละ Container (เชื่อม Logs ด้วย)
- **Dashboard ID:** `14282` (Cadvisor exporter)
- **วิธี Import:**
1. ใส่เลข `14282` กด **Load**
2. เลือก Data source: **Prometheus**
3. กด **Import**
#### 6.3 Logs Monitoring (Loki Integration)
เพื่อให้ Dashboard ของ Container แสดง Logs จาก Loki ได้ด้วย:
1. เปิด Dashboard **Cadvisor exporter** ที่เพิ่ง Import มา
2. กดปุ่ม **Add visualization** (หรือ Edit dashboard)
3. เลือก Data source: **Loki**
4. ในช่อง Query ใส่: `{container="$name"}`
* *(Note: `$name` มาจาก Variable ของ Dashboard 14282)*
- _(Note: `$name` มาจาก Variable ของ Dashboard 14282)_
5. ปรับ Visualization type เป็น **Logs**
6. ตั้งชื่อ Panel ว่า **"Container Logs"**
7. กด **Apply** และ **Save Dashboard**
@@ -1316,8 +1311,6 @@ curl -s http://localhost:9090/api/v1/targets | grep -E '"qnap-(node|cadvisor)"'
> 📝 **หมายเหตุ**: เอกสารนี้อ้างอิงจาก Architecture Document **v1.8.0** - Monitoring Stack deploy บน ASUSTOR AS5403T
---
## 📈 Document Numbering Specific Monitoring
@@ -1389,9 +1382,9 @@ groups:
severity: critical
component: document-numbering
annotations:
summary: "Redis is unavailable for document numbering"
description: "System is falling back to DB-only locking. Performance degraded by 30-50%."
runbook_url: "https://wiki.lcbp3/runbooks/redis-unavailable"
summary: 'Redis is unavailable for document numbering'
description: 'System is falling back to DB-only locking. Performance degraded by 30-50%.'
runbook_url: 'https://wiki.lcbp3/runbooks/redis-unavailable'
# CRITICAL: High lock failure rate
- alert: HighLockFailureRate
@@ -1402,9 +1395,9 @@ groups:
severity: critical
component: document-numbering
annotations:
summary: "Lock acquisition failure rate > 10%"
description: "Check Redis and database performance immediately"
runbook_url: "https://wiki.lcbp3/runbooks/high-lock-failure"
summary: 'Lock acquisition failure rate > 10%'
description: 'Check Redis and database performance immediately'
runbook_url: 'https://wiki.lcbp3/runbooks/high-lock-failure'
# WARNING: Elevated lock failure rate
- alert: ElevatedLockFailureRate
@@ -1415,8 +1408,8 @@ groups:
severity: warning
component: document-numbering
annotations:
summary: "Lock acquisition failure rate > 5%"
description: "Monitor closely. May escalate to critical soon."
summary: 'Lock acquisition failure rate > 5%'
description: 'Monitor closely. May escalate to critical soon.'
# WARNING: Slow lock acquisition
- alert: SlowLockAcquisition
@@ -1429,8 +1422,8 @@ groups:
severity: warning
component: document-numbering
annotations:
summary: "P95 lock acquisition time > 1 second"
description: "Lock acquisition is slower than expected. Check Redis latency."
summary: 'P95 lock acquisition time > 1 second'
description: 'Lock acquisition is slower than expected. Check Redis latency.'
# WARNING: High retry count
- alert: HighRetryCount
@@ -1443,8 +1436,8 @@ groups:
severity: warning
component: document-numbering
annotations:
summary: "Retry count > 100 per hour in project {{ $labels.project }}"
description: "High contention detected. Consider scaling."
summary: 'Retry count > 100 per hour in project {{ $labels.project }}'
description: 'High contention detected. Consider scaling.'
# WARNING: Slow generation
- alert: SlowDocumentNumberGeneration
@@ -1457,8 +1450,8 @@ groups:
severity: warning
component: document-numbering
annotations:
summary: "P95 generation time > 2 seconds"
description: "Document number generation is slower than SLA target"
summary: 'P95 generation time > 2 seconds'
description: 'Document number generation is slower than SLA target'
```
### 3.3. AlertManager Configuration
@@ -1553,4 +1546,3 @@ Dashboard panels ที่สำคัญ:
6. **DB Connection Pool Usage** (Gauge)
- Query: `docnum_db_connection_pool_usage`
- Alert threshold: > 80%