260322:1648 Correct Coresspondence / Doing RFA / Correct CI
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
# 04.3 Monitoring & Alerting
|
||||
|
||||
**Project:** LCBP3-DMS
|
||||
**Version:** 1.8.0
|
||||
**Status:** Active
|
||||
@@ -78,12 +79,7 @@ This document describes monitoring setup, health checks, and alerting rules for
|
||||
```typescript
|
||||
// File: backend/src/health/health.controller.ts
|
||||
import { Controller, Get } from '@nestjs/common';
|
||||
import {
|
||||
HealthCheck,
|
||||
HealthCheckService,
|
||||
TypeOrmHealthIndicator,
|
||||
DiskHealthIndicator,
|
||||
} from '@nestjs/terminus';
|
||||
import { HealthCheck, HealthCheckService, TypeOrmHealthIndicator, DiskHealthIndicator } from '@nestjs/terminus';
|
||||
|
||||
@Controller('health')
|
||||
export class HealthController {
|
||||
@@ -208,12 +204,7 @@ done
|
||||
|
||||
```typescript
|
||||
// File: backend/src/common/interceptors/performance.interceptor.ts
|
||||
import {
|
||||
Injectable,
|
||||
NestInterceptor,
|
||||
ExecutionContext,
|
||||
CallHandler,
|
||||
} from '@nestjs/common';
|
||||
import { Injectable, NestInterceptor, ExecutionContext, CallHandler } from '@nestjs/common';
|
||||
import { Observable } from 'rxjs';
|
||||
import { tap } from 'rxjs/operators';
|
||||
import { logger } from 'src/config/logger.config';
|
||||
@@ -460,7 +451,6 @@ ab -n 1000 -c 10 \
|
||||
**Last Review:** 2025-12-01
|
||||
**Next Review:** 2026-03-01
|
||||
|
||||
|
||||
---
|
||||
|
||||
# การติดตั้ง Monitoring Stack บน ASUSTOR
|
||||
@@ -472,15 +462,15 @@ ab -n 1000 -c 10 \
|
||||
|
||||
Stack สำหรับ Monitoring ประกอบด้วย:
|
||||
|
||||
| Service | Port | Purpose | Host |
|
||||
| :---------------- | :--------------------------- | :-------------------------------- | :------ |
|
||||
| Service | Port | Purpose | Host |
|
||||
| :---------------- | :--------------------------- | :--------------------------------- | :------ |
|
||||
| **Prometheus** | 9090 | เก็บ Metrics และ Time-series data | ASUSTOR |
|
||||
| **Grafana** | 3000 | Dashboard สำหรับแสดงผล Metrics | ASUSTOR |
|
||||
| **Grafana** | 3000 | Dashboard สำหรับแสดงผล Metrics | ASUSTOR |
|
||||
| **Node Exporter** | 9100 | เก็บ Metrics ของ Host system | Both |
|
||||
| **cAdvisor** | 8080 (ASUSTOR) / 8088 (QNAP) | เก็บ Metrics ของ Docker containers | Both |
|
||||
| **Uptime Kuma** | 3001 | Service Availability Monitoring | ASUSTOR |
|
||||
| **Loki** | 3100 | Log aggregation | ASUSTOR |
|
||||
| **Promtail** | - | Log shipper (Sender) | ASUSTOR |
|
||||
| **Uptime Kuma** | 3001 | Service Availability Monitoring | ASUSTOR |
|
||||
| **Loki** | 3100 | Log aggregation | ASUSTOR |
|
||||
| **Promtail** | - | Log shipper (Sender) | ASUSTOR |
|
||||
|
||||
---
|
||||
|
||||
@@ -613,10 +603,10 @@ x-restart: &restart_policy
|
||||
|
||||
x-logging: &default_logging
|
||||
logging:
|
||||
driver: "json-file"
|
||||
driver: 'json-file'
|
||||
options:
|
||||
max-size: "10m"
|
||||
max-file: "5"
|
||||
max-size: '10m'
|
||||
max-file: '5'
|
||||
|
||||
networks:
|
||||
lcbp3:
|
||||
@@ -635,27 +625,27 @@ services:
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: "1.0"
|
||||
cpus: '1.0'
|
||||
memory: 1G
|
||||
reservations:
|
||||
cpus: "0.25"
|
||||
cpus: '0.25'
|
||||
memory: 256M
|
||||
environment:
|
||||
TZ: "Asia/Bangkok"
|
||||
TZ: 'Asia/Bangkok'
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
- '--storage.tsdb.path=/prometheus'
|
||||
- '--storage.tsdb.retention.time=30d'
|
||||
- '--web.enable-lifecycle'
|
||||
ports:
|
||||
- "9090:9090"
|
||||
- '9090:9090'
|
||||
networks:
|
||||
- lcbp3
|
||||
volumes:
|
||||
- "/volume1/np-dms/monitoring/prometheus/config:/etc/prometheus:ro"
|
||||
- "/volume1/np-dms/monitoring/prometheus/data:/prometheus"
|
||||
- '/volume1/np-dms/monitoring/prometheus/config:/etc/prometheus:ro'
|
||||
- '/volume1/np-dms/monitoring/prometheus/data:/prometheus'
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--spider", "-q", "http://localhost:9090/-/healthy"]
|
||||
test: ['CMD', 'wget', '--spider', '-q', 'http://localhost:9090/-/healthy']
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
@@ -672,27 +662,27 @@ services:
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: "1.0"
|
||||
cpus: '1.0'
|
||||
memory: 512M
|
||||
reservations:
|
||||
cpus: "0.25"
|
||||
cpus: '0.25'
|
||||
memory: 128M
|
||||
environment:
|
||||
TZ: "Asia/Bangkok"
|
||||
TZ: 'Asia/Bangkok'
|
||||
GF_SECURITY_ADMIN_USER: admin
|
||||
GF_SECURITY_ADMIN_PASSWORD: "Center#2025"
|
||||
GF_SERVER_ROOT_URL: "https://grafana.np-dms.work"
|
||||
GF_SECURITY_ADMIN_PASSWORD: 'Center#2025'
|
||||
GF_SERVER_ROOT_URL: 'https://grafana.np-dms.work'
|
||||
GF_INSTALL_PLUGINS: grafana-clock-panel,grafana-piechart-panel
|
||||
ports:
|
||||
- "3000:3000"
|
||||
- '3000:3000'
|
||||
networks:
|
||||
- lcbp3
|
||||
volumes:
|
||||
- "/volume1/np-dms/monitoring/grafana/data:/var/lib/grafana"
|
||||
- '/volume1/np-dms/monitoring/grafana/data:/var/lib/grafana'
|
||||
depends_on:
|
||||
- prometheus
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "wget --spider -q http://localhost:3000/api/health || exit 1"]
|
||||
test: ['CMD-SHELL', 'wget --spider -q http://localhost:3000/api/health || exit 1']
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
@@ -707,18 +697,18 @@ services:
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: "0.5"
|
||||
cpus: '0.5'
|
||||
memory: 256M
|
||||
environment:
|
||||
TZ: "Asia/Bangkok"
|
||||
TZ: 'Asia/Bangkok'
|
||||
ports:
|
||||
- "3001:3001"
|
||||
- '3001:3001'
|
||||
networks:
|
||||
- lcbp3
|
||||
volumes:
|
||||
- "/volume1/np-dms/monitoring/uptime-kuma/data:/app/data"
|
||||
- '/volume1/np-dms/monitoring/uptime-kuma/data:/app/data'
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "curl -f http://localhost:3001/api/entry-page || exit 1"]
|
||||
test: ['CMD-SHELL', 'curl -f http://localhost:3001/api/entry-page || exit 1']
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
@@ -733,16 +723,16 @@ services:
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: "0.5"
|
||||
cpus: '0.5'
|
||||
memory: 128M
|
||||
environment:
|
||||
TZ: "Asia/Bangkok"
|
||||
TZ: 'Asia/Bangkok'
|
||||
command:
|
||||
- '--path.procfs=/host/proc'
|
||||
- '--path.sysfs=/host/sys'
|
||||
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
|
||||
ports:
|
||||
- "9100:9100"
|
||||
- '9100:9100'
|
||||
networks:
|
||||
- lcbp3
|
||||
volumes:
|
||||
@@ -750,7 +740,7 @@ services:
|
||||
- /sys:/host/sys:ro
|
||||
- /:/rootfs:ro
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--spider", "-q", "http://localhost:9100/metrics"]
|
||||
test: ['CMD', 'wget', '--spider', '-q', 'http://localhost:9100/metrics']
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
@@ -768,12 +758,12 @@ services:
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: "0.5"
|
||||
cpus: '0.5'
|
||||
memory: 256M
|
||||
environment:
|
||||
TZ: "Asia/Bangkok"
|
||||
TZ: 'Asia/Bangkok'
|
||||
ports:
|
||||
- "8088:8088"
|
||||
- '8088:8088'
|
||||
networks:
|
||||
- lcbp3
|
||||
volumes:
|
||||
@@ -783,7 +773,7 @@ services:
|
||||
- /var/lib/docker/:/var/lib/docker:ro
|
||||
- /dev/disk/:/dev/disk:ro
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--spider", "-q", "http://localhost:8080/healthz"]
|
||||
test: ['CMD', 'wget', '--spider', '-q', 'http://localhost:8080/healthz']
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
@@ -798,19 +788,19 @@ services:
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: "0.5"
|
||||
cpus: '0.5'
|
||||
memory: 512M
|
||||
environment:
|
||||
TZ: "Asia/Bangkok"
|
||||
TZ: 'Asia/Bangkok'
|
||||
command: -config.file=/etc/loki/local-config.yaml
|
||||
ports:
|
||||
- "3100:3100"
|
||||
- '3100:3100'
|
||||
networks:
|
||||
- lcbp3
|
||||
volumes:
|
||||
- "/volume1/np-dms/monitoring/loki/data:/loki"
|
||||
- '/volume1/np-dms/monitoring/loki/data:/loki'
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--spider", "-q", "http://localhost:3100/ready"]
|
||||
test: ['CMD', 'wget', '--spider', '-q', 'http://localhost:3100/ready']
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
@@ -822,21 +812,21 @@ services:
|
||||
<<: [*restart_policy, *default_logging]
|
||||
image: grafana/promtail:2.9.0
|
||||
container_name: promtail
|
||||
user: "0:0"
|
||||
user: '0:0'
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: "0.5"
|
||||
cpus: '0.5'
|
||||
memory: 256M
|
||||
environment:
|
||||
TZ: "Asia/Bangkok"
|
||||
TZ: 'Asia/Bangkok'
|
||||
command: -config.file=/etc/promtail/promtail-config.yml
|
||||
networks:
|
||||
- lcbp3
|
||||
volumes:
|
||||
- "/volume1/np-dms/monitoring/promtail/config:/etc/promtail:ro"
|
||||
- "/var/run/docker.sock:/var/run/docker.sock:ro"
|
||||
- "/var/lib/docker/containers:/var/lib/docker/containers:ro"
|
||||
- '/volume1/np-dms/monitoring/promtail/config:/etc/promtail:ro'
|
||||
- '/var/run/docker.sock:/var/run/docker.sock:ro'
|
||||
- '/var/lib/docker/containers:/var/lib/docker/containers:ro'
|
||||
depends_on:
|
||||
- loki
|
||||
```
|
||||
@@ -867,7 +857,7 @@ services:
|
||||
- '--path.sysfs=/host/sys'
|
||||
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
|
||||
ports:
|
||||
- "9100:9100"
|
||||
- '9100:9100'
|
||||
networks:
|
||||
- lcbp3
|
||||
volumes:
|
||||
@@ -881,7 +871,7 @@ services:
|
||||
restart: unless-stopped
|
||||
privileged: true
|
||||
ports:
|
||||
- "8088:8080"
|
||||
- '8088:8080'
|
||||
networks:
|
||||
- lcbp3
|
||||
volumes:
|
||||
@@ -899,11 +889,11 @@ services:
|
||||
command:
|
||||
- '--config.my-cnf=/etc/mysql/my.cnf'
|
||||
ports:
|
||||
- "9104:9104"
|
||||
- '9104:9104'
|
||||
networks:
|
||||
- lcbp3
|
||||
volumes:
|
||||
- "/share/np-dms/monitoring/mysqld-exporter/.my.cnf:/etc/mysql/my.cnf:ro"
|
||||
- '/share/np-dms/monitoring/mysqld-exporter/.my.cnf:/etc/mysql/my.cnf:ro'
|
||||
```
|
||||
|
||||
---
|
||||
@@ -1012,7 +1002,6 @@ scrape_configs:
|
||||
| 14204 | Elasticsearch | Elasticsearch view |
|
||||
| 13106 | MySQL/MariaDB Overview | Detailed MySQL/MariaDB metrics |
|
||||
|
||||
|
||||
### Import Dashboard via Grafana UI
|
||||
|
||||
1. Go to **Dashboards → Import**
|
||||
@@ -1026,13 +1015,13 @@ scrape_configs:
|
||||
|
||||
### 📋 Prerequisites Checklist
|
||||
|
||||
| # | ขั้นตอน | Status |
|
||||
| :--- | :------------------------------------------------------------------------------------------------- | :----- |
|
||||
| 1 | SSH เข้า ASUSTOR ได้ (`ssh admin@192.168.10.9`) | ✅ |
|
||||
| 2 | Docker Network `lcbp3` สร้างแล้ว (ดูหัวข้อ [สร้าง Docker Network](#-สร้าง-docker-network-ทำครั้งแรกครั้งเดียว)) | ✅ |
|
||||
| 3 | สร้าง Directories และกำหนดสิทธิ์แล้ว (ดูหัวข้อ [กำหนดสิทธิ](#กำหนดสิทธิ-บน-asustor)) | ✅ |
|
||||
| 4 | สร้าง `prometheus.yml` แล้ว (ดูหัวข้อ [Prometheus Configuration](#prometheus-configuration)) | ✅ |
|
||||
| 5 | สร้าง `promtail-config.yml` แล้ว (ดูหัวข้อ [Step 1.2](#step-12-สร้าง-promtail-configyml)) | ✅ |
|
||||
| # | ขั้นตอน | Status |
|
||||
| :-- | :-------------------------------------------------------------------------------------------------------------- | :----- |
|
||||
| 1 | SSH เข้า ASUSTOR ได้ (`ssh admin@192.168.10.9`) | ✅ |
|
||||
| 2 | Docker Network `lcbp3` สร้างแล้ว (ดูหัวข้อ [สร้าง Docker Network](#-สร้าง-docker-network-ทำครั้งแรกครั้งเดียว)) | ✅ |
|
||||
| 3 | สร้าง Directories และกำหนดสิทธิ์แล้ว (ดูหัวข้อ [กำหนดสิทธิ](#กำหนดสิทธิ-บน-asustor)) | ✅ |
|
||||
| 4 | สร้าง `prometheus.yml` แล้ว (ดูหัวข้อ [Prometheus Configuration](#prometheus-configuration)) | ✅ |
|
||||
| 5 | สร้าง `promtail-config.yml` แล้ว (ดูหัวข้อ [Step 1.2](#step-12-สร้าง-promtail-configyml)) | ✅ |
|
||||
|
||||
---
|
||||
|
||||
@@ -1093,7 +1082,7 @@ cat /volume1/np-dms/monitoring/prometheus/config/prometheus.yml
|
||||
|
||||
ต้องสร้าง Config ให้ Promtail อ่าน logs จาก Docker containers และส่งไป Loki:
|
||||
|
||||
```bash
|
||||
````bash
|
||||
# สร้างไฟล์ promtail-config.yml
|
||||
cat > /volume1/np-dms/monitoring/promtail/config/promtail-config.yml << 'EOF'
|
||||
server:
|
||||
@@ -1127,9 +1116,10 @@ EOF
|
||||
CREATE USER 'exporter'@'%' IDENTIFIED BY 'Center2025' WITH MAX_USER_CONNECTIONS 3;
|
||||
GRANT PROCESS, REPLICATION CLIENT, SELECT, SLAVE MONITOR ON *.* TO 'exporter'@'%';
|
||||
FLUSH PRIVILEGES;
|
||||
```
|
||||
````
|
||||
|
||||
### 2. สร้างไฟล์คอนฟิก .my.cnf บน QNAP
|
||||
|
||||
เพื่อให้ `mysqld-exporter` อ่านรหัสผ่านที่มีตัวอักษรพิเศษได้ถูกต้อง:
|
||||
|
||||
1. **SSH เข้า QNAP** (หรือใช้ File Station สร้าง Folder):
|
||||
@@ -1143,11 +1133,11 @@ FLUSH PRIVILEGES;
|
||||
3. **สร้างไฟล์ .my.cnf**:
|
||||
```bash
|
||||
cat > /share/np-dms/monitoring/mysqld-exporter/.my.cnf << 'EOF'
|
||||
[client]
|
||||
user=exporter
|
||||
password=Center2025
|
||||
host=mariadb
|
||||
EOF
|
||||
[client]
|
||||
user=exporter
|
||||
password=Center2025
|
||||
host=mariadb
|
||||
EOF
|
||||
```
|
||||
4. **กำหนดสิทธิ์ไฟล์** (เพื่อให้ Container อ่านไฟล์ได้):
|
||||
```bash
|
||||
@@ -1155,8 +1145,10 @@ EOF
|
||||
```
|
||||
|
||||
# ตรวจสอบ
|
||||
|
||||
cat /volume1/np-dms/monitoring/promtail/config/promtail-config.yml
|
||||
```
|
||||
|
||||
````
|
||||
|
||||
---
|
||||
|
||||
@@ -1187,7 +1179,7 @@ docker compose up -d
|
||||
|
||||
# ตรวจสอบ container status
|
||||
docker compose ps
|
||||
```
|
||||
````
|
||||
|
||||
---
|
||||
|
||||
@@ -1200,15 +1192,15 @@ docker ps --filter "name=prometheus" --filter "name=grafana" \
|
||||
--filter "name=cadvisor" --filter "name=loki" --filter "name=promtail"
|
||||
```
|
||||
|
||||
| Service | วิธีตรวจสอบ | Expected Result |
|
||||
| :---------------- | :----------------------------------------------------------------- | :------------------------------------ |
|
||||
| ✅ **Prometheus** | `curl http://192.168.10.9:9090/-/healthy` | `Prometheus Server is Healthy` |
|
||||
| ✅ **Grafana** | เปิด `https://grafana.np-dms.work` (หรือ `http://192.168.10.9:3000`) | หน้า Login |
|
||||
| ✅ **Uptime Kuma** | เปิด `https://uptime.np-dms.work` (หรือ `http://192.168.10.9:3001`) | หน้า Setup |
|
||||
| ✅ **Node Exp.** | `curl http://192.168.10.9:9100/metrics \| head` | Metrics output |
|
||||
| ✅ **cAdvisor** | `curl http://192.168.10.9:8080/healthz` | `ok` |
|
||||
| ✅ **Loki** | `curl http://192.168.10.9:3100/ready` | `ready` |
|
||||
| ✅ **Promtail** | เช็ค Logs: `docker logs promtail` | ไม่ควรมี Error + เห็น connection success |
|
||||
| Service | วิธีตรวจสอบ | Expected Result |
|
||||
| :----------------- | :------------------------------------------------------------------- | :--------------------------------------- |
|
||||
| ✅ **Prometheus** | `curl http://192.168.10.9:9090/-/healthy` | `Prometheus Server is Healthy` |
|
||||
| ✅ **Grafana** | เปิด `https://grafana.np-dms.work` (หรือ `http://192.168.10.9:3000`) | หน้า Login |
|
||||
| ✅ **Uptime Kuma** | เปิด `https://uptime.np-dms.work` (หรือ `http://192.168.10.9:3001`) | หน้า Setup |
|
||||
| ✅ **Node Exp.** | `curl http://192.168.10.9:9100/metrics \| head` | Metrics output |
|
||||
| ✅ **cAdvisor** | `curl http://192.168.10.9:8080/healthz` | `ok` |
|
||||
| ✅ **Loki** | `curl http://192.168.10.9:3100/ready` | `ready` |
|
||||
| ✅ **Promtail** | เช็ค Logs: `docker logs promtail` | ไม่ควรมี Error + เห็น connection success |
|
||||
|
||||
---
|
||||
|
||||
@@ -1262,30 +1254,33 @@ curl -s http://localhost:9090/api/v1/targets | grep -E '"qnap-(node|cadvisor)"'
|
||||
เพื่อการ Monitor ที่สมบูรณ์ แนะนำให้ Import Dashboards ต่อไปนี้:
|
||||
|
||||
#### 6.1 Host Monitoring (Node Exporter)
|
||||
* **Concept:** ดู resource ของเครื่อง Host (CPU, RAM, Disk, Network)
|
||||
* **Dashboard ID:** `1860` (Node Exporter Full)
|
||||
* **วิธี Import:**
|
||||
1. ไปที่ **Dashboards** → **New** → **Import**
|
||||
2. ช่อง **Import via grafana.com** ใส่เลข `1860` กด **Load**
|
||||
3. เลือก Data source: **Prometheus**
|
||||
4. กด **Import**
|
||||
|
||||
- **Concept:** ดู resource ของเครื่อง Host (CPU, RAM, Disk, Network)
|
||||
- **Dashboard ID:** `1860` (Node Exporter Full)
|
||||
- **วิธี Import:**
|
||||
1. ไปที่ **Dashboards** → **New** → **Import**
|
||||
2. ช่อง **Import via grafana.com** ใส่เลข `1860` กด **Load**
|
||||
3. เลือก Data source: **Prometheus**
|
||||
4. กด **Import**
|
||||
|
||||
#### 6.2 Container Monitoring (cAdvisor)
|
||||
* **Concept:** ดู resource ของแต่ละ Container (เชื่อม Logs ด้วย)
|
||||
* **Dashboard ID:** `14282` (Cadvisor exporter)
|
||||
* **วิธี Import:**
|
||||
1. ใส่เลข `14282` กด **Load**
|
||||
2. เลือก Data source: **Prometheus**
|
||||
3. กด **Import**
|
||||
|
||||
- **Concept:** ดู resource ของแต่ละ Container (เชื่อม Logs ด้วย)
|
||||
- **Dashboard ID:** `14282` (Cadvisor exporter)
|
||||
- **วิธี Import:**
|
||||
1. ใส่เลข `14282` กด **Load**
|
||||
2. เลือก Data source: **Prometheus**
|
||||
3. กด **Import**
|
||||
|
||||
#### 6.3 Logs Monitoring (Loki Integration)
|
||||
|
||||
เพื่อให้ Dashboard ของ Container แสดง Logs จาก Loki ได้ด้วย:
|
||||
|
||||
1. เปิด Dashboard **Cadvisor exporter** ที่เพิ่ง Import มา
|
||||
2. กดปุ่ม **Add visualization** (หรือ Edit dashboard)
|
||||
3. เลือก Data source: **Loki**
|
||||
4. ในช่อง Query ใส่: `{container="$name"}`
|
||||
* *(Note: `$name` มาจาก Variable ของ Dashboard 14282)*
|
||||
- _(Note: `$name` มาจาก Variable ของ Dashboard 14282)_
|
||||
5. ปรับ Visualization type เป็น **Logs**
|
||||
6. ตั้งชื่อ Panel ว่า **"Container Logs"**
|
||||
7. กด **Apply** และ **Save Dashboard**
|
||||
@@ -1316,8 +1311,6 @@ curl -s http://localhost:9090/api/v1/targets | grep -E '"qnap-(node|cadvisor)"'
|
||||
|
||||
> 📝 **หมายเหตุ**: เอกสารนี้อ้างอิงจาก Architecture Document **v1.8.0** - Monitoring Stack deploy บน ASUSTOR AS5403T
|
||||
|
||||
|
||||
|
||||
---
|
||||
|
||||
## 📈 Document Numbering Specific Monitoring
|
||||
@@ -1389,9 +1382,9 @@ groups:
|
||||
severity: critical
|
||||
component: document-numbering
|
||||
annotations:
|
||||
summary: "Redis is unavailable for document numbering"
|
||||
description: "System is falling back to DB-only locking. Performance degraded by 30-50%."
|
||||
runbook_url: "https://wiki.lcbp3/runbooks/redis-unavailable"
|
||||
summary: 'Redis is unavailable for document numbering'
|
||||
description: 'System is falling back to DB-only locking. Performance degraded by 30-50%.'
|
||||
runbook_url: 'https://wiki.lcbp3/runbooks/redis-unavailable'
|
||||
|
||||
# CRITICAL: High lock failure rate
|
||||
- alert: HighLockFailureRate
|
||||
@@ -1402,9 +1395,9 @@ groups:
|
||||
severity: critical
|
||||
component: document-numbering
|
||||
annotations:
|
||||
summary: "Lock acquisition failure rate > 10%"
|
||||
description: "Check Redis and database performance immediately"
|
||||
runbook_url: "https://wiki.lcbp3/runbooks/high-lock-failure"
|
||||
summary: 'Lock acquisition failure rate > 10%'
|
||||
description: 'Check Redis and database performance immediately'
|
||||
runbook_url: 'https://wiki.lcbp3/runbooks/high-lock-failure'
|
||||
|
||||
# WARNING: Elevated lock failure rate
|
||||
- alert: ElevatedLockFailureRate
|
||||
@@ -1415,8 +1408,8 @@ groups:
|
||||
severity: warning
|
||||
component: document-numbering
|
||||
annotations:
|
||||
summary: "Lock acquisition failure rate > 5%"
|
||||
description: "Monitor closely. May escalate to critical soon."
|
||||
summary: 'Lock acquisition failure rate > 5%'
|
||||
description: 'Monitor closely. May escalate to critical soon.'
|
||||
|
||||
# WARNING: Slow lock acquisition
|
||||
- alert: SlowLockAcquisition
|
||||
@@ -1429,8 +1422,8 @@ groups:
|
||||
severity: warning
|
||||
component: document-numbering
|
||||
annotations:
|
||||
summary: "P95 lock acquisition time > 1 second"
|
||||
description: "Lock acquisition is slower than expected. Check Redis latency."
|
||||
summary: 'P95 lock acquisition time > 1 second'
|
||||
description: 'Lock acquisition is slower than expected. Check Redis latency.'
|
||||
|
||||
# WARNING: High retry count
|
||||
- alert: HighRetryCount
|
||||
@@ -1443,8 +1436,8 @@ groups:
|
||||
severity: warning
|
||||
component: document-numbering
|
||||
annotations:
|
||||
summary: "Retry count > 100 per hour in project {{ $labels.project }}"
|
||||
description: "High contention detected. Consider scaling."
|
||||
summary: 'Retry count > 100 per hour in project {{ $labels.project }}'
|
||||
description: 'High contention detected. Consider scaling.'
|
||||
|
||||
# WARNING: Slow generation
|
||||
- alert: SlowDocumentNumberGeneration
|
||||
@@ -1457,8 +1450,8 @@ groups:
|
||||
severity: warning
|
||||
component: document-numbering
|
||||
annotations:
|
||||
summary: "P95 generation time > 2 seconds"
|
||||
description: "Document number generation is slower than SLA target"
|
||||
summary: 'P95 generation time > 2 seconds'
|
||||
description: 'Document number generation is slower than SLA target'
|
||||
```
|
||||
|
||||
### 3.3. AlertManager Configuration
|
||||
@@ -1553,4 +1546,3 @@ Dashboard panels ที่สำคัญ:
|
||||
6. **DB Connection Pool Usage** (Gauge)
|
||||
- Query: `docnum_db_connection_pool_usage`
|
||||
- Alert threshold: > 80%
|
||||
|
||||
|
||||
Reference in New Issue
Block a user