Files
lcbp3/specs/04-Infrastructure-OPS/04-03-monitoring.md
admin ef16817f38
All checks were successful
Build and Deploy / deploy (push) Successful in 4m44s
260223:1415 20260223 nextJS & nestJS Best pratices
2026-02-23 14:15:06 +07:00

1557 lines
49 KiB
Markdown
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# 04.3 Monitoring & Alerting
**Project:** LCBP3-DMS
**Version:** 1.8.0
**Status:** Active
**Owner:** Nattanin Peancharoen / DevOps Team
**Last Updated:** 2026-02-23
> 📍 **Monitoring Hub:** ASUSTOR AS5403T
> 📍 **App Server (Exporters):** QNAP TS-473A
---
## 📖 Overview
This document combines the operational SLAs, Alerting Rules, and Health Checks with the technical deployment instructions for the monitoring stack (Prometheus, Grafana, Loki) across both servers.
---
# Monitoring & Alerting
**Project:** LCBP3-DMS
**Version:** 1.8.0
**Last Updated:** 2025-12-02
---
## 📋 Overview
This document describes monitoring setup, health checks, and alerting rules for LCBP3-DMS.
---
## 🎯 Monitoring Objectives
- **Availability:** System uptime > 99.5%
- **Performance:** API response time < 500ms (P95)
- **Reliability:** Error rate < 1%
- **Capacity:** Resource utilization < 80%
---
## 📊 Key Metrics
### Application Metrics
| Metric | Target | Alert Threshold |
| ----------------------- | ------- | ------------------ |
| API Response Time (P95) | < 500ms | > 1000ms |
| Error Rate | < 1% | > 5% |
| Request Rate | N/A | Sudden ±50% change |
| Active Users | N/A | - |
| Queue Length (BullMQ) | < 100 | > 500 |
### Infrastructure Metrics
| Metric | Target | Alert Threshold |
| ------------ | ------ | ----------------- |
| CPU Usage | < 70% | > 90% |
| Memory Usage | < 80% | > 95% |
| Disk Usage | < 80% | > 90% |
| Network I/O | N/A | Anomaly detection |
### Database Metrics
| Metric | Target | Alert Threshold |
| --------------------- | ------- | --------------- |
| Query Time (P95) | < 100ms | > 500ms |
| Connection Pool Usage | < 80% | > 95% |
| Slow Queries | 0 | > 10/min |
| Replication Lag | 0s | > 30s |
---
## 🔍 Health Checks
### Backend Health Endpoint
```typescript
// File: backend/src/health/health.controller.ts
import { Controller, Get } from '@nestjs/common';
import {
HealthCheck,
HealthCheckService,
TypeOrmHealthIndicator,
DiskHealthIndicator,
} from '@nestjs/terminus';
@Controller('health')
export class HealthController {
constructor(
private health: HealthCheckService,
private db: TypeOrmHealthIndicator,
private disk: DiskHealthIndicator
) {}
@Get()
@HealthCheck()
check() {
return this.health.check([
// Database health
() => this.db.pingCheck('database'),
// Disk health
() =>
this.disk.checkStorage('storage', {
path: '/',
thresholdPercent: 0.9,
}),
// Redis health
async () => {
const redis = await this.redis.ping();
return { redis: { status: redis === 'PONG' ? 'up' : 'down' } };
},
]);
}
}
```
### Health Check Response
```json
{
"status": "ok",
"info": {
"database": {
"status": "up"
},
"storage": {
"status": "up",
"freePercent": 0.75
},
"redis": {
"status": "up"
}
},
"error": {},
"details": {
"database": {
"status": "up"
},
"storage": {
"status": "up",
"freePercent": 0.75
},
"redis": {
"status": "up"
}
}
}
```
---
## 🐳 Docker Container Monitoring
### Health Check in docker-compose.yml
```yaml
services:
backend:
healthcheck:
test: ['CMD', 'curl', '-f', 'http://localhost:3000/health']
interval: 30s
timeout: 10s
retries: 3
start_period: 40s
mariadb:
healthcheck:
test: ['CMD', 'mysqladmin', 'ping', '-h', 'localhost']
interval: 30s
timeout: 10s
retries: 3
redis:
healthcheck:
test: ['CMD', 'redis-cli', 'ping']
interval: 30s
timeout: 10s
retries: 3
```
### Monitor Container Status
```bash
#!/bin/bash
# File: /scripts/monitor-containers.sh
# Check all containers are healthy
CONTAINERS=("lcbp3-backend" "lcbp3-frontend" "lcbp3-mariadb" "lcbp3-redis")
for CONTAINER in "${CONTAINERS[@]}"; do
HEALTH=$(docker inspect --format='{{.State.Health.Status}}' $CONTAINER 2>/dev/null)
if [ "$HEALTH" != "healthy" ]; then
echo "ALERT: $CONTAINER is $HEALTH"
# Send alert (email, Slack, etc.)
fi
done
```
---
## 📈 Application Performance Monitoring (APM)
### Log-Based Monitoring (MVP Phase)
```typescript
// File: backend/src/common/interceptors/performance.interceptor.ts
import {
Injectable,
NestInterceptor,
ExecutionContext,
CallHandler,
} from '@nestjs/common';
import { Observable } from 'rxjs';
import { tap } from 'rxjs/operators';
import { logger } from 'src/config/logger.config';
@Injectable()
export class PerformanceInterceptor implements NestInterceptor {
intercept(context: ExecutionContext, next: CallHandler): Observable<any> {
const request = context.switchToHttp().getRequest();
const start = Date.now();
return next.handle().pipe(
tap({
next: () => {
const duration = Date.now() - start;
logger.info('Request completed', {
method: request.method,
url: request.url,
statusCode: context.switchToHttp().getResponse().statusCode,
duration: `${duration}ms`,
userId: request.user?.user_id,
});
// Alert on slow requests
if (duration > 1000) {
logger.warn('Slow request detected', {
method: request.method,
url: request.url,
duration: `${duration}ms`,
});
}
},
error: (error) => {
const duration = Date.now() - start;
logger.error('Request failed', {
method: request.method,
url: request.url,
duration: `${duration}ms`,
error: error.message,
});
},
})
);
}
}
```
---
## 🚨 Alerting Rules
### Critical Alerts (Immediate Action Required)
| Alert | Condition | Action |
| --------------- | ------------------------------------------- | --------------------------- |
| Service Down | Health check fails for 3 consecutive checks | Page on-call engineer |
| Database Down | Cannot connect to database | Page DBA + on-call engineer |
| Disk Full | Disk usage > 95% | Page operations team |
| High Error Rate | Error rate > 10% for 5 min | Page on-call engineer |
### Warning Alerts (Review Within 1 Hour)
| Alert | Condition | Action |
| ------------- | ----------------------- | ---------------------- |
| High CPU | CPU > 90% for 10 min | Notify operations team |
| High Memory | Memory > 95% for 10 min | Notify operations team |
| Slow Queries | > 50 slow queries/min | Notify DBA |
| Queue Backlog | BullMQ queue > 500 jobs | Notify backend team |
### Info Alerts (Review During Business Hours)
| Alert | Condition | Action |
| ------------------ | ------------------------------------ | --------------------- |
| Backup Failed | Daily backup job failed | Email operations team |
| SSL Expiring | SSL certificate expires in < 30 days | Email operations team |
| Disk Space Warning | Disk usage > 80% | Email operations team |
---
## 📧 Alert Notification Channels
### Email Alerts
```bash
#!/bin/bash
# File: /scripts/send-alert-email.sh
TO="ops-team@example.com"
SUBJECT="$1"
MESSAGE="$2"
echo "$MESSAGE" | mail -s "[LCBP3-DMS] $SUBJECT" "$TO"
```
### Slack Alerts
```bash
#!/bin/bash
# File: /scripts/send-alert-slack.sh
WEBHOOK_URL="https://hooks.slack.com/services/YOUR/WEBHOOK/URL"
MESSAGE="$1"
curl -X POST -H 'Content-type: application/json' \
--data "{\"text\":\"🚨 LCBP3-DMS Alert: $MESSAGE\"}" \
"$WEBHOOK_URL"
```
---
## 📊 Monitoring Dashboard
### Metrics to Display
**System Overview:**
- Service status (up/down)
- Overall system health score
- Active user count
- Request rate (req/s)
**Performance:**
- API response time (P50, P95, P99)
- Database query time
- Queue processing time
**Resources:**
- CPU usage %
- Memory usage %
- Disk usage %
- Network I/O
**Business Metrics:**
- Documents created today
- Workflows completed today
- Active correspondences
- Pending approvals
---
## 🔧 Log Aggregation
### Centralized Logging with Docker
```bash
# Configure Docker logging driver
# File: /etc/docker/daemon.json
{
"log-driver": "json-file",
"log-opts": {
"max-size": "10m",
"max-file": "3",
"labels": "service,environment"
}
}
```
### View Aggregated Logs
```bash
# View all LCBP3 container logs
docker-compose logs -f --tail=100
# View specific service logs
docker logs lcbp3-backend -f --since=1h
# Search logs
docker logs lcbp3-backend 2>&1 | grep "ERROR"
# Export logs for analysis
docker logs lcbp3-backend > backend-logs.txt
```
---
## 📈 Performance Baseline
### Establish Baselines
Run load tests to establish performance baselines:
```bash
# Install Apache Bench
apt-get install apache2-utils
# Test API endpoint
ab -n 1000 -c 10 \
-H "Authorization: Bearer <TOKEN>" \
https://lcbp3-dms.example.com/api/correspondences
# Results to record:
# - Requests per second
# - Mean response time
# - P95 response time
# - Error rate
```
### Regular Performance Testing
- **Weekly:** Quick health check (100 requests)
- **Monthly:** Full load test (10,000 requests)
- **Quarterly:** Stress test (find breaking point)
---
## ✅ Monitoring Checklist
### Daily
- [ ] Check service health dashboard
- [ ] Review error logs
- [ ] Verify backup completion
- [ ] Check disk space
### Weekly
- [ ] Review performance metrics trends
- [ ] Analyze slow query log
- [ ] Check SSL certificate expiry
- [ ] Review security alerts
### Monthly
- [ ] Capacity planning review
- [ ] Update monitoring thresholds
- [ ] Test alert notifications
- [ ] Review and tune performance
---
## 🔗 Related Documents
- [Backup & Recovery](04-04-backup-recovery.md)
- [Incident Response](04-07-incident-response.md)
- [ADR-010: Logging Strategy](../05-decisions/ADR-010-logging-monitoring-strategy.md)
---
**Version:** 1.8.0
**Last Review:** 2025-12-01
**Next Review:** 2026-03-01
---
# การติดตั้ง Monitoring Stack บน ASUSTOR
## **📝 คำอธิบายและข้อควรพิจารณา**
> ⚠️ **หมายเหตุ**: Monitoring Stack ทั้งหมดติดตั้งบน **ASUSTOR AS5403T** ไม่ใช่ QNAP
> เพื่อแยก Application workload ออกจาก Infrastructure/Monitoring workload
Stack สำหรับ Monitoring ประกอบด้วย:
| Service | Port | Purpose | Host |
| :---------------- | :--------------------------- | :-------------------------------- | :------ |
| **Prometheus** | 9090 | เก็บ Metrics และ Time-series data | ASUSTOR |
| **Grafana** | 3000 | Dashboard สำหรับแสดงผล Metrics | ASUSTOR |
| **Node Exporter** | 9100 | เก็บ Metrics ของ Host system | Both |
| **cAdvisor** | 8080 (ASUSTOR) / 8088 (QNAP) | เก็บ Metrics ของ Docker containers | Both |
| **Uptime Kuma** | 3001 | Service Availability Monitoring | ASUSTOR |
| **Loki** | 3100 | Log aggregation | ASUSTOR |
| **Promtail** | - | Log shipper (Sender) | ASUSTOR |
---
## 🏗️ Architecture Overview
```
┌─────────────────────────────────────────────────────────────────────────┐
│ ASUSTOR AS5403T (Monitoring Hub) │
├─────────────────────────────────────────────────────────────────────────┤
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
│ │ Prometheus │───▶│ Grafana │ │ Uptime Kuma │ │
│ │ :9090 │ │ :3000 │ │ :3001 │ │
│ └──────┬──────┘ └─────────────┘ └─────────────┘ │
│ │ │
│ │ Scrape Metrics │
│ ▼ │
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
│ │node-exporter│ │ cAdvisor │ │ Promtail │ │
│ │ :9100 │ │ :8080 │ │ (Log Ship) │ │
│ │ (Local) │ │ (Local) │ │ (Local) │ │
│ └─────────────┘ └─────────────┘ └─────────────┘ │
└─────────────────────────────────────────────────────────────────────────┘
│ Remote Scrape
┌─────────────────────────────────────────────────────────────────────────┐
│ QNAP TS-473A (App Server) │
├─────────────────────────────────────────────────────────────────────────┤
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
│ │node-exporter│ │ cAdvisor │ │ Backend │ │
│ │ :9100 │ │ :8080 │ │ /metrics │ │
│ └─────────────┘ └─────────────┘ └─────────────┘ │
└─────────────────────────────────────────────────────────────────────────┘
```
---
## กำหนดสิทธิ (บน ASUSTOR)
```bash
# SSH เข้า ASUSTOR
ssh admin@192.168.10.9
# สร้าง Directory
mkdir -p /volume1/np-dms/monitoring/prometheus/data
mkdir -p /volume1/np-dms/monitoring/prometheus/config
mkdir -p /volume1/np-dms/monitoring/grafana/data
mkdir -p /volume1/np-dms/monitoring/uptime-kuma/data
mkdir -p /volume1/np-dms/monitoring/loki/data
mkdir -p /volume1/np-dms/monitoring/promtail/config
# กำหนดสิทธิ์ให้ตรงกับ User ID ใน Container
# Prometheus (UID 65534 - nobody)
chown -R 65534:65534 /volume1/np-dms/monitoring/prometheus
chmod -R 750 /volume1/np-dms/monitoring/prometheus
# Grafana (UID 472)
chown -R 472:472 /volume1/np-dms/monitoring/grafana/data
chmod -R 750 /volume1/np-dms/monitoring/grafana/data
# Uptime Kuma (UID 1000)
chown -R 1000:1000 /volume1/np-dms/monitoring/uptime-kuma/data
chmod -R 750 /volume1/np-dms/monitoring/uptime-kuma/data
# Loki (UID 10001)
chown -R 10001:10001 /volume1/np-dms/monitoring/loki/data
chmod -R 750 /volume1/np-dms/monitoring/loki/data
# Promtail (Runs as root to read docker logs - no specific chown needed for config dir if created by admin)
# But ensure config file is readable
chmod -R 755 /volume1/np-dms/monitoring/promtail/config
```
---
## 🔗 สร้าง Docker Network (ทำครั้งแรกครั้งเดียว)
> ⚠️ **ต้องสร้าง network ก่อน deploy docker-compose ทุกตัว** เพราะทุก service ใช้ `lcbp3` เป็น external network
### สร้างผ่าน Portainer (แนะนำ)
1. เปิด **Portainer** → เลือก Environment ของ ASUSTOR
2. ไปที่ **Networks****Add network**
3. กรอกข้อมูล:
- **Name:** `lcbp3`
- **Driver:** `bridge`
4. กด **Create the network**
### สร้างผ่าน SSH
```bash
# SSH เข้า ASUSTOR
ssh admin@192.168.10.9
# สร้าง external network
docker network create lcbp3
# ตรวจสอบ
docker network ls | grep lcbp3
docker network inspect lcbp3
```
> 📖 **QNAP** ก็ต้องมี network ชื่อ `lcbp3` เช่นกัน (สร้างผ่าน Container Station หรือ SSH)
> ดู [README.md Quick Reference](README.md#-quick-reference) สำหรับคำสั่งบน QNAP
---
## Note: NPM Proxy Configuration (NPM รันบน QNAP → Forward ไป ASUSTOR)
> ⚠️ เนื่องจาก NPM อยู่บน **QNAP** แต่ Monitoring services อยู่บน **ASUSTOR**
> ต้องใช้ **IP Address** (`192.168.10.9`) แทนชื่อ container (resolve ข้ามเครื่องไม่ได้)
| Domain Names | Scheme | Forward Hostname | Forward Port | Block Common Exploits | Websockets | Force SSL | HTTP/2 |
| :--------------------- | :----- | :--------------- | :----------- | :-------------------- | :--------- | :-------- | :----- |
| grafana.np-dms.work | `http` | `192.168.10.9` | 3000 | [x] | [x] | [x] | [x] |
| prometheus.np-dms.work | `http` | `192.168.10.9` | 9090 | [x] | [ ] | [x] | [x] |
| uptime.np-dms.work | `http` | `192.168.10.9` | 3001 | [x] | [x] | [x] | [x] |
---
## Docker Compose File (ASUSTOR)
```yaml
# File: /volume1/np-dms/monitoring/docker-compose.yml
# DMS Container v1.8.0: Application name: lcbp3-monitoring
# Deploy on: ASUSTOR AS5403T
# Services: prometheus, grafana, node-exporter, cadvisor, uptime-kuma, loki, promtail
x-restart: &restart_policy
restart: unless-stopped
x-logging: &default_logging
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "5"
networks:
lcbp3:
external: true
services:
# ----------------------------------------------------------------
# 1. Prometheus (Metrics Collection & Storage)
# ----------------------------------------------------------------
prometheus:
<<: [*restart_policy, *default_logging]
image: prom/prometheus:v2.48.0
container_name: prometheus
stdin_open: true
tty: true
deploy:
resources:
limits:
cpus: "1.0"
memory: 1G
reservations:
cpus: "0.25"
memory: 256M
environment:
TZ: "Asia/Bangkok"
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=30d'
- '--web.enable-lifecycle'
ports:
- "9090:9090"
networks:
- lcbp3
volumes:
- "/volume1/np-dms/monitoring/prometheus/config:/etc/prometheus:ro"
- "/volume1/np-dms/monitoring/prometheus/data:/prometheus"
healthcheck:
test: ["CMD", "wget", "--spider", "-q", "http://localhost:9090/-/healthy"]
interval: 30s
timeout: 10s
retries: 3
# ----------------------------------------------------------------
# 2. Grafana (Dashboard & Visualization)
# ----------------------------------------------------------------
grafana:
<<: [*restart_policy, *default_logging]
image: grafana/grafana:10.2.2
container_name: grafana
stdin_open: true
tty: true
deploy:
resources:
limits:
cpus: "1.0"
memory: 512M
reservations:
cpus: "0.25"
memory: 128M
environment:
TZ: "Asia/Bangkok"
GF_SECURITY_ADMIN_USER: admin
GF_SECURITY_ADMIN_PASSWORD: "Center#2025"
GF_SERVER_ROOT_URL: "https://grafana.np-dms.work"
GF_INSTALL_PLUGINS: grafana-clock-panel,grafana-piechart-panel
ports:
- "3000:3000"
networks:
- lcbp3
volumes:
- "/volume1/np-dms/monitoring/grafana/data:/var/lib/grafana"
depends_on:
- prometheus
healthcheck:
test: ["CMD-SHELL", "wget --spider -q http://localhost:3000/api/health || exit 1"]
interval: 30s
timeout: 10s
retries: 3
# ----------------------------------------------------------------
# 3. Uptime Kuma (Service Availability Monitoring)
# ----------------------------------------------------------------
uptime-kuma:
<<: [*restart_policy, *default_logging]
image: louislam/uptime-kuma:1
container_name: uptime-kuma
deploy:
resources:
limits:
cpus: "0.5"
memory: 256M
environment:
TZ: "Asia/Bangkok"
ports:
- "3001:3001"
networks:
- lcbp3
volumes:
- "/volume1/np-dms/monitoring/uptime-kuma/data:/app/data"
healthcheck:
test: ["CMD-SHELL", "curl -f http://localhost:3001/api/entry-page || exit 1"]
interval: 30s
timeout: 10s
retries: 3
# ----------------------------------------------------------------
# 4. Node Exporter (Host Metrics - ASUSTOR)
# ----------------------------------------------------------------
node-exporter:
<<: [*restart_policy, *default_logging]
image: prom/node-exporter:v1.7.0
container_name: node-exporter
deploy:
resources:
limits:
cpus: "0.5"
memory: 128M
environment:
TZ: "Asia/Bangkok"
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
ports:
- "9100:9100"
networks:
- lcbp3
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
healthcheck:
test: ["CMD", "wget", "--spider", "-q", "http://localhost:9100/metrics"]
interval: 30s
timeout: 10s
retries: 3
# ----------------------------------------------------------------
# 5. cAdvisor (Container Metrics - ASUSTOR)
# ----------------------------------------------------------------
cadvisor:
<<: [*restart_policy, *default_logging]
image: gcr.io/cadvisor/cadvisor:v0.47.2
container_name: cadvisor
privileged: true
devices:
- /dev/kmsg
deploy:
resources:
limits:
cpus: "0.5"
memory: 256M
environment:
TZ: "Asia/Bangkok"
ports:
- "8088:8088"
networks:
- lcbp3
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
healthcheck:
test: ["CMD", "wget", "--spider", "-q", "http://localhost:8080/healthz"]
interval: 30s
timeout: 10s
retries: 3
# ----------------------------------------------------------------
# 6. Loki (Log Aggregation)
# ----------------------------------------------------------------
loki:
<<: [*restart_policy, *default_logging]
image: grafana/loki:2.9.0
container_name: loki
deploy:
resources:
limits:
cpus: "0.5"
memory: 512M
environment:
TZ: "Asia/Bangkok"
command: -config.file=/etc/loki/local-config.yaml
ports:
- "3100:3100"
networks:
- lcbp3
volumes:
- "/volume1/np-dms/monitoring/loki/data:/loki"
healthcheck:
test: ["CMD", "wget", "--spider", "-q", "http://localhost:3100/ready"]
interval: 30s
timeout: 10s
retries: 3
# ----------------------------------------------------------------
# 7. Promtail (Log Shipper)
# ----------------------------------------------------------------
promtail:
<<: [*restart_policy, *default_logging]
image: grafana/promtail:2.9.0
container_name: promtail
user: "0:0"
deploy:
resources:
limits:
cpus: "0.5"
memory: 256M
environment:
TZ: "Asia/Bangkok"
command: -config.file=/etc/promtail/promtail-config.yml
networks:
- lcbp3
volumes:
- "/volume1/np-dms/monitoring/promtail/config:/etc/promtail:ro"
- "/var/run/docker.sock:/var/run/docker.sock:ro"
- "/var/lib/docker/containers:/var/lib/docker/containers:ro"
depends_on:
- loki
```
---
## QNAP Node Exporter & cAdvisor
ติดตั้ง node-exporter และ cAdvisor บน QNAP เพื่อให้ Prometheus บน ASUSTOR scrape metrics ได้:
```yaml
# File: /share/np-dms/monitoring/docker-compose.yml (QNAP)
# เฉพาะ exporters เท่านั้น - metrics ถูก scrape โดย Prometheus บน ASUSTOR
version: '3.8'
networks:
lcbp3:
external: true
services:
node-exporter:
image: prom/node-exporter:v1.7.0
container_name: node-exporter
restart: unless-stopped
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
ports:
- "9100:9100"
networks:
- lcbp3
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.47.2
container_name: cadvisor
restart: unless-stopped
privileged: true
ports:
- "8088:8080"
networks:
- lcbp3
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /sys/fs/cgroup:/sys/fs/cgroup:ro
mysqld-exporter:
image: prom/mysqld-exporter:v0.15.0
container_name: mysqld-exporter
restart: unless-stopped
user: root
command:
- '--config.my-cnf=/etc/mysql/my.cnf'
ports:
- "9104:9104"
networks:
- lcbp3
volumes:
- "/share/np-dms/monitoring/mysqld-exporter/.my.cnf:/etc/mysql/my.cnf:ro"
```
---
## Prometheus Configuration
สร้างไฟล์ `/volume1/np-dms/monitoring/prometheus/config/prometheus.yml` บน ASUSTOR:
```yaml
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
# Prometheus self-monitoring (ASUSTOR)
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
# ============================================
# ASUSTOR Metrics (Local)
# ============================================
# Host metrics from Node Exporter (ASUSTOR)
- job_name: 'asustor-node'
static_configs:
- targets: ['node-exporter:9100']
labels:
host: 'asustor'
# Container metrics from cAdvisor (ASUSTOR)
- job_name: 'asustor-cadvisor'
static_configs:
- targets: ['cadvisor:8080']
labels:
host: 'asustor'
# ============================================
# QNAP Metrics (Remote - 192.168.10.8)
# ============================================
# Host metrics from Node Exporter (QNAP)
- job_name: 'qnap-node'
static_configs:
- targets: ['192.168.10.8:9100']
labels:
host: 'qnap'
# Container metrics from cAdvisor (QNAP)
- job_name: 'qnap-cadvisor'
static_configs:
- targets: ['192.168.10.8:8088']
labels:
host: 'qnap'
# Backend NestJS application (QNAP)
- job_name: 'backend'
static_configs:
- targets: ['192.168.10.8:3000']
labels:
host: 'qnap'
metrics_path: '/metrics'
# MariaDB Exporter (QNAP)
- job_name: 'mariadb'
static_configs:
- targets: ['192.168.10.8:9104']
labels:
host: 'qnap'
```
---
## Uptime Kuma Monitors
เมื่อ Uptime Kuma พร้อมใช้งาน ให้เพิ่ม monitors ต่อไปนี้:
| Monitor Name | Type | URL / Host | Interval |
| :------------ | :--- | :--------------------------------- | :------- |
| QNAP NPM | HTTP | https://npm.np-dms.work | 60s |
| Frontend | HTTP | https://lcbp3.np-dms.work | 60s |
| Backend API | HTTP | https://backend.np-dms.work/health | 60s |
| MariaDB | TCP | 192.168.10.8:3306 | 60s |
| Redis | TCP | 192.168.10.8:6379 | 60s |
| Elasticsearch | HTTP | http://192.168.10.8:9200 | 60s |
| Gitea | HTTP | https://git.np-dms.work | 60s |
| n8n | HTTP | https://n8n.np-dms.work | 60s |
| Grafana | HTTP | https://grafana.np-dms.work | 60s |
| QNAP Host | Ping | 192.168.10.8 | 60s |
| ASUSTOR Host | Ping | 192.168.10.9 | 60s |
---
## Grafana Dashboards
### Recommended Dashboards to Import
| Dashboard ID | Name | Purpose |
| :----------- | :--------------------------- | :----------------------------- |
| 1860 | Node Exporter Full | Host system metrics |
| 14282 | cAdvisor exporter | Container metrics |
| 11074 | Node Exporter for Prometheus | Node overview |
| 893 | Docker and Container | Docker overview |
| 7362 | MySQL | MySQL view |
| 1214 | Redis | Redis view |
| 14204 | Elasticsearch | Elasticsearch view |
| 13106 | MySQL/MariaDB Overview | Detailed MySQL/MariaDB metrics |
### Import Dashboard via Grafana UI
1. Go to **Dashboards → Import**
2. Enter Dashboard ID (e.g., `1860`)
3. Select Prometheus data source
4. Click **Import**
---
## 🚀 Deploy lcbp3-monitoring บน ASUSTOR
### 📋 Prerequisites Checklist
| # | ขั้นตอน | Status |
| :--- | :------------------------------------------------------------------------------------------------- | :----- |
| 1 | SSH เข้า ASUSTOR ได้ (`ssh admin@192.168.10.9`) | ✅ |
| 2 | Docker Network `lcbp3` สร้างแล้ว (ดูหัวข้อ [สร้าง Docker Network](#-สร้าง-docker-network-ทำครั้งแรกครั้งเดียว)) | ✅ |
| 3 | สร้าง Directories และกำหนดสิทธิ์แล้ว (ดูหัวข้อ [กำหนดสิทธิ](#กำหนดสิทธิ-บน-asustor)) | ✅ |
| 4 | สร้าง `prometheus.yml` แล้ว (ดูหัวข้อ [Prometheus Configuration](#prometheus-configuration)) | ✅ |
| 5 | สร้าง `promtail-config.yml` แล้ว (ดูหัวข้อ [Step 1.2](#step-12-สร้าง-promtail-configyml)) | ✅ |
---
### Step 1: สร้าง prometheus.yml
```bash
# SSH เข้า ASUSTOR
ssh admin@192.168.10.9
# สร้างไฟล์ prometheus.yml
cat > /volume1/np-dms/monitoring/prometheus/config/prometheus.yml << 'EOF'
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'asustor-node'
static_configs:
- targets: ['node-exporter:9100']
labels:
host: 'asustor'
- job_name: 'asustor-cadvisor'
static_configs:
- targets: ['cadvisor:8080']
labels:
host: 'asustor'
- job_name: 'qnap-node'
static_configs:
- targets: ['192.168.10.8:9100']
labels:
host: 'qnap'
- job_name: 'qnap-cadvisor'
static_configs:
- targets: ['192.168.10.8:8088']
labels:
host: 'qnap'
- job_name: 'backend'
static_configs:
- targets: ['192.168.10.8:3000']
labels:
host: 'qnap'
metrics_path: '/metrics'
EOF
# ตรวจสอบ
cat /volume1/np-dms/monitoring/prometheus/config/prometheus.yml
```
### Step 1.2: สร้าง promtail-config.yml
ต้องสร้าง Config ให้ Promtail อ่าน logs จาก Docker containers และส่งไป Loki:
```bash
# สร้างไฟล์ promtail-config.yml
cat > /volume1/np-dms/monitoring/promtail/config/promtail-config.yml << 'EOF'
server:
http_listen_port: 9080
grpc_listen_port: 0
positions:
filename: /tmp/positions.yaml
clients:
- url: http://loki:3100/loki/api/v1/push
scrape_configs:
- job_name: docker
docker_sd_configs:
- host: unix:///var/run/docker.sock
refresh_interval: 5s
relabel_configs:
- source_labels: ['__meta_docker_container_name']
regex: '/(.*)'
target_label: 'container'
- source_labels: ['__meta_docker_container_log_stream']
target_label: 'stream'
EOF
# ขั้นตอนการเตรียมระบบที่ QNAP (ก่อน Deploy Stack)
### 1. สร้าง Monitoring User ใน MariaDB
รันคำสั่ง SQL นี้ผ่าน **phpMyAdmin** หรือ `docker exec`:
```sql
CREATE USER 'exporter'@'%' IDENTIFIED BY 'Center2025' WITH MAX_USER_CONNECTIONS 3;
GRANT PROCESS, REPLICATION CLIENT, SELECT, SLAVE MONITOR ON *.* TO 'exporter'@'%';
FLUSH PRIVILEGES;
```
### 2. สร้างไฟล์คอนฟิก .my.cnf บน QNAP
เพื่อให้ `mysqld-exporter` อ่านรหัสผ่านที่มีตัวอักษรพิเศษได้ถูกต้อง:
1. **SSH เข้า QNAP** (หรือใช้ File Station สร้าง Folder):
```bash
ssh admin@192.168.10.8
```
2. **สร้าง Directory สำหรับเก็บ Config**:
```bash
mkdir -p /share/np-dms/monitoring/mysqld-exporter
```
3. **สร้างไฟล์ .my.cnf**:
```bash
cat > /share/np-dms/monitoring/mysqld-exporter/.my.cnf << 'EOF'
[client]
user=exporter
password=Center2025
host=mariadb
EOF
```
4. **กำหนดสิทธิ์ไฟล์** (เพื่อให้ Container อ่านไฟล์ได้):
```bash
chmod 644 /share/np-dms/monitoring/mysqld-exporter/.my.cnf
```
# ตรวจสอบ
cat /volume1/np-dms/monitoring/promtail/config/promtail-config.yml
```
---
### Step 2: Deploy ผ่าน Portainer (แนะนำ)
1. เปิด **Portainer** → เลือก Environment ของ **ASUSTOR**
2. ไปที่ **Stacks** → **Add stack**
3. กรอกข้อมูล:
- **Name:** `lcbp3-monitoring`
- **Build method:** เลือก **Web editor**
4. วาง (Paste) เนื้อหาจาก [Docker Compose File (ASUSTOR)](#docker-compose-file-asustor) ด้านบน
5. กด **Deploy the stack**
> ⚠️ **สำคัญ:** ตรวจสอบ Password ของ Grafana (`GF_SECURITY_ADMIN_PASSWORD`) ใน docker-compose ก่อน deploy
### Deploy ผ่าน SSH (วิธีสำรอง)
```bash
# SSH เข้า ASUSTOR
ssh admin@192.168.10.9
# คัดลอก docker-compose.yml ไปยัง path
# (วางไฟล์ที่ /volume1/np-dms/monitoring/docker-compose.yml)
# Deploy
cd /volume1/np-dms/monitoring
docker compose up -d
# ตรวจสอบ container status
docker compose ps
```
---
### Step 3: Verify Services
```bash
# ตรวจสอบ containers ทั้งหมด
docker ps --filter "name=prometheus" --filter "name=grafana" \
--filter "name=uptime-kuma" --filter "name=node-exporter" \
--filter "name=cadvisor" --filter "name=loki" --filter "name=promtail"
```
| Service | วิธีตรวจสอบ | Expected Result |
| :---------------- | :----------------------------------------------------------------- | :------------------------------------ |
| ✅ **Prometheus** | `curl http://192.168.10.9:9090/-/healthy` | `Prometheus Server is Healthy` |
| ✅ **Grafana** | เปิด `https://grafana.np-dms.work` (หรือ `http://192.168.10.9:3000`) | หน้า Login |
| ✅ **Uptime Kuma** | เปิด `https://uptime.np-dms.work` (หรือ `http://192.168.10.9:3001`) | หน้า Setup |
| ✅ **Node Exp.** | `curl http://192.168.10.9:9100/metrics \| head` | Metrics output |
| ✅ **cAdvisor** | `curl http://192.168.10.9:8080/healthz` | `ok` |
| ✅ **Loki** | `curl http://192.168.10.9:3100/ready` | `ready` |
| ✅ **Promtail** | เช็ค Logs: `docker logs promtail` | ไม่ควรมี Error + เห็น connection success |
---
### Step 4: Deploy QNAP Exporters
ติดตั้ง node-exporter และ cAdvisor บน QNAP เพื่อให้ Prometheus scrape ข้ามเครื่องได้:
#### ผ่าน Container Station (QNAP)
1. เปิด **Container Station** บน QNAP Web UI
2. ไปที่ **Applications** → **Create**
3. ตั้งชื่อ Application: `lcbp3-exporters`
4. วาง (Paste) เนื้อหาจาก [QNAP Node Exporter & cAdvisor](#qnap-node-exporter--cadvisor)
5. กด **Create**
#### ตรวจสอบจาก ASUSTOR
```bash
# ตรวจว่า Prometheus scrape QNAP ได้
curl -s http://localhost:9090/api/v1/targets | grep -E '"qnap-(node|cadvisor)"'
# หรือเปิด Prometheus UI → Targets
# URL: http://192.168.10.9:9090/targets
# ดูว่า qnap-node, qnap-cadvisor เป็น State: UP
```
---
### Step 5: ตั้งค่า Grafana & Uptime Kuma
#### Grafana — First Login
1. เปิด `https://grafana.np-dms.work`
2. Login: `admin` / `Center#2025` (หรือ password ที่ตั้งไว้)
3. ไปที่ **Connections** → **Data sources** → **Add data source**
4. เลือก **Prometheus**
- URL: `http://prometheus:9090`
- กด **Save & Test** → ต้องขึ้น ✅
5. Import Dashboards (ดูรายละเอียดในหัวข้อ [6. Grafana Dashboards Setup](#6-grafana-dashboards-setup))
#### Uptime Kuma — First Setup
1. เปิด `https://uptime.np-dms.work`
2. สร้าง Admin account
3. เพิ่ม Monitors ตาม [ตาราง Uptime Kuma Monitors](#uptime-kuma-monitors)
---
### 6. Grafana Dashboards Setup
เพื่อการ Monitor ที่สมบูรณ์ แนะนำให้ Import Dashboards ต่อไปนี้:
#### 6.1 Host Monitoring (Node Exporter)
* **Concept:** ดู resource ของเครื่อง Host (CPU, RAM, Disk, Network)
* **Dashboard ID:** `1860` (Node Exporter Full)
* **วิธี Import:**
1. ไปที่ **Dashboards** → **New** → **Import**
2. ช่อง **Import via grafana.com** ใส่เลข `1860` กด **Load**
3. เลือก Data source: **Prometheus**
4. กด **Import**
#### 6.2 Container Monitoring (cAdvisor)
* **Concept:** ดู resource ของแต่ละ Container (เชื่อม Logs ด้วย)
* **Dashboard ID:** `14282` (Cadvisor exporter)
* **วิธี Import:**
1. ใส่เลข `14282` กด **Load**
2. เลือก Data source: **Prometheus**
3. กด **Import**
#### 6.3 Logs Monitoring (Loki Integration)
เพื่อให้ Dashboard ของ Container แสดง Logs จาก Loki ได้ด้วย:
1. เปิด Dashboard **Cadvisor exporter** ที่เพิ่ง Import มา
2. กดปุ่ม **Add visualization** (หรือ Edit dashboard)
3. เลือก Data source: **Loki**
4. ในช่อง Query ใส่: `{container="$name"}`
* *(Note: `$name` มาจาก Variable ของ Dashboard 14282)*
5. ปรับ Visualization type เป็น **Logs**
6. ตั้งชื่อ Panel ว่า **"Container Logs"**
7. กด **Apply** และ **Save Dashboard**
ตอนนี้เราจะเห็นทั้ง **กราฟการกินทรัพยากร** และ **Logs** ของ Container นั้นๆ ในหน้าเดียวกันครับ
#### 6.4 Integrated Dashboard (Recommended)
ผมได้เตรียม JSON file ที่รวม Metrics และ Logs ไว้ให้แล้วครับ:
1. ไปที่ **Dashboards** → **New** → **Import**
2. ลากไฟล์ หรือ Copy เนื้อหาจากไฟล์:
`specs/08-infrastructure/grafana/dashboards/lcbp3-docker-monitoring.json`
3. กด **Load** และ **Import**
## 7.3 Backup / Export Dashboards
เมื่อปรับแต่ง Dashboard จนพอใจแล้ว ควร Export เก็บเป็นไฟล์ JSON ไว้ backup หรือ version control:
1. เปิด Dashboard ที่ต้องการ backup
2. ไปที่ปุ่ม **Share Dashboard** (ไอคอน 🔗 หรือ Share มุมซ้ายบน)
3. เลือกTab **Export**
4. เปิดตัวเลือก **Export for sharing externally** (เพื่อให้ลบ hardcoded value)
5. กด **Save to file**
6. นำไฟล์ JSON มาเก็บไว้ที่ path: `specs/08-infrastructure/grafana/dashboards/`
---
> 📝 **หมายเหตุ**: เอกสารนี้อ้างอิงจาก Architecture Document **v1.8.0** - Monitoring Stack deploy บน ASUSTOR AS5403T
---
## 📈 Document Numbering Specific Monitoring
## 3. Monitoring & Metrics
### 3.1. Prometheus Metrics
#### Key Metrics to Collect
```typescript
// metrics.service.ts
import { Counter, Histogram, Gauge } from 'prom-client';
// Lock acquisition metrics
export const lockAcquisitionDuration = new Histogram({
name: 'docnum_lock_acquisition_duration_ms',
help: 'Lock acquisition time in milliseconds',
labelNames: ['project', 'type'],
buckets: [10, 50, 100, 200, 500, 1000, 2000, 5000],
});
export const lockAcquisitionFailures = new Counter({
name: 'docnum_lock_acquisition_failures_total',
help: 'Total number of lock acquisition failures',
labelNames: ['project', 'type', 'reason'],
});
// Generation metrics
export const generationDuration = new Histogram({
name: 'docnum_generation_duration_ms',
help: 'Total document number generation time',
labelNames: ['project', 'type', 'status'],
buckets: [100, 200, 500, 1000, 2000, 5000],
});
export const retryCount = new Histogram({
name: 'docnum_retry_count',
help: 'Number of retries per generation',
labelNames: ['project', 'type'],
buckets: [0, 1, 2, 3, 5, 10],
});
// Connection health
export const redisConnectionStatus = new Gauge({
name: 'docnum_redis_connection_status',
help: 'Redis connection status (1=up, 0=down)',
});
export const dbConnectionPoolUsage = new Gauge({
name: 'docnum_db_connection_pool_usage',
help: 'Database connection pool usage percentage',
});
```
### 3.2. Prometheus Alert Rules
```yaml
# prometheus/alerts.yml
groups:
- name: document_numbering_alerts
interval: 30s
rules:
# CRITICAL: Redis unavailable
- alert: RedisUnavailable
expr: docnum_redis_connection_status == 0
for: 1m
labels:
severity: critical
component: document-numbering
annotations:
summary: "Redis is unavailable for document numbering"
description: "System is falling back to DB-only locking. Performance degraded by 30-50%."
runbook_url: "https://wiki.lcbp3/runbooks/redis-unavailable"
# CRITICAL: High lock failure rate
- alert: HighLockFailureRate
expr: |
rate(docnum_lock_acquisition_failures_total[5m]) > 0.1
for: 5m
labels:
severity: critical
component: document-numbering
annotations:
summary: "Lock acquisition failure rate > 10%"
description: "Check Redis and database performance immediately"
runbook_url: "https://wiki.lcbp3/runbooks/high-lock-failure"
# WARNING: Elevated lock failure rate
- alert: ElevatedLockFailureRate
expr: |
rate(docnum_lock_acquisition_failures_total[5m]) > 0.05
for: 5m
labels:
severity: warning
component: document-numbering
annotations:
summary: "Lock acquisition failure rate > 5%"
description: "Monitor closely. May escalate to critical soon."
# WARNING: Slow lock acquisition
- alert: SlowLockAcquisition
expr: |
histogram_quantile(0.95,
rate(docnum_lock_acquisition_duration_ms_bucket[5m])
) > 1000
for: 5m
labels:
severity: warning
component: document-numbering
annotations:
summary: "P95 lock acquisition time > 1 second"
description: "Lock acquisition is slower than expected. Check Redis latency."
# WARNING: High retry count
- alert: HighRetryCount
expr: |
sum by (project) (
rate(docnum_retry_count_sum[1h])
) > 100
for: 1h
labels:
severity: warning
component: document-numbering
annotations:
summary: "Retry count > 100 per hour in project {{ $labels.project }}"
description: "High contention detected. Consider scaling."
# WARNING: Slow generation
- alert: SlowDocumentNumberGeneration
expr: |
histogram_quantile(0.95,
rate(docnum_generation_duration_ms_bucket[5m])
) > 2000
for: 5m
labels:
severity: warning
component: document-numbering
annotations:
summary: "P95 generation time > 2 seconds"
description: "Document number generation is slower than SLA target"
```
### 3.3. AlertManager Configuration
```yaml
# alertmanager/config.yml
global:
resolve_timeout: 5m
slack_api_url: ${SLACK_WEBHOOK_URL}
route:
group_by: ['alertname', 'severity', 'project']
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
receiver: 'ops-team'
routes:
# CRITICAL alerts → PagerDuty + Slack
- match:
severity: critical
receiver: 'pagerduty-critical'
continue: true
- match:
severity: critical
receiver: 'slack-critical'
continue: false
# WARNING alerts → Slack only
- match:
severity: warning
receiver: 'slack-warnings'
receivers:
- name: 'pagerduty-critical'
pagerduty_configs:
- service_key: ${PAGERDUTY_SERVICE_KEY}
description: '{{ .GroupLabels.alertname }}: {{ .CommonAnnotations.summary }}'
details:
firing: '{{ .Alerts.Firing | len }}'
resolved: '{{ .Alerts.Resolved | len }}'
runbook: '{{ .CommonAnnotations.runbook_url }}'
- name: 'slack-critical'
slack_configs:
- channel: '#lcbp3-critical-alerts'
title: '🚨 CRITICAL: {{ .GroupLabels.alertname }}'
text: |
*Summary:* {{ .CommonAnnotations.summary }}
*Description:* {{ .CommonAnnotations.description }}
*Runbook:* {{ .CommonAnnotations.runbook_url }}
color: 'danger'
- name: 'slack-warnings'
slack_configs:
- channel: '#lcbp3-alerts'
title: '⚠️ WARNING: {{ .GroupLabels.alertname }}'
text: '{{ .CommonAnnotations.description }}'
color: 'warning'
- name: 'ops-team'
email_configs:
- to: 'ops@example.com'
subject: '[LCBP3] {{ .GroupLabels.alertname }}'
```
### 3.4. Grafana Dashboard
Dashboard panels ที่สำคัญ:
1. **Lock Acquisition Success Rate** (Gauge)
- Query: `1 - (rate(docnum_lock_acquisition_failures_total[5m]) / rate(docnum_lock_acquisition_total[5m]))`
- Alert threshold: < 95%
2. **Lock Acquisition Time Percentiles** (Graph)
- P50: `histogram_quantile(0.50, rate(docnum_lock_acquisition_duration_ms_bucket[5m]))`
- P95: `histogram_quantile(0.95, rate(docnum_lock_acquisition_duration_ms_bucket[5m]))`
- P99: `histogram_quantile(0.99, rate(docnum_lock_acquisition_duration_ms_bucket[5m]))`
3. **Generation Rate** (Stat)
- Query: `sum(rate(docnum_generation_duration_ms_count[1m])) * 60`
- Unit: documents/minute
4. **Error Rate by Type** (Graph)
- Query: `sum by (reason) (rate(docnum_lock_acquisition_failures_total[5m]))`
5. **Redis Connection Status** (Stat)
- Query: `docnum_redis_connection_status`
- Thresholds: 0 = red, 1 = green
6. **DB Connection Pool Usage** (Gauge)
- Query: `docnum_db_connection_pool_usage`
- Alert threshold: > 80%