1557 lines
49 KiB
Markdown
1557 lines
49 KiB
Markdown
# 04.3 Monitoring & Alerting
|
||
**Project:** LCBP3-DMS
|
||
**Version:** 1.8.0
|
||
**Status:** Active
|
||
**Owner:** Nattanin Peancharoen / DevOps Team
|
||
**Last Updated:** 2026-02-23
|
||
|
||
> 📍 **Monitoring Hub:** ASUSTOR AS5403T
|
||
> 📍 **App Server (Exporters):** QNAP TS-473A
|
||
|
||
---
|
||
|
||
## 📖 Overview
|
||
|
||
This document combines the operational SLAs, Alerting Rules, and Health Checks with the technical deployment instructions for the monitoring stack (Prometheus, Grafana, Loki) across both servers.
|
||
|
||
---
|
||
|
||
# Monitoring & Alerting
|
||
|
||
**Project:** LCBP3-DMS
|
||
**Version:** 1.8.0
|
||
**Last Updated:** 2025-12-02
|
||
|
||
---
|
||
|
||
## 📋 Overview
|
||
|
||
This document describes monitoring setup, health checks, and alerting rules for LCBP3-DMS.
|
||
|
||
---
|
||
|
||
## 🎯 Monitoring Objectives
|
||
|
||
- **Availability:** System uptime > 99.5%
|
||
- **Performance:** API response time < 500ms (P95)
|
||
- **Reliability:** Error rate < 1%
|
||
- **Capacity:** Resource utilization < 80%
|
||
|
||
---
|
||
|
||
## 📊 Key Metrics
|
||
|
||
### Application Metrics
|
||
|
||
| Metric | Target | Alert Threshold |
|
||
| ----------------------- | ------- | ------------------ |
|
||
| API Response Time (P95) | < 500ms | > 1000ms |
|
||
| Error Rate | < 1% | > 5% |
|
||
| Request Rate | N/A | Sudden ±50% change |
|
||
| Active Users | N/A | - |
|
||
| Queue Length (BullMQ) | < 100 | > 500 |
|
||
|
||
### Infrastructure Metrics
|
||
|
||
| Metric | Target | Alert Threshold |
|
||
| ------------ | ------ | ----------------- |
|
||
| CPU Usage | < 70% | > 90% |
|
||
| Memory Usage | < 80% | > 95% |
|
||
| Disk Usage | < 80% | > 90% |
|
||
| Network I/O | N/A | Anomaly detection |
|
||
|
||
### Database Metrics
|
||
|
||
| Metric | Target | Alert Threshold |
|
||
| --------------------- | ------- | --------------- |
|
||
| Query Time (P95) | < 100ms | > 500ms |
|
||
| Connection Pool Usage | < 80% | > 95% |
|
||
| Slow Queries | 0 | > 10/min |
|
||
| Replication Lag | 0s | > 30s |
|
||
|
||
---
|
||
|
||
## 🔍 Health Checks
|
||
|
||
### Backend Health Endpoint
|
||
|
||
```typescript
|
||
// File: backend/src/health/health.controller.ts
|
||
import { Controller, Get } from '@nestjs/common';
|
||
import {
|
||
HealthCheck,
|
||
HealthCheckService,
|
||
TypeOrmHealthIndicator,
|
||
DiskHealthIndicator,
|
||
} from '@nestjs/terminus';
|
||
|
||
@Controller('health')
|
||
export class HealthController {
|
||
constructor(
|
||
private health: HealthCheckService,
|
||
private db: TypeOrmHealthIndicator,
|
||
private disk: DiskHealthIndicator
|
||
) {}
|
||
|
||
@Get()
|
||
@HealthCheck()
|
||
check() {
|
||
return this.health.check([
|
||
// Database health
|
||
() => this.db.pingCheck('database'),
|
||
|
||
// Disk health
|
||
() =>
|
||
this.disk.checkStorage('storage', {
|
||
path: '/',
|
||
thresholdPercent: 0.9,
|
||
}),
|
||
|
||
// Redis health
|
||
async () => {
|
||
const redis = await this.redis.ping();
|
||
return { redis: { status: redis === 'PONG' ? 'up' : 'down' } };
|
||
},
|
||
]);
|
||
}
|
||
}
|
||
```
|
||
|
||
### Health Check Response
|
||
|
||
```json
|
||
{
|
||
"status": "ok",
|
||
"info": {
|
||
"database": {
|
||
"status": "up"
|
||
},
|
||
"storage": {
|
||
"status": "up",
|
||
"freePercent": 0.75
|
||
},
|
||
"redis": {
|
||
"status": "up"
|
||
}
|
||
},
|
||
"error": {},
|
||
"details": {
|
||
"database": {
|
||
"status": "up"
|
||
},
|
||
"storage": {
|
||
"status": "up",
|
||
"freePercent": 0.75
|
||
},
|
||
"redis": {
|
||
"status": "up"
|
||
}
|
||
}
|
||
}
|
||
```
|
||
|
||
---
|
||
|
||
## 🐳 Docker Container Monitoring
|
||
|
||
### Health Check in docker-compose.yml
|
||
|
||
```yaml
|
||
services:
|
||
backend:
|
||
healthcheck:
|
||
test: ['CMD', 'curl', '-f', 'http://localhost:3000/health']
|
||
interval: 30s
|
||
timeout: 10s
|
||
retries: 3
|
||
start_period: 40s
|
||
|
||
mariadb:
|
||
healthcheck:
|
||
test: ['CMD', 'mysqladmin', 'ping', '-h', 'localhost']
|
||
interval: 30s
|
||
timeout: 10s
|
||
retries: 3
|
||
|
||
redis:
|
||
healthcheck:
|
||
test: ['CMD', 'redis-cli', 'ping']
|
||
interval: 30s
|
||
timeout: 10s
|
||
retries: 3
|
||
```
|
||
|
||
### Monitor Container Status
|
||
|
||
```bash
|
||
#!/bin/bash
|
||
# File: /scripts/monitor-containers.sh
|
||
|
||
# Check all containers are healthy
|
||
CONTAINERS=("lcbp3-backend" "lcbp3-frontend" "lcbp3-mariadb" "lcbp3-redis")
|
||
|
||
for CONTAINER in "${CONTAINERS[@]}"; do
|
||
HEALTH=$(docker inspect --format='{{.State.Health.Status}}' $CONTAINER 2>/dev/null)
|
||
|
||
if [ "$HEALTH" != "healthy" ]; then
|
||
echo "ALERT: $CONTAINER is $HEALTH"
|
||
# Send alert (email, Slack, etc.)
|
||
fi
|
||
done
|
||
```
|
||
|
||
---
|
||
|
||
## 📈 Application Performance Monitoring (APM)
|
||
|
||
### Log-Based Monitoring (MVP Phase)
|
||
|
||
```typescript
|
||
// File: backend/src/common/interceptors/performance.interceptor.ts
|
||
import {
|
||
Injectable,
|
||
NestInterceptor,
|
||
ExecutionContext,
|
||
CallHandler,
|
||
} from '@nestjs/common';
|
||
import { Observable } from 'rxjs';
|
||
import { tap } from 'rxjs/operators';
|
||
import { logger } from 'src/config/logger.config';
|
||
|
||
@Injectable()
|
||
export class PerformanceInterceptor implements NestInterceptor {
|
||
intercept(context: ExecutionContext, next: CallHandler): Observable<any> {
|
||
const request = context.switchToHttp().getRequest();
|
||
const start = Date.now();
|
||
|
||
return next.handle().pipe(
|
||
tap({
|
||
next: () => {
|
||
const duration = Date.now() - start;
|
||
|
||
logger.info('Request completed', {
|
||
method: request.method,
|
||
url: request.url,
|
||
statusCode: context.switchToHttp().getResponse().statusCode,
|
||
duration: `${duration}ms`,
|
||
userId: request.user?.user_id,
|
||
});
|
||
|
||
// Alert on slow requests
|
||
if (duration > 1000) {
|
||
logger.warn('Slow request detected', {
|
||
method: request.method,
|
||
url: request.url,
|
||
duration: `${duration}ms`,
|
||
});
|
||
}
|
||
},
|
||
error: (error) => {
|
||
const duration = Date.now() - start;
|
||
|
||
logger.error('Request failed', {
|
||
method: request.method,
|
||
url: request.url,
|
||
duration: `${duration}ms`,
|
||
error: error.message,
|
||
});
|
||
},
|
||
})
|
||
);
|
||
}
|
||
}
|
||
```
|
||
|
||
---
|
||
|
||
## 🚨 Alerting Rules
|
||
|
||
### Critical Alerts (Immediate Action Required)
|
||
|
||
| Alert | Condition | Action |
|
||
| --------------- | ------------------------------------------- | --------------------------- |
|
||
| Service Down | Health check fails for 3 consecutive checks | Page on-call engineer |
|
||
| Database Down | Cannot connect to database | Page DBA + on-call engineer |
|
||
| Disk Full | Disk usage > 95% | Page operations team |
|
||
| High Error Rate | Error rate > 10% for 5 min | Page on-call engineer |
|
||
|
||
### Warning Alerts (Review Within 1 Hour)
|
||
|
||
| Alert | Condition | Action |
|
||
| ------------- | ----------------------- | ---------------------- |
|
||
| High CPU | CPU > 90% for 10 min | Notify operations team |
|
||
| High Memory | Memory > 95% for 10 min | Notify operations team |
|
||
| Slow Queries | > 50 slow queries/min | Notify DBA |
|
||
| Queue Backlog | BullMQ queue > 500 jobs | Notify backend team |
|
||
|
||
### Info Alerts (Review During Business Hours)
|
||
|
||
| Alert | Condition | Action |
|
||
| ------------------ | ------------------------------------ | --------------------- |
|
||
| Backup Failed | Daily backup job failed | Email operations team |
|
||
| SSL Expiring | SSL certificate expires in < 30 days | Email operations team |
|
||
| Disk Space Warning | Disk usage > 80% | Email operations team |
|
||
|
||
---
|
||
|
||
## 📧 Alert Notification Channels
|
||
|
||
### Email Alerts
|
||
|
||
```bash
|
||
#!/bin/bash
|
||
# File: /scripts/send-alert-email.sh
|
||
|
||
TO="ops-team@example.com"
|
||
SUBJECT="$1"
|
||
MESSAGE="$2"
|
||
|
||
echo "$MESSAGE" | mail -s "[LCBP3-DMS] $SUBJECT" "$TO"
|
||
```
|
||
|
||
### Slack Alerts
|
||
|
||
```bash
|
||
#!/bin/bash
|
||
# File: /scripts/send-alert-slack.sh
|
||
|
||
WEBHOOK_URL="https://hooks.slack.com/services/YOUR/WEBHOOK/URL"
|
||
MESSAGE="$1"
|
||
|
||
curl -X POST -H 'Content-type: application/json' \
|
||
--data "{\"text\":\"🚨 LCBP3-DMS Alert: $MESSAGE\"}" \
|
||
"$WEBHOOK_URL"
|
||
```
|
||
|
||
---
|
||
|
||
## 📊 Monitoring Dashboard
|
||
|
||
### Metrics to Display
|
||
|
||
**System Overview:**
|
||
|
||
- Service status (up/down)
|
||
- Overall system health score
|
||
- Active user count
|
||
- Request rate (req/s)
|
||
|
||
**Performance:**
|
||
|
||
- API response time (P50, P95, P99)
|
||
- Database query time
|
||
- Queue processing time
|
||
|
||
**Resources:**
|
||
|
||
- CPU usage %
|
||
- Memory usage %
|
||
- Disk usage %
|
||
- Network I/O
|
||
|
||
**Business Metrics:**
|
||
|
||
- Documents created today
|
||
- Workflows completed today
|
||
- Active correspondences
|
||
- Pending approvals
|
||
|
||
---
|
||
|
||
## 🔧 Log Aggregation
|
||
|
||
### Centralized Logging with Docker
|
||
|
||
```bash
|
||
# Configure Docker logging driver
|
||
# File: /etc/docker/daemon.json
|
||
{
|
||
"log-driver": "json-file",
|
||
"log-opts": {
|
||
"max-size": "10m",
|
||
"max-file": "3",
|
||
"labels": "service,environment"
|
||
}
|
||
}
|
||
```
|
||
|
||
### View Aggregated Logs
|
||
|
||
```bash
|
||
# View all LCBP3 container logs
|
||
docker-compose logs -f --tail=100
|
||
|
||
# View specific service logs
|
||
docker logs lcbp3-backend -f --since=1h
|
||
|
||
# Search logs
|
||
docker logs lcbp3-backend 2>&1 | grep "ERROR"
|
||
|
||
# Export logs for analysis
|
||
docker logs lcbp3-backend > backend-logs.txt
|
||
```
|
||
|
||
---
|
||
|
||
## 📈 Performance Baseline
|
||
|
||
### Establish Baselines
|
||
|
||
Run load tests to establish performance baselines:
|
||
|
||
```bash
|
||
# Install Apache Bench
|
||
apt-get install apache2-utils
|
||
|
||
# Test API endpoint
|
||
ab -n 1000 -c 10 \
|
||
-H "Authorization: Bearer <TOKEN>" \
|
||
https://lcbp3-dms.example.com/api/correspondences
|
||
|
||
# Results to record:
|
||
# - Requests per second
|
||
# - Mean response time
|
||
# - P95 response time
|
||
# - Error rate
|
||
```
|
||
|
||
### Regular Performance Testing
|
||
|
||
- **Weekly:** Quick health check (100 requests)
|
||
- **Monthly:** Full load test (10,000 requests)
|
||
- **Quarterly:** Stress test (find breaking point)
|
||
|
||
---
|
||
|
||
## ✅ Monitoring Checklist
|
||
|
||
### Daily
|
||
|
||
- [ ] Check service health dashboard
|
||
- [ ] Review error logs
|
||
- [ ] Verify backup completion
|
||
- [ ] Check disk space
|
||
|
||
### Weekly
|
||
|
||
- [ ] Review performance metrics trends
|
||
- [ ] Analyze slow query log
|
||
- [ ] Check SSL certificate expiry
|
||
- [ ] Review security alerts
|
||
|
||
### Monthly
|
||
|
||
- [ ] Capacity planning review
|
||
- [ ] Update monitoring thresholds
|
||
- [ ] Test alert notifications
|
||
- [ ] Review and tune performance
|
||
|
||
---
|
||
|
||
## 🔗 Related Documents
|
||
|
||
- [Backup & Recovery](04-04-backup-recovery.md)
|
||
- [Incident Response](04-07-incident-response.md)
|
||
- [ADR-010: Logging Strategy](../05-decisions/ADR-010-logging-monitoring-strategy.md)
|
||
|
||
---
|
||
|
||
**Version:** 1.8.0
|
||
**Last Review:** 2025-12-01
|
||
**Next Review:** 2026-03-01
|
||
|
||
|
||
---
|
||
|
||
# การติดตั้ง Monitoring Stack บน ASUSTOR
|
||
|
||
## **📝 คำอธิบายและข้อควรพิจารณา**
|
||
|
||
> ⚠️ **หมายเหตุ**: Monitoring Stack ทั้งหมดติดตั้งบน **ASUSTOR AS5403T** ไม่ใช่ QNAP
|
||
> เพื่อแยก Application workload ออกจาก Infrastructure/Monitoring workload
|
||
|
||
Stack สำหรับ Monitoring ประกอบด้วย:
|
||
|
||
| Service | Port | Purpose | Host |
|
||
| :---------------- | :--------------------------- | :-------------------------------- | :------ |
|
||
| **Prometheus** | 9090 | เก็บ Metrics และ Time-series data | ASUSTOR |
|
||
| **Grafana** | 3000 | Dashboard สำหรับแสดงผล Metrics | ASUSTOR |
|
||
| **Node Exporter** | 9100 | เก็บ Metrics ของ Host system | Both |
|
||
| **cAdvisor** | 8080 (ASUSTOR) / 8088 (QNAP) | เก็บ Metrics ของ Docker containers | Both |
|
||
| **Uptime Kuma** | 3001 | Service Availability Monitoring | ASUSTOR |
|
||
| **Loki** | 3100 | Log aggregation | ASUSTOR |
|
||
| **Promtail** | - | Log shipper (Sender) | ASUSTOR |
|
||
|
||
---
|
||
|
||
## 🏗️ Architecture Overview
|
||
|
||
```
|
||
┌─────────────────────────────────────────────────────────────────────────┐
|
||
│ ASUSTOR AS5403T (Monitoring Hub) │
|
||
├─────────────────────────────────────────────────────────────────────────┤
|
||
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
|
||
│ │ Prometheus │───▶│ Grafana │ │ Uptime Kuma │ │
|
||
│ │ :9090 │ │ :3000 │ │ :3001 │ │
|
||
│ └──────┬──────┘ └─────────────┘ └─────────────┘ │
|
||
│ │ │
|
||
│ │ Scrape Metrics │
|
||
│ ▼ │
|
||
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
|
||
│ │node-exporter│ │ cAdvisor │ │ Promtail │ │
|
||
│ │ :9100 │ │ :8080 │ │ (Log Ship) │ │
|
||
│ │ (Local) │ │ (Local) │ │ (Local) │ │
|
||
│ └─────────────┘ └─────────────┘ └─────────────┘ │
|
||
└─────────────────────────────────────────────────────────────────────────┘
|
||
│ Remote Scrape
|
||
▼
|
||
┌─────────────────────────────────────────────────────────────────────────┐
|
||
│ QNAP TS-473A (App Server) │
|
||
├─────────────────────────────────────────────────────────────────────────┤
|
||
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
|
||
│ │node-exporter│ │ cAdvisor │ │ Backend │ │
|
||
│ │ :9100 │ │ :8080 │ │ /metrics │ │
|
||
│ └─────────────┘ └─────────────┘ └─────────────┘ │
|
||
└─────────────────────────────────────────────────────────────────────────┘
|
||
```
|
||
|
||
---
|
||
|
||
## กำหนดสิทธิ (บน ASUSTOR)
|
||
|
||
```bash
|
||
# SSH เข้า ASUSTOR
|
||
ssh admin@192.168.10.9
|
||
|
||
# สร้าง Directory
|
||
mkdir -p /volume1/np-dms/monitoring/prometheus/data
|
||
mkdir -p /volume1/np-dms/monitoring/prometheus/config
|
||
mkdir -p /volume1/np-dms/monitoring/grafana/data
|
||
mkdir -p /volume1/np-dms/monitoring/uptime-kuma/data
|
||
mkdir -p /volume1/np-dms/monitoring/loki/data
|
||
mkdir -p /volume1/np-dms/monitoring/promtail/config
|
||
|
||
# กำหนดสิทธิ์ให้ตรงกับ User ID ใน Container
|
||
# Prometheus (UID 65534 - nobody)
|
||
chown -R 65534:65534 /volume1/np-dms/monitoring/prometheus
|
||
chmod -R 750 /volume1/np-dms/monitoring/prometheus
|
||
|
||
# Grafana (UID 472)
|
||
chown -R 472:472 /volume1/np-dms/monitoring/grafana/data
|
||
chmod -R 750 /volume1/np-dms/monitoring/grafana/data
|
||
|
||
# Uptime Kuma (UID 1000)
|
||
chown -R 1000:1000 /volume1/np-dms/monitoring/uptime-kuma/data
|
||
chmod -R 750 /volume1/np-dms/monitoring/uptime-kuma/data
|
||
|
||
# Loki (UID 10001)
|
||
chown -R 10001:10001 /volume1/np-dms/monitoring/loki/data
|
||
chmod -R 750 /volume1/np-dms/monitoring/loki/data
|
||
|
||
# Promtail (Runs as root to read docker logs - no specific chown needed for config dir if created by admin)
|
||
# But ensure config file is readable
|
||
chmod -R 755 /volume1/np-dms/monitoring/promtail/config
|
||
```
|
||
|
||
---
|
||
|
||
## 🔗 สร้าง Docker Network (ทำครั้งแรกครั้งเดียว)
|
||
|
||
> ⚠️ **ต้องสร้าง network ก่อน deploy docker-compose ทุกตัว** เพราะทุก service ใช้ `lcbp3` เป็น external network
|
||
|
||
### สร้างผ่าน Portainer (แนะนำ)
|
||
|
||
1. เปิด **Portainer** → เลือก Environment ของ ASUSTOR
|
||
2. ไปที่ **Networks** → **Add network**
|
||
3. กรอกข้อมูล:
|
||
- **Name:** `lcbp3`
|
||
- **Driver:** `bridge`
|
||
4. กด **Create the network**
|
||
|
||
### สร้างผ่าน SSH
|
||
|
||
```bash
|
||
# SSH เข้า ASUSTOR
|
||
ssh admin@192.168.10.9
|
||
|
||
# สร้าง external network
|
||
docker network create lcbp3
|
||
|
||
# ตรวจสอบ
|
||
docker network ls | grep lcbp3
|
||
docker network inspect lcbp3
|
||
```
|
||
|
||
> 📖 **QNAP** ก็ต้องมี network ชื่อ `lcbp3` เช่นกัน (สร้างผ่าน Container Station หรือ SSH)
|
||
> ดู [README.md – Quick Reference](README.md#-quick-reference) สำหรับคำสั่งบน QNAP
|
||
|
||
---
|
||
|
||
## Note: NPM Proxy Configuration (NPM รันบน QNAP → Forward ไป ASUSTOR)
|
||
|
||
> ⚠️ เนื่องจาก NPM อยู่บน **QNAP** แต่ Monitoring services อยู่บน **ASUSTOR**
|
||
> ต้องใช้ **IP Address** (`192.168.10.9`) แทนชื่อ container (resolve ข้ามเครื่องไม่ได้)
|
||
|
||
| Domain Names | Scheme | Forward Hostname | Forward Port | Block Common Exploits | Websockets | Force SSL | HTTP/2 |
|
||
| :--------------------- | :----- | :--------------- | :----------- | :-------------------- | :--------- | :-------- | :----- |
|
||
| grafana.np-dms.work | `http` | `192.168.10.9` | 3000 | [x] | [x] | [x] | [x] |
|
||
| prometheus.np-dms.work | `http` | `192.168.10.9` | 9090 | [x] | [ ] | [x] | [x] |
|
||
| uptime.np-dms.work | `http` | `192.168.10.9` | 3001 | [x] | [x] | [x] | [x] |
|
||
|
||
---
|
||
|
||
## Docker Compose File (ASUSTOR)
|
||
|
||
```yaml
|
||
# File: /volume1/np-dms/monitoring/docker-compose.yml
|
||
# DMS Container v1.8.0: Application name: lcbp3-monitoring
|
||
# Deploy on: ASUSTOR AS5403T
|
||
# Services: prometheus, grafana, node-exporter, cadvisor, uptime-kuma, loki, promtail
|
||
|
||
x-restart: &restart_policy
|
||
restart: unless-stopped
|
||
|
||
x-logging: &default_logging
|
||
logging:
|
||
driver: "json-file"
|
||
options:
|
||
max-size: "10m"
|
||
max-file: "5"
|
||
|
||
networks:
|
||
lcbp3:
|
||
external: true
|
||
|
||
services:
|
||
# ----------------------------------------------------------------
|
||
# 1. Prometheus (Metrics Collection & Storage)
|
||
# ----------------------------------------------------------------
|
||
prometheus:
|
||
<<: [*restart_policy, *default_logging]
|
||
image: prom/prometheus:v2.48.0
|
||
container_name: prometheus
|
||
stdin_open: true
|
||
tty: true
|
||
deploy:
|
||
resources:
|
||
limits:
|
||
cpus: "1.0"
|
||
memory: 1G
|
||
reservations:
|
||
cpus: "0.25"
|
||
memory: 256M
|
||
environment:
|
||
TZ: "Asia/Bangkok"
|
||
command:
|
||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||
- '--storage.tsdb.path=/prometheus'
|
||
- '--storage.tsdb.retention.time=30d'
|
||
- '--web.enable-lifecycle'
|
||
ports:
|
||
- "9090:9090"
|
||
networks:
|
||
- lcbp3
|
||
volumes:
|
||
- "/volume1/np-dms/monitoring/prometheus/config:/etc/prometheus:ro"
|
||
- "/volume1/np-dms/monitoring/prometheus/data:/prometheus"
|
||
healthcheck:
|
||
test: ["CMD", "wget", "--spider", "-q", "http://localhost:9090/-/healthy"]
|
||
interval: 30s
|
||
timeout: 10s
|
||
retries: 3
|
||
|
||
# ----------------------------------------------------------------
|
||
# 2. Grafana (Dashboard & Visualization)
|
||
# ----------------------------------------------------------------
|
||
grafana:
|
||
<<: [*restart_policy, *default_logging]
|
||
image: grafana/grafana:10.2.2
|
||
container_name: grafana
|
||
stdin_open: true
|
||
tty: true
|
||
deploy:
|
||
resources:
|
||
limits:
|
||
cpus: "1.0"
|
||
memory: 512M
|
||
reservations:
|
||
cpus: "0.25"
|
||
memory: 128M
|
||
environment:
|
||
TZ: "Asia/Bangkok"
|
||
GF_SECURITY_ADMIN_USER: admin
|
||
GF_SECURITY_ADMIN_PASSWORD: "Center#2025"
|
||
GF_SERVER_ROOT_URL: "https://grafana.np-dms.work"
|
||
GF_INSTALL_PLUGINS: grafana-clock-panel,grafana-piechart-panel
|
||
ports:
|
||
- "3000:3000"
|
||
networks:
|
||
- lcbp3
|
||
volumes:
|
||
- "/volume1/np-dms/monitoring/grafana/data:/var/lib/grafana"
|
||
depends_on:
|
||
- prometheus
|
||
healthcheck:
|
||
test: ["CMD-SHELL", "wget --spider -q http://localhost:3000/api/health || exit 1"]
|
||
interval: 30s
|
||
timeout: 10s
|
||
retries: 3
|
||
|
||
# ----------------------------------------------------------------
|
||
# 3. Uptime Kuma (Service Availability Monitoring)
|
||
# ----------------------------------------------------------------
|
||
uptime-kuma:
|
||
<<: [*restart_policy, *default_logging]
|
||
image: louislam/uptime-kuma:1
|
||
container_name: uptime-kuma
|
||
deploy:
|
||
resources:
|
||
limits:
|
||
cpus: "0.5"
|
||
memory: 256M
|
||
environment:
|
||
TZ: "Asia/Bangkok"
|
||
ports:
|
||
- "3001:3001"
|
||
networks:
|
||
- lcbp3
|
||
volumes:
|
||
- "/volume1/np-dms/monitoring/uptime-kuma/data:/app/data"
|
||
healthcheck:
|
||
test: ["CMD-SHELL", "curl -f http://localhost:3001/api/entry-page || exit 1"]
|
||
interval: 30s
|
||
timeout: 10s
|
||
retries: 3
|
||
|
||
# ----------------------------------------------------------------
|
||
# 4. Node Exporter (Host Metrics - ASUSTOR)
|
||
# ----------------------------------------------------------------
|
||
node-exporter:
|
||
<<: [*restart_policy, *default_logging]
|
||
image: prom/node-exporter:v1.7.0
|
||
container_name: node-exporter
|
||
deploy:
|
||
resources:
|
||
limits:
|
||
cpus: "0.5"
|
||
memory: 128M
|
||
environment:
|
||
TZ: "Asia/Bangkok"
|
||
command:
|
||
- '--path.procfs=/host/proc'
|
||
- '--path.sysfs=/host/sys'
|
||
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
|
||
ports:
|
||
- "9100:9100"
|
||
networks:
|
||
- lcbp3
|
||
volumes:
|
||
- /proc:/host/proc:ro
|
||
- /sys:/host/sys:ro
|
||
- /:/rootfs:ro
|
||
healthcheck:
|
||
test: ["CMD", "wget", "--spider", "-q", "http://localhost:9100/metrics"]
|
||
interval: 30s
|
||
timeout: 10s
|
||
retries: 3
|
||
|
||
# ----------------------------------------------------------------
|
||
# 5. cAdvisor (Container Metrics - ASUSTOR)
|
||
# ----------------------------------------------------------------
|
||
cadvisor:
|
||
<<: [*restart_policy, *default_logging]
|
||
image: gcr.io/cadvisor/cadvisor:v0.47.2
|
||
container_name: cadvisor
|
||
privileged: true
|
||
devices:
|
||
- /dev/kmsg
|
||
deploy:
|
||
resources:
|
||
limits:
|
||
cpus: "0.5"
|
||
memory: 256M
|
||
environment:
|
||
TZ: "Asia/Bangkok"
|
||
ports:
|
||
- "8088:8088"
|
||
networks:
|
||
- lcbp3
|
||
volumes:
|
||
- /:/rootfs:ro
|
||
- /var/run:/var/run:ro
|
||
- /sys:/sys:ro
|
||
- /var/lib/docker/:/var/lib/docker:ro
|
||
- /dev/disk/:/dev/disk:ro
|
||
healthcheck:
|
||
test: ["CMD", "wget", "--spider", "-q", "http://localhost:8080/healthz"]
|
||
interval: 30s
|
||
timeout: 10s
|
||
retries: 3
|
||
|
||
# ----------------------------------------------------------------
|
||
# 6. Loki (Log Aggregation)
|
||
# ----------------------------------------------------------------
|
||
loki:
|
||
<<: [*restart_policy, *default_logging]
|
||
image: grafana/loki:2.9.0
|
||
container_name: loki
|
||
deploy:
|
||
resources:
|
||
limits:
|
||
cpus: "0.5"
|
||
memory: 512M
|
||
environment:
|
||
TZ: "Asia/Bangkok"
|
||
command: -config.file=/etc/loki/local-config.yaml
|
||
ports:
|
||
- "3100:3100"
|
||
networks:
|
||
- lcbp3
|
||
volumes:
|
||
- "/volume1/np-dms/monitoring/loki/data:/loki"
|
||
healthcheck:
|
||
test: ["CMD", "wget", "--spider", "-q", "http://localhost:3100/ready"]
|
||
interval: 30s
|
||
timeout: 10s
|
||
retries: 3
|
||
|
||
# ----------------------------------------------------------------
|
||
# 7. Promtail (Log Shipper)
|
||
# ----------------------------------------------------------------
|
||
promtail:
|
||
<<: [*restart_policy, *default_logging]
|
||
image: grafana/promtail:2.9.0
|
||
container_name: promtail
|
||
user: "0:0"
|
||
deploy:
|
||
resources:
|
||
limits:
|
||
cpus: "0.5"
|
||
memory: 256M
|
||
environment:
|
||
TZ: "Asia/Bangkok"
|
||
command: -config.file=/etc/promtail/promtail-config.yml
|
||
networks:
|
||
- lcbp3
|
||
volumes:
|
||
- "/volume1/np-dms/monitoring/promtail/config:/etc/promtail:ro"
|
||
- "/var/run/docker.sock:/var/run/docker.sock:ro"
|
||
- "/var/lib/docker/containers:/var/lib/docker/containers:ro"
|
||
depends_on:
|
||
- loki
|
||
```
|
||
|
||
---
|
||
|
||
## QNAP Node Exporter & cAdvisor
|
||
|
||
ติดตั้ง node-exporter และ cAdvisor บน QNAP เพื่อให้ Prometheus บน ASUSTOR scrape metrics ได้:
|
||
|
||
```yaml
|
||
# File: /share/np-dms/monitoring/docker-compose.yml (QNAP)
|
||
# เฉพาะ exporters เท่านั้น - metrics ถูก scrape โดย Prometheus บน ASUSTOR
|
||
|
||
version: '3.8'
|
||
|
||
networks:
|
||
lcbp3:
|
||
external: true
|
||
|
||
services:
|
||
node-exporter:
|
||
image: prom/node-exporter:v1.7.0
|
||
container_name: node-exporter
|
||
restart: unless-stopped
|
||
command:
|
||
- '--path.procfs=/host/proc'
|
||
- '--path.sysfs=/host/sys'
|
||
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
|
||
ports:
|
||
- "9100:9100"
|
||
networks:
|
||
- lcbp3
|
||
volumes:
|
||
- /proc:/host/proc:ro
|
||
- /sys:/host/sys:ro
|
||
- /:/rootfs:ro
|
||
|
||
cadvisor:
|
||
image: gcr.io/cadvisor/cadvisor:v0.47.2
|
||
container_name: cadvisor
|
||
restart: unless-stopped
|
||
privileged: true
|
||
ports:
|
||
- "8088:8080"
|
||
networks:
|
||
- lcbp3
|
||
volumes:
|
||
- /:/rootfs:ro
|
||
- /var/run:/var/run:ro
|
||
- /sys:/sys:ro
|
||
- /var/lib/docker/:/var/lib/docker:ro
|
||
- /sys/fs/cgroup:/sys/fs/cgroup:ro
|
||
|
||
mysqld-exporter:
|
||
image: prom/mysqld-exporter:v0.15.0
|
||
container_name: mysqld-exporter
|
||
restart: unless-stopped
|
||
user: root
|
||
command:
|
||
- '--config.my-cnf=/etc/mysql/my.cnf'
|
||
ports:
|
||
- "9104:9104"
|
||
networks:
|
||
- lcbp3
|
||
volumes:
|
||
- "/share/np-dms/monitoring/mysqld-exporter/.my.cnf:/etc/mysql/my.cnf:ro"
|
||
```
|
||
|
||
---
|
||
|
||
## Prometheus Configuration
|
||
|
||
สร้างไฟล์ `/volume1/np-dms/monitoring/prometheus/config/prometheus.yml` บน ASUSTOR:
|
||
|
||
```yaml
|
||
global:
|
||
scrape_interval: 15s
|
||
evaluation_interval: 15s
|
||
|
||
scrape_configs:
|
||
# Prometheus self-monitoring (ASUSTOR)
|
||
- job_name: 'prometheus'
|
||
static_configs:
|
||
- targets: ['localhost:9090']
|
||
|
||
# ============================================
|
||
# ASUSTOR Metrics (Local)
|
||
# ============================================
|
||
|
||
# Host metrics from Node Exporter (ASUSTOR)
|
||
- job_name: 'asustor-node'
|
||
static_configs:
|
||
- targets: ['node-exporter:9100']
|
||
labels:
|
||
host: 'asustor'
|
||
|
||
# Container metrics from cAdvisor (ASUSTOR)
|
||
- job_name: 'asustor-cadvisor'
|
||
static_configs:
|
||
- targets: ['cadvisor:8080']
|
||
labels:
|
||
host: 'asustor'
|
||
|
||
# ============================================
|
||
# QNAP Metrics (Remote - 192.168.10.8)
|
||
# ============================================
|
||
|
||
# Host metrics from Node Exporter (QNAP)
|
||
- job_name: 'qnap-node'
|
||
static_configs:
|
||
- targets: ['192.168.10.8:9100']
|
||
labels:
|
||
host: 'qnap'
|
||
|
||
# Container metrics from cAdvisor (QNAP)
|
||
- job_name: 'qnap-cadvisor'
|
||
static_configs:
|
||
- targets: ['192.168.10.8:8088']
|
||
labels:
|
||
host: 'qnap'
|
||
|
||
# Backend NestJS application (QNAP)
|
||
- job_name: 'backend'
|
||
static_configs:
|
||
- targets: ['192.168.10.8:3000']
|
||
labels:
|
||
host: 'qnap'
|
||
metrics_path: '/metrics'
|
||
|
||
# MariaDB Exporter (QNAP)
|
||
- job_name: 'mariadb'
|
||
static_configs:
|
||
- targets: ['192.168.10.8:9104']
|
||
labels:
|
||
host: 'qnap'
|
||
```
|
||
|
||
---
|
||
|
||
## Uptime Kuma Monitors
|
||
|
||
เมื่อ Uptime Kuma พร้อมใช้งาน ให้เพิ่ม monitors ต่อไปนี้:
|
||
|
||
| Monitor Name | Type | URL / Host | Interval |
|
||
| :------------ | :--- | :--------------------------------- | :------- |
|
||
| QNAP NPM | HTTP | https://npm.np-dms.work | 60s |
|
||
| Frontend | HTTP | https://lcbp3.np-dms.work | 60s |
|
||
| Backend API | HTTP | https://backend.np-dms.work/health | 60s |
|
||
| MariaDB | TCP | 192.168.10.8:3306 | 60s |
|
||
| Redis | TCP | 192.168.10.8:6379 | 60s |
|
||
| Elasticsearch | HTTP | http://192.168.10.8:9200 | 60s |
|
||
| Gitea | HTTP | https://git.np-dms.work | 60s |
|
||
| n8n | HTTP | https://n8n.np-dms.work | 60s |
|
||
| Grafana | HTTP | https://grafana.np-dms.work | 60s |
|
||
| QNAP Host | Ping | 192.168.10.8 | 60s |
|
||
| ASUSTOR Host | Ping | 192.168.10.9 | 60s |
|
||
|
||
---
|
||
|
||
## Grafana Dashboards
|
||
|
||
### Recommended Dashboards to Import
|
||
|
||
| Dashboard ID | Name | Purpose |
|
||
| :----------- | :--------------------------- | :----------------------------- |
|
||
| 1860 | Node Exporter Full | Host system metrics |
|
||
| 14282 | cAdvisor exporter | Container metrics |
|
||
| 11074 | Node Exporter for Prometheus | Node overview |
|
||
| 893 | Docker and Container | Docker overview |
|
||
| 7362 | MySQL | MySQL view |
|
||
| 1214 | Redis | Redis view |
|
||
| 14204 | Elasticsearch | Elasticsearch view |
|
||
| 13106 | MySQL/MariaDB Overview | Detailed MySQL/MariaDB metrics |
|
||
|
||
|
||
### Import Dashboard via Grafana UI
|
||
|
||
1. Go to **Dashboards → Import**
|
||
2. Enter Dashboard ID (e.g., `1860`)
|
||
3. Select Prometheus data source
|
||
4. Click **Import**
|
||
|
||
---
|
||
|
||
## 🚀 Deploy lcbp3-monitoring บน ASUSTOR
|
||
|
||
### 📋 Prerequisites Checklist
|
||
|
||
| # | ขั้นตอน | Status |
|
||
| :--- | :------------------------------------------------------------------------------------------------- | :----- |
|
||
| 1 | SSH เข้า ASUSTOR ได้ (`ssh admin@192.168.10.9`) | ✅ |
|
||
| 2 | Docker Network `lcbp3` สร้างแล้ว (ดูหัวข้อ [สร้าง Docker Network](#-สร้าง-docker-network-ทำครั้งแรกครั้งเดียว)) | ✅ |
|
||
| 3 | สร้าง Directories และกำหนดสิทธิ์แล้ว (ดูหัวข้อ [กำหนดสิทธิ](#กำหนดสิทธิ-บน-asustor)) | ✅ |
|
||
| 4 | สร้าง `prometheus.yml` แล้ว (ดูหัวข้อ [Prometheus Configuration](#prometheus-configuration)) | ✅ |
|
||
| 5 | สร้าง `promtail-config.yml` แล้ว (ดูหัวข้อ [Step 1.2](#step-12-สร้าง-promtail-configyml)) | ✅ |
|
||
|
||
---
|
||
|
||
### Step 1: สร้าง prometheus.yml
|
||
|
||
```bash
|
||
# SSH เข้า ASUSTOR
|
||
ssh admin@192.168.10.9
|
||
|
||
# สร้างไฟล์ prometheus.yml
|
||
cat > /volume1/np-dms/monitoring/prometheus/config/prometheus.yml << 'EOF'
|
||
global:
|
||
scrape_interval: 15s
|
||
evaluation_interval: 15s
|
||
|
||
scrape_configs:
|
||
- job_name: 'prometheus'
|
||
static_configs:
|
||
- targets: ['localhost:9090']
|
||
|
||
- job_name: 'asustor-node'
|
||
static_configs:
|
||
- targets: ['node-exporter:9100']
|
||
labels:
|
||
host: 'asustor'
|
||
|
||
- job_name: 'asustor-cadvisor'
|
||
static_configs:
|
||
- targets: ['cadvisor:8080']
|
||
labels:
|
||
host: 'asustor'
|
||
|
||
- job_name: 'qnap-node'
|
||
static_configs:
|
||
- targets: ['192.168.10.8:9100']
|
||
labels:
|
||
host: 'qnap'
|
||
|
||
- job_name: 'qnap-cadvisor'
|
||
static_configs:
|
||
- targets: ['192.168.10.8:8088']
|
||
labels:
|
||
host: 'qnap'
|
||
|
||
- job_name: 'backend'
|
||
static_configs:
|
||
- targets: ['192.168.10.8:3000']
|
||
labels:
|
||
host: 'qnap'
|
||
metrics_path: '/metrics'
|
||
EOF
|
||
|
||
# ตรวจสอบ
|
||
cat /volume1/np-dms/monitoring/prometheus/config/prometheus.yml
|
||
```
|
||
|
||
### Step 1.2: สร้าง promtail-config.yml
|
||
|
||
ต้องสร้าง Config ให้ Promtail อ่าน logs จาก Docker containers และส่งไป Loki:
|
||
|
||
```bash
|
||
# สร้างไฟล์ promtail-config.yml
|
||
cat > /volume1/np-dms/monitoring/promtail/config/promtail-config.yml << 'EOF'
|
||
server:
|
||
http_listen_port: 9080
|
||
grpc_listen_port: 0
|
||
|
||
positions:
|
||
filename: /tmp/positions.yaml
|
||
|
||
clients:
|
||
- url: http://loki:3100/loki/api/v1/push
|
||
|
||
scrape_configs:
|
||
- job_name: docker
|
||
docker_sd_configs:
|
||
- host: unix:///var/run/docker.sock
|
||
refresh_interval: 5s
|
||
relabel_configs:
|
||
- source_labels: ['__meta_docker_container_name']
|
||
regex: '/(.*)'
|
||
target_label: 'container'
|
||
- source_labels: ['__meta_docker_container_log_stream']
|
||
target_label: 'stream'
|
||
EOF
|
||
|
||
# ขั้นตอนการเตรียมระบบที่ QNAP (ก่อน Deploy Stack)
|
||
|
||
### 1. สร้าง Monitoring User ใน MariaDB
|
||
รันคำสั่ง SQL นี้ผ่าน **phpMyAdmin** หรือ `docker exec`:
|
||
```sql
|
||
CREATE USER 'exporter'@'%' IDENTIFIED BY 'Center2025' WITH MAX_USER_CONNECTIONS 3;
|
||
GRANT PROCESS, REPLICATION CLIENT, SELECT, SLAVE MONITOR ON *.* TO 'exporter'@'%';
|
||
FLUSH PRIVILEGES;
|
||
```
|
||
|
||
### 2. สร้างไฟล์คอนฟิก .my.cnf บน QNAP
|
||
เพื่อให้ `mysqld-exporter` อ่านรหัสผ่านที่มีตัวอักษรพิเศษได้ถูกต้อง:
|
||
|
||
1. **SSH เข้า QNAP** (หรือใช้ File Station สร้าง Folder):
|
||
```bash
|
||
ssh admin@192.168.10.8
|
||
```
|
||
2. **สร้าง Directory สำหรับเก็บ Config**:
|
||
```bash
|
||
mkdir -p /share/np-dms/monitoring/mysqld-exporter
|
||
```
|
||
3. **สร้างไฟล์ .my.cnf**:
|
||
```bash
|
||
cat > /share/np-dms/monitoring/mysqld-exporter/.my.cnf << 'EOF'
|
||
[client]
|
||
user=exporter
|
||
password=Center2025
|
||
host=mariadb
|
||
EOF
|
||
```
|
||
4. **กำหนดสิทธิ์ไฟล์** (เพื่อให้ Container อ่านไฟล์ได้):
|
||
```bash
|
||
chmod 644 /share/np-dms/monitoring/mysqld-exporter/.my.cnf
|
||
```
|
||
|
||
# ตรวจสอบ
|
||
cat /volume1/np-dms/monitoring/promtail/config/promtail-config.yml
|
||
```
|
||
|
||
---
|
||
|
||
### Step 2: Deploy ผ่าน Portainer (แนะนำ)
|
||
|
||
1. เปิด **Portainer** → เลือก Environment ของ **ASUSTOR**
|
||
2. ไปที่ **Stacks** → **Add stack**
|
||
3. กรอกข้อมูล:
|
||
- **Name:** `lcbp3-monitoring`
|
||
- **Build method:** เลือก **Web editor**
|
||
4. วาง (Paste) เนื้อหาจาก [Docker Compose File (ASUSTOR)](#docker-compose-file-asustor) ด้านบน
|
||
5. กด **Deploy the stack**
|
||
|
||
> ⚠️ **สำคัญ:** ตรวจสอบ Password ของ Grafana (`GF_SECURITY_ADMIN_PASSWORD`) ใน docker-compose ก่อน deploy
|
||
|
||
### Deploy ผ่าน SSH (วิธีสำรอง)
|
||
|
||
```bash
|
||
# SSH เข้า ASUSTOR
|
||
ssh admin@192.168.10.9
|
||
|
||
# คัดลอก docker-compose.yml ไปยัง path
|
||
# (วางไฟล์ที่ /volume1/np-dms/monitoring/docker-compose.yml)
|
||
|
||
# Deploy
|
||
cd /volume1/np-dms/monitoring
|
||
docker compose up -d
|
||
|
||
# ตรวจสอบ container status
|
||
docker compose ps
|
||
```
|
||
|
||
---
|
||
|
||
### Step 3: Verify Services
|
||
|
||
```bash
|
||
# ตรวจสอบ containers ทั้งหมด
|
||
docker ps --filter "name=prometheus" --filter "name=grafana" \
|
||
--filter "name=uptime-kuma" --filter "name=node-exporter" \
|
||
--filter "name=cadvisor" --filter "name=loki" --filter "name=promtail"
|
||
```
|
||
|
||
| Service | วิธีตรวจสอบ | Expected Result |
|
||
| :---------------- | :----------------------------------------------------------------- | :------------------------------------ |
|
||
| ✅ **Prometheus** | `curl http://192.168.10.9:9090/-/healthy` | `Prometheus Server is Healthy` |
|
||
| ✅ **Grafana** | เปิด `https://grafana.np-dms.work` (หรือ `http://192.168.10.9:3000`) | หน้า Login |
|
||
| ✅ **Uptime Kuma** | เปิด `https://uptime.np-dms.work` (หรือ `http://192.168.10.9:3001`) | หน้า Setup |
|
||
| ✅ **Node Exp.** | `curl http://192.168.10.9:9100/metrics \| head` | Metrics output |
|
||
| ✅ **cAdvisor** | `curl http://192.168.10.9:8080/healthz` | `ok` |
|
||
| ✅ **Loki** | `curl http://192.168.10.9:3100/ready` | `ready` |
|
||
| ✅ **Promtail** | เช็ค Logs: `docker logs promtail` | ไม่ควรมี Error + เห็น connection success |
|
||
|
||
---
|
||
|
||
### Step 4: Deploy QNAP Exporters
|
||
|
||
ติดตั้ง node-exporter และ cAdvisor บน QNAP เพื่อให้ Prometheus scrape ข้ามเครื่องได้:
|
||
|
||
#### ผ่าน Container Station (QNAP)
|
||
|
||
1. เปิด **Container Station** บน QNAP Web UI
|
||
2. ไปที่ **Applications** → **Create**
|
||
3. ตั้งชื่อ Application: `lcbp3-exporters`
|
||
4. วาง (Paste) เนื้อหาจาก [QNAP Node Exporter & cAdvisor](#qnap-node-exporter--cadvisor)
|
||
5. กด **Create**
|
||
|
||
#### ตรวจสอบจาก ASUSTOR
|
||
|
||
```bash
|
||
# ตรวจว่า Prometheus scrape QNAP ได้
|
||
curl -s http://localhost:9090/api/v1/targets | grep -E '"qnap-(node|cadvisor)"'
|
||
|
||
# หรือเปิด Prometheus UI → Targets
|
||
# URL: http://192.168.10.9:9090/targets
|
||
# ดูว่า qnap-node, qnap-cadvisor เป็น State: UP
|
||
```
|
||
|
||
---
|
||
|
||
### Step 5: ตั้งค่า Grafana & Uptime Kuma
|
||
|
||
#### Grafana — First Login
|
||
|
||
1. เปิด `https://grafana.np-dms.work`
|
||
2. Login: `admin` / `Center#2025` (หรือ password ที่ตั้งไว้)
|
||
3. ไปที่ **Connections** → **Data sources** → **Add data source**
|
||
4. เลือก **Prometheus**
|
||
- URL: `http://prometheus:9090`
|
||
- กด **Save & Test** → ต้องขึ้น ✅
|
||
5. Import Dashboards (ดูรายละเอียดในหัวข้อ [6. Grafana Dashboards Setup](#6-grafana-dashboards-setup))
|
||
|
||
#### Uptime Kuma — First Setup
|
||
|
||
1. เปิด `https://uptime.np-dms.work`
|
||
2. สร้าง Admin account
|
||
3. เพิ่ม Monitors ตาม [ตาราง Uptime Kuma Monitors](#uptime-kuma-monitors)
|
||
|
||
---
|
||
|
||
### 6. Grafana Dashboards Setup
|
||
|
||
เพื่อการ Monitor ที่สมบูรณ์ แนะนำให้ Import Dashboards ต่อไปนี้:
|
||
|
||
#### 6.1 Host Monitoring (Node Exporter)
|
||
* **Concept:** ดู resource ของเครื่อง Host (CPU, RAM, Disk, Network)
|
||
* **Dashboard ID:** `1860` (Node Exporter Full)
|
||
* **วิธี Import:**
|
||
1. ไปที่ **Dashboards** → **New** → **Import**
|
||
2. ช่อง **Import via grafana.com** ใส่เลข `1860` กด **Load**
|
||
3. เลือก Data source: **Prometheus**
|
||
4. กด **Import**
|
||
|
||
#### 6.2 Container Monitoring (cAdvisor)
|
||
* **Concept:** ดู resource ของแต่ละ Container (เชื่อม Logs ด้วย)
|
||
* **Dashboard ID:** `14282` (Cadvisor exporter)
|
||
* **วิธี Import:**
|
||
1. ใส่เลข `14282` กด **Load**
|
||
2. เลือก Data source: **Prometheus**
|
||
3. กด **Import**
|
||
|
||
#### 6.3 Logs Monitoring (Loki Integration)
|
||
เพื่อให้ Dashboard ของ Container แสดง Logs จาก Loki ได้ด้วย:
|
||
|
||
1. เปิด Dashboard **Cadvisor exporter** ที่เพิ่ง Import มา
|
||
2. กดปุ่ม **Add visualization** (หรือ Edit dashboard)
|
||
3. เลือก Data source: **Loki**
|
||
4. ในช่อง Query ใส่: `{container="$name"}`
|
||
* *(Note: `$name` มาจาก Variable ของ Dashboard 14282)*
|
||
5. ปรับ Visualization type เป็น **Logs**
|
||
6. ตั้งชื่อ Panel ว่า **"Container Logs"**
|
||
7. กด **Apply** และ **Save Dashboard**
|
||
|
||
ตอนนี้เราจะเห็นทั้ง **กราฟการกินทรัพยากร** และ **Logs** ของ Container นั้นๆ ในหน้าเดียวกันครับ
|
||
|
||
#### 6.4 Integrated Dashboard (Recommended)
|
||
|
||
ผมได้เตรียม JSON file ที่รวม Metrics และ Logs ไว้ให้แล้วครับ:
|
||
|
||
1. ไปที่ **Dashboards** → **New** → **Import**
|
||
2. ลากไฟล์ หรือ Copy เนื้อหาจากไฟล์:
|
||
`specs/08-infrastructure/grafana/dashboards/lcbp3-docker-monitoring.json`
|
||
3. กด **Load** และ **Import**
|
||
|
||
## 7.3 Backup / Export Dashboards
|
||
|
||
เมื่อปรับแต่ง Dashboard จนพอใจแล้ว ควร Export เก็บเป็นไฟล์ JSON ไว้ backup หรือ version control:
|
||
|
||
1. เปิด Dashboard ที่ต้องการ backup
|
||
2. ไปที่ปุ่ม **Share Dashboard** (ไอคอน 🔗 หรือ Share มุมซ้ายบน)
|
||
3. เลือกTab **Export**
|
||
4. เปิดตัวเลือก **Export for sharing externally** (เพื่อให้ลบ hardcoded value)
|
||
5. กด **Save to file**
|
||
6. นำไฟล์ JSON มาเก็บไว้ที่ path: `specs/08-infrastructure/grafana/dashboards/`
|
||
|
||
---
|
||
|
||
> 📝 **หมายเหตุ**: เอกสารนี้อ้างอิงจาก Architecture Document **v1.8.0** - Monitoring Stack deploy บน ASUSTOR AS5403T
|
||
|
||
|
||
|
||
---
|
||
|
||
## 📈 Document Numbering Specific Monitoring
|
||
|
||
## 3. Monitoring & Metrics
|
||
|
||
### 3.1. Prometheus Metrics
|
||
|
||
#### Key Metrics to Collect
|
||
|
||
```typescript
|
||
// metrics.service.ts
|
||
import { Counter, Histogram, Gauge } from 'prom-client';
|
||
|
||
// Lock acquisition metrics
|
||
export const lockAcquisitionDuration = new Histogram({
|
||
name: 'docnum_lock_acquisition_duration_ms',
|
||
help: 'Lock acquisition time in milliseconds',
|
||
labelNames: ['project', 'type'],
|
||
buckets: [10, 50, 100, 200, 500, 1000, 2000, 5000],
|
||
});
|
||
|
||
export const lockAcquisitionFailures = new Counter({
|
||
name: 'docnum_lock_acquisition_failures_total',
|
||
help: 'Total number of lock acquisition failures',
|
||
labelNames: ['project', 'type', 'reason'],
|
||
});
|
||
|
||
// Generation metrics
|
||
export const generationDuration = new Histogram({
|
||
name: 'docnum_generation_duration_ms',
|
||
help: 'Total document number generation time',
|
||
labelNames: ['project', 'type', 'status'],
|
||
buckets: [100, 200, 500, 1000, 2000, 5000],
|
||
});
|
||
|
||
export const retryCount = new Histogram({
|
||
name: 'docnum_retry_count',
|
||
help: 'Number of retries per generation',
|
||
labelNames: ['project', 'type'],
|
||
buckets: [0, 1, 2, 3, 5, 10],
|
||
});
|
||
|
||
// Connection health
|
||
export const redisConnectionStatus = new Gauge({
|
||
name: 'docnum_redis_connection_status',
|
||
help: 'Redis connection status (1=up, 0=down)',
|
||
});
|
||
|
||
export const dbConnectionPoolUsage = new Gauge({
|
||
name: 'docnum_db_connection_pool_usage',
|
||
help: 'Database connection pool usage percentage',
|
||
});
|
||
```
|
||
|
||
### 3.2. Prometheus Alert Rules
|
||
|
||
```yaml
|
||
# prometheus/alerts.yml
|
||
groups:
|
||
- name: document_numbering_alerts
|
||
interval: 30s
|
||
rules:
|
||
# CRITICAL: Redis unavailable
|
||
- alert: RedisUnavailable
|
||
expr: docnum_redis_connection_status == 0
|
||
for: 1m
|
||
labels:
|
||
severity: critical
|
||
component: document-numbering
|
||
annotations:
|
||
summary: "Redis is unavailable for document numbering"
|
||
description: "System is falling back to DB-only locking. Performance degraded by 30-50%."
|
||
runbook_url: "https://wiki.lcbp3/runbooks/redis-unavailable"
|
||
|
||
# CRITICAL: High lock failure rate
|
||
- alert: HighLockFailureRate
|
||
expr: |
|
||
rate(docnum_lock_acquisition_failures_total[5m]) > 0.1
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
component: document-numbering
|
||
annotations:
|
||
summary: "Lock acquisition failure rate > 10%"
|
||
description: "Check Redis and database performance immediately"
|
||
runbook_url: "https://wiki.lcbp3/runbooks/high-lock-failure"
|
||
|
||
# WARNING: Elevated lock failure rate
|
||
- alert: ElevatedLockFailureRate
|
||
expr: |
|
||
rate(docnum_lock_acquisition_failures_total[5m]) > 0.05
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
component: document-numbering
|
||
annotations:
|
||
summary: "Lock acquisition failure rate > 5%"
|
||
description: "Monitor closely. May escalate to critical soon."
|
||
|
||
# WARNING: Slow lock acquisition
|
||
- alert: SlowLockAcquisition
|
||
expr: |
|
||
histogram_quantile(0.95,
|
||
rate(docnum_lock_acquisition_duration_ms_bucket[5m])
|
||
) > 1000
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
component: document-numbering
|
||
annotations:
|
||
summary: "P95 lock acquisition time > 1 second"
|
||
description: "Lock acquisition is slower than expected. Check Redis latency."
|
||
|
||
# WARNING: High retry count
|
||
- alert: HighRetryCount
|
||
expr: |
|
||
sum by (project) (
|
||
rate(docnum_retry_count_sum[1h])
|
||
) > 100
|
||
for: 1h
|
||
labels:
|
||
severity: warning
|
||
component: document-numbering
|
||
annotations:
|
||
summary: "Retry count > 100 per hour in project {{ $labels.project }}"
|
||
description: "High contention detected. Consider scaling."
|
||
|
||
# WARNING: Slow generation
|
||
- alert: SlowDocumentNumberGeneration
|
||
expr: |
|
||
histogram_quantile(0.95,
|
||
rate(docnum_generation_duration_ms_bucket[5m])
|
||
) > 2000
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
component: document-numbering
|
||
annotations:
|
||
summary: "P95 generation time > 2 seconds"
|
||
description: "Document number generation is slower than SLA target"
|
||
```
|
||
|
||
### 3.3. AlertManager Configuration
|
||
|
||
```yaml
|
||
# alertmanager/config.yml
|
||
global:
|
||
resolve_timeout: 5m
|
||
slack_api_url: ${SLACK_WEBHOOK_URL}
|
||
|
||
route:
|
||
group_by: ['alertname', 'severity', 'project']
|
||
group_wait: 30s
|
||
group_interval: 5m
|
||
repeat_interval: 4h
|
||
receiver: 'ops-team'
|
||
|
||
routes:
|
||
# CRITICAL alerts → PagerDuty + Slack
|
||
- match:
|
||
severity: critical
|
||
receiver: 'pagerduty-critical'
|
||
continue: true
|
||
|
||
- match:
|
||
severity: critical
|
||
receiver: 'slack-critical'
|
||
continue: false
|
||
|
||
# WARNING alerts → Slack only
|
||
- match:
|
||
severity: warning
|
||
receiver: 'slack-warnings'
|
||
|
||
receivers:
|
||
- name: 'pagerduty-critical'
|
||
pagerduty_configs:
|
||
- service_key: ${PAGERDUTY_SERVICE_KEY}
|
||
description: '{{ .GroupLabels.alertname }}: {{ .CommonAnnotations.summary }}'
|
||
details:
|
||
firing: '{{ .Alerts.Firing | len }}'
|
||
resolved: '{{ .Alerts.Resolved | len }}'
|
||
runbook: '{{ .CommonAnnotations.runbook_url }}'
|
||
|
||
- name: 'slack-critical'
|
||
slack_configs:
|
||
- channel: '#lcbp3-critical-alerts'
|
||
title: '🚨 CRITICAL: {{ .GroupLabels.alertname }}'
|
||
text: |
|
||
*Summary:* {{ .CommonAnnotations.summary }}
|
||
*Description:* {{ .CommonAnnotations.description }}
|
||
*Runbook:* {{ .CommonAnnotations.runbook_url }}
|
||
color: 'danger'
|
||
|
||
- name: 'slack-warnings'
|
||
slack_configs:
|
||
- channel: '#lcbp3-alerts'
|
||
title: '⚠️ WARNING: {{ .GroupLabels.alertname }}'
|
||
text: '{{ .CommonAnnotations.description }}'
|
||
color: 'warning'
|
||
|
||
- name: 'ops-team'
|
||
email_configs:
|
||
- to: 'ops@example.com'
|
||
subject: '[LCBP3] {{ .GroupLabels.alertname }}'
|
||
```
|
||
|
||
### 3.4. Grafana Dashboard
|
||
|
||
Dashboard panels ที่สำคัญ:
|
||
|
||
1. **Lock Acquisition Success Rate** (Gauge)
|
||
- Query: `1 - (rate(docnum_lock_acquisition_failures_total[5m]) / rate(docnum_lock_acquisition_total[5m]))`
|
||
- Alert threshold: < 95%
|
||
|
||
2. **Lock Acquisition Time Percentiles** (Graph)
|
||
- P50: `histogram_quantile(0.50, rate(docnum_lock_acquisition_duration_ms_bucket[5m]))`
|
||
- P95: `histogram_quantile(0.95, rate(docnum_lock_acquisition_duration_ms_bucket[5m]))`
|
||
- P99: `histogram_quantile(0.99, rate(docnum_lock_acquisition_duration_ms_bucket[5m]))`
|
||
|
||
3. **Generation Rate** (Stat)
|
||
- Query: `sum(rate(docnum_generation_duration_ms_count[1m])) * 60`
|
||
- Unit: documents/minute
|
||
|
||
4. **Error Rate by Type** (Graph)
|
||
- Query: `sum by (reason) (rate(docnum_lock_acquisition_failures_total[5m]))`
|
||
|
||
5. **Redis Connection Status** (Stat)
|
||
- Query: `docnum_redis_connection_status`
|
||
- Thresholds: 0 = red, 1 = green
|
||
|
||
6. **DB Connection Pool Usage** (Gauge)
|
||
- Query: `docnum_db_connection_pool_usage`
|
||
- Alert threshold: > 80%
|
||
|