260223:1415 20260223 nextJS & nestJS Best pratices
All checks were successful
Build and Deploy / deploy (push) Successful in 4m44s
All checks were successful
Build and Deploy / deploy (push) Successful in 4m44s
This commit is contained in:
@@ -1,904 +0,0 @@
|
||||
# Infrastructure Setup
|
||||
|
||||
> 📍 **Document Version:** v1.8.0
|
||||
> 🖥️ **Primary Server:** QNAP TS-473A (Application & Database)
|
||||
> 💾 **Backup Server:** ASUSTOR AS5403T (Infrastructure & Backup)
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
> 📖 **ดูรายละเอียด Server Roles และ Service Distribution ได้ที่:** [README.md](README.md#-hardware-infrastructure)
|
||||
>
|
||||
> เอกสารนี้มุ่งเน้นการตั้งค่า Technical Configuration สำหรับแต่ละ Service
|
||||
|
||||
---
|
||||
|
||||
## 1. Redis Configuration (Standalone + Persistence)
|
||||
|
||||
### 1.1 Docker Compose Setup
|
||||
```yaml
|
||||
# docker-compose-redis.yml
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
redis:
|
||||
image: 'redis:7.2-alpine'
|
||||
container_name: lcbp3-redis
|
||||
restart: unless-stopped
|
||||
# AOF: Enabled for durability
|
||||
# Maxmemory: Prevent OOM
|
||||
command: redis-server --appendonly yes --requirepass ${REDIS_PASSWORD} --maxmemory 1gb --maxmemory-policy noeviction
|
||||
volumes:
|
||||
- ./redis/data:/data
|
||||
ports:
|
||||
- '6379:6379'
|
||||
networks:
|
||||
- lcbp3
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: '2.0'
|
||||
memory: 1.5G
|
||||
networks:
|
||||
lcbp3:
|
||||
external: true
|
||||
```
|
||||
|
||||
|
||||
## 2. Database Configuration
|
||||
|
||||
### 2.1 MariaDB Optimization for Numbering
|
||||
```sql
|
||||
-- /etc/mysql/mariadb.conf.d/50-numbering.cnf
|
||||
|
||||
[mysqld]
|
||||
# Connection pool
|
||||
max_connections = 200
|
||||
thread_cache_size = 50
|
||||
|
||||
# Query cache (disabled for InnoDB)
|
||||
query_cache_type = 0
|
||||
query_cache_size = 0
|
||||
|
||||
# InnoDB settings
|
||||
innodb_buffer_pool_size = 4G
|
||||
innodb_log_file_size = 512M
|
||||
innodb_flush_log_at_trx_commit = 1
|
||||
innodb_lock_wait_timeout = 50
|
||||
|
||||
# Performance Schema
|
||||
performance_schema = ON
|
||||
performance_schema_instrument = 'wait/lock/%=ON'
|
||||
|
||||
# Binary logging
|
||||
log_bin = /var/log/mysql/mysql-bin.log
|
||||
expire_logs_days = 7
|
||||
max_binlog_size = 100M
|
||||
|
||||
# Slow query log
|
||||
slow_query_log = 1
|
||||
slow_query_log_file = /var/log/mysql/slow-query.log
|
||||
long_query_time = 1
|
||||
```
|
||||
|
||||
### 2.2 Monitoring Locks
|
||||
```sql
|
||||
-- Check for lock contention
|
||||
SELECT
|
||||
r.trx_id waiting_trx_id,
|
||||
r.trx_mysql_thread_id waiting_thread,
|
||||
r.trx_query waiting_query,
|
||||
b.trx_id blocking_trx_id,
|
||||
b.trx_mysql_thread_id blocking_thread,
|
||||
b.trx_query blocking_query
|
||||
FROM information_schema.innodb_lock_waits w
|
||||
INNER JOIN information_schema.innodb_trx b ON b.trx_id = w.blocking_trx_id
|
||||
INNER JOIN information_schema.innodb_trx r ON r.trx_id = w.requesting_trx_id;
|
||||
|
||||
-- Check active transactions
|
||||
SELECT * FROM information_schema.innodb_trx;
|
||||
|
||||
-- Kill long-running transaction (if needed)
|
||||
KILL <thread_id>;
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 3. Backend Service Configuration
|
||||
|
||||
### 3.1 Backend Service Deployment
|
||||
|
||||
#### Docker Compose
|
||||
```yaml
|
||||
# docker-compose-backend.yml
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
backend-1:
|
||||
image: lcbp3-backend:latest
|
||||
container_name: lcbp3-backend-1
|
||||
environment:
|
||||
- NODE_ENV=production
|
||||
- DB_HOST=mariadb
|
||||
- REDIS_HOST=cache
|
||||
- REDIS_PORT=6379
|
||||
- NUMBERING_LOCK_TIMEOUT=5000
|
||||
- NUMBERING_RESERVATION_TTL=300
|
||||
ports:
|
||||
- "3001:3000"
|
||||
depends_on:
|
||||
- mariadb
|
||||
- cache
|
||||
networks:
|
||||
- lcbp3
|
||||
restart: unless-stopped
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:3000/health"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
backend-2:
|
||||
image: lcbp3-backend:latest
|
||||
container_name: lcbp3-backend-2
|
||||
environment:
|
||||
- NODE_ENV=production
|
||||
- DB_HOST=mariadb
|
||||
- REDIS_HOST=cache
|
||||
- REDIS_PORT=6379
|
||||
ports:
|
||||
- "3002:3000"
|
||||
depends_on:
|
||||
- mariadb
|
||||
- cache
|
||||
networks:
|
||||
- lcbp3
|
||||
restart: unless-stopped
|
||||
|
||||
networks:
|
||||
lcbp3:
|
||||
external: true
|
||||
```
|
||||
|
||||
#### Health Check Endpoint
|
||||
```typescript
|
||||
// health/numbering.health.ts
|
||||
import { Injectable } from '@nestjs/common';
|
||||
import { HealthIndicator, HealthIndicatorResult } from '@nestjs/terminus';
|
||||
import { Redis } from 'ioredis';
|
||||
import { DataSource } from 'typeorm';
|
||||
|
||||
@Injectable()
|
||||
export class NumberingHealthIndicator extends HealthIndicator {
|
||||
constructor(
|
||||
private redis: Redis,
|
||||
private dataSource: DataSource,
|
||||
) {
|
||||
super();
|
||||
}
|
||||
|
||||
async isHealthy(key: string): Promise<HealthIndicatorResult> {
|
||||
const checks = await Promise.all([
|
||||
this.checkRedis(),
|
||||
this.checkDatabase(),
|
||||
this.checkSequenceIntegrity(),
|
||||
]);
|
||||
|
||||
const isHealthy = checks.every((check) => check.status === 'up');
|
||||
|
||||
return this.getStatus(key, isHealthy, { checks });
|
||||
}
|
||||
|
||||
private async checkRedis(): Promise<any> {
|
||||
try {
|
||||
await this.redis.ping();
|
||||
return { name: 'redis', status: 'up' };
|
||||
} catch (error) {
|
||||
return { name: 'redis', status: 'down', error: error.message };
|
||||
}
|
||||
}
|
||||
|
||||
private async checkDatabase(): Promise<any> {
|
||||
try {
|
||||
await this.dataSource.query('SELECT 1');
|
||||
return { name: 'database', status: 'up' };
|
||||
} catch (error) {
|
||||
return { name: 'database', status: 'down', error: error.message };
|
||||
}
|
||||
}
|
||||
|
||||
private async checkSequenceIntegrity(): Promise<any> {
|
||||
try {
|
||||
const result = await this.dataSource.query(`
|
||||
SELECT COUNT(*) as count
|
||||
FROM document_numbering_sequences
|
||||
WHERE current_value > (
|
||||
SELECT max_value FROM document_numbering_configs
|
||||
WHERE id = config_id
|
||||
)
|
||||
`);
|
||||
|
||||
const hasIssue = result[0].count > 0;
|
||||
|
||||
return {
|
||||
name: 'sequence_integrity',
|
||||
status: hasIssue ? 'degraded' : 'up',
|
||||
exceeded_sequences: result[0].count,
|
||||
};
|
||||
} catch (error) {
|
||||
return { name: 'sequence_integrity', status: 'down', error: error.message };
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 4. Monitoring & Alerting
|
||||
|
||||
### 4.1 Prometheus Configuration
|
||||
|
||||
```yaml
|
||||
# prometheus.yml
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
- alertmanager:9093
|
||||
|
||||
rule_files:
|
||||
- "/etc/prometheus/alerts/numbering.yml"
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'backend'
|
||||
static_configs:
|
||||
- targets:
|
||||
- 'backend-1:3000'
|
||||
- 'backend-2:3000'
|
||||
metrics_path: '/metrics'
|
||||
|
||||
- job_name: 'redis-numbering'
|
||||
static_configs:
|
||||
- targets:
|
||||
- 'redis-1:6379'
|
||||
- 'redis-2:6379'
|
||||
- 'redis-3:6379'
|
||||
metrics_path: '/metrics'
|
||||
|
||||
- job_name: 'mariadb'
|
||||
static_configs:
|
||||
- targets:
|
||||
- 'mariadb-exporter:9104'
|
||||
```
|
||||
|
||||
### 4.2 Alert Manager Configuration
|
||||
|
||||
```yaml
|
||||
# alertmanager.yml
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
|
||||
route:
|
||||
receiver: 'default'
|
||||
group_by: ['alertname', 'severity']
|
||||
group_wait: 10s
|
||||
group_interval: 10s
|
||||
repeat_interval: 12h
|
||||
|
||||
routes:
|
||||
- match:
|
||||
severity: critical
|
||||
receiver: 'critical'
|
||||
continue: true
|
||||
|
||||
- match:
|
||||
severity: warning
|
||||
receiver: 'warning'
|
||||
|
||||
receivers:
|
||||
- name: 'default'
|
||||
slack_configs:
|
||||
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
|
||||
channel: '#lcbp3-alerts'
|
||||
title: '{{ .GroupLabels.alertname }}'
|
||||
text: '{{ range .Alerts }}{{ .Annotations.summary }}\n{{ end }}'
|
||||
|
||||
- name: 'critical'
|
||||
email_configs:
|
||||
- to: 'devops@lcbp3.com'
|
||||
from: 'alerts@lcbp3.com'
|
||||
smarthost: 'smtp.gmail.com:587'
|
||||
auth_username: 'alerts@lcbp3.com'
|
||||
auth_password: 'your-password'
|
||||
headers:
|
||||
Subject: '🚨 CRITICAL: {{ .GroupLabels.alertname }}'
|
||||
|
||||
pagerduty_configs:
|
||||
- service_key: 'YOUR_PAGERDUTY_KEY'
|
||||
|
||||
- name: 'warning'
|
||||
slack_configs:
|
||||
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
|
||||
channel: '#lcbp3-warnings'
|
||||
```
|
||||
|
||||
### 4.3 Grafana Dashboards
|
||||
|
||||
#### Import Dashboard JSON
|
||||
```bash
|
||||
# Download dashboard template
|
||||
curl -o numbering-dashboard.json \
|
||||
https://raw.githubusercontent.com/lcbp3/grafana-dashboards/main/numbering.json
|
||||
|
||||
# Import to Grafana
|
||||
curl -X POST http://admin:admin@localhost:3000/api/dashboards/db \
|
||||
-H "Content-Type: application/json" \
|
||||
-d @numbering-dashboard.json
|
||||
```
|
||||
|
||||
#### Key Panels to Monitor
|
||||
1. **Numbers Generated per Minute** - Rate of number creation
|
||||
2. **Sequence Utilization** - Current usage vs max (alert >90%)
|
||||
3. **Lock Wait Time (p95)** - Performance indicator
|
||||
4. **Lock Failures** - System health indicator
|
||||
5. **Redis Health (Single instance)** - Node status
|
||||
6. **Database Connection Pool** - Resource usage
|
||||
|
||||
---
|
||||
|
||||
## 5. Backup & Recovery
|
||||
|
||||
### 5.1 Database Backup Strategy
|
||||
|
||||
#### Automated Backup Script
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# scripts/backup-numbering-db.sh
|
||||
|
||||
DATE=$(date +%Y%m%d_%H%M%S)
|
||||
BACKUP_DIR="/backups/numbering"
|
||||
DB_NAME="lcbp3_production"
|
||||
|
||||
echo "🔄 Starting backup at $DATE"
|
||||
|
||||
# Create backup directory
|
||||
mkdir -p $BACKUP_DIR
|
||||
|
||||
# Backup numbering tables only
|
||||
docker exec lcbp3-mariadb mysqldump \
|
||||
--single-transaction \
|
||||
--routines \
|
||||
--triggers \
|
||||
$DB_NAME \
|
||||
document_numbering_configs \
|
||||
document_numbering_sequences \
|
||||
document_numbering_audit_logs \
|
||||
> $BACKUP_DIR/numbering_$DATE.sql
|
||||
|
||||
# Compress backup
|
||||
gzip $BACKUP_DIR/numbering_$DATE.sql
|
||||
|
||||
# Keep only last 30 days
|
||||
find $BACKUP_DIR -name "numbering_*.sql.gz" -mtime +30 -delete
|
||||
|
||||
echo "✅ Backup complete: numbering_$DATE.sql.gz"
|
||||
```
|
||||
|
||||
#### Cron Schedule
|
||||
```cron
|
||||
# Run backup daily at 2 AM
|
||||
0 2 * * * /opt/lcbp3/scripts/backup-numbering-db.sh >> /var/log/numbering-backup.log 2>&1
|
||||
|
||||
# Run integrity check weekly on Sunday at 3 AM
|
||||
0 3 * * 0 /opt/lcbp3/scripts/check-sequence-integrity.sh >> /var/log/numbering-integrity.log 2>&1
|
||||
```
|
||||
|
||||
### 5.2 Redis Backup
|
||||
|
||||
#### Enable RDB Persistence
|
||||
```conf
|
||||
# redis.conf
|
||||
save 900 1 # Save if 1 key changed after 900 seconds
|
||||
save 300 10 # Save if 10 keys changed after 300 seconds
|
||||
save 60 10000 # Save if 10000 keys changed after 60 seconds
|
||||
|
||||
dbfilename dump.rdb
|
||||
dir /data
|
||||
|
||||
# Enable AOF for durability
|
||||
appendonly yes
|
||||
appendfilename "appendonly.aof"
|
||||
appendfsync everysec
|
||||
```
|
||||
|
||||
#### Backup Script
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# scripts/backup-redis.sh
|
||||
|
||||
DATE=$(date +%Y%m%d_%H%M%S)
|
||||
BACKUP_DIR="/backups/redis"
|
||||
|
||||
mkdir -p $BACKUP_DIR
|
||||
|
||||
echo "Backing up Redis..."
|
||||
|
||||
# Trigger BGSAVE
|
||||
docker exec cache redis-cli BGSAVE
|
||||
|
||||
# Wait for save to complete
|
||||
sleep 10
|
||||
|
||||
# Copy RDB file
|
||||
docker cp cache:/data/dump.rdb \
|
||||
$BACKUP_DIR/redis_${DATE}.rdb
|
||||
|
||||
# Copy AOF file
|
||||
docker cp cache:/data/appendonly.aof \
|
||||
$BACKUP_DIR/redis_${DATE}.aof
|
||||
|
||||
# Compress
|
||||
tar -czf $BACKUP_DIR/redis_${DATE}.tar.gz \
|
||||
$BACKUP_DIR/redis_${DATE}.rdb \
|
||||
$BACKUP_DIR/redis_${DATE}.aof
|
||||
|
||||
# Cleanup
|
||||
rm $BACKUP_DIR/redis_${DATE}.rdb $BACKUP_DIR/redis_${DATE}.aof
|
||||
|
||||
echo "✅ Redis backup complete: redis_${DATE}.tar.gz"
|
||||
```
|
||||
|
||||
### 5.3 Recovery Procedures
|
||||
|
||||
#### Scenario 1: Restore from Database Backup
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# scripts/restore-numbering-db.sh
|
||||
|
||||
BACKUP_FILE=$1
|
||||
|
||||
if [ -z "$BACKUP_FILE" ]; then
|
||||
echo "Usage: ./restore-numbering-db.sh <backup_file>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "⚠️ WARNING: This will overwrite current numbering data!"
|
||||
read -p "Continue? (yes/no): " confirm
|
||||
|
||||
if [ "$confirm" != "yes" ]; then
|
||||
echo "Aborted"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Decompress if needed
|
||||
if [[ $BACKUP_FILE == *.gz ]]; then
|
||||
gunzip -c $BACKUP_FILE > /tmp/restore.sql
|
||||
RESTORE_FILE="/tmp/restore.sql"
|
||||
else
|
||||
RESTORE_FILE=$BACKUP_FILE
|
||||
fi
|
||||
|
||||
# Restore
|
||||
docker exec -i lcbp3-mariadb mysql lcbp3_production < $RESTORE_FILE
|
||||
|
||||
echo "✅ Restore complete"
|
||||
echo "🔄 Please verify sequence integrity"
|
||||
```
|
||||
|
||||
#### Scenario 2: Redis Failure
|
||||
```bash
|
||||
# Check Redis status
|
||||
docker exec cache redis-cli ping
|
||||
|
||||
# If Redis is down, restart container
|
||||
docker restart cache
|
||||
|
||||
# Verify Redis is running
|
||||
docker exec cache redis-cli ping
|
||||
|
||||
# If restart fails, restore from backup
|
||||
./scripts/restore-redis.sh /backups/redis/latest.tar.gz
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 6. Maintenance Procedures
|
||||
|
||||
### 6.1 Sequence Adjustment
|
||||
|
||||
#### Increase Max Value
|
||||
```sql
|
||||
-- Check current utilization
|
||||
SELECT
|
||||
dc.document_type,
|
||||
ds.current_value,
|
||||
dc.max_value,
|
||||
ROUND((ds.current_value * 100.0 / dc.max_value), 2) as utilization
|
||||
FROM document_numbering_sequences ds
|
||||
JOIN document_numbering_configs dc ON ds.config_id = dc.id
|
||||
WHERE ds.current_value > dc.max_value * 0.8;
|
||||
|
||||
-- Increase max_value for type approaching limit
|
||||
UPDATE document_numbering_configs
|
||||
SET max_value = max_value * 10,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
WHERE document_type = 'COR'
|
||||
AND max_value < 9999999;
|
||||
|
||||
-- Audit log
|
||||
INSERT INTO document_numbering_audit_logs (
|
||||
operation, document_type, old_value, new_value,
|
||||
user_id, metadata
|
||||
) VALUES (
|
||||
'ADJUST_MAX_VALUE', 'COR', '999999', '9999999',
|
||||
1, '{"reason": "Approaching limit", "automated": false}'
|
||||
);
|
||||
```
|
||||
|
||||
#### Reset Yearly Sequence
|
||||
```sql
|
||||
-- For document types with yearly reset
|
||||
-- Run on January 1st
|
||||
|
||||
START TRANSACTION;
|
||||
|
||||
-- Create new sequence for new year
|
||||
INSERT INTO document_numbering_sequences (
|
||||
config_id,
|
||||
scope_value,
|
||||
current_value,
|
||||
last_used_at
|
||||
)
|
||||
SELECT
|
||||
id as config_id,
|
||||
YEAR(CURDATE()) as scope_value,
|
||||
0 as current_value,
|
||||
NULL as last_used_at
|
||||
FROM document_numbering_configs
|
||||
WHERE scope = 'YEARLY';
|
||||
|
||||
-- Verify
|
||||
SELECT * FROM document_numbering_sequences
|
||||
WHERE scope_value = YEAR(CURDATE());
|
||||
|
||||
COMMIT;
|
||||
```
|
||||
|
||||
### 6.2 Cleanup Old Audit Logs
|
||||
|
||||
```sql
|
||||
-- Archive logs older than 2 years
|
||||
-- Run monthly
|
||||
|
||||
START TRANSACTION;
|
||||
|
||||
-- Create archive table (if not exists)
|
||||
CREATE TABLE IF NOT EXISTS document_numbering_audit_logs_archive
|
||||
LIKE document_numbering_audit_logs;
|
||||
|
||||
-- Move old logs to archive
|
||||
INSERT INTO document_numbering_audit_logs_archive
|
||||
SELECT * FROM document_numbering_audit_logs
|
||||
WHERE timestamp < DATE_SUB(CURDATE(), INTERVAL 2 YEAR);
|
||||
|
||||
-- Delete from main table
|
||||
DELETE FROM document_numbering_audit_logs
|
||||
WHERE timestamp < DATE_SUB(CURDATE(), INTERVAL 2 YEAR);
|
||||
|
||||
-- Optimize table
|
||||
OPTIMIZE TABLE document_numbering_audit_logs;
|
||||
|
||||
COMMIT;
|
||||
|
||||
-- Export archive to file (optional)
|
||||
SELECT * FROM document_numbering_audit_logs_archive
|
||||
INTO OUTFILE '/tmp/audit_archive_2023.csv'
|
||||
FIELDS TERMINATED BY ','
|
||||
ENCLOSED BY '"'
|
||||
LINES TERMINATED BY '\n';
|
||||
```
|
||||
|
||||
### 6.3 Redis Maintenance
|
||||
|
||||
#### Flush Expired Reservations
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# scripts/cleanup-expired-reservations.sh
|
||||
|
||||
echo "🧹 Cleaning up expired reservations..."
|
||||
|
||||
# Get all reservation keys
|
||||
KEYS=$(docker exec lcbp3-redis-1 redis-cli --cluster call 172.20.0.2:6379 KEYS "reservation:*" | grep -v "(error)")
|
||||
|
||||
COUNT=0
|
||||
for KEY in $KEYS; do
|
||||
# Check TTL
|
||||
TTL=$(docker exec lcbp3-redis-1 redis-cli TTL "$KEY")
|
||||
|
||||
if [ "$TTL" -lt 0 ]; then
|
||||
# Delete expired key
|
||||
docker exec lcbp3-redis-1 redis-cli DEL "$KEY"
|
||||
((COUNT++))
|
||||
fi
|
||||
done
|
||||
|
||||
echo "✅ Cleaned up $COUNT expired reservations"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 7. Disaster Recovery
|
||||
|
||||
### 7.1 Total System Failure
|
||||
|
||||
#### Recovery Steps
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# scripts/disaster-recovery.sh
|
||||
|
||||
echo "🚨 Starting disaster recovery..."
|
||||
|
||||
# 1. Start Redis cluster
|
||||
echo "1️⃣ Starting Redis cluster..."
|
||||
docker-compose -f docker-compose-redis.yml up -d
|
||||
sleep 30
|
||||
|
||||
# 2. Restore Redis backups
|
||||
echo "2️⃣ Restoring Redis backups..."
|
||||
./scripts/restore-redis.sh /backups/redis/latest.tar.gz
|
||||
|
||||
# 3. Start database
|
||||
echo "3️⃣ Starting MariaDB..."
|
||||
docker-compose -f docker-compose-db.yml up -d
|
||||
sleep 30
|
||||
|
||||
# 4. Restore database
|
||||
echo "4️⃣ Restoring database..."
|
||||
./scripts/restore-numbering-db.sh /backups/db/latest.sql.gz
|
||||
|
||||
# 5. Verify sequence integrity
|
||||
echo "5️⃣ Verifying sequence integrity..."
|
||||
./scripts/check-sequence-integrity.sh
|
||||
|
||||
# 6. Start backend services
|
||||
echo "6️⃣ Starting backend services..."
|
||||
docker-compose -f docker-compose-backend.yml up -d
|
||||
|
||||
# 7. Run health checks
|
||||
echo "7️⃣ Running health checks..."
|
||||
sleep 60
|
||||
for i in {1..5}; do
|
||||
curl -f http://localhost:3001/health || echo "Backend $i not healthy"
|
||||
done
|
||||
|
||||
echo "✅ Disaster recovery complete"
|
||||
echo "⚠️ Please verify system functionality manually"
|
||||
```
|
||||
|
||||
### 7.2 RTO/RPO Targets
|
||||
|
||||
| Scenario | RTO | RPO | Priority |
|
||||
| ---------------------------- | ------- | ------ | -------- |
|
||||
| Single backend node failure | 0 min | 0 | P0 |
|
||||
| Single Redis node failure | 0 min | 0 | P0 |
|
||||
| Database primary failure | 5 min | 0 | P0 |
|
||||
| Complete data center failure | 1 hour | 15 min | P1 |
|
||||
| Data corruption | 4 hours | 1 day | P2 |
|
||||
|
||||
---
|
||||
|
||||
## 8. Runbooks
|
||||
|
||||
### 8.1 High Sequence Utilization (>90%)
|
||||
|
||||
**Alert**: `SequenceWarning` or `SequenceCritical`
|
||||
|
||||
**Steps**:
|
||||
1. Check current utilization
|
||||
```sql
|
||||
SELECT document_type, current_value, max_value,
|
||||
ROUND((current_value * 100.0 / max_value), 2) as pct
|
||||
FROM document_numbering_sequences s
|
||||
JOIN document_numbering_configs c ON s.config_id = c.id
|
||||
WHERE current_value > max_value * 0.9;
|
||||
```
|
||||
|
||||
2. Assess impact
|
||||
- How many numbers left?
|
||||
- Daily usage rate?
|
||||
- Days until exhaustion?
|
||||
|
||||
3. Take action
|
||||
```sql
|
||||
-- Option A: Increase max_value
|
||||
UPDATE document_numbering_configs
|
||||
SET max_value = max_value * 10
|
||||
WHERE document_type = 'COR';
|
||||
|
||||
-- Option B: Reset sequence (yearly types only)
|
||||
-- Schedule for next year/month
|
||||
```
|
||||
|
||||
4. Notify stakeholders
|
||||
5. Update monitoring thresholds if needed
|
||||
|
||||
---
|
||||
|
||||
### 8.2 High Lock Wait Time
|
||||
|
||||
**Alert**: `HighLockWaitTime`
|
||||
|
||||
**Steps**:
|
||||
1. Check Redis cluster health
|
||||
```bash
|
||||
docker exec lcbp3-redis-1 redis-cli cluster info
|
||||
docker exec lcbp3-redis-1 redis-cli cluster nodes
|
||||
```
|
||||
|
||||
2. Check database locks
|
||||
```sql
|
||||
SELECT * FROM information_schema.innodb_lock_waits;
|
||||
SELECT * FROM information_schema.innodb_trx
|
||||
WHERE trx_started < NOW() - INTERVAL 30 SECOND;
|
||||
```
|
||||
|
||||
3. Identify bottleneck
|
||||
- Redis slow?
|
||||
- Database slow?
|
||||
- High concurrent load?
|
||||
|
||||
4. Take action based on cause:
|
||||
- **Redis**: Add more nodes, check network latency
|
||||
- **Database**: Optimize queries, increase connection pool
|
||||
- **High load**: Scale horizontally (add backend nodes)
|
||||
|
||||
5. Monitor improvements
|
||||
|
||||
---
|
||||
|
||||
### 8.3 Redis Down
|
||||
|
||||
**Alert**: `RedisUnavailable`
|
||||
|
||||
**Steps**:
|
||||
1. Verify Redis is down
|
||||
```bash
|
||||
docker exec cache redis-cli ping || echo "Redis DOWN"
|
||||
```
|
||||
|
||||
2. Check system falls back to DB-only mode
|
||||
```bash
|
||||
curl http://localhost:3001/health/numbering
|
||||
# Should show: fallback_mode: true
|
||||
```
|
||||
|
||||
3. Restart Redis container
|
||||
```bash
|
||||
docker restart cache
|
||||
sleep 10
|
||||
docker exec cache redis-cli ping
|
||||
```
|
||||
|
||||
4. If restart fails, restore from backup
|
||||
```bash
|
||||
./scripts/restore-redis.sh /backups/redis/latest.tar.gz
|
||||
```
|
||||
|
||||
5. Verify numbering system back to normal
|
||||
```bash
|
||||
curl http://localhost:3001/health/numbering
|
||||
# Should show: fallback_mode: false
|
||||
```
|
||||
|
||||
6. Review logs for root cause
|
||||
```bash
|
||||
docker logs cache --tail 100
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 9. Performance Tuning
|
||||
|
||||
### 9.1 Slow Number Generation
|
||||
|
||||
**Diagnosis**:
|
||||
```sql
|
||||
-- Check slow queries
|
||||
SELECT * FROM mysql.slow_log
|
||||
WHERE sql_text LIKE '%document_numbering%'
|
||||
ORDER BY query_time DESC
|
||||
LIMIT 10;
|
||||
|
||||
-- Check index usage
|
||||
EXPLAIN SELECT * FROM document_numbering_sequences
|
||||
WHERE config_id = 1 AND scope_value = '2025'
|
||||
FOR UPDATE;
|
||||
```
|
||||
|
||||
**Optimizations**:
|
||||
```sql
|
||||
-- Add missing indexes
|
||||
CREATE INDEX idx_sequence_lookup
|
||||
ON document_numbering_sequences(config_id, scope_value);
|
||||
|
||||
-- Optimize table
|
||||
OPTIMIZE TABLE document_numbering_sequences;
|
||||
|
||||
-- Update statistics
|
||||
ANALYZE TABLE document_numbering_sequences;
|
||||
```
|
||||
|
||||
### 9.2 Redis Memory Optimization
|
||||
|
||||
```bash
|
||||
# Check memory usage
|
||||
docker exec cache redis-cli INFO memory
|
||||
|
||||
# If memory high, check keys
|
||||
docker exec cache redis-cli --bigkeys
|
||||
|
||||
# Set maxmemory policy
|
||||
docker exec cache redis-cli CONFIG SET maxmemory 2gb
|
||||
docker exec cache redis-cli CONFIG SET maxmemory-policy allkeys-lru
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 10. Security Hardening
|
||||
|
||||
### 10.1 Redis Security
|
||||
|
||||
```conf
|
||||
# redis.conf
|
||||
requirepass your-strong-redis-password
|
||||
bind 0.0.0.0
|
||||
protected-mode yes
|
||||
rename-command FLUSHDB ""
|
||||
rename-command FLUSHALL ""
|
||||
rename-command CONFIG "CONFIG_abc123"
|
||||
```
|
||||
|
||||
### 10.2 Database Security
|
||||
|
||||
```sql
|
||||
-- Create dedicated numbering user
|
||||
CREATE USER 'numbering'@'%' IDENTIFIED BY 'strong-password';
|
||||
|
||||
-- Grant minimal permissions
|
||||
GRANT SELECT, INSERT, UPDATE ON lcbp3_production.document_numbering_* TO 'numbering'@'%';
|
||||
GRANT SELECT ON lcbp3_production.users TO 'numbering'@'%';
|
||||
|
||||
FLUSH PRIVILEGES;
|
||||
```
|
||||
|
||||
### 10.3 Network Security
|
||||
|
||||
```yaml
|
||||
# docker-compose-network.yml
|
||||
networks:
|
||||
lcbp3:
|
||||
driver: bridge
|
||||
ipam:
|
||||
config:
|
||||
- subnet: 172.20.0.0/16
|
||||
driver_opts:
|
||||
com.docker.network.bridge.name: lcbp3-br
|
||||
com.docker.network.bridge.enable_icc: "true"
|
||||
com.docker.network.bridge.enable_ip_masquerade: "true"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 11. Compliance & Audit
|
||||
|
||||
### 11.1 Audit Log Retention
|
||||
|
||||
```sql
|
||||
-- Export audit logs for compliance
|
||||
SELECT *
|
||||
FROM document_numbering
|
||||
@@ -1,290 +0,0 @@
|
||||
# 🗺️ แผนผัง Network Architecture & Container Services (LCBP3-DMS)
|
||||
|
||||
แผนผังนี้แสดงการแบ่งส่วนเครือข่าย (VLANs), การเชื่อมต่อ Firewall (ACLs) และบทบาทของ Server ทั้งสองตัว (QNAP: Application, ASUSTOR: Infrastructure)
|
||||
|
||||
> 📖 **ดูรายละเอียด Server Roles และ Service Distribution ได้ที่:** [README.md](README.md#-hardware-infrastructure)
|
||||
|
||||
---
|
||||
|
||||
## 1. Data Flow Diagram
|
||||
|
||||
```mermaid
|
||||
flowchart TB
|
||||
subgraph Internet["🌐 Internet"]
|
||||
User[("👤 User")]
|
||||
end
|
||||
|
||||
subgraph QNAP["💾 QNAP TS-473A (App Server)"]
|
||||
NPM["🔲 NPM<br/>(Reverse Proxy)"]
|
||||
Frontend["📱 Next.js<br/>(Frontend)"]
|
||||
Backend["⚙️ NestJS<br/>(Backend API)"]
|
||||
DB["🗄️ MariaDB"]
|
||||
Redis["📦 Redis"]
|
||||
ES["🔍 Elasticsearch"]
|
||||
end
|
||||
|
||||
subgraph ASUSTOR["💾 ASUSTOR AS5403T (Infra Server)"]
|
||||
Portainer["🐳 Portainer"]
|
||||
Registry["📦 Registry"]
|
||||
Prometheus["📊 Prometheus"]
|
||||
Grafana["📈 Grafana"]
|
||||
Uptime["⏱️ Uptime Kuma"]
|
||||
Backup["💾 Restic/Borg"]
|
||||
NFS["📁 NFS Storage"]
|
||||
end
|
||||
|
||||
User -->|HTTPS 443| NPM
|
||||
NPM --> Frontend
|
||||
NPM --> Backend
|
||||
Frontend --> Backend
|
||||
Backend --> DB
|
||||
Backend --> Redis
|
||||
Backend --> ES
|
||||
|
||||
DB -.->|Scheduled Backup| Backup
|
||||
Backup --> NFS
|
||||
|
||||
Portainer -.->|Manage| QNAP
|
||||
Prometheus -.->|Collect Metrics| Backend
|
||||
Prometheus -.->|Collect Metrics| DB
|
||||
Uptime -.->|Health Check| NPM
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 2. Docker Management View
|
||||
|
||||
```mermaid
|
||||
flowchart TB
|
||||
subgraph Portainer["🐳 Portainer (ASUSTOR - Central Management)"]
|
||||
direction TB
|
||||
|
||||
subgraph LocalStack["📦 Local Infra Stack"]
|
||||
Registry["Docker Registry"]
|
||||
Prometheus["Prometheus"]
|
||||
Grafana["Grafana"]
|
||||
Uptime["Uptime Kuma"]
|
||||
Backup["Restic/Borg"]
|
||||
Loki["Loki (Logs)"]
|
||||
ClamAV["ClamAV"]
|
||||
end
|
||||
|
||||
subgraph RemoteStack["🔗 Remote: QNAP App Stack"]
|
||||
Frontend["Next.js"]
|
||||
Backend["NestJS"]
|
||||
MariaDB["MariaDB"]
|
||||
Redis["Redis"]
|
||||
ES["Elasticsearch"]
|
||||
NPM["NPM"]
|
||||
Gitea["Gitea"]
|
||||
N8N["n8n"]
|
||||
PMA["phpMyAdmin"]
|
||||
end
|
||||
end
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 3. Security Zones Diagram
|
||||
|
||||
```mermaid
|
||||
flowchart TB
|
||||
subgraph PublicZone["🌐 PUBLIC ZONE"]
|
||||
direction LR
|
||||
NPM["NPM (Reverse Proxy)"]
|
||||
SSL["SSL/TLS Termination"]
|
||||
end
|
||||
|
||||
subgraph AppZone["📱 APPLICATION ZONE (QNAP)"]
|
||||
direction LR
|
||||
Frontend["Next.js"]
|
||||
Backend["NestJS"]
|
||||
N8N["n8n"]
|
||||
Gitea["Gitea"]
|
||||
end
|
||||
|
||||
subgraph DataZone["💾 DATA ZONE (QNAP - Internal Only)"]
|
||||
direction LR
|
||||
MariaDB["MariaDB"]
|
||||
Redis["Redis"]
|
||||
ES["Elasticsearch"]
|
||||
end
|
||||
|
||||
subgraph InfraZone["🛠️ INFRASTRUCTURE ZONE (ASUSTOR)"]
|
||||
direction LR
|
||||
Backup["Backup Services"]
|
||||
Registry["Docker Registry"]
|
||||
Monitoring["Prometheus + Grafana"]
|
||||
Logs["Loki / Syslog"]
|
||||
end
|
||||
|
||||
PublicZone -->|HTTPS Only| AppZone
|
||||
AppZone -->|Internal API| DataZone
|
||||
DataZone -.->|Backup| InfraZone
|
||||
AppZone -.->|Metrics| InfraZone
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 4. แผนผังการเชื่อมต่อเครือข่าย (Network Flow)
|
||||
|
||||
```mermaid
|
||||
graph TD
|
||||
direction TB
|
||||
|
||||
subgraph Flow1["การเชื่อมต่อจากภายนอก (Public WAN)"]
|
||||
User["ผู้ใช้งานภายนอก (Internet)"]
|
||||
end
|
||||
|
||||
subgraph Router["Router (ER7206) - Gateway"]
|
||||
User -- "Port 80/443 (HTTPS/HTTP)" --> ER7206
|
||||
ER7206["Port Forwarding<br/>TCP 80 → 192.168.10.8:80<br/>TCP 443 → 192.168.10.8:443"]
|
||||
end
|
||||
|
||||
subgraph VLANs["เครือข่ายภายใน (VLANs & Firewall Rules)"]
|
||||
direction LR
|
||||
|
||||
subgraph VLAN10["VLAN 10: Servers<br/>192.168.10.x"]
|
||||
QNAP["QNAP NAS<br/>(192.168.10.8)"]
|
||||
ASUSTOR["ASUSTOR NAS<br/>(192.168.10.9)"]
|
||||
end
|
||||
|
||||
subgraph VLAN20["VLAN 20: MGMT<br/>192.168.20.x"]
|
||||
AdminPC["Admin PC / Switches"]
|
||||
end
|
||||
|
||||
subgraph VLAN30["VLAN 30: USER<br/>192.168.30.x"]
|
||||
OfficePC["PC พนักงาน/Wi-Fi"]
|
||||
end
|
||||
|
||||
subgraph VLAN70["VLAN 70: GUEST<br/>192.168.70.x"]
|
||||
GuestPC["Guest Wi-Fi"]
|
||||
end
|
||||
|
||||
subgraph Firewall["Firewall ACLs (OC200/ER7206)"]
|
||||
direction TB
|
||||
rule1["Rule 1: DENY<br/>Guest (VLAN 70) → All VLANs"]
|
||||
rule2["Rule 2: DENY<br/>Server (VLAN 10) → User (VLAN 30)"]
|
||||
rule3["Rule 3: ALLOW<br/>User (VLAN 30) → QNAP<br/>Ports: 443, 80"]
|
||||
rule4["Rule 4: ALLOW<br/>MGMT (VLAN 20) → All"]
|
||||
end
|
||||
|
||||
GuestPC -.x|rule1| QNAP
|
||||
QNAP -.x|rule2| OfficePC
|
||||
OfficePC -- "https://lcbp3.np-dms.work" -->|rule3| QNAP
|
||||
AdminPC -->|rule4| QNAP
|
||||
AdminPC -->|rule4| ASUSTOR
|
||||
end
|
||||
|
||||
ER7206 --> QNAP
|
||||
|
||||
subgraph DockerQNAP["Docker 'lcbp3' (QNAP - Applications)"]
|
||||
direction TB
|
||||
|
||||
subgraph PublicServices["Services ที่ NPM เปิดสู่ภายนอก"]
|
||||
direction LR
|
||||
NPM["NPM (Nginx Proxy Manager)"]
|
||||
FrontendC["frontend:3000"]
|
||||
BackendC["backend:3000"]
|
||||
GiteaC["gitea:3000"]
|
||||
PMAC["pma:80"]
|
||||
N8NC["n8n:5678"]
|
||||
end
|
||||
|
||||
subgraph InternalServices["Internal Services (Backend Only)"]
|
||||
direction LR
|
||||
DBC["mariadb:3306"]
|
||||
CacheC["cache:6379"]
|
||||
SearchC["search:9200"]
|
||||
end
|
||||
|
||||
NPM -- "lcbp3.np-dms.work" --> FrontendC
|
||||
NPM -- "backend.np-dms.work" --> BackendC
|
||||
NPM -- "git.np-dms.work" --> GiteaC
|
||||
NPM -- "pma.np-dms.work" --> PMAC
|
||||
NPM -- "n8n.np-dms.work" --> N8NC
|
||||
|
||||
BackendC -- "lcbp3 Network" --> DBC
|
||||
BackendC -- "lcbp3 Network" --> CacheC
|
||||
BackendC -- "lcbp3 Network" --> SearchC
|
||||
end
|
||||
|
||||
subgraph DockerASUSTOR["Docker 'lcbp3' (ASUSTOR - Infrastructure)"]
|
||||
direction TB
|
||||
|
||||
subgraph InfraServices["Infrastructure Services"]
|
||||
direction LR
|
||||
PortainerC["portainer:9443"]
|
||||
RegistryC["registry:5000"]
|
||||
PrometheusC["prometheus:9090"]
|
||||
GrafanaC["grafana:3000"]
|
||||
UptimeC["uptime-kuma:3001"]
|
||||
end
|
||||
|
||||
subgraph BackupServices["Backup & Storage"]
|
||||
direction LR
|
||||
ResticC["restic/borg"]
|
||||
NFSC["NFS Share"]
|
||||
end
|
||||
|
||||
PortainerC -.->|"Remote Endpoint"| NPM
|
||||
PrometheusC -.->|"Scrape Metrics"| BackendC
|
||||
ResticC --> NFSC
|
||||
end
|
||||
|
||||
QNAP --> NPM
|
||||
ASUSTOR --> PortainerC
|
||||
DBC -.->|"Scheduled Backup"| ResticC
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 5. Firewall & Security Configuration
|
||||
|
||||
> 📖 **ดูรายละเอียด Firewall ACLs และ Port Forwarding ได้ที่:** [03_Securities.md](03_Securities.md)
|
||||
|
||||
ไฟล์ `03_Securities.md` ประกอบด้วย:
|
||||
- 🌐 VLAN Segmentation
|
||||
- 🔥 Firewall Rules (IP Groups, Port Groups, Switch ACL, Gateway ACL)
|
||||
- 🚪 Port Forwarding Configuration
|
||||
|
||||
---
|
||||
|
||||
## 6. Container Service Distribution
|
||||
|
||||
> 📖 **ดูรายละเอียด Container Services, Ports, และ Domain Mapping ได้ที่:** [README.md](README.md#-domain-mapping-npm-proxy)
|
||||
|
||||
---
|
||||
|
||||
## 7. Backup Flow
|
||||
|
||||
```mermaid
|
||||
flowchart LR
|
||||
subgraph QNAP["💾 QNAP TS-473A (Source)"]
|
||||
direction TB
|
||||
DB["🗄️ MariaDB<br/>(mysqldump)"]
|
||||
Redis["📦 Redis<br/>(RDB + AOF)"]
|
||||
Config["⚙️ App Config<br/>+ Volumes"]
|
||||
end
|
||||
|
||||
subgraph ASUSTOR["💾 ASUSTOR AS5403T (Target)"]
|
||||
direction TB
|
||||
BackupDB["📁 /volume1/backup/db/<br/>(Restic Repository)"]
|
||||
BackupRedis["📁 /volume1/backup/redis/"]
|
||||
BackupConfig["📁 /volume1/backup/config/"]
|
||||
end
|
||||
|
||||
DB -->|"Daily 2AM"| BackupDB
|
||||
Redis -->|"Daily 3AM"| BackupRedis
|
||||
Config -->|"Weekly Sun 4AM"| BackupConfig
|
||||
|
||||
subgraph Retention["📋 Retention Policy"]
|
||||
R1["Daily: 7 days"]
|
||||
R2["Weekly: 4 weeks"]
|
||||
R3["Monthly: 6 months"]
|
||||
end
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
> 📝 **หมายเหตุ**: เอกสารนี้อ้างอิงจาก Architecture Document **v1.8.0** - Last updated: 2026-01-28
|
||||
@@ -1,853 +0,0 @@
|
||||
# การติดตั้ง Monitoring Stack บน ASUSTOR
|
||||
|
||||
## **📝 คำอธิบายและข้อควรพิจารณา**
|
||||
|
||||
> ⚠️ **หมายเหตุ**: Monitoring Stack ทั้งหมดติดตั้งบน **ASUSTOR AS5403T** ไม่ใช่ QNAP
|
||||
> เพื่อแยก Application workload ออกจาก Infrastructure/Monitoring workload
|
||||
|
||||
Stack สำหรับ Monitoring ประกอบด้วย:
|
||||
|
||||
| Service | Port | Purpose | Host |
|
||||
| :---------------- | :--------------------------- | :-------------------------------- | :------ |
|
||||
| **Prometheus** | 9090 | เก็บ Metrics และ Time-series data | ASUSTOR |
|
||||
| **Grafana** | 3000 | Dashboard สำหรับแสดงผล Metrics | ASUSTOR |
|
||||
| **Node Exporter** | 9100 | เก็บ Metrics ของ Host system | Both |
|
||||
| **cAdvisor** | 8080 (ASUSTOR) / 8088 (QNAP) | เก็บ Metrics ของ Docker containers | Both |
|
||||
| **Uptime Kuma** | 3001 | Service Availability Monitoring | ASUSTOR |
|
||||
| **Loki** | 3100 | Log aggregation | ASUSTOR |
|
||||
| **Promtail** | - | Log shipper (Sender) | ASUSTOR |
|
||||
|
||||
---
|
||||
|
||||
## 🏗️ Architecture Overview
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────┐
|
||||
│ ASUSTOR AS5403T (Monitoring Hub) │
|
||||
├─────────────────────────────────────────────────────────────────────────┤
|
||||
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
|
||||
│ │ Prometheus │───▶│ Grafana │ │ Uptime Kuma │ │
|
||||
│ │ :9090 │ │ :3000 │ │ :3001 │ │
|
||||
│ └──────┬──────┘ └─────────────┘ └─────────────┘ │
|
||||
│ │ │
|
||||
│ │ Scrape Metrics │
|
||||
│ ▼ │
|
||||
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
|
||||
│ │node-exporter│ │ cAdvisor │ │ Promtail │ │
|
||||
│ │ :9100 │ │ :8080 │ │ (Log Ship) │ │
|
||||
│ │ (Local) │ │ (Local) │ │ (Local) │ │
|
||||
│ └─────────────┘ └─────────────┘ └─────────────┘ │
|
||||
└─────────────────────────────────────────────────────────────────────────┘
|
||||
│ Remote Scrape
|
||||
▼
|
||||
┌─────────────────────────────────────────────────────────────────────────┐
|
||||
│ QNAP TS-473A (App Server) │
|
||||
├─────────────────────────────────────────────────────────────────────────┤
|
||||
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
|
||||
│ │node-exporter│ │ cAdvisor │ │ Backend │ │
|
||||
│ │ :9100 │ │ :8080 │ │ /metrics │ │
|
||||
│ └─────────────┘ └─────────────┘ └─────────────┘ │
|
||||
└─────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## กำหนดสิทธิ (บน ASUSTOR)
|
||||
|
||||
```bash
|
||||
# SSH เข้า ASUSTOR
|
||||
ssh admin@192.168.10.9
|
||||
|
||||
# สร้าง Directory
|
||||
mkdir -p /volume1/np-dms/monitoring/prometheus/data
|
||||
mkdir -p /volume1/np-dms/monitoring/prometheus/config
|
||||
mkdir -p /volume1/np-dms/monitoring/grafana/data
|
||||
mkdir -p /volume1/np-dms/monitoring/uptime-kuma/data
|
||||
mkdir -p /volume1/np-dms/monitoring/loki/data
|
||||
mkdir -p /volume1/np-dms/monitoring/promtail/config
|
||||
|
||||
# กำหนดสิทธิ์ให้ตรงกับ User ID ใน Container
|
||||
# Prometheus (UID 65534 - nobody)
|
||||
chown -R 65534:65534 /volume1/np-dms/monitoring/prometheus
|
||||
chmod -R 750 /volume1/np-dms/monitoring/prometheus
|
||||
|
||||
# Grafana (UID 472)
|
||||
chown -R 472:472 /volume1/np-dms/monitoring/grafana/data
|
||||
chmod -R 750 /volume1/np-dms/monitoring/grafana/data
|
||||
|
||||
# Uptime Kuma (UID 1000)
|
||||
chown -R 1000:1000 /volume1/np-dms/monitoring/uptime-kuma/data
|
||||
chmod -R 750 /volume1/np-dms/monitoring/uptime-kuma/data
|
||||
|
||||
# Loki (UID 10001)
|
||||
chown -R 10001:10001 /volume1/np-dms/monitoring/loki/data
|
||||
chmod -R 750 /volume1/np-dms/monitoring/loki/data
|
||||
|
||||
# Promtail (Runs as root to read docker logs - no specific chown needed for config dir if created by admin)
|
||||
# But ensure config file is readable
|
||||
chmod -R 755 /volume1/np-dms/monitoring/promtail/config
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🔗 สร้าง Docker Network (ทำครั้งแรกครั้งเดียว)
|
||||
|
||||
> ⚠️ **ต้องสร้าง network ก่อน deploy docker-compose ทุกตัว** เพราะทุก service ใช้ `lcbp3` เป็น external network
|
||||
|
||||
### สร้างผ่าน Portainer (แนะนำ)
|
||||
|
||||
1. เปิด **Portainer** → เลือก Environment ของ ASUSTOR
|
||||
2. ไปที่ **Networks** → **Add network**
|
||||
3. กรอกข้อมูล:
|
||||
- **Name:** `lcbp3`
|
||||
- **Driver:** `bridge`
|
||||
4. กด **Create the network**
|
||||
|
||||
### สร้างผ่าน SSH
|
||||
|
||||
```bash
|
||||
# SSH เข้า ASUSTOR
|
||||
ssh admin@192.168.10.9
|
||||
|
||||
# สร้าง external network
|
||||
docker network create lcbp3
|
||||
|
||||
# ตรวจสอบ
|
||||
docker network ls | grep lcbp3
|
||||
docker network inspect lcbp3
|
||||
```
|
||||
|
||||
> 📖 **QNAP** ก็ต้องมี network ชื่อ `lcbp3` เช่นกัน (สร้างผ่าน Container Station หรือ SSH)
|
||||
> ดู [README.md – Quick Reference](README.md#-quick-reference) สำหรับคำสั่งบน QNAP
|
||||
|
||||
---
|
||||
|
||||
## Note: NPM Proxy Configuration (NPM รันบน QNAP → Forward ไป ASUSTOR)
|
||||
|
||||
> ⚠️ เนื่องจาก NPM อยู่บน **QNAP** แต่ Monitoring services อยู่บน **ASUSTOR**
|
||||
> ต้องใช้ **IP Address** (`192.168.10.9`) แทนชื่อ container (resolve ข้ามเครื่องไม่ได้)
|
||||
|
||||
| Domain Names | Scheme | Forward Hostname | Forward Port | Block Common Exploits | Websockets | Force SSL | HTTP/2 |
|
||||
| :--------------------- | :----- | :--------------- | :----------- | :-------------------- | :--------- | :-------- | :----- |
|
||||
| grafana.np-dms.work | `http` | `192.168.10.9` | 3000 | [x] | [x] | [x] | [x] |
|
||||
| prometheus.np-dms.work | `http` | `192.168.10.9` | 9090 | [x] | [ ] | [x] | [x] |
|
||||
| uptime.np-dms.work | `http` | `192.168.10.9` | 3001 | [x] | [x] | [x] | [x] |
|
||||
|
||||
---
|
||||
|
||||
## Docker Compose File (ASUSTOR)
|
||||
|
||||
```yaml
|
||||
# File: /volume1/np-dms/monitoring/docker-compose.yml
|
||||
# DMS Container v1.8.0: Application name: lcbp3-monitoring
|
||||
# Deploy on: ASUSTOR AS5403T
|
||||
# Services: prometheus, grafana, node-exporter, cadvisor, uptime-kuma, loki, promtail
|
||||
|
||||
x-restart: &restart_policy
|
||||
restart: unless-stopped
|
||||
|
||||
x-logging: &default_logging
|
||||
logging:
|
||||
driver: "json-file"
|
||||
options:
|
||||
max-size: "10m"
|
||||
max-file: "5"
|
||||
|
||||
networks:
|
||||
lcbp3:
|
||||
external: true
|
||||
|
||||
services:
|
||||
# ----------------------------------------------------------------
|
||||
# 1. Prometheus (Metrics Collection & Storage)
|
||||
# ----------------------------------------------------------------
|
||||
prometheus:
|
||||
<<: [*restart_policy, *default_logging]
|
||||
image: prom/prometheus:v2.48.0
|
||||
container_name: prometheus
|
||||
stdin_open: true
|
||||
tty: true
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: "1.0"
|
||||
memory: 1G
|
||||
reservations:
|
||||
cpus: "0.25"
|
||||
memory: 256M
|
||||
environment:
|
||||
TZ: "Asia/Bangkok"
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
- '--storage.tsdb.path=/prometheus'
|
||||
- '--storage.tsdb.retention.time=30d'
|
||||
- '--web.enable-lifecycle'
|
||||
ports:
|
||||
- "9090:9090"
|
||||
networks:
|
||||
- lcbp3
|
||||
volumes:
|
||||
- "/volume1/np-dms/monitoring/prometheus/config:/etc/prometheus:ro"
|
||||
- "/volume1/np-dms/monitoring/prometheus/data:/prometheus"
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--spider", "-q", "http://localhost:9090/-/healthy"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# 2. Grafana (Dashboard & Visualization)
|
||||
# ----------------------------------------------------------------
|
||||
grafana:
|
||||
<<: [*restart_policy, *default_logging]
|
||||
image: grafana/grafana:10.2.2
|
||||
container_name: grafana
|
||||
stdin_open: true
|
||||
tty: true
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: "1.0"
|
||||
memory: 512M
|
||||
reservations:
|
||||
cpus: "0.25"
|
||||
memory: 128M
|
||||
environment:
|
||||
TZ: "Asia/Bangkok"
|
||||
GF_SECURITY_ADMIN_USER: admin
|
||||
GF_SECURITY_ADMIN_PASSWORD: "Center#2025"
|
||||
GF_SERVER_ROOT_URL: "https://grafana.np-dms.work"
|
||||
GF_INSTALL_PLUGINS: grafana-clock-panel,grafana-piechart-panel
|
||||
ports:
|
||||
- "3000:3000"
|
||||
networks:
|
||||
- lcbp3
|
||||
volumes:
|
||||
- "/volume1/np-dms/monitoring/grafana/data:/var/lib/grafana"
|
||||
depends_on:
|
||||
- prometheus
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "wget --spider -q http://localhost:3000/api/health || exit 1"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# 3. Uptime Kuma (Service Availability Monitoring)
|
||||
# ----------------------------------------------------------------
|
||||
uptime-kuma:
|
||||
<<: [*restart_policy, *default_logging]
|
||||
image: louislam/uptime-kuma:1
|
||||
container_name: uptime-kuma
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: "0.5"
|
||||
memory: 256M
|
||||
environment:
|
||||
TZ: "Asia/Bangkok"
|
||||
ports:
|
||||
- "3001:3001"
|
||||
networks:
|
||||
- lcbp3
|
||||
volumes:
|
||||
- "/volume1/np-dms/monitoring/uptime-kuma/data:/app/data"
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "curl -f http://localhost:3001/api/entry-page || exit 1"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# 4. Node Exporter (Host Metrics - ASUSTOR)
|
||||
# ----------------------------------------------------------------
|
||||
node-exporter:
|
||||
<<: [*restart_policy, *default_logging]
|
||||
image: prom/node-exporter:v1.7.0
|
||||
container_name: node-exporter
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: "0.5"
|
||||
memory: 128M
|
||||
environment:
|
||||
TZ: "Asia/Bangkok"
|
||||
command:
|
||||
- '--path.procfs=/host/proc'
|
||||
- '--path.sysfs=/host/sys'
|
||||
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
|
||||
ports:
|
||||
- "9100:9100"
|
||||
networks:
|
||||
- lcbp3
|
||||
volumes:
|
||||
- /proc:/host/proc:ro
|
||||
- /sys:/host/sys:ro
|
||||
- /:/rootfs:ro
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--spider", "-q", "http://localhost:9100/metrics"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# 5. cAdvisor (Container Metrics - ASUSTOR)
|
||||
# ----------------------------------------------------------------
|
||||
cadvisor:
|
||||
<<: [*restart_policy, *default_logging]
|
||||
image: gcr.io/cadvisor/cadvisor:v0.47.2
|
||||
container_name: cadvisor
|
||||
privileged: true
|
||||
devices:
|
||||
- /dev/kmsg
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: "0.5"
|
||||
memory: 256M
|
||||
environment:
|
||||
TZ: "Asia/Bangkok"
|
||||
ports:
|
||||
- "8088:8088"
|
||||
networks:
|
||||
- lcbp3
|
||||
volumes:
|
||||
- /:/rootfs:ro
|
||||
- /var/run:/var/run:ro
|
||||
- /sys:/sys:ro
|
||||
- /var/lib/docker/:/var/lib/docker:ro
|
||||
- /dev/disk/:/dev/disk:ro
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--spider", "-q", "http://localhost:8080/healthz"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# 6. Loki (Log Aggregation)
|
||||
# ----------------------------------------------------------------
|
||||
loki:
|
||||
<<: [*restart_policy, *default_logging]
|
||||
image: grafana/loki:2.9.0
|
||||
container_name: loki
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: "0.5"
|
||||
memory: 512M
|
||||
environment:
|
||||
TZ: "Asia/Bangkok"
|
||||
command: -config.file=/etc/loki/local-config.yaml
|
||||
ports:
|
||||
- "3100:3100"
|
||||
networks:
|
||||
- lcbp3
|
||||
volumes:
|
||||
- "/volume1/np-dms/monitoring/loki/data:/loki"
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--spider", "-q", "http://localhost:3100/ready"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# 7. Promtail (Log Shipper)
|
||||
# ----------------------------------------------------------------
|
||||
promtail:
|
||||
<<: [*restart_policy, *default_logging]
|
||||
image: grafana/promtail:2.9.0
|
||||
container_name: promtail
|
||||
user: "0:0"
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: "0.5"
|
||||
memory: 256M
|
||||
environment:
|
||||
TZ: "Asia/Bangkok"
|
||||
command: -config.file=/etc/promtail/promtail-config.yml
|
||||
networks:
|
||||
- lcbp3
|
||||
volumes:
|
||||
- "/volume1/np-dms/monitoring/promtail/config:/etc/promtail:ro"
|
||||
- "/var/run/docker.sock:/var/run/docker.sock:ro"
|
||||
- "/var/lib/docker/containers:/var/lib/docker/containers:ro"
|
||||
depends_on:
|
||||
- loki
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## QNAP Node Exporter & cAdvisor
|
||||
|
||||
ติดตั้ง node-exporter และ cAdvisor บน QNAP เพื่อให้ Prometheus บน ASUSTOR scrape metrics ได้:
|
||||
|
||||
```yaml
|
||||
# File: /share/np-dms/monitoring/docker-compose.yml (QNAP)
|
||||
# เฉพาะ exporters เท่านั้น - metrics ถูก scrape โดย Prometheus บน ASUSTOR
|
||||
|
||||
version: '3.8'
|
||||
|
||||
networks:
|
||||
lcbp3:
|
||||
external: true
|
||||
|
||||
services:
|
||||
node-exporter:
|
||||
image: prom/node-exporter:v1.7.0
|
||||
container_name: node-exporter
|
||||
restart: unless-stopped
|
||||
command:
|
||||
- '--path.procfs=/host/proc'
|
||||
- '--path.sysfs=/host/sys'
|
||||
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
|
||||
ports:
|
||||
- "9100:9100"
|
||||
networks:
|
||||
- lcbp3
|
||||
volumes:
|
||||
- /proc:/host/proc:ro
|
||||
- /sys:/host/sys:ro
|
||||
- /:/rootfs:ro
|
||||
|
||||
cadvisor:
|
||||
image: gcr.io/cadvisor/cadvisor:v0.47.2
|
||||
container_name: cadvisor
|
||||
restart: unless-stopped
|
||||
privileged: true
|
||||
ports:
|
||||
- "8088:8080"
|
||||
networks:
|
||||
- lcbp3
|
||||
volumes:
|
||||
- /:/rootfs:ro
|
||||
- /var/run:/var/run:ro
|
||||
- /sys:/sys:ro
|
||||
- /var/lib/docker/:/var/lib/docker:ro
|
||||
- /sys/fs/cgroup:/sys/fs/cgroup:ro
|
||||
|
||||
mysqld-exporter:
|
||||
image: prom/mysqld-exporter:v0.15.0
|
||||
container_name: mysqld-exporter
|
||||
restart: unless-stopped
|
||||
user: root
|
||||
command:
|
||||
- '--config.my-cnf=/etc/mysql/my.cnf'
|
||||
ports:
|
||||
- "9104:9104"
|
||||
networks:
|
||||
- lcbp3
|
||||
volumes:
|
||||
- "/share/np-dms/monitoring/mysqld-exporter/.my.cnf:/etc/mysql/my.cnf:ro"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Prometheus Configuration
|
||||
|
||||
สร้างไฟล์ `/volume1/np-dms/monitoring/prometheus/config/prometheus.yml` บน ASUSTOR:
|
||||
|
||||
```yaml
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
scrape_configs:
|
||||
# Prometheus self-monitoring (ASUSTOR)
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
# ============================================
|
||||
# ASUSTOR Metrics (Local)
|
||||
# ============================================
|
||||
|
||||
# Host metrics from Node Exporter (ASUSTOR)
|
||||
- job_name: 'asustor-node'
|
||||
static_configs:
|
||||
- targets: ['node-exporter:9100']
|
||||
labels:
|
||||
host: 'asustor'
|
||||
|
||||
# Container metrics from cAdvisor (ASUSTOR)
|
||||
- job_name: 'asustor-cadvisor'
|
||||
static_configs:
|
||||
- targets: ['cadvisor:8080']
|
||||
labels:
|
||||
host: 'asustor'
|
||||
|
||||
# ============================================
|
||||
# QNAP Metrics (Remote - 192.168.10.8)
|
||||
# ============================================
|
||||
|
||||
# Host metrics from Node Exporter (QNAP)
|
||||
- job_name: 'qnap-node'
|
||||
static_configs:
|
||||
- targets: ['192.168.10.8:9100']
|
||||
labels:
|
||||
host: 'qnap'
|
||||
|
||||
# Container metrics from cAdvisor (QNAP)
|
||||
- job_name: 'qnap-cadvisor'
|
||||
static_configs:
|
||||
- targets: ['192.168.10.8:8088']
|
||||
labels:
|
||||
host: 'qnap'
|
||||
|
||||
# Backend NestJS application (QNAP)
|
||||
- job_name: 'backend'
|
||||
static_configs:
|
||||
- targets: ['192.168.10.8:3000']
|
||||
labels:
|
||||
host: 'qnap'
|
||||
metrics_path: '/metrics'
|
||||
|
||||
# MariaDB Exporter (QNAP)
|
||||
- job_name: 'mariadb'
|
||||
static_configs:
|
||||
- targets: ['192.168.10.8:9104']
|
||||
labels:
|
||||
host: 'qnap'
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Uptime Kuma Monitors
|
||||
|
||||
เมื่อ Uptime Kuma พร้อมใช้งาน ให้เพิ่ม monitors ต่อไปนี้:
|
||||
|
||||
| Monitor Name | Type | URL / Host | Interval |
|
||||
| :------------ | :--- | :--------------------------------- | :------- |
|
||||
| QNAP NPM | HTTP | https://npm.np-dms.work | 60s |
|
||||
| Frontend | HTTP | https://lcbp3.np-dms.work | 60s |
|
||||
| Backend API | HTTP | https://backend.np-dms.work/health | 60s |
|
||||
| MariaDB | TCP | 192.168.10.8:3306 | 60s |
|
||||
| Redis | TCP | 192.168.10.8:6379 | 60s |
|
||||
| Elasticsearch | HTTP | http://192.168.10.8:9200 | 60s |
|
||||
| Gitea | HTTP | https://git.np-dms.work | 60s |
|
||||
| n8n | HTTP | https://n8n.np-dms.work | 60s |
|
||||
| Grafana | HTTP | https://grafana.np-dms.work | 60s |
|
||||
| QNAP Host | Ping | 192.168.10.8 | 60s |
|
||||
| ASUSTOR Host | Ping | 192.168.10.9 | 60s |
|
||||
|
||||
---
|
||||
|
||||
## Grafana Dashboards
|
||||
|
||||
### Recommended Dashboards to Import
|
||||
|
||||
| Dashboard ID | Name | Purpose |
|
||||
| :----------- | :--------------------------- | :----------------------------- |
|
||||
| 1860 | Node Exporter Full | Host system metrics |
|
||||
| 14282 | cAdvisor exporter | Container metrics |
|
||||
| 11074 | Node Exporter for Prometheus | Node overview |
|
||||
| 893 | Docker and Container | Docker overview |
|
||||
| 7362 | MySQL | MySQL view |
|
||||
| 1214 | Redis | Redis view |
|
||||
| 14204 | Elasticsearch | Elasticsearch view |
|
||||
| 13106 | MySQL/MariaDB Overview | Detailed MySQL/MariaDB metrics |
|
||||
|
||||
|
||||
### Import Dashboard via Grafana UI
|
||||
|
||||
1. Go to **Dashboards → Import**
|
||||
2. Enter Dashboard ID (e.g., `1860`)
|
||||
3. Select Prometheus data source
|
||||
4. Click **Import**
|
||||
|
||||
---
|
||||
|
||||
## 🚀 Deploy lcbp3-monitoring บน ASUSTOR
|
||||
|
||||
### 📋 Prerequisites Checklist
|
||||
|
||||
| # | ขั้นตอน | Status |
|
||||
| :--- | :------------------------------------------------------------------------------------------------- | :----- |
|
||||
| 1 | SSH เข้า ASUSTOR ได้ (`ssh admin@192.168.10.9`) | ✅ |
|
||||
| 2 | Docker Network `lcbp3` สร้างแล้ว (ดูหัวข้อ [สร้าง Docker Network](#-สร้าง-docker-network-ทำครั้งแรกครั้งเดียว)) | ✅ |
|
||||
| 3 | สร้าง Directories และกำหนดสิทธิ์แล้ว (ดูหัวข้อ [กำหนดสิทธิ](#กำหนดสิทธิ-บน-asustor)) | ✅ |
|
||||
| 4 | สร้าง `prometheus.yml` แล้ว (ดูหัวข้อ [Prometheus Configuration](#prometheus-configuration)) | ✅ |
|
||||
| 5 | สร้าง `promtail-config.yml` แล้ว (ดูหัวข้อ [Step 1.2](#step-12-สร้าง-promtail-configyml)) | ✅ |
|
||||
|
||||
---
|
||||
|
||||
### Step 1: สร้าง prometheus.yml
|
||||
|
||||
```bash
|
||||
# SSH เข้า ASUSTOR
|
||||
ssh admin@192.168.10.9
|
||||
|
||||
# สร้างไฟล์ prometheus.yml
|
||||
cat > /volume1/np-dms/monitoring/prometheus/config/prometheus.yml << 'EOF'
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
- job_name: 'asustor-node'
|
||||
static_configs:
|
||||
- targets: ['node-exporter:9100']
|
||||
labels:
|
||||
host: 'asustor'
|
||||
|
||||
- job_name: 'asustor-cadvisor'
|
||||
static_configs:
|
||||
- targets: ['cadvisor:8080']
|
||||
labels:
|
||||
host: 'asustor'
|
||||
|
||||
- job_name: 'qnap-node'
|
||||
static_configs:
|
||||
- targets: ['192.168.10.8:9100']
|
||||
labels:
|
||||
host: 'qnap'
|
||||
|
||||
- job_name: 'qnap-cadvisor'
|
||||
static_configs:
|
||||
- targets: ['192.168.10.8:8088']
|
||||
labels:
|
||||
host: 'qnap'
|
||||
|
||||
- job_name: 'backend'
|
||||
static_configs:
|
||||
- targets: ['192.168.10.8:3000']
|
||||
labels:
|
||||
host: 'qnap'
|
||||
metrics_path: '/metrics'
|
||||
EOF
|
||||
|
||||
# ตรวจสอบ
|
||||
cat /volume1/np-dms/monitoring/prometheus/config/prometheus.yml
|
||||
```
|
||||
|
||||
### Step 1.2: สร้าง promtail-config.yml
|
||||
|
||||
ต้องสร้าง Config ให้ Promtail อ่าน logs จาก Docker containers และส่งไป Loki:
|
||||
|
||||
```bash
|
||||
# สร้างไฟล์ promtail-config.yml
|
||||
cat > /volume1/np-dms/monitoring/promtail/config/promtail-config.yml << 'EOF'
|
||||
server:
|
||||
http_listen_port: 9080
|
||||
grpc_listen_port: 0
|
||||
|
||||
positions:
|
||||
filename: /tmp/positions.yaml
|
||||
|
||||
clients:
|
||||
- url: http://loki:3100/loki/api/v1/push
|
||||
|
||||
scrape_configs:
|
||||
- job_name: docker
|
||||
docker_sd_configs:
|
||||
- host: unix:///var/run/docker.sock
|
||||
refresh_interval: 5s
|
||||
relabel_configs:
|
||||
- source_labels: ['__meta_docker_container_name']
|
||||
regex: '/(.*)'
|
||||
target_label: 'container'
|
||||
- source_labels: ['__meta_docker_container_log_stream']
|
||||
target_label: 'stream'
|
||||
EOF
|
||||
|
||||
# ขั้นตอนการเตรียมระบบที่ QNAP (ก่อน Deploy Stack)
|
||||
|
||||
### 1. สร้าง Monitoring User ใน MariaDB
|
||||
รันคำสั่ง SQL นี้ผ่าน **phpMyAdmin** หรือ `docker exec`:
|
||||
```sql
|
||||
CREATE USER 'exporter'@'%' IDENTIFIED BY 'Center2025' WITH MAX_USER_CONNECTIONS 3;
|
||||
GRANT PROCESS, REPLICATION CLIENT, SELECT, SLAVE MONITOR ON *.* TO 'exporter'@'%';
|
||||
FLUSH PRIVILEGES;
|
||||
```
|
||||
|
||||
### 2. สร้างไฟล์คอนฟิก .my.cnf บน QNAP
|
||||
เพื่อให้ `mysqld-exporter` อ่านรหัสผ่านที่มีตัวอักษรพิเศษได้ถูกต้อง:
|
||||
|
||||
1. **SSH เข้า QNAP** (หรือใช้ File Station สร้าง Folder):
|
||||
```bash
|
||||
ssh admin@192.168.10.8
|
||||
```
|
||||
2. **สร้าง Directory สำหรับเก็บ Config**:
|
||||
```bash
|
||||
mkdir -p /share/np-dms/monitoring/mysqld-exporter
|
||||
```
|
||||
3. **สร้างไฟล์ .my.cnf**:
|
||||
```bash
|
||||
cat > /share/np-dms/monitoring/mysqld-exporter/.my.cnf << 'EOF'
|
||||
[client]
|
||||
user=exporter
|
||||
password=Center2025
|
||||
host=mariadb
|
||||
EOF
|
||||
```
|
||||
4. **กำหนดสิทธิ์ไฟล์** (เพื่อให้ Container อ่านไฟล์ได้):
|
||||
```bash
|
||||
chmod 644 /share/np-dms/monitoring/mysqld-exporter/.my.cnf
|
||||
```
|
||||
|
||||
# ตรวจสอบ
|
||||
cat /volume1/np-dms/monitoring/promtail/config/promtail-config.yml
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Step 2: Deploy ผ่าน Portainer (แนะนำ)
|
||||
|
||||
1. เปิด **Portainer** → เลือก Environment ของ **ASUSTOR**
|
||||
2. ไปที่ **Stacks** → **Add stack**
|
||||
3. กรอกข้อมูล:
|
||||
- **Name:** `lcbp3-monitoring`
|
||||
- **Build method:** เลือก **Web editor**
|
||||
4. วาง (Paste) เนื้อหาจาก [Docker Compose File (ASUSTOR)](#docker-compose-file-asustor) ด้านบน
|
||||
5. กด **Deploy the stack**
|
||||
|
||||
> ⚠️ **สำคัญ:** ตรวจสอบ Password ของ Grafana (`GF_SECURITY_ADMIN_PASSWORD`) ใน docker-compose ก่อน deploy
|
||||
|
||||
### Deploy ผ่าน SSH (วิธีสำรอง)
|
||||
|
||||
```bash
|
||||
# SSH เข้า ASUSTOR
|
||||
ssh admin@192.168.10.9
|
||||
|
||||
# คัดลอก docker-compose.yml ไปยัง path
|
||||
# (วางไฟล์ที่ /volume1/np-dms/monitoring/docker-compose.yml)
|
||||
|
||||
# Deploy
|
||||
cd /volume1/np-dms/monitoring
|
||||
docker compose up -d
|
||||
|
||||
# ตรวจสอบ container status
|
||||
docker compose ps
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Step 3: Verify Services
|
||||
|
||||
```bash
|
||||
# ตรวจสอบ containers ทั้งหมด
|
||||
docker ps --filter "name=prometheus" --filter "name=grafana" \
|
||||
--filter "name=uptime-kuma" --filter "name=node-exporter" \
|
||||
--filter "name=cadvisor" --filter "name=loki" --filter "name=promtail"
|
||||
```
|
||||
|
||||
| Service | วิธีตรวจสอบ | Expected Result |
|
||||
| :---------------- | :----------------------------------------------------------------- | :------------------------------------ |
|
||||
| ✅ **Prometheus** | `curl http://192.168.10.9:9090/-/healthy` | `Prometheus Server is Healthy` |
|
||||
| ✅ **Grafana** | เปิด `https://grafana.np-dms.work` (หรือ `http://192.168.10.9:3000`) | หน้า Login |
|
||||
| ✅ **Uptime Kuma** | เปิด `https://uptime.np-dms.work` (หรือ `http://192.168.10.9:3001`) | หน้า Setup |
|
||||
| ✅ **Node Exp.** | `curl http://192.168.10.9:9100/metrics \| head` | Metrics output |
|
||||
| ✅ **cAdvisor** | `curl http://192.168.10.9:8080/healthz` | `ok` |
|
||||
| ✅ **Loki** | `curl http://192.168.10.9:3100/ready` | `ready` |
|
||||
| ✅ **Promtail** | เช็ค Logs: `docker logs promtail` | ไม่ควรมี Error + เห็น connection success |
|
||||
|
||||
---
|
||||
|
||||
### Step 4: Deploy QNAP Exporters
|
||||
|
||||
ติดตั้ง node-exporter และ cAdvisor บน QNAP เพื่อให้ Prometheus scrape ข้ามเครื่องได้:
|
||||
|
||||
#### ผ่าน Container Station (QNAP)
|
||||
|
||||
1. เปิด **Container Station** บน QNAP Web UI
|
||||
2. ไปที่ **Applications** → **Create**
|
||||
3. ตั้งชื่อ Application: `lcbp3-exporters`
|
||||
4. วาง (Paste) เนื้อหาจาก [QNAP Node Exporter & cAdvisor](#qnap-node-exporter--cadvisor)
|
||||
5. กด **Create**
|
||||
|
||||
#### ตรวจสอบจาก ASUSTOR
|
||||
|
||||
```bash
|
||||
# ตรวจว่า Prometheus scrape QNAP ได้
|
||||
curl -s http://localhost:9090/api/v1/targets | grep -E '"qnap-(node|cadvisor)"'
|
||||
|
||||
# หรือเปิด Prometheus UI → Targets
|
||||
# URL: http://192.168.10.9:9090/targets
|
||||
# ดูว่า qnap-node, qnap-cadvisor เป็น State: UP
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Step 5: ตั้งค่า Grafana & Uptime Kuma
|
||||
|
||||
#### Grafana — First Login
|
||||
|
||||
1. เปิด `https://grafana.np-dms.work`
|
||||
2. Login: `admin` / `Center#2025` (หรือ password ที่ตั้งไว้)
|
||||
3. ไปที่ **Connections** → **Data sources** → **Add data source**
|
||||
4. เลือก **Prometheus**
|
||||
- URL: `http://prometheus:9090`
|
||||
- กด **Save & Test** → ต้องขึ้น ✅
|
||||
5. Import Dashboards (ดูรายละเอียดในหัวข้อ [6. Grafana Dashboards Setup](#6-grafana-dashboards-setup))
|
||||
|
||||
#### Uptime Kuma — First Setup
|
||||
|
||||
1. เปิด `https://uptime.np-dms.work`
|
||||
2. สร้าง Admin account
|
||||
3. เพิ่ม Monitors ตาม [ตาราง Uptime Kuma Monitors](#uptime-kuma-monitors)
|
||||
|
||||
---
|
||||
|
||||
### 6. Grafana Dashboards Setup
|
||||
|
||||
เพื่อการ Monitor ที่สมบูรณ์ แนะนำให้ Import Dashboards ต่อไปนี้:
|
||||
|
||||
#### 6.1 Host Monitoring (Node Exporter)
|
||||
* **Concept:** ดู resource ของเครื่อง Host (CPU, RAM, Disk, Network)
|
||||
* **Dashboard ID:** `1860` (Node Exporter Full)
|
||||
* **วิธี Import:**
|
||||
1. ไปที่ **Dashboards** → **New** → **Import**
|
||||
2. ช่อง **Import via grafana.com** ใส่เลข `1860` กด **Load**
|
||||
3. เลือก Data source: **Prometheus**
|
||||
4. กด **Import**
|
||||
|
||||
#### 6.2 Container Monitoring (cAdvisor)
|
||||
* **Concept:** ดู resource ของแต่ละ Container (เชื่อม Logs ด้วย)
|
||||
* **Dashboard ID:** `14282` (Cadvisor exporter)
|
||||
* **วิธี Import:**
|
||||
1. ใส่เลข `14282` กด **Load**
|
||||
2. เลือก Data source: **Prometheus**
|
||||
3. กด **Import**
|
||||
|
||||
#### 6.3 Logs Monitoring (Loki Integration)
|
||||
เพื่อให้ Dashboard ของ Container แสดง Logs จาก Loki ได้ด้วย:
|
||||
|
||||
1. เปิด Dashboard **Cadvisor exporter** ที่เพิ่ง Import มา
|
||||
2. กดปุ่ม **Add visualization** (หรือ Edit dashboard)
|
||||
3. เลือก Data source: **Loki**
|
||||
4. ในช่อง Query ใส่: `{container="$name"}`
|
||||
* *(Note: `$name` มาจาก Variable ของ Dashboard 14282)*
|
||||
5. ปรับ Visualization type เป็น **Logs**
|
||||
6. ตั้งชื่อ Panel ว่า **"Container Logs"**
|
||||
7. กด **Apply** และ **Save Dashboard**
|
||||
|
||||
ตอนนี้เราจะเห็นทั้ง **กราฟการกินทรัพยากร** และ **Logs** ของ Container นั้นๆ ในหน้าเดียวกันครับ
|
||||
|
||||
#### 6.4 Integrated Dashboard (Recommended)
|
||||
|
||||
ผมได้เตรียม JSON file ที่รวม Metrics และ Logs ไว้ให้แล้วครับ:
|
||||
|
||||
1. ไปที่ **Dashboards** → **New** → **Import**
|
||||
2. ลากไฟล์ หรือ Copy เนื้อหาจากไฟล์:
|
||||
`specs/08-infrastructure/grafana/dashboards/lcbp3-docker-monitoring.json`
|
||||
3. กด **Load** และ **Import**
|
||||
|
||||
## 7.3 Backup / Export Dashboards
|
||||
|
||||
เมื่อปรับแต่ง Dashboard จนพอใจแล้ว ควร Export เก็บเป็นไฟล์ JSON ไว้ backup หรือ version control:
|
||||
|
||||
1. เปิด Dashboard ที่ต้องการ backup
|
||||
2. ไปที่ปุ่ม **Share Dashboard** (ไอคอน 🔗 หรือ Share มุมซ้ายบน)
|
||||
3. เลือกTab **Export**
|
||||
4. เปิดตัวเลือก **Export for sharing externally** (เพื่อให้ลบ hardcoded value)
|
||||
5. กด **Save to file**
|
||||
6. นำไฟล์ JSON มาเก็บไว้ที่ path: `specs/08-infrastructure/grafana/dashboards/`
|
||||
|
||||
---
|
||||
|
||||
> 📝 **หมายเหตุ**: เอกสารนี้อ้างอิงจาก Architecture Document **v1.8.0** - Monitoring Stack deploy บน ASUSTOR AS5403T
|
||||
|
||||
@@ -1,247 +0,0 @@
|
||||
# Backup Strategy สำหรับ LCBP3-DMS
|
||||
|
||||
> 📍 **Deploy on:** ASUSTOR AS5403T (Infrastructure Server)
|
||||
> 🎯 **Backup Target:** QNAP TS-473A (Application & Database)
|
||||
> 📄 **Version:** v1.8.0
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
ระบบ Backup แบบ Pull-based: ASUSTOR ดึงข้อมูลจาก QNAP เพื่อความปลอดภัย
|
||||
หาก QNAP ถูกโจมตี ผู้โจมตีจะไม่สามารถลบ Backup บน ASUSTOR ได้
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ BACKUP ARCHITECTURE │
|
||||
├─────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ QNAP (Source) ASUSTOR (Backup Target) │
|
||||
│ 192.168.10.8 192.168.10.9 │
|
||||
│ │
|
||||
│ ┌──────────────┐ SSH/Rsync ┌──────────────────────┐ │
|
||||
│ │ MariaDB │ ─────────────▶ │ /volume1/backup/db/ │ │
|
||||
│ │ (mysqldump) │ Daily 2AM │ (Restic Repository) │ │
|
||||
│ └──────────────┘ └──────────────────────┘ │
|
||||
│ │
|
||||
│ ┌──────────────┐ ┌──────────────────────┐ │
|
||||
│ │ Redis RDB │ ─────────────▶ │ /volume1/backup/ │ │
|
||||
│ │ + AOF │ Daily 3AM │ redis/ │ │
|
||||
│ └──────────────┘ └──────────────────────┘ │
|
||||
│ │
|
||||
│ ┌──────────────┐ ┌──────────────────────┐ │
|
||||
│ │ App Config │ ─────────────▶ │ /volume1/backup/ │ │
|
||||
│ │ + Volumes │ Weekly Sun │ config/ │ │
|
||||
│ └──────────────┘ └──────────────────────┘ │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 1. MariaDB Backup
|
||||
|
||||
### 1.1 Daily Database Backup Script
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# File: /volume1/np-dms/scripts/backup-mariadb.sh
|
||||
# Run on: ASUSTOR (Pull from QNAP)
|
||||
|
||||
DATE=$(date +%Y%m%d_%H%M%S)
|
||||
BACKUP_DIR="/volume1/backup/db"
|
||||
QNAP_IP="192.168.10.8"
|
||||
DB_NAME="lcbp3_db"
|
||||
DB_USER="root"
|
||||
DB_PASSWORD="${MARIADB_ROOT_PASSWORD}"
|
||||
|
||||
echo "🔄 Starting MariaDB backup at $DATE"
|
||||
|
||||
# Create backup directory
|
||||
mkdir -p $BACKUP_DIR
|
||||
|
||||
# Remote mysqldump via SSH
|
||||
ssh admin@$QNAP_IP "docker exec mariadb mysqldump \
|
||||
--single-transaction \
|
||||
--routines \
|
||||
--triggers \
|
||||
-u $DB_USER -p$DB_PASSWORD $DB_NAME" > $BACKUP_DIR/lcbp3_$DATE.sql
|
||||
|
||||
# Compress
|
||||
gzip $BACKUP_DIR/lcbp3_$DATE.sql
|
||||
|
||||
# Add to Restic repository
|
||||
restic -r $BACKUP_DIR/restic-repo backup $BACKUP_DIR/lcbp3_$DATE.sql.gz
|
||||
|
||||
# Keep only last 30 days of raw files
|
||||
find $BACKUP_DIR -name "lcbp3_*.sql.gz" -mtime +30 -delete
|
||||
|
||||
echo "✅ MariaDB backup complete: lcbp3_$DATE.sql.gz"
|
||||
```
|
||||
|
||||
### 1.2 Cron Schedule (ASUSTOR)
|
||||
|
||||
```cron
|
||||
# MariaDB daily backup at 2 AM
|
||||
0 2 * * * /volume1/np-dms/scripts/backup-mariadb.sh >> /var/log/backup-mariadb.log 2>&1
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 2. Redis Backup
|
||||
|
||||
### 2.1 Redis Backup Script
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# File: /volume1/np-dms/scripts/backup-redis.sh
|
||||
# Run on: ASUSTOR (Pull from QNAP)
|
||||
|
||||
DATE=$(date +%Y%m%d_%H%M%S)
|
||||
BACKUP_DIR="/volume1/backup/redis"
|
||||
QNAP_IP="192.168.10.8"
|
||||
|
||||
echo "🔄 Starting Redis backup at $DATE"
|
||||
|
||||
mkdir -p $BACKUP_DIR
|
||||
|
||||
# Trigger BGSAVE on QNAP Redis
|
||||
ssh admin@$QNAP_IP "docker exec cache redis-cli BGSAVE"
|
||||
sleep 10
|
||||
|
||||
# Copy RDB and AOF files
|
||||
scp admin@$QNAP_IP:/share/np-dms/services/cache/data/dump.rdb $BACKUP_DIR/redis_$DATE.rdb
|
||||
scp admin@$QNAP_IP:/share/np-dms/services/cache/data/appendonly.aof $BACKUP_DIR/redis_$DATE.aof
|
||||
|
||||
# Compress
|
||||
tar -czf $BACKUP_DIR/redis_$DATE.tar.gz \
|
||||
$BACKUP_DIR/redis_$DATE.rdb \
|
||||
$BACKUP_DIR/redis_$DATE.aof
|
||||
|
||||
# Cleanup raw files
|
||||
rm $BACKUP_DIR/redis_$DATE.rdb $BACKUP_DIR/redis_$DATE.aof
|
||||
|
||||
echo "✅ Redis backup complete: redis_$DATE.tar.gz"
|
||||
```
|
||||
|
||||
### 2.2 Cron Schedule
|
||||
|
||||
```cron
|
||||
# Redis daily backup at 3 AM
|
||||
0 3 * * * /volume1/np-dms/scripts/backup-redis.sh >> /var/log/backup-redis.log 2>&1
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 3. Application Config Backup
|
||||
|
||||
### 3.1 Weekly Config Backup Script
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# File: /volume1/np-dms/scripts/backup-config.sh
|
||||
# Run on: ASUSTOR (Pull from QNAP)
|
||||
|
||||
DATE=$(date +%Y%m%d)
|
||||
BACKUP_DIR="/volume1/backup/config"
|
||||
QNAP_IP="192.168.10.8"
|
||||
|
||||
echo "🔄 Starting config backup at $DATE"
|
||||
|
||||
mkdir -p $BACKUP_DIR
|
||||
|
||||
# Sync Docker compose files and configs
|
||||
rsync -avz --delete \
|
||||
admin@$QNAP_IP:/share/np-dms/ \
|
||||
$BACKUP_DIR/np-dms_$DATE/ \
|
||||
--exclude='*/data/*' \
|
||||
--exclude='*/logs/*' \
|
||||
--exclude='node_modules'
|
||||
|
||||
# Compress
|
||||
tar -czf $BACKUP_DIR/config_$DATE.tar.gz $BACKUP_DIR/np-dms_$DATE
|
||||
|
||||
# Cleanup
|
||||
rm -rf $BACKUP_DIR/np-dms_$DATE
|
||||
|
||||
echo "✅ Config backup complete: config_$DATE.tar.gz"
|
||||
```
|
||||
|
||||
### 3.2 Cron Schedule
|
||||
|
||||
```cron
|
||||
# Config weekly backup on Sunday at 4 AM
|
||||
0 4 * * 0 /volume1/np-dms/scripts/backup-config.sh >> /var/log/backup-config.log 2>&1
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 4. Retention Policy
|
||||
|
||||
| Backup Type | Frequency | Retention | Storage Est. |
|
||||
| :---------- | :-------- | :-------- | :----------- |
|
||||
| MariaDB | Daily | 30 days | ~5GB/month |
|
||||
| Redis | Daily | 7 days | ~500MB |
|
||||
| Config | Weekly | 4 weeks | ~200MB |
|
||||
| Restic | Daily | 6 months | Deduplicated |
|
||||
|
||||
---
|
||||
|
||||
## 5. Restic Repository Setup
|
||||
|
||||
```bash
|
||||
# Initialize Restic repository (one-time)
|
||||
restic init -r /volume1/backup/restic-repo
|
||||
|
||||
# Set password in environment
|
||||
export RESTIC_PASSWORD="your-secure-backup-password"
|
||||
|
||||
# Check repository status
|
||||
restic -r /volume1/backup/restic-repo snapshots
|
||||
|
||||
# Prune old snapshots (keep 30 daily, 4 weekly, 6 monthly)
|
||||
restic -r /volume1/backup/restic-repo forget \
|
||||
--keep-daily 30 \
|
||||
--keep-weekly 4 \
|
||||
--keep-monthly 6 \
|
||||
--prune
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 6. Verification Script
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# File: /volume1/np-dms/scripts/verify-backup.sh
|
||||
|
||||
echo "📋 Backup Verification Report"
|
||||
echo "=============================="
|
||||
echo ""
|
||||
|
||||
# Check latest MariaDB backup
|
||||
LATEST_DB=$(ls -t /volume1/backup/db/*.sql.gz 2>/dev/null | head -1)
|
||||
if [ -n "$LATEST_DB" ]; then
|
||||
echo "✅ Latest DB backup: $LATEST_DB"
|
||||
echo " Size: $(du -h $LATEST_DB | cut -f1)"
|
||||
else
|
||||
echo "❌ No DB backup found!"
|
||||
fi
|
||||
|
||||
# Check latest Redis backup
|
||||
LATEST_REDIS=$(ls -t /volume1/backup/redis/*.tar.gz 2>/dev/null | head -1)
|
||||
if [ -n "$LATEST_REDIS" ]; then
|
||||
echo "✅ Latest Redis backup: $LATEST_REDIS"
|
||||
else
|
||||
echo "❌ No Redis backup found!"
|
||||
fi
|
||||
|
||||
# Check Restic repository
|
||||
echo ""
|
||||
echo "📦 Restic Snapshots:"
|
||||
restic -r /volume1/backup/restic-repo snapshots --latest 5
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
> 📝 **หมายเหตุ**: เอกสารนี้อ้างอิงจาก Architecture Document **v1.8.0**
|
||||
@@ -1,209 +0,0 @@
|
||||
# Disaster Recovery Plan สำหรับ LCBP3-DMS
|
||||
|
||||
> 📍 **Version:** v1.8.0
|
||||
> 🖥️ **Primary Server:** QNAP TS-473A (Application & Database)
|
||||
> 💾 **Backup Server:** ASUSTOR AS5403T (Infrastructure & Backup)
|
||||
|
||||
---
|
||||
|
||||
## RTO/RPO Targets
|
||||
|
||||
| Scenario | RTO | RPO | Priority |
|
||||
| :-------------------------- | :------ | :----- | :------- |
|
||||
| Single backend node failure | 0 min | 0 | P0 |
|
||||
| Redis failure | 5 min | 0 | P0 |
|
||||
| MariaDB failure | 10 min | 0 | P0 |
|
||||
| QNAP total failure | 2 hours | 15 min | P1 |
|
||||
| Data corruption | 4 hours | 1 day | P2 |
|
||||
|
||||
---
|
||||
|
||||
## 1. Quick Recovery Procedures
|
||||
|
||||
### 1.1 Service Not Responding
|
||||
|
||||
```bash
|
||||
# Check container status
|
||||
docker ps -a | grep <service-name>
|
||||
|
||||
# Restart specific service
|
||||
docker restart <container-name>
|
||||
|
||||
# Check logs for errors
|
||||
docker logs <container-name> --tail 100
|
||||
```
|
||||
|
||||
### 1.2 Redis Failure
|
||||
|
||||
```bash
|
||||
# Check status
|
||||
docker exec cache redis-cli ping
|
||||
|
||||
# Restart
|
||||
docker restart cache
|
||||
|
||||
# Verify
|
||||
docker exec cache redis-cli ping
|
||||
```
|
||||
|
||||
### 1.3 MariaDB Failure
|
||||
|
||||
```bash
|
||||
# Check status
|
||||
docker exec mariadb mysql -u root -p -e "SELECT 1"
|
||||
|
||||
# Restart
|
||||
docker restart mariadb
|
||||
|
||||
# Wait for startup
|
||||
sleep 30
|
||||
|
||||
# Verify
|
||||
docker exec mariadb mysql -u root -p -e "SHOW DATABASES"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 2. Full System Recovery
|
||||
|
||||
### 2.1 Recovery Prerequisites (ASUSTOR)
|
||||
|
||||
ตรวจสอบว่า Backup files พร้อมใช้งาน:
|
||||
|
||||
```bash
|
||||
# SSH to ASUSTOR
|
||||
ssh admin@192.168.10.9
|
||||
|
||||
# List available backups
|
||||
ls -la /volume1/backup/db/
|
||||
ls -la /volume1/backup/redis/
|
||||
ls -la /volume1/backup/config/
|
||||
|
||||
# Check Restic snapshots
|
||||
restic -r /volume1/backup/restic-repo snapshots
|
||||
```
|
||||
|
||||
### 2.2 QNAP Recovery Script
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# File: /volume1/np-dms/scripts/disaster-recovery.sh
|
||||
# Run on: ASUSTOR (Push to QNAP)
|
||||
|
||||
QNAP_IP="192.168.10.8"
|
||||
BACKUP_DIR="/volume1/backup"
|
||||
|
||||
echo "🚨 Starting Disaster Recovery..."
|
||||
echo "================================"
|
||||
|
||||
# 1. Restore Docker Network
|
||||
echo "1️⃣ Creating Docker network..."
|
||||
ssh admin@$QNAP_IP "docker network create lcbp3 || true"
|
||||
|
||||
# 2. Restore config files
|
||||
echo "2️⃣ Restoring configuration files..."
|
||||
LATEST_CONFIG=$(ls -t $BACKUP_DIR/config/*.tar.gz | head -1)
|
||||
tar -xzf $LATEST_CONFIG -C /tmp/
|
||||
rsync -avz /tmp/np-dms/ admin@$QNAP_IP:/share/np-dms/
|
||||
|
||||
# 3. Start infrastructure services
|
||||
echo "3️⃣ Starting MariaDB..."
|
||||
ssh admin@$QNAP_IP "cd /share/np-dms/mariadb && docker-compose up -d"
|
||||
sleep 30
|
||||
|
||||
# 4. Restore database
|
||||
echo "4️⃣ Restoring database..."
|
||||
LATEST_DB=$(ls -t $BACKUP_DIR/db/*.sql.gz | head -1)
|
||||
gunzip -c $LATEST_DB | ssh admin@$QNAP_IP "docker exec -i mariadb mysql -u root -p\$MYSQL_ROOT_PASSWORD lcbp3_db"
|
||||
|
||||
# 5. Start Redis
|
||||
echo "5️⃣ Starting Redis..."
|
||||
ssh admin@$QNAP_IP "cd /share/np-dms/services && docker-compose up -d cache"
|
||||
|
||||
# 6. Restore Redis data (if needed)
|
||||
echo "6️⃣ Restoring Redis data..."
|
||||
LATEST_REDIS=$(ls -t $BACKUP_DIR/redis/*.tar.gz | head -1)
|
||||
tar -xzf $LATEST_REDIS -C /tmp/
|
||||
scp /tmp/redis_*.rdb admin@$QNAP_IP:/share/np-dms/services/cache/data/dump.rdb
|
||||
ssh admin@$QNAP_IP "docker restart cache"
|
||||
|
||||
# 7. Start remaining services
|
||||
echo "7️⃣ Starting application services..."
|
||||
ssh admin@$QNAP_IP "cd /share/np-dms/services && docker-compose up -d"
|
||||
ssh admin@$QNAP_IP "cd /share/np-dms/npm && docker-compose up -d"
|
||||
|
||||
# 8. Health check
|
||||
echo "8️⃣ Running health checks..."
|
||||
sleep 60
|
||||
curl -f https://lcbp3.np-dms.work/health || echo "⚠️ Frontend not ready"
|
||||
curl -f https://backend.np-dms.work/health || echo "⚠️ Backend not ready"
|
||||
|
||||
echo ""
|
||||
echo "✅ Disaster Recovery Complete"
|
||||
echo "⚠️ Please verify system functionality manually"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 3. Data Corruption Recovery
|
||||
|
||||
### 3.1 Point-in-Time Recovery (Database)
|
||||
|
||||
```bash
|
||||
# List available Restic snapshots
|
||||
restic -r /volume1/backup/restic-repo snapshots
|
||||
|
||||
# Restore specific snapshot
|
||||
restic -r /volume1/backup/restic-repo restore <snapshot-id> --target /tmp/restore/
|
||||
|
||||
# Apply restored backup
|
||||
gunzip -c /tmp/restore/lcbp3_*.sql.gz | \
|
||||
ssh admin@192.168.10.8 "docker exec -i mariadb mysql -u root -p\$MYSQL_ROOT_PASSWORD lcbp3_db"
|
||||
```
|
||||
|
||||
### 3.2 Selective Table Recovery
|
||||
|
||||
```bash
|
||||
# Extract specific tables from backup
|
||||
gunzip -c /volume1/backup/db/lcbp3_YYYYMMDD.sql.gz | \
|
||||
grep -A1000 "CREATE TABLE \`documents\`" | \
|
||||
grep -B1000 "UNLOCK TABLES" > /tmp/documents_table.sql
|
||||
|
||||
# Restore specific table
|
||||
ssh admin@192.168.10.8 "docker exec -i mariadb mysql -u root -p\$MYSQL_ROOT_PASSWORD lcbp3_db" < /tmp/documents_table.sql
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 4. Communication & Escalation
|
||||
|
||||
### 4.1 Incident Response
|
||||
|
||||
| Severity | Response Time | Notify |
|
||||
| :------- | :------------ | :----------------------------- |
|
||||
| P0 | Immediate | Admin Team + Management |
|
||||
| P1 | 30 minutes | Admin Team |
|
||||
| P2 | 2 hours | Admin Team (next business day) |
|
||||
|
||||
### 4.2 Post-Incident Checklist
|
||||
|
||||
- [ ] Identify root cause
|
||||
- [ ] Document timeline of events
|
||||
- [ ] Verify all services restored
|
||||
- [ ] Check data integrity
|
||||
- [ ] Update monitoring alerts if needed
|
||||
- [ ] Create incident report
|
||||
|
||||
---
|
||||
|
||||
## 5. Testing Schedule
|
||||
|
||||
| Test Type | Frequency | Last Tested | Next Due |
|
||||
| :---------------------- | :-------- | :---------- | :------- |
|
||||
| Backup Verification | Weekly | - | - |
|
||||
| Single Service Recovery | Monthly | - | - |
|
||||
| Full DR Test | Quarterly | - | - |
|
||||
|
||||
---
|
||||
|
||||
> 📝 **หมายเหตุ**: เอกสารนี้อ้างอิงจาก Architecture Document **v1.8.0**
|
||||
Reference in New Issue
Block a user