21 KiB
21 KiB
Infrastructure Setup
1. Redis Cluster Configuration
1.1 Docker Compose Setup
# docker-compose-redis.yml
version: '3.8'
services:
redis-1:
image: redis:7-alpine
container_name: lcbp3-redis-1
command: redis-server --port 6379 --cluster-enabled yes --cluster-config-file nodes.conf
ports:
- "6379:6379"
- "16379:16379"
volumes:
- redis-1-data:/data
networks:
- lcbp3-network
restart: unless-stopped
redis-2:
image: redis:7-alpine
container_name: lcbp3-redis-2
command: redis-server --port 6379 --cluster-enabled yes --cluster-config-file nodes.conf
ports:
- "6380:6379"
- "16380:16379"
volumes:
- redis-2-data:/data
networks:
- lcbp3-network
restart: unless-stopped
redis-3:
image: redis:7-alpine
container_name: lcbp3-redis-3
command: redis-server --port 6379 --cluster-enabled yes --cluster-config-file nodes.conf
ports:
- "6381:6379"
- "16381:16379"
volumes:
- redis-3-data:/data
networks:
- lcbp3-network
restart: unless-stopped
volumes:
redis-1-data:
redis-2-data:
redis-3-data:
networks:
lcbp3-network:
external: true
Initialize Cluster
# Start Redis nodes
docker-compose -f docker-compose-redis.yml up -d
# Wait for nodes to start
sleep 10
# Create cluster
docker exec -it lcbp3-redis-1 redis-cli --cluster create \
172.20.0.2:6379 \
172.20.0.3:6379 \
172.20.0.4:6379 \
--cluster-replicas 0
# Verify cluster
docker exec -it lcbp3-redis-1 redis-cli cluster info
docker exec -it lcbp3-redis-1 redis-cli cluster nodes
Health Check Script
#!/bin/bash
# scripts/check-redis-cluster.sh
echo "🔍 Checking Redis Cluster Health..."
for port in 6379 6380 6381; do
echo "\n📍 Node on port $port:"
# Check if node is up
docker exec lcbp3-redis-$(($port - 6378)) redis-cli -p 6379 ping
# Check cluster status
docker exec lcbp3-redis-$(($port - 6378)) redis-cli -p 6379 cluster info | grep cluster_state
# Check memory usage
docker exec lcbp3-redis-$(($port - 6378)) redis-cli -p 6379 info memory | grep used_memory_human
done
echo "\n✅ Cluster check complete"
2. Database Configuration
2.1 MariaDB Optimization for Numbering
-- /etc/mysql/mariadb.conf.d/50-numbering.cnf
[mysqld]
# Connection pool
max_connections = 200
thread_cache_size = 50
# Query cache (disabled for InnoDB)
query_cache_type = 0
query_cache_size = 0
# InnoDB settings
innodb_buffer_pool_size = 4G
innodb_log_file_size = 512M
innodb_flush_log_at_trx_commit = 1
innodb_lock_wait_timeout = 50
# Performance Schema
performance_schema = ON
performance_schema_instrument = 'wait/lock/%=ON'
# Binary logging
log_bin = /var/log/mysql/mysql-bin.log
expire_logs_days = 7
max_binlog_size = 100M
# Slow query log
slow_query_log = 1
slow_query_log_file = /var/log/mysql/slow-query.log
long_query_time = 1
2.2 Monitoring Locks
-- Check for lock contention
SELECT
r.trx_id waiting_trx_id,
r.trx_mysql_thread_id waiting_thread,
r.trx_query waiting_query,
b.trx_id blocking_trx_id,
b.trx_mysql_thread_id blocking_thread,
b.trx_query blocking_query
FROM information_schema.innodb_lock_waits w
INNER JOIN information_schema.innodb_trx b ON b.trx_id = w.blocking_trx_id
INNER JOIN information_schema.innodb_trx r ON r.trx_id = w.requesting_trx_id;
-- Check active transactions
SELECT * FROM information_schema.innodb_trx;
-- Kill long-running transaction (if needed)
KILL <thread_id>;
3. Backend Service Configuration
3.1 Backend Service Deployment
Docker Compose
# docker-compose-backend.yml
version: '3.8'
services:
backend-1:
image: lcbp3-backend:latest
container_name: lcbp3-backend-1
environment:
- NODE_ENV=production
- DB_HOST=mariadb-primary
- REDIS_CLUSTER_NODES=redis-1:6379,redis-2:6379,redis-3:6379
- NUMBERING_LOCK_TIMEOUT=5000
- NUMBERING_RESERVATION_TTL=300
ports:
- "3001:3000"
depends_on:
- mariadb-primary
- redis-1
- redis-2
- redis-3
networks:
- lcbp3-network
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:3000/health"]
interval: 30s
timeout: 10s
retries: 3
backend-2:
image: lcbp3-backend:latest
container_name: lcbp3-backend-2
environment:
- NODE_ENV=production
- DB_HOST=mariadb-primary
- REDIS_CLUSTER_NODES=redis-1:6379,redis-2:6379,redis-3:6379
ports:
- "3002:3000"
depends_on:
- mariadb-primary
- redis-1
networks:
- lcbp3-network
restart: unless-stopped
networks:
lcbp3-network:
external: true
Health Check Endpoint
// health/numbering.health.ts
import { Injectable } from '@nestjs/common';
import { HealthIndicator, HealthIndicatorResult } from '@nestjs/terminus';
import { Redis } from 'ioredis';
import { DataSource } from 'typeorm';
@Injectable()
export class NumberingHealthIndicator extends HealthIndicator {
constructor(
private redis: Redis,
private dataSource: DataSource,
) {
super();
}
async isHealthy(key: string): Promise<HealthIndicatorResult> {
const checks = await Promise.all([
this.checkRedis(),
this.checkDatabase(),
this.checkSequenceIntegrity(),
]);
const isHealthy = checks.every((check) => check.status === 'up');
return this.getStatus(key, isHealthy, { checks });
}
private async checkRedis(): Promise<any> {
try {
await this.redis.ping();
return { name: 'redis', status: 'up' };
} catch (error) {
return { name: 'redis', status: 'down', error: error.message };
}
}
private async checkDatabase(): Promise<any> {
try {
await this.dataSource.query('SELECT 1');
return { name: 'database', status: 'up' };
} catch (error) {
return { name: 'database', status: 'down', error: error.message };
}
}
private async checkSequenceIntegrity(): Promise<any> {
try {
const result = await this.dataSource.query(`
SELECT COUNT(*) as count
FROM document_numbering_sequences
WHERE current_value > (
SELECT max_value FROM document_numbering_configs
WHERE id = config_id
)
`);
const hasIssue = result[0].count > 0;
return {
name: 'sequence_integrity',
status: hasIssue ? 'degraded' : 'up',
exceeded_sequences: result[0].count,
};
} catch (error) {
return { name: 'sequence_integrity', status: 'down', error: error.message };
}
}
}
4. Monitoring & Alerting
4.1 Prometheus Configuration
# prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
rule_files:
- "/etc/prometheus/alerts/numbering.yml"
scrape_configs:
- job_name: 'backend'
static_configs:
- targets:
- 'backend-1:3000'
- 'backend-2:3000'
metrics_path: '/metrics'
- job_name: 'redis-numbering'
static_configs:
- targets:
- 'redis-1:6379'
- 'redis-2:6379'
- 'redis-3:6379'
metrics_path: '/metrics'
- job_name: 'mariadb'
static_configs:
- targets:
- 'mariadb-exporter:9104'
4.2 Alert Manager Configuration
# alertmanager.yml
global:
resolve_timeout: 5m
route:
receiver: 'default'
group_by: ['alertname', 'severity']
group_wait: 10s
group_interval: 10s
repeat_interval: 12h
routes:
- match:
severity: critical
receiver: 'critical'
continue: true
- match:
severity: warning
receiver: 'warning'
receivers:
- name: 'default'
slack_configs:
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
channel: '#lcbp3-alerts'
title: '{{ .GroupLabels.alertname }}'
text: '{{ range .Alerts }}{{ .Annotations.summary }}\n{{ end }}'
- name: 'critical'
email_configs:
- to: 'devops@lcbp3.com'
from: 'alerts@lcbp3.com'
smarthost: 'smtp.gmail.com:587'
auth_username: 'alerts@lcbp3.com'
auth_password: 'your-password'
headers:
Subject: '🚨 CRITICAL: {{ .GroupLabels.alertname }}'
pagerduty_configs:
- service_key: 'YOUR_PAGERDUTY_KEY'
- name: 'warning'
slack_configs:
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
channel: '#lcbp3-warnings'
4.3 Grafana Dashboards
Import Dashboard JSON
# Download dashboard template
curl -o numbering-dashboard.json \
https://raw.githubusercontent.com/lcbp3/grafana-dashboards/main/numbering.json
# Import to Grafana
curl -X POST http://admin:admin@localhost:3000/api/dashboards/db \
-H "Content-Type: application/json" \
-d @numbering-dashboard.json
Key Panels to Monitor
- Numbers Generated per Minute - Rate of number creation
- Sequence Utilization - Current usage vs max (alert >90%)
- Lock Wait Time (p95) - Performance indicator
- Lock Failures - System health indicator
- Redis Cluster Health - Node status
- Database Connection Pool - Resource usage
5. Backup & Recovery
5.1 Database Backup Strategy
Automated Backup Script
#!/bin/bash
# scripts/backup-numbering-db.sh
DATE=$(date +%Y%m%d_%H%M%S)
BACKUP_DIR="/backups/numbering"
DB_NAME="lcbp3_production"
echo "🔄 Starting backup at $DATE"
# Create backup directory
mkdir -p $BACKUP_DIR
# Backup numbering tables only
docker exec lcbp3-mariadb mysqldump \
--single-transaction \
--routines \
--triggers \
$DB_NAME \
document_numbering_configs \
document_numbering_sequences \
document_numbering_audit_logs \
> $BACKUP_DIR/numbering_$DATE.sql
# Compress backup
gzip $BACKUP_DIR/numbering_$DATE.sql
# Keep only last 30 days
find $BACKUP_DIR -name "numbering_*.sql.gz" -mtime +30 -delete
echo "✅ Backup complete: numbering_$DATE.sql.gz"
Cron Schedule
# Run backup daily at 2 AM
0 2 * * * /opt/lcbp3/scripts/backup-numbering-db.sh >> /var/log/numbering-backup.log 2>&1
# Run integrity check weekly on Sunday at 3 AM
0 3 * * 0 /opt/lcbp3/scripts/check-sequence-integrity.sh >> /var/log/numbering-integrity.log 2>&1
5.2 Redis Backup
Enable RDB Persistence
# redis.conf
save 900 1 # Save if 1 key changed after 900 seconds
save 300 10 # Save if 10 keys changed after 300 seconds
save 60 10000 # Save if 10000 keys changed after 60 seconds
dbfilename dump.rdb
dir /data
# Enable AOF for durability
appendonly yes
appendfilename "appendonly.aof"
appendfsync everysec
Backup Script
#!/bin/bash
# scripts/backup-redis.sh
DATE=$(date +%Y%m%d_%H%M%S)
BACKUP_DIR="/backups/redis"
mkdir -p $BACKUP_DIR
for i in 1 2 3; do
echo "Backing up redis-$i..."
# Trigger BGSAVE
docker exec lcbp3-redis-$i redis-cli -p 6379 BGSAVE
# Wait for save to complete
sleep 10
# Copy RDB file
docker cp lcbp3-redis-$i:/data/dump.rdb \
$BACKUP_DIR/redis-${i}_${DATE}.rdb
# Copy AOF file
docker cp lcbp3-redis-$i:/data/appendonly.aof \
$BACKUP_DIR/redis-${i}_${DATE}.aof
done
# Compress
tar -czf $BACKUP_DIR/redis_cluster_${DATE}.tar.gz $BACKUP_DIR/*_${DATE}.*
# Cleanup
rm $BACKUP_DIR/*_${DATE}.rdb $BACKUP_DIR/*_${DATE}.aof
echo "✅ Redis backup complete"
5.3 Recovery Procedures
Scenario 1: Restore from Database Backup
#!/bin/bash
# scripts/restore-numbering-db.sh
BACKUP_FILE=$1
if [ -z "$BACKUP_FILE" ]; then
echo "Usage: ./restore-numbering-db.sh <backup_file>"
exit 1
fi
echo "⚠️ WARNING: This will overwrite current numbering data!"
read -p "Continue? (yes/no): " confirm
if [ "$confirm" != "yes" ]; then
echo "Aborted"
exit 0
fi
# Decompress if needed
if [[ $BACKUP_FILE == *.gz ]]; then
gunzip -c $BACKUP_FILE > /tmp/restore.sql
RESTORE_FILE="/tmp/restore.sql"
else
RESTORE_FILE=$BACKUP_FILE
fi
# Restore
docker exec -i lcbp3-mariadb mysql lcbp3_production < $RESTORE_FILE
echo "✅ Restore complete"
echo "🔄 Please verify sequence integrity"
Scenario 2: Redis Node Failure
# Automatically handled by cluster
# Node will rejoin cluster when restarted
# Check cluster status
docker exec lcbp3-redis-1 redis-cli cluster info
# If node is failed, remove and add back
docker exec lcbp3-redis-1 redis-cli --cluster del-node <node-id>
docker exec lcbp3-redis-1 redis-cli --cluster add-node <new-node-ip>:6379 <cluster-ip>:6379
6. Maintenance Procedures
6.1 Sequence Adjustment
Increase Max Value
-- Check current utilization
SELECT
dc.document_type,
ds.current_value,
dc.max_value,
ROUND((ds.current_value * 100.0 / dc.max_value), 2) as utilization
FROM document_numbering_sequences ds
JOIN document_numbering_configs dc ON ds.config_id = dc.id
WHERE ds.current_value > dc.max_value * 0.8;
-- Increase max_value for type approaching limit
UPDATE document_numbering_configs
SET max_value = max_value * 10,
updated_at = CURRENT_TIMESTAMP
WHERE document_type = 'COR'
AND max_value < 9999999;
-- Audit log
INSERT INTO document_numbering_audit_logs (
operation, document_type, old_value, new_value,
user_id, metadata
) VALUES (
'ADJUST_MAX_VALUE', 'COR', '999999', '9999999',
1, '{"reason": "Approaching limit", "automated": false}'
);
Reset Yearly Sequence
-- For document types with yearly reset
-- Run on January 1st
START TRANSACTION;
-- Create new sequence for new year
INSERT INTO document_numbering_sequences (
config_id,
scope_value,
current_value,
last_used_at
)
SELECT
id as config_id,
YEAR(CURDATE()) as scope_value,
0 as current_value,
NULL as last_used_at
FROM document_numbering_configs
WHERE scope = 'YEARLY';
-- Verify
SELECT * FROM document_numbering_sequences
WHERE scope_value = YEAR(CURDATE());
COMMIT;
6.2 Cleanup Old Audit Logs
-- Archive logs older than 2 years
-- Run monthly
START TRANSACTION;
-- Create archive table (if not exists)
CREATE TABLE IF NOT EXISTS document_numbering_audit_logs_archive
LIKE document_numbering_audit_logs;
-- Move old logs to archive
INSERT INTO document_numbering_audit_logs_archive
SELECT * FROM document_numbering_audit_logs
WHERE timestamp < DATE_SUB(CURDATE(), INTERVAL 2 YEAR);
-- Delete from main table
DELETE FROM document_numbering_audit_logs
WHERE timestamp < DATE_SUB(CURDATE(), INTERVAL 2 YEAR);
-- Optimize table
OPTIMIZE TABLE document_numbering_audit_logs;
COMMIT;
-- Export archive to file (optional)
SELECT * FROM document_numbering_audit_logs_archive
INTO OUTFILE '/tmp/audit_archive_2023.csv'
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n';
6.3 Redis Maintenance
Flush Expired Reservations
#!/bin/bash
# scripts/cleanup-expired-reservations.sh
echo "🧹 Cleaning up expired reservations..."
# Get all reservation keys
KEYS=$(docker exec lcbp3-redis-1 redis-cli --cluster call 172.20.0.2:6379 KEYS "reservation:*" | grep -v "(error)")
COUNT=0
for KEY in $KEYS; do
# Check TTL
TTL=$(docker exec lcbp3-redis-1 redis-cli TTL "$KEY")
if [ "$TTL" -lt 0 ]; then
# Delete expired key
docker exec lcbp3-redis-1 redis-cli DEL "$KEY"
((COUNT++))
fi
done
echo "✅ Cleaned up $COUNT expired reservations"
7. Disaster Recovery
7.1 Total System Failure
Recovery Steps
#!/bin/bash
# scripts/disaster-recovery.sh
echo "🚨 Starting disaster recovery..."
# 1. Start Redis cluster
echo "1️⃣ Starting Redis cluster..."
docker-compose -f docker-compose-redis.yml up -d
sleep 30
# 2. Restore Redis backups
echo "2️⃣ Restoring Redis backups..."
./scripts/restore-redis.sh /backups/redis/latest.tar.gz
# 3. Start database
echo "3️⃣ Starting MariaDB..."
docker-compose -f docker-compose-db.yml up -d
sleep 30
# 4. Restore database
echo "4️⃣ Restoring database..."
./scripts/restore-numbering-db.sh /backups/db/latest.sql.gz
# 5. Verify sequence integrity
echo "5️⃣ Verifying sequence integrity..."
./scripts/check-sequence-integrity.sh
# 6. Start backend services
echo "6️⃣ Starting backend services..."
docker-compose -f docker-compose-backend.yml up -d
# 7. Run health checks
echo "7️⃣ Running health checks..."
sleep 60
for i in {1..5}; do
curl -f http://localhost:3001/health || echo "Backend $i not healthy"
done
echo "✅ Disaster recovery complete"
echo "⚠️ Please verify system functionality manually"
7.2 RTO/RPO Targets
| Scenario | RTO | RPO | Priority |
|---|---|---|---|
| Single backend node failure | 0 min | 0 | P0 |
| Single Redis node failure | 0 min | 0 | P0 |
| Database primary failure | 5 min | 0 | P0 |
| Complete data center failure | 1 hour | 15 min | P1 |
| Data corruption | 4 hours | 1 day | P2 |
8. Runbooks
8.1 High Sequence Utilization (>90%)
Alert: SequenceWarning or SequenceCritical
Steps:
-
Check current utilization
SELECT document_type, current_value, max_value, ROUND((current_value * 100.0 / max_value), 2) as pct FROM document_numbering_sequences s JOIN document_numbering_configs c ON s.config_id = c.id WHERE current_value > max_value * 0.9; -
Assess impact
- How many numbers left?
- Daily usage rate?
- Days until exhaustion?
-
Take action
-- Option A: Increase max_value UPDATE document_numbering_configs SET max_value = max_value * 10 WHERE document_type = 'COR'; -- Option B: Reset sequence (yearly types only) -- Schedule for next year/month -
Notify stakeholders
-
Update monitoring thresholds if needed
8.2 High Lock Wait Time
Alert: HighLockWaitTime
Steps:
-
Check Redis cluster health
docker exec lcbp3-redis-1 redis-cli cluster info docker exec lcbp3-redis-1 redis-cli cluster nodes -
Check database locks
SELECT * FROM information_schema.innodb_lock_waits; SELECT * FROM information_schema.innodb_trx WHERE trx_started < NOW() - INTERVAL 30 SECOND; -
Identify bottleneck
- Redis slow?
- Database slow?
- High concurrent load?
-
Take action based on cause:
- Redis: Add more nodes, check network latency
- Database: Optimize queries, increase connection pool
- High load: Scale horizontally (add backend nodes)
-
Monitor improvements
8.3 Redis Cluster Down
Alert: RedisUnavailable
Steps:
-
Verify all nodes down
for i in {1..3}; do docker exec lcbp3-redis-$i redis-cli ping || echo "Node $i DOWN" done -
Check system falls back to DB-only mode
curl http://localhost:3001/health/numbering # Should show: fallback_mode: true -
Restart Redis cluster
docker-compose -f docker-compose-redis.yml restart sleep 30 ./scripts/check-redis-cluster.sh -
If restart fails, restore from backup
./scripts/restore-redis.sh /backups/redis/latest.tar.gz -
Verify numbering system back to normal
curl http://localhost:3001/health/numbering # Should show: fallback_mode: false -
Review logs for root cause
9. Performance Tuning
9.1 Slow Number Generation
Diagnosis:
-- Check slow queries
SELECT * FROM mysql.slow_log
WHERE sql_text LIKE '%document_numbering%'
ORDER BY query_time DESC
LIMIT 10;
-- Check index usage
EXPLAIN SELECT * FROM document_numbering_sequences
WHERE config_id = 1 AND scope_value = '2025'
FOR UPDATE;
Optimizations:
-- Add missing indexes
CREATE INDEX idx_sequence_lookup
ON document_numbering_sequences(config_id, scope_value);
-- Optimize table
OPTIMIZE TABLE document_numbering_sequences;
-- Update statistics
ANALYZE TABLE document_numbering_sequences;
8.2 Redis Memory Optimization
# Check memory usage
docker exec lcbp3-redis-1 redis-cli INFO memory
# If memory high, check keys
docker exec lcbp3-redis-1 redis-cli --bigkeys
# Set maxmemory policy
docker exec lcbp3-redis-1 redis-cli CONFIG SET maxmemory 2gb
docker exec lcbp3-redis-1 redis-cli CONFIG SET maxmemory-policy allkeys-lru
10. Security Hardening
10.1 Redis Security
# redis.conf
requirepass your-strong-redis-password
bind 0.0.0.0
protected-mode yes
rename-command FLUSHDB ""
rename-command FLUSHALL ""
rename-command CONFIG "CONFIG_abc123"
10.2 Database Security
-- Create dedicated numbering user
CREATE USER 'numbering'@'%' IDENTIFIED BY 'strong-password';
-- Grant minimal permissions
GRANT SELECT, INSERT, UPDATE ON lcbp3_production.document_numbering_* TO 'numbering'@'%';
GRANT SELECT ON lcbp3_production.users TO 'numbering'@'%';
FLUSH PRIVILEGES;
10.3 Network Security
# docker-compose-network.yml
networks:
lcbp3-network:
driver: bridge
ipam:
config:
- subnet: 172.20.0.0/16
driver_opts:
com.docker.network.bridge.name: lcbp3-br
com.docker.network.bridge.enable_icc: "true"
com.docker.network.bridge.enable_ip_masquerade: "true"
11. Compliance & Audit
11.1 Audit Log Retention
-- Export audit logs for compliance
SELECT *
FROM document_numbering