Main: revise specs to 1.5.0 (completed)

2025-12-01 01:28:32 +07:00
parent 241022ada6
commit 71c091055a
69 changed files with 28252 additions and 74 deletions
--- a/specs/04-operations/monitoring-alerting.md
+++ b/specs/04-operations/monitoring-alerting.md
@@ -0,0 +1,443 @@
+# Monitoring & Alerting
+
+**Project:** LCBP3-DMS
+**Version:** 1.5.0
+**Last Updated:** 2025-12-01
+
+---
+
+## 📋 Overview
+
+This document describes monitoring setup, health checks, and alerting rules for LCBP3-DMS.
+
+---
+
+## 🎯 Monitoring Objectives
+
+- **Availability:** System uptime > 99.5%
+- **Performance:** API response time < 500ms (P95)
+- **Reliability:** Error rate < 1%
+- **Capacity:** Resource utilization < 80%
+
+---
+
+## 📊 Key Metrics
+
+### Application Metrics
+
+| Metric                  | Target  | Alert Threshold    |
+| ----------------------- | ------- | ------------------ |
+| API Response Time (P95) | < 500ms | > 1000ms           |
+| Error Rate              | < 1%    | > 5%               |
+| Request Rate            | N/A     | Sudden ±50% change |
+| Active Users            | N/A     | -                  |
+| Queue Length (BullMQ)   | < 100   | > 500              |
+
+### Infrastructure Metrics
+
+| Metric       | Target | Alert Threshold   |
+| ------------ | ------ | ----------------- |
+| CPU Usage    | < 70%  | > 90%             |
+| Memory Usage | < 80%  | > 95%             |
+| Disk Usage   | < 80%  | > 90%             |
+| Network I/O  | N/A    | Anomaly detection |
+
+### Database Metrics
+
+| Metric                | Target  | Alert Threshold |
+| --------------------- | ------- | --------------- |
+| Query Time (P95)      | < 100ms | > 500ms         |
+| Connection Pool Usage | < 80%   | > 95%           |
+| Slow Queries          | 0       | > 10/min        |
+| Replication Lag       | 0s      | > 30s           |
+
+---
+
+## 🔍 Health Checks
+
+### Backend Health Endpoint
+
+```typescript
+// File: backend/src/health/health.controller.ts
+import { Controller, Get } from '@nestjs/common';
+import {
+  HealthCheck,
+  HealthCheckService,
+  TypeOrmHealthIndicator,
+  DiskHealthIndicator,
+} from '@nestjs/terminus';
+
+@Controller('health')
+export class HealthController {
+  constructor(
+    private health: HealthCheckService,
+    private db: TypeOrmHealthIndicator,
+    private disk: DiskHealthIndicator
+  ) {}
+
+  @Get()
+  @HealthCheck()
+  check() {
+    return this.health.check([
+      // Database health
+      () => this.db.pingCheck('database'),
+
+      // Disk health
+      () =>
+        this.disk.checkStorage('storage', {
+          path: '/',
+          thresholdPercent: 0.9,
+        }),
+
+      // Redis health
+      async () => {
+        const redis = await this.redis.ping();
+        return { redis: { status: redis === 'PONG' ? 'up' : 'down' } };
+      },
+    ]);
+  }
+}
+```
+
+### Health Check Response
+
+```json
+{
+  "status": "ok",
+  "info": {
+    "database": {
+      "status": "up"
+    },
+    "storage": {
+      "status": "up",
+      "freePercent": 0.75
+    },
+    "redis": {
+      "status": "up"
+    }
+  },
+  "error": {},
+  "details": {
+    "database": {
+      "status": "up"
+    },
+    "storage": {
+      "status": "up",
+      "freePercent": 0.75
+    },
+    "redis": {
+      "status": "up"
+    }
+  }
+}
+```
+
+---
+
+## 🐳 Docker Container Monitoring
+
+### Health Check in docker-compose.yml
+
+```yaml
+services:
+  backend:
+    healthcheck:
+      test: ['CMD', 'curl', '-f', 'http://localhost:3000/health']
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 40s
+
+  mariadb:
+    healthcheck:
+      test: ['CMD', 'mysqladmin', 'ping', '-h', 'localhost']
+      interval: 30s
+      timeout: 10s
+      retries: 3
+
+  redis:
+    healthcheck:
+      test: ['CMD', 'redis-cli', 'ping']
+      interval: 30s
+      timeout: 10s
+      retries: 3
+```
+
+### Monitor Container Status
+
+```bash
+#!/bin/bash
+# File: /scripts/monitor-containers.sh
+
+# Check all containers are healthy
+CONTAINERS=("lcbp3-backend" "lcbp3-frontend" "lcbp3-mariadb" "lcbp3-redis")
+
+for CONTAINER in "${CONTAINERS[@]}"; do
+  HEALTH=$(docker inspect --format='{{.State.Health.Status}}' $CONTAINER 2>/dev/null)
+
+  if [ "$HEALTH" != "healthy" ]; then
+    echo "ALERT: $CONTAINER is $HEALTH"
+    # Send alert (email, Slack, etc.)
+  fi
+done
+```
+
+---
+
+## 📈 Application Performance Monitoring (APM)
+
+### Log-Based Monitoring (MVP Phase)
+
+```typescript
+// File: backend/src/common/interceptors/performance.interceptor.ts
+import {
+  Injectable,
+  NestInterceptor,
+  ExecutionContext,
+  CallHandler,
+} from '@nestjs/common';
+import { Observable } from 'rxjs';
+import { tap } from 'rxjs/operators';
+import { logger } from 'src/config/logger.config';
+
+@Injectable()
+export class PerformanceInterceptor implements NestInterceptor {
+  intercept(context: ExecutionContext, next: CallHandler): Observable<any> {
+    const request = context.switchToHttp().getRequest();
+    const start = Date.now();
+
+    return next.handle().pipe(
+      tap({
+        next: () => {
+          const duration = Date.now() - start;
+
+          logger.info('Request completed', {
+            method: request.method,
+            url: request.url,
+            statusCode: context.switchToHttp().getResponse().statusCode,
+            duration: `${duration}ms`,
+            userId: request.user?.user_id,
+          });
+
+          // Alert on slow requests
+          if (duration > 1000) {
+            logger.warn('Slow request detected', {
+              method: request.method,
+              url: request.url,
+              duration: `${duration}ms`,
+            });
+          }
+        },
+        error: (error) => {
+          const duration = Date.now() - start;
+
+          logger.error('Request failed', {
+            method: request.method,
+            url: request.url,
+            duration: `${duration}ms`,
+            error: error.message,
+          });
+        },
+      })
+    );
+  }
+}
+```
+
+---
+
+## 🚨 Alerting Rules
+
+### Critical Alerts (Immediate Action Required)
+
+| Alert           | Condition                                   | Action                      |
+| --------------- | ------------------------------------------- | --------------------------- |
+| Service Down    | Health check fails for 3 consecutive checks | Page on-call engineer       |
+| Database Down   | Cannot connect to database                  | Page DBA + on-call engineer |
+| Disk Full       | Disk usage > 95%                            | Page operations team        |
+| High Error Rate | Error rate > 10% for 5 min                  | Page on-call engineer       |
+
+### Warning Alerts (Review Within 1 Hour)
+
+| Alert         | Condition               | Action                 |
+| ------------- | ----------------------- | ---------------------- |
+| High CPU      | CPU > 90% for 10 min    | Notify operations team |
+| High Memory   | Memory > 95% for 10 min | Notify operations team |
+| Slow Queries  | > 50 slow queries/min   | Notify DBA             |
+| Queue Backlog | BullMQ queue > 500 jobs | Notify backend team    |
+
+### Info Alerts (Review During Business Hours)
+
+| Alert              | Condition                            | Action                |
+| ------------------ | ------------------------------------ | --------------------- |
+| Backup Failed      | Daily backup job failed              | Email operations team |
+| SSL Expiring       | SSL certificate expires in < 30 days | Email operations team |
+| Disk Space Warning | Disk usage > 80%                     | Email operations team |
+
+---
+
+## 📧 Alert Notification Channels
+
+### Email Alerts
+
+```bash
+#!/bin/bash
+# File: /scripts/send-alert-email.sh
+
+TO="ops-team@example.com"
+SUBJECT="$1"
+MESSAGE="$2"
+
+echo "$MESSAGE" | mail -s "[LCBP3-DMS] $SUBJECT" "$TO"
+```
+
+### Slack Alerts
+
+```bash
+#!/bin/bash
+# File: /scripts/send-alert-slack.sh
+
+WEBHOOK_URL="https://hooks.slack.com/services/YOUR/WEBHOOK/URL"
+MESSAGE="$1"
+
+curl -X POST -H 'Content-type: application/json' \
+  --data "{\"text\":\"🚨 LCBP3-DMS Alert: $MESSAGE\"}" \
+  "$WEBHOOK_URL"
+```
+
+---
+
+## 📊 Monitoring Dashboard
+
+### Metrics to Display
+
+**System Overview:**
+
+- Service status (up/down)
+- Overall system health score
+- Active user count
+- Request rate (req/s)
+
+**Performance:**
+
+- API response time (P50, P95, P99)
+- Database query time
+- Queue processing time
+
+**Resources:**
+
+- CPU usage %
+- Memory usage %
+- Disk usage %
+- Network I/O
+
+**Business Metrics:**
+
+- Documents created today
+- Workflows completed today
+- Active correspondences
+- Pending approvals
+
+---
+
+## 🔧 Log Aggregation
+
+### Centralized Logging with Docker
+
+```bash
+# Configure Docker logging driver
+# File: /etc/docker/daemon.json
+{
+  "log-driver": "json-file",
+  "log-opts": {
+    "max-size": "10m",
+    "max-file": "3",
+    "labels": "service,environment"
+  }
+}
+```
+
+### View Aggregated Logs
+
+```bash
+# View all LCBP3 container logs
+docker-compose logs -f --tail=100
+
+# View specific service logs
+docker logs lcbp3-backend -f --since=1h
+
+# Search logs
+docker logs lcbp3-backend 2>&1 | grep "ERROR"
+
+# Export logs for analysis
+docker logs lcbp3-backend > backend-logs.txt
+```
+
+---
+
+## 📈 Performance Baseline
+
+### Establish Baselines
+
+Run load tests to establish performance baselines:
+
+```bash
+# Install Apache Bench
+apt-get install apache2-utils
+
+# Test API endpoint
+ab -n 1000 -c 10 \
+  -H "Authorization: Bearer <TOKEN>" \
+  https://lcbp3-dms.example.com/api/correspondences
+
+# Results to record:
+# - Requests per second
+# - Mean response time
+# - P95 response time
+# - Error rate
+```
+
+### Regular Performance Testing
+
+- **Weekly:** Quick health check (100 requests)
+- **Monthly:** Full load test (10,000 requests)
+- **Quarterly:** Stress test (find breaking point)
+
+---
+
+## ✅ Monitoring Checklist
+
+### Daily
+
+- [ ] Check service health dashboard
+- [ ] Review error logs
+- [ ] Verify backup completion
+- [ ] Check disk space
+
+### Weekly
+
+- [ ] Review performance metrics trends
+- [ ] Analyze slow query log
+- [ ] Check SSL certificate expiry
+- [ ] Review security alerts
+
+### Monthly
+
+- [ ] Capacity planning review
+- [ ] Update monitoring thresholds
+- [ ] Test alert notifications
+- [ ] Review and tune performance
+
+---
+
+## 🔗 Related Documents
+
+- [Backup & Recovery](./backup-recovery.md)
+- [Incident Response](./incident-response.md)
+- [ADR-010: Logging Strategy](../05-decisions/ADR-010-logging-monitoring-strategy.md)
+
+---
+
+**Version:** 1.5.0
+**Last Review:** 2025-12-01
+**Next Review:** 2026-03-01