diff --git a/specs/08-infrastructure/05_monitoring.md b/specs/08-infrastructure/05_monitoring.md index 6054b4d..54722a5 100644 --- a/specs/08-infrastructure/05_monitoring.md +++ b/specs/08-infrastructure/05_monitoring.md @@ -15,6 +15,7 @@ Stack สำหรับ Monitoring ประกอบด้วย: | **cAdvisor** | 8080 (ASUSTOR) / 8088 (QNAP) | เก็บ Metrics ของ Docker containers | Both | | **Uptime Kuma** | 3001 | Service Availability Monitoring | ASUSTOR | | **Loki** | 3100 | Log aggregation | ASUSTOR | +| **Promtail** | - | Log shipper (Sender) | ASUSTOR | --- @@ -31,11 +32,11 @@ Stack สำหรับ Monitoring ประกอบด้วย: │ │ │ │ │ Scrape Metrics │ │ ▼ │ -│ ┌─────────────┐ ┌─────────────┐ │ -│ │node-exporter│ │ cAdvisor │ │ -│ │ :9100 │ │ :8080 │ │ -│ │ (Local) │ │ (Local) │ │ -│ └─────────────┘ └─────────────┘ │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ +│ │node-exporter│ │ cAdvisor │ │ Promtail │ │ +│ │ :9100 │ │ :8080 │ │ (Log Ship) │ │ +│ │ (Local) │ │ (Local) │ │ (Local) │ │ +│ └─────────────┘ └─────────────┘ └─────────────┘ │ └─────────────────────────────────────────────────────────────────────────┘ │ Remote Scrape ▼ @@ -63,6 +64,7 @@ mkdir -p /volume1/np-dms/monitoring/prometheus/config mkdir -p /volume1/np-dms/monitoring/grafana/data mkdir -p /volume1/np-dms/monitoring/uptime-kuma/data mkdir -p /volume1/np-dms/monitoring/loki/data +mkdir -p /volume1/np-dms/monitoring/promtail/config # กำหนดสิทธิ์ให้ตรงกับ User ID ใน Container # Prometheus (UID 65534 - nobody) @@ -80,6 +82,10 @@ chmod -R 750 /volume1/np-dms/monitoring/uptime-kuma/data # Loki (UID 10001) chown -R 10001:10001 /volume1/np-dms/monitoring/loki/data chmod -R 750 /volume1/np-dms/monitoring/loki/data + +# Promtail (Runs as root to read docker logs - no specific chown needed for config dir if created by admin) +# But ensure config file is readable +chmod -R 755 /volume1/np-dms/monitoring/promtail/config ``` --- @@ -135,7 +141,7 @@ docker network inspect lcbp3 # File: /volume1/np-dms/monitoring/docker-compose.yml # DMS Container v1.8.0: Application name: lcbp3-monitoring # Deploy on: ASUSTOR AS5403T -# Services: prometheus, grafana, node-exporter, cadvisor, uptime-kuma, loki +# Services: prometheus, grafana, node-exporter, cadvisor, uptime-kuma, loki, promtail x-restart: &restart_policy restart: unless-stopped @@ -339,6 +345,31 @@ services: interval: 30s timeout: 10s retries: 3 + + # ---------------------------------------------------------------- + # 7. Promtail (Log Shipper) + # ---------------------------------------------------------------- + promtail: + <<: [*restart_policy, *default_logging] + image: grafana/promtail:2.9.0 + container_name: promtail + user: "0:0" + deploy: + resources: + limits: + cpus: "0.5" + memory: 256M + environment: + TZ: "Asia/Bangkok" + command: -config.file=/etc/promtail/promtail-config.yml + networks: + - lcbp3 + volumes: + - "/volume1/np-dms/monitoring/promtail/config:/etc/promtail:ro" + - "/var/run/docker.sock:/var/run/docker.sock:ro" + - "/var/lib/docker/containers:/var/lib/docker/containers:ro" + depends_on: + - loki ``` --- @@ -489,8 +520,8 @@ scrape_configs: | Dashboard ID | Name | Purpose | | :----------- | :--------------------------- | :------------------ | -| 1860 | Node Exporter Full | Host system metrics | -| 14282 | cAdvisor exporter | Container metrics | +| 1860 | Node Exporter Full | Host system metrics | ป | +| 14282 | cAdvisor exporter | Container metrics | ป | | 11074 | Node Exporter for Prometheus | Node overview | | 893 | Docker and Container | Docker overview | | 7362 | MySQL | MySQL view | @@ -513,10 +544,11 @@ scrape_configs: | # | ขั้นตอน | Status | | :--- | :------------------------------------------------------------------------------------------------- | :----- | -| 1 | SSH เข้า ASUSTOR ได้ (`ssh admin@192.168.10.9`) | ☐ | -| 2 | Docker Network `lcbp3` สร้างแล้ว (ดูหัวข้อ [สร้าง Docker Network](#-สร้าง-docker-network-ทำครั้งแรกครั้งเดียว)) | ☐ | -| 3 | สร้าง Directories และกำหนดสิทธิ์แล้ว (ดูหัวข้อ [กำหนดสิทธิ](#กำหนดสิทธิ-บน-asustor)) | ☐ | -| 4 | สร้าง `prometheus.yml` แล้ว (ดูหัวข้อ [Prometheus Configuration](#prometheus-configuration)) | ☐ | +| 1 | SSH เข้า ASUSTOR ได้ (`ssh admin@192.168.10.9`) | ✅ | +| 2 | Docker Network `lcbp3` สร้างแล้ว (ดูหัวข้อ [สร้าง Docker Network](#-สร้าง-docker-network-ทำครั้งแรกครั้งเดียว)) | ✅ | +| 3 | สร้าง Directories และกำหนดสิทธิ์แล้ว (ดูหัวข้อ [กำหนดสิทธิ](#กำหนดสิทธิ-บน-asustor)) | ✅ | +| 4 | สร้าง `prometheus.yml` แล้ว (ดูหัวข้อ [Prometheus Configuration](#prometheus-configuration)) | ✅ | +| 5 | สร้าง `promtail-config.yml` แล้ว (ดูหัวข้อ [Step 1.2](#step-12-สร้าง-promtail-configyml)) | ✅ | --- @@ -573,6 +605,40 @@ EOF cat /volume1/np-dms/monitoring/prometheus/config/prometheus.yml ``` +### Step 1.2: สร้าง promtail-config.yml + +ต้องสร้าง Config ให้ Promtail อ่าน logs จาก Docker containers และส่งไป Loki: + +```bash +# สร้างไฟล์ promtail-config.yml +cat > /volume1/np-dms/monitoring/promtail/config/promtail-config.yml << 'EOF' +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /tmp/positions.yaml + +clients: + - url: http://loki:3100/loki/api/v1/push + +scrape_configs: + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 5s + relabel_configs: + - source_labels: ['__meta_docker_container_name'] + regex: '/(.*)' + target_label: 'container' + - source_labels: ['__meta_docker_container_log_stream'] + target_label: 'stream' +EOF + +# ตรวจสอบ +cat /volume1/np-dms/monitoring/promtail/config/promtail-config.yml +``` + --- ### Step 2: Deploy ผ่าน Portainer (แนะนำ) @@ -612,17 +678,18 @@ docker compose ps # ตรวจสอบ containers ทั้งหมด docker ps --filter "name=prometheus" --filter "name=grafana" \ --filter "name=uptime-kuma" --filter "name=node-exporter" \ - --filter "name=cadvisor" --filter "name=loki" + --filter "name=cadvisor" --filter "name=loki" --filter "name=promtail" ``` -| Service | วิธีตรวจสอบ | Expected Result | -| :---------------- | :----------------------------------------------------------------- | :----------------------------- | -| ✅ **Prometheus** | `curl http://192.168.10.9:9090/-/healthy` | `Prometheus Server is Healthy` | -| ✅ **Grafana** | เปิด `https://grafana.np-dms.work` (หรือ `http://192.168.10.9:3000`) | หน้า Login | -| ✅ **Uptime Kuma** | เปิด `https://uptime.np-dms.work` (หรือ `http://192.168.10.9:3001`) | หน้า Setup | -| ✅ **Node Exp.** | `curl http://192.168.10.9:9100/metrics \| head` | Metrics output | -| ✅ **cAdvisor** | `curl http://192.168.10.9:8080/healthz` | `ok` | -| ✅ **Loki** | `curl http://192.168.10.9:3100/ready` | `ready` | +| Service | วิธีตรวจสอบ | Expected Result | +| :---------------- | :----------------------------------------------------------------- | :------------------------------------ | +| ✅ **Prometheus** | `curl http://192.168.10.9:9090/-/healthy` | `Prometheus Server is Healthy` | +| ✅ **Grafana** | เปิด `https://grafana.np-dms.work` (หรือ `http://192.168.10.9:3000`) | หน้า Login | +| ✅ **Uptime Kuma** | เปิด `https://uptime.np-dms.work` (หรือ `http://192.168.10.9:3001`) | หน้า Setup | +| ✅ **Node Exp.** | `curl http://192.168.10.9:9100/metrics \| head` | Metrics output | +| ✅ **cAdvisor** | `curl http://192.168.10.9:8080/healthz` | `ok` | +| ✅ **Loki** | `curl http://192.168.10.9:3100/ready` | `ready` | +| ✅ **Promtail** | เช็ค Logs: `docker logs promtail` | ไม่ควรมี Error + เห็น connection success | --- @@ -661,7 +728,7 @@ curl -s http://localhost:9090/api/v1/targets | grep -E '"qnap-(node|cadvisor)"' 4. เลือก **Prometheus** - URL: `http://prometheus:9090` - กด **Save & Test** → ต้องขึ้น ✅ -5. Import Dashboards (ดูหัวข้อ [Grafana Dashboards](#grafana-dashboards)) +5. Import Dashboards (ดูรายละเอียดในหัวข้อ [6. Grafana Dashboards Setup](#6-grafana-dashboards-setup)) #### Uptime Kuma — First Setup @@ -671,5 +738,62 @@ curl -s http://localhost:9090/api/v1/targets | grep -E '"qnap-(node|cadvisor)"' --- +### 6. Grafana Dashboards Setup + +เพื่อการ Monitor ที่สมบูรณ์ แนะนำให้ Import Dashboards ต่อไปนี้: + +#### 6.1 Host Monitoring (Node Exporter) +* **Concept:** ดู resource ของเครื่อง Host (CPU, RAM, Disk, Network) +* **Dashboard ID:** `1860` (Node Exporter Full) +* **วิธี Import:** + 1. ไปที่ **Dashboards** → **New** → **Import** + 2. ช่อง **Import via grafana.com** ใส่เลข `1860` กด **Load** + 3. เลือก Data source: **Prometheus** + 4. กด **Import** + +#### 6.2 Container Monitoring (cAdvisor) +* **Concept:** ดู resource ของแต่ละ Container (เชื่อม Logs ด้วย) +* **Dashboard ID:** `14282` (Cadvisor exporter) +* **วิธี Import:** + 1. ใส่เลข `14282` กด **Load** + 2. เลือก Data source: **Prometheus** + 3. กด **Import** + +#### 6.3 Logs Monitoring (Loki Integration) +เพื่อให้ Dashboard ของ Container แสดง Logs จาก Loki ได้ด้วย: + +1. เปิด Dashboard **Cadvisor exporter** ที่เพิ่ง Import มา +2. กดปุ่ม **Add visualization** (หรือ Edit dashboard) +3. เลือก Data source: **Loki** +4. ในช่อง Query ใส่: `{container="$name"}` + * *(Note: `$name` มาจาก Variable ของ Dashboard 14282)* +5. ปรับ Visualization type เป็น **Logs** +6. ตั้งชื่อ Panel ว่า **"Container Logs"** +7. กด **Apply** และ **Save Dashboard** + +ตอนนี้เราจะเห็นทั้ง **กราฟการกินทรัพยากร** และ **Logs** ของ Container นั้นๆ ในหน้าเดียวกันครับ + +#### 6.4 Integrated Dashboard (Recommended) + +ผมได้เตรียม JSON file ที่รวม Metrics และ Logs ไว้ให้แล้วครับ: + +1. ไปที่ **Dashboards** → **New** → **Import** +2. ลากไฟล์ หรือ Copy เนื้อหาจากไฟล์: + `specs/08-infrastructure/grafana/dashboards/lcbp3-docker-monitoring.json` +3. กด **Load** และ **Import** + +## 7.3 Backup / Export Dashboards + +เมื่อปรับแต่ง Dashboard จนพอใจแล้ว ควร Export เก็บเป็นไฟล์ JSON ไว้ backup หรือ version control: + +1. เปิด Dashboard ที่ต้องการ backup +2. ไปที่ปุ่ม **Share Dashboard** (ไอคอน 🔗 หรือ Share มุมซ้ายบน) +3. เลือกTab **Export** +4. เปิดตัวเลือก **Export for sharing externally** (เพื่อให้ลบ hardcoded value) +5. กด **Save to file** +6. นำไฟล์ JSON มาเก็บไว้ที่ path: `specs/08-infrastructure/grafana/dashboards/` + +--- + > 📝 **หมายเหตุ**: เอกสารนี้อ้างอิงจาก Architecture Document **v1.8.0** - Monitoring Stack deploy บน ASUSTOR AS5403T diff --git a/specs/08-infrastructure/grafana/dashboards/lcbp3-docker-monitoring.json b/specs/08-infrastructure/grafana/dashboards/lcbp3-docker-monitoring.json new file mode 100644 index 0000000..7c45cac --- /dev/null +++ b/specs/08-infrastructure/grafana/dashboards/lcbp3-docker-monitoring.json @@ -0,0 +1,810 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 18, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 18, + "panels": [], + "title": "Logs", + "type": "row" + }, + { + "datasource": { + "type": "loki", + "uid": "a78950eb-fe7b-48c5-bbb5-7ef22a250c29" + }, + "gridPos": { + "h": 41, + "w": 24, + "x": 0, + "y": 1 + }, + "id": 6, + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": false, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": false + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "a78950eb-fe7b-48c5-bbb5-7ef22a250c29" + }, + "editorMode": "code", + "expr": "{container=~\"$container\"} |= ``", + "queryType": "range", + "refId": "A" + } + ], + "title": "Container Logs", + "type": "logs" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 42 + }, + "id": 16, + "panels": [], + "title": "CPU", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bd6668d2-9dc4-40a7-8cd2-8b8c9719b1fb" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 43 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "bd6668d2-9dc4-40a7-8cd2-8b8c9719b1fb" + }, + "editorMode": "code", + "expr": "rate(container_cpu_usage_seconds_total{image!=\"\",name=~\"$container\",instance=~\"$host\"}[5m]) * 100", + "legendFormat": "{{name}}", + "range": true, + "refId": "A" + } + ], + "title": "CPU Usage (%)", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 51 + }, + "id": 22, + "panels": [], + "title": "Memory", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "bd6668d2-9dc4-40a7-8cd2-8b8c9719b1fb" + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 52 + }, + "hiddenSeries": false, + "id": 23, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.2.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "bd6668d2-9dc4-40a7-8cd2-8b8c9719b1fb" + }, + "expr": "sum(container_memory_rss{instance=~\"$host\",name=~\"$container\",name=~\".+\"}) by (name)", + "hide": false, + "interval": "", + "legendFormat": "{{name}}", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:606", + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:607", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "bd6668d2-9dc4-40a7-8cd2-8b8c9719b1fb" + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 52 + }, + "hiddenSeries": false, + "id": 24, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.2.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "bd6668d2-9dc4-40a7-8cd2-8b8c9719b1fb" + }, + "expr": "sum(container_memory_cache{instance=~\"$host\",name=~\"$container\",name=~\".+\"}) by (name)", + "hide": false, + "interval": "", + "legendFormat": "{{name}}", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Memory Cached", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:606", + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:607", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 60 + }, + "id": 21, + "panels": [], + "title": "Network", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "bd6668d2-9dc4-40a7-8cd2-8b8c9719b1fb" + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 61 + }, + "hiddenSeries": false, + "id": 25, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.2.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "bd6668d2-9dc4-40a7-8cd2-8b8c9719b1fb" + }, + "expr": "sum(rate(container_network_receive_bytes_total{instance=~\"$host\",name=~\"$container\",name=~\".+\"}[5m])) by (name)", + "hide": false, + "interval": "", + "legendFormat": "{{name}}", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Received Network Traffic", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:674", + "format": "Bps", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:675", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "bd6668d2-9dc4-40a7-8cd2-8b8c9719b1fb" + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 61 + }, + "hiddenSeries": false, + "id": 26, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.2.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "bd6668d2-9dc4-40a7-8cd2-8b8c9719b1fb" + }, + "expr": "sum(rate(container_network_transmit_bytes_total{instance=~\"$host\",name=~\"$container\",name=~\".+\"}[5m])) by (name)", + "interval": "", + "legendFormat": "{{name}}", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Sent Network Traffic", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:832", + "format": "Bps", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:833", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 69 + }, + "id": 19, + "panels": [], + "title": "Misc", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bd6668d2-9dc4-40a7-8cd2-8b8c9719b1fb" + }, + "fieldConfig": { + "defaults": { + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "filterable": false, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "id" + }, + "properties": [ + { + "id": "custom.width", + "value": 260 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Running" + }, + "properties": [ + { + "id": "unit", + "value": "d" + }, + { + "id": "decimals", + "value": 1 + }, + { + "id": "custom.cellOptions", + "value": { + "type": "color-text" + } + }, + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 70 + }, + "id": 20, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "10.2.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "bd6668d2-9dc4-40a7-8cd2-8b8c9719b1fb" + }, + "expr": "(time() - container_start_time_seconds{instance=~\"$host\",name=~\"$container\",name=~\".+\"})/86400", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "{{name}}", + "refId": "A" + } + ], + "title": "Containers Info", + "transformations": [ + { + "id": "filterFieldsByName", + "options": { + "include": { + "names": [ + "container_label_com_docker_compose_project", + "container_label_com_docker_compose_project_working_dir", + "image", + "instance", + "name", + "Value", + "container_label_com_docker_compose_service" + ] + } + } + }, + { + "id": "organize", + "options": { + "excludeByName": {}, + "indexByName": {}, + "renameByName": { + "Value": "Running", + "container_label_com_docker_compose_project": "Label", + "container_label_com_docker_compose_project_working_dir": "Working dir", + "container_label_com_docker_compose_service": "Service", + "image": "Registry Image", + "instance": "Instance", + "name": "Name" + } + } + } + ], + "type": "table" + } + ], + "refresh": "10s", + "schemaVersion": 38, + "tags": [ + "docker", + "monitoring", + "lcbp3" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "bd6668d2-9dc4-40a7-8cd2-8b8c9719b1fb" + }, + "definition": "label_values(container_cpu_usage_seconds_total{image!=\"\"}, instance)", + "hide": 0, + "includeAll": true, + "label": "Host", + "multi": false, + "name": "host", + "options": [], + "query": { + "query": "label_values(container_cpu_usage_seconds_total{image!=\"\"}, instance)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "/(.*)/", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "allValue": ".+", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "bd6668d2-9dc4-40a7-8cd2-8b8c9719b1fb" + }, + "definition": "label_values(container_cpu_usage_seconds_total{instance=~\"$host\", image!=\"\"}, name)", + "hide": 0, + "includeAll": true, + "label": "Container", + "multi": true, + "name": "container", + "options": [], + "query": { + "query": "label_values(container_cpu_usage_seconds_total{instance=~\"$host\", image!=\"\"}, name)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "/^\\/?(.*)$/", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Data Center Overview (Loki + Prometheus)", + "uid": "lcbp3-docker-metrics-logs", + "version": 5, + "weekStart": "" +} \ No newline at end of file diff --git a/specs/08-infrastructure/lcbp3-monitoring.yml b/specs/08-infrastructure/lcbp3-monitoring.yml new file mode 100644 index 0000000..75719f0 --- /dev/null +++ b/specs/08-infrastructure/lcbp3-monitoring.yml @@ -0,0 +1,232 @@ +# File: /volume1/np-dms/monitoring/docker-compose.yml +# DMS Container v1.8.0: Application name: lcbp3-monitoring +# Deploy on: ASUSTOR AS5403T +# Services: prometheus, grafana, node-exporter, cadvisor, uptime-kuma, loki, promtail + +x-restart: &restart_policy + restart: unless-stopped + +x-logging: &default_logging + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "5" + +networks: + lcbp3: + external: true + +services: + # ---------------------------------------------------------------- + # 1. Prometheus (Metrics Collection & Storage) + # ---------------------------------------------------------------- + prometheus: + <<: [*restart_policy, *default_logging] + image: prom/prometheus:v2.48.0 + container_name: prometheus + stdin_open: true + tty: true + deploy: + resources: + limits: + cpus: "1.0" + memory: 1G + reservations: + cpus: "0.25" + memory: 256M + environment: + TZ: "Asia/Bangkok" + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--storage.tsdb.retention.time=30d' + - '--web.enable-lifecycle' + ports: + - "9090:9090" + networks: + - lcbp3 + volumes: + - "/volume1/np-dms/monitoring/prometheus/config:/etc/prometheus:ro" + - "/volume1/np-dms/monitoring/prometheus/data:/prometheus" + healthcheck: + test: ["CMD", "wget", "--spider", "-q", "http://localhost:9090/-/healthy"] + interval: 30s + timeout: 10s + retries: 3 + + # ---------------------------------------------------------------- + # 2. Grafana (Dashboard & Visualization) + # ---------------------------------------------------------------- + grafana: + <<: [*restart_policy, *default_logging] + image: grafana/grafana:10.2.2 + container_name: grafana + stdin_open: true + tty: true + deploy: + resources: + limits: + cpus: "1.0" + memory: 512M + reservations: + cpus: "0.25" + memory: 128M + environment: + TZ: "Asia/Bangkok" + GF_SECURITY_ADMIN_USER: admin + GF_SECURITY_ADMIN_PASSWORD: "Center#2025" + GF_SERVER_ROOT_URL: "https://grafana.np-dms.work" + GF_INSTALL_PLUGINS: grafana-clock-panel,grafana-piechart-panel + ports: + - "3000:3000" + networks: + - lcbp3 + volumes: + - "/volume1/np-dms/monitoring/grafana/data:/var/lib/grafana" + depends_on: + - prometheus + healthcheck: + test: ["CMD-SHELL", "wget --spider -q http://localhost:3000/api/health || exit 1"] + interval: 30s + timeout: 10s + retries: 3 + + # ---------------------------------------------------------------- + # 3. Uptime Kuma (Service Availability Monitoring) + # ---------------------------------------------------------------- + uptime-kuma: + <<: [*restart_policy, *default_logging] + image: louislam/uptime-kuma:1 + container_name: uptime-kuma + deploy: + resources: + limits: + cpus: "0.5" + memory: 256M + environment: + TZ: "Asia/Bangkok" + ports: + - "3001:3001" + networks: + - lcbp3 + volumes: + - "/volume1/np-dms/monitoring/uptime-kuma/data:/app/data" + healthcheck: + test: ["CMD-SHELL", "curl -f http://localhost:3001/api/entry-page || exit 1"] + interval: 30s + timeout: 10s + retries: 3 + + # ---------------------------------------------------------------- + # 4. Node Exporter (Host Metrics - ASUSTOR) + # ---------------------------------------------------------------- + node-exporter: + <<: [*restart_policy, *default_logging] + image: prom/node-exporter:v1.7.0 + container_name: node-exporter + deploy: + resources: + limits: + cpus: "0.5" + memory: 128M + environment: + TZ: "Asia/Bangkok" + command: + - '--path.procfs=/host/proc' + - '--path.sysfs=/host/sys' + - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)' + ports: + - "9100:9100" + networks: + - lcbp3 + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/rootfs:ro + healthcheck: + test: ["CMD", "wget", "--spider", "-q", "http://localhost:9100/metrics"] + interval: 30s + timeout: 10s + retries: 3 + + # ---------------------------------------------------------------- + # 5. cAdvisor (Container Metrics - ASUSTOR) + # ---------------------------------------------------------------- + cadvisor: + <<: [*restart_policy, *default_logging] + image: gcr.io/cadvisor/cadvisor:v0.47.2 + container_name: cadvisor + deploy: + resources: + limits: + cpus: "0.5" + memory: 256M + environment: + TZ: "Asia/Bangkok" + ports: + - "8088:8088" + networks: + - lcbp3 + volumes: + - /:/rootfs:ro + - /var/run:/var/run:ro + - /sys:/sys:ro + - /var/lib/docker/:/var/lib/docker:ro + healthcheck: + test: ["CMD", "wget", "--spider", "-q", "http://localhost:8080/healthz"] + interval: 30s + timeout: 10s + retries: 3 + + # ---------------------------------------------------------------- + # 6. Loki (Log Aggregation) + # ---------------------------------------------------------------- + loki: + <<: [*restart_policy, *default_logging] + image: grafana/loki:2.9.0 + container_name: loki + deploy: + resources: + limits: + cpus: "0.5" + memory: 512M + environment: + TZ: "Asia/Bangkok" + command: -config.file=/etc/loki/local-config.yaml + ports: + - "3100:3100" + networks: + - lcbp3 + volumes: + - "/volume1/np-dms/monitoring/loki/data:/loki" + healthcheck: + test: ["CMD", "wget", "--spider", "-q", "http://localhost:3100/ready"] + interval: 30s + timeout: 10s + retries: 3 + + # ---------------------------------------------------------------- + # 7. Promtail (Log Shipper) + # ---------------------------------------------------------------- + promtail: + <<: [*restart_policy, *default_logging] + image: grafana/promtail:2.9.0 + container_name: promtail + user: "0:0" + deploy: + resources: + limits: + cpus: "0.5" + memory: 256M + environment: + TZ: "Asia/Bangkok" + command: -config.file=/etc/promtail/promtail-config.yml + networks: + - lcbp3 + volumes: + - "/volume1/np-dms/monitoring/promtail/config:/etc/promtail:ro" + - "/var/run/docker.sock:/var/run/docker.sock:ro" + - "/var/lib/docker/containers:/var/lib/docker/containers:ro" + depends_on: + - loki \ No newline at end of file