690602:1334 ADR-033-233 #03
This commit is contained in:
@@ -0,0 +1,240 @@
|
||||
# File: /volume1/np-dms/monitoring/docker-compose.yml
|
||||
# DMS Container v1.8.6: Application name: lcbp3-monitoring
|
||||
# Deploy on: ASUSTOR AS5403T
|
||||
# Services: prometheus, grafana, node-exporter, cadvisor, uptime-kuma, loki, promtail
|
||||
|
||||
x-restart: &restart_policy
|
||||
restart: unless-stopped
|
||||
|
||||
x-logging: &default_logging
|
||||
logging:
|
||||
driver: "json-file"
|
||||
options:
|
||||
max-size: "10m"
|
||||
max-file: "5"
|
||||
|
||||
name: lcbp3-monitoring
|
||||
|
||||
networks:
|
||||
lcbp3:
|
||||
external: true
|
||||
|
||||
services:
|
||||
# ----------------------------------------------------------------
|
||||
# 1. Prometheus (Metrics Collection & Storage)
|
||||
# ----------------------------------------------------------------
|
||||
prometheus:
|
||||
<<: [*restart_policy, *default_logging]
|
||||
image: prom/prometheus:v2.48.0
|
||||
container_name: prometheus
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: "1.0"
|
||||
memory: 1G
|
||||
reservations:
|
||||
cpus: "0.25"
|
||||
memory: 256M
|
||||
environment:
|
||||
TZ: "Asia/Bangkok"
|
||||
command:
|
||||
- "--config.file=/etc/prometheus/prometheus.yml"
|
||||
- "--storage.tsdb.path=/prometheus"
|
||||
- "--storage.tsdb.retention.time=30d"
|
||||
- "--web.enable-lifecycle"
|
||||
ports:
|
||||
- "9090:9090"
|
||||
networks:
|
||||
- lcbp3
|
||||
volumes:
|
||||
- "/volume1/np-dms/monitoring/prometheus/config:/etc/prometheus:ro"
|
||||
- "/volume1/np-dms/monitoring/prometheus/data:/prometheus"
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--spider", "-q", "http://localhost:9090/-/healthy"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# 2. Grafana (Dashboard & Visualization)
|
||||
# ----------------------------------------------------------------
|
||||
grafana:
|
||||
<<: [*restart_policy, *default_logging]
|
||||
image: grafana/grafana:10.2.2
|
||||
container_name: grafana
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: "1.0"
|
||||
memory: 512M
|
||||
reservations:
|
||||
cpus: "0.25"
|
||||
memory: 128M
|
||||
env_file:
|
||||
- .env
|
||||
environment:
|
||||
TZ: "Asia/Bangkok"
|
||||
GF_SECURITY_ADMIN_USER: admin
|
||||
GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:?GRAFANA_ADMIN_PASSWORD required}
|
||||
GF_SERVER_ROOT_URL: "https://grafana.np-dms.work"
|
||||
GF_INSTALL_PLUGINS: grafana-clock-panel,grafana-piechart-panel
|
||||
ports:
|
||||
- "3003:3000"
|
||||
networks:
|
||||
- lcbp3
|
||||
volumes:
|
||||
- "/volume1/np-dms/monitoring/grafana/data:/var/lib/grafana"
|
||||
depends_on:
|
||||
- prometheus
|
||||
healthcheck:
|
||||
test:
|
||||
[
|
||||
"CMD-SHELL",
|
||||
"wget --spider -q http://localhost:3000/api/health || exit 1",
|
||||
]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# 3. Uptime Kuma (Service Availability Monitoring)
|
||||
# ----------------------------------------------------------------
|
||||
uptime-kuma:
|
||||
<<: [*restart_policy, *default_logging]
|
||||
image: louislam/uptime-kuma:1
|
||||
container_name: uptime-kuma
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: "0.5"
|
||||
memory: 256M
|
||||
environment:
|
||||
TZ: "Asia/Bangkok"
|
||||
ports:
|
||||
- "3001:3001"
|
||||
networks:
|
||||
- lcbp3
|
||||
volumes:
|
||||
- "/volume1/np-dms/monitoring/uptime-kuma/data:/app/data"
|
||||
healthcheck:
|
||||
test:
|
||||
["CMD-SHELL", "curl -f http://localhost:3001/api/entry-page || exit 1"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# 4. Node Exporter (Host Metrics - ASUSTOR)
|
||||
# ----------------------------------------------------------------
|
||||
node-exporter:
|
||||
<<: [*restart_policy, *default_logging]
|
||||
image: prom/node-exporter:v1.7.0
|
||||
container_name: node-exporter
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: "0.5"
|
||||
memory: 128M
|
||||
environment:
|
||||
TZ: "Asia/Bangkok"
|
||||
command:
|
||||
- "--path.procfs=/host/proc"
|
||||
- "--path.sysfs=/host/sys"
|
||||
- "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)"
|
||||
ports:
|
||||
- "9100:9100"
|
||||
networks:
|
||||
- lcbp3
|
||||
volumes:
|
||||
- /proc:/host/proc:ro
|
||||
- /sys:/host/sys:ro
|
||||
- /:/rootfs:ro
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--spider", "-q", "http://localhost:9100/metrics"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# 5. cAdvisor (Container Metrics - ASUSTOR)
|
||||
# ----------------------------------------------------------------
|
||||
cadvisor:
|
||||
<<: [*restart_policy, *default_logging]
|
||||
image: gcr.io/cadvisor/cadvisor:v0.47.2
|
||||
container_name: cadvisor
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: "0.5"
|
||||
memory: 256M
|
||||
environment:
|
||||
TZ: "Asia/Bangkok"
|
||||
# H4: cAdvisor binds 8080 container map 8088 host
|
||||
ports:
|
||||
- "8088:8080"
|
||||
networks:
|
||||
- lcbp3
|
||||
volumes:
|
||||
- /:/rootfs:ro
|
||||
- /var/run:/var/run:ro
|
||||
- /sys:/sys:ro
|
||||
- /var/lib/docker/:/var/lib/docker:ro
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--spider", "-q", "http://localhost:8080/healthz"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# 6. Loki (Log Aggregation)
|
||||
# ----------------------------------------------------------------
|
||||
loki:
|
||||
<<: [*restart_policy, *default_logging]
|
||||
image: grafana/loki:2.9.0
|
||||
container_name: loki
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: "0.5"
|
||||
memory: 512M
|
||||
environment:
|
||||
TZ: "Asia/Bangkok"
|
||||
command: -config.file=/etc/loki/local-config.yaml
|
||||
ports:
|
||||
- "3100:3100"
|
||||
networks:
|
||||
- lcbp3
|
||||
volumes:
|
||||
- "/volume1/np-dms/monitoring/loki/data:/loki"
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--spider", "-q", "http://localhost:3100/ready"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# 7. Promtail (Log Shipper)
|
||||
# ----------------------------------------------------------------
|
||||
promtail:
|
||||
<<: [*restart_policy, *default_logging]
|
||||
image: grafana/promtail:2.9.0
|
||||
container_name: promtail
|
||||
# L5: root /var/lib/docker/containers
|
||||
# mount read-only
|
||||
user: "0:0"
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: "0.5"
|
||||
memory: 256M
|
||||
environment:
|
||||
TZ: "Asia/Bangkok"
|
||||
command: -config.file=/etc/promtail/promtail-config.yml
|
||||
networks:
|
||||
- lcbp3
|
||||
volumes:
|
||||
- "/volume1/np-dms/monitoring/promtail/config:/etc/promtail:ro"
|
||||
- "/var/run/docker.sock:/var/run/docker.sock:ro"
|
||||
- "/var/lib/docker/containers:/var/lib/docker/containers:ro"
|
||||
depends_on:
|
||||
- loki
|
||||
+60
@@ -0,0 +1,60 @@
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
scrape_configs:
|
||||
# Prometheus self-monitoring (ASUSTOR)
|
||||
- job_name: "prometheus"
|
||||
static_configs:
|
||||
- targets: ["localhost:9090"]
|
||||
|
||||
# ============================================
|
||||
# ASUSTOR Metrics (Local)
|
||||
# ============================================
|
||||
|
||||
# Host metrics from Node Exporter (ASUSTOR)
|
||||
- job_name: "asustor-node"
|
||||
static_configs:
|
||||
- targets: ["node-exporter:9100"]
|
||||
labels:
|
||||
host: "asustor"
|
||||
|
||||
# Container metrics from cAdvisor (ASUSTOR)
|
||||
- job_name: "asustor-cadvisor"
|
||||
static_configs:
|
||||
- targets: ["cadvisor:8080"]
|
||||
labels:
|
||||
host: "asustor"
|
||||
|
||||
# ============================================
|
||||
# QNAP Metrics (Remote - 192.168.10.8)
|
||||
# ============================================
|
||||
|
||||
# Host metrics from Node Exporter (QNAP)
|
||||
- job_name: "qnap-node"
|
||||
static_configs:
|
||||
- targets: ["192.168.10.8:9100"]
|
||||
labels:
|
||||
host: "qnap"
|
||||
|
||||
# Container metrics from cAdvisor (QNAP)
|
||||
- job_name: "qnap-cadvisor"
|
||||
static_configs:
|
||||
- targets: ["192.168.10.8:8088"]
|
||||
labels:
|
||||
host: "qnap"
|
||||
|
||||
# Backend NestJS application (QNAP)
|
||||
- job_name: "backend"
|
||||
static_configs:
|
||||
- targets: ["192.168.10.8:3000"]
|
||||
labels:
|
||||
host: "qnap"
|
||||
metrics_path: "/metrics"
|
||||
|
||||
# MariaDB Exporter (optional - QNAP)
|
||||
- job_name: "mariadb"
|
||||
static_configs:
|
||||
- targets: ["192.168.10.8:9104"]
|
||||
labels:
|
||||
host: "qnap"
|
||||
+73
-67
@@ -1,92 +1,98 @@
|
||||
# File: specs/04-Infrastructure-OPS/04-00-docker-compose/ASUSTOR/monitoring/prometheus/config/prometheus.yml
|
||||
# Prometheus Configuration — รัน บน ASUSTOR AS5403T (lcbp3-monitoring stack)
|
||||
# Change Log:
|
||||
# - 2026-06-02: Initial config — scrape jobs สำหรับ ASUSTOR local + Desk-5439 remote
|
||||
# - 2026-06-02: Initial config — merge จาก 0.yml (existing) + เพิ่ม ollama-metrics job
|
||||
#
|
||||
# Deploy path: /volume1/np-dms/monitoring/prometheus/config/prometheus.yml
|
||||
# Mount (read-only): docker-compose volume → /etc/prometheus/prometheus.yml
|
||||
#
|
||||
# NOTE: ไฟล์นี้รวม 0.yml (config เดิมบน ASUSTOR) + job ollama-metrics ใหม่
|
||||
# เมื่อ deploy แล้วให้ลบ 0.yml ออก หรือ rename เป็น 0.yml.bak
|
||||
|
||||
global:
|
||||
scrape_interval: 15s # ดึง metrics ทุก 15 วินาที (default)
|
||||
evaluation_interval: 15s # ประเมิน rules ทุก 15 วินาที
|
||||
scrape_timeout: 10s
|
||||
|
||||
# Labels ที่ติดไปกับทุก time series ที่ scrape ได้
|
||||
external_labels:
|
||||
environment: 'production'
|
||||
cluster: 'lcbp3'
|
||||
|
||||
# ─── Alerting (optional — เชื่อม Alertmanager เมื่อต้องการ) ──────────────────
|
||||
# alerting:
|
||||
# alertmanagers:
|
||||
# - static_configs:
|
||||
# - targets: ['alertmanager:9093']
|
||||
|
||||
# ─── Rules (optional) ────────────────────────────────────────────────────────
|
||||
# rule_files:
|
||||
# - /etc/prometheus/rules/*.yml
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
# ─── Scrape Jobs ─────────────────────────────────────────────────────────────
|
||||
scrape_configs:
|
||||
# ----------------------------------------------------------------
|
||||
# 1. Prometheus self-monitoring (ASUSTOR)
|
||||
# ----------------------------------------------------------------
|
||||
- job_name: 'prometheus'
|
||||
- job_name: "prometheus"
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
labels:
|
||||
host: 'asustor'
|
||||
service: 'prometheus'
|
||||
- targets: ["localhost:9090"]
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# 2. Node Exporter — Host metrics ของ ASUSTOR
|
||||
# ----------------------------------------------------------------
|
||||
- job_name: 'node-exporter-asustor'
|
||||
# ============================================
|
||||
# ASUSTOR Metrics (Local)
|
||||
# ============================================
|
||||
|
||||
# Host metrics from Node Exporter (ASUSTOR)
|
||||
- job_name: "asustor-node"
|
||||
static_configs:
|
||||
- targets: ['node-exporter:9100']
|
||||
- targets: ["node-exporter:9100"]
|
||||
labels:
|
||||
host: 'asustor'
|
||||
service: 'node-exporter'
|
||||
host: "asustor"
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# 3. cAdvisor — Container metrics ของ ASUSTOR
|
||||
# ----------------------------------------------------------------
|
||||
- job_name: 'cadvisor-asustor'
|
||||
# Container metrics from cAdvisor (ASUSTOR)
|
||||
- job_name: "asustor-cadvisor"
|
||||
static_configs:
|
||||
- targets: ['cadvisor:8080']
|
||||
- targets: ["cadvisor:8080"]
|
||||
labels:
|
||||
host: 'asustor'
|
||||
service: 'cadvisor'
|
||||
host: "asustor"
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# 4. ollama-metrics (NorskHelsenett) — Ollama LLM metrics
|
||||
# รัน บน Desk-5439 (192.168.10.100) ตาม ADR-023A
|
||||
# sidecar expose /metrics บน port 9924
|
||||
# ============================================
|
||||
# QNAP Metrics (Remote - 192.168.10.8)
|
||||
# ============================================
|
||||
|
||||
# Host metrics from Node Exporter (QNAP)
|
||||
- job_name: "qnap-node"
|
||||
static_configs:
|
||||
- targets: ["192.168.10.8:9100"]
|
||||
labels:
|
||||
host: "qnap"
|
||||
|
||||
# Container metrics from cAdvisor (QNAP)
|
||||
- job_name: "qnap-cadvisor"
|
||||
static_configs:
|
||||
- targets: ["192.168.10.8:8088"]
|
||||
labels:
|
||||
host: "qnap"
|
||||
|
||||
# Backend NestJS application (QNAP)
|
||||
- job_name: "backend"
|
||||
static_configs:
|
||||
- targets: ["192.168.10.8:3000"]
|
||||
labels:
|
||||
host: "qnap"
|
||||
metrics_path: "/metrics"
|
||||
|
||||
# MariaDB Exporter (optional - QNAP)
|
||||
- job_name: "mariadb"
|
||||
static_configs:
|
||||
- targets: ["192.168.10.8:9104"]
|
||||
labels:
|
||||
host: "qnap"
|
||||
|
||||
# ============================================
|
||||
# Desk-5439 Metrics (Remote - 192.168.10.100)
|
||||
# ============================================
|
||||
|
||||
# ollama-metrics (NorskHelsenett) — Ollama LLM metrics
|
||||
# sidecar รันบน Desk-5439 ตาม ADR-023A, expose /metrics บน port 9924
|
||||
#
|
||||
# Metrics ที่ collect:
|
||||
# ollama_prompt_tokens_total — prompt tokens รวม
|
||||
# ollama_generated_tokens_total — generated tokens รวม
|
||||
# ollama_request_duration_seconds — latency histogram
|
||||
# ollama_time_per_token_seconds — inference speed
|
||||
# ollama_loaded_models — จำนวน model ใน VRAM
|
||||
# ollama_model_loaded — 1/0 per model
|
||||
# ollama_model_ram_mb — VRAM usage (MB) per model
|
||||
# ----------------------------------------------------------------
|
||||
- job_name: 'ollama-metrics'
|
||||
scrape_interval: 30s # Ollama metrics ไม่เปลี่ยนเร็ว — 30s เพียงพอ
|
||||
# Metrics ที่ collect:
|
||||
# ollama_prompt_tokens_total — prompt tokens รวม
|
||||
# ollama_generated_tokens_total — generated tokens รวม
|
||||
# ollama_request_duration_seconds — latency histogram
|
||||
# ollama_time_per_token_seconds — inference speed (tok/s)
|
||||
# ollama_loaded_models — จำนวน model ใน VRAM
|
||||
# ollama_model_loaded — 1/0 per model
|
||||
# ollama_model_ram_mb — VRAM usage (MB) per model
|
||||
- job_name: "ollama-metrics"
|
||||
scrape_interval: 30s
|
||||
static_configs:
|
||||
- targets: ['192.168.10.100:9924']
|
||||
- targets: ["192.168.10.100:9924"]
|
||||
labels:
|
||||
host: 'desk-5439'
|
||||
service: 'ollama'
|
||||
role: 'ai-inference'
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# 5. Loki — Log aggregation health (ASUSTOR)
|
||||
# ----------------------------------------------------------------
|
||||
- job_name: 'loki'
|
||||
static_configs:
|
||||
- targets: ['loki:3100']
|
||||
labels:
|
||||
host: 'asustor'
|
||||
service: 'loki'
|
||||
host: "desk-5439"
|
||||
service: "ollama"
|
||||
role: "ai-inference"
|
||||
|
||||
Reference in New Issue
Block a user