feat(ai-runtime): complete ai runtime policy refactor (ADR-035)
CI / CD Pipeline / build (push) Successful in 4m16s
CI / CD Pipeline / deploy (push) Successful in 11m51s

This commit is contained in:
2026-06-12 08:07:15 +07:00
parent 71c5e88181
commit 0227b7b982
63 changed files with 3566 additions and 451 deletions
@@ -0,0 +1,34 @@
# File: specs/04-Infrastructure-OPS/04-00-docker-compose/Desk-5439/ocr-sidecar/services/residency_policy.py
# Change Log:
# - 2026-06-11: Initial creation of residency_policy.py for calculating OCR keep_alive value dynamically
import os
import logging
from dataclasses import dataclass
from services.vram_monitor import get_vram_headroom
logger = logging.getLogger("ocr-sidecar.residency-policy")
@dataclass
class OcrResidencyDecision:
keep_alive_seconds: int
vram_headroom_mb: float
reason: str
def calculate_ocr_residency(active_profile: str = None) -> OcrResidencyDecision:
"""
คำนวณ keep_alive สำหรับ Typhoon OCR จาก VRAM headroom และ active profile ของโมเดลหลัก
"""
threshold_mb = float(os.getenv("VRAM_HEADROOM_THRESHOLD_MB", "3000.0"))
residency_window = int(os.getenv("OCR_RESIDENCY_WINDOW_SECONDS", "120"))
pressure_threshold = float(os.getenv("GPU_MAIN_MODEL_PRESSURE_THRESHOLD_MB", "7000.0"))
if active_profile in ("deep-analysis", "large-context"):
return OcrResidencyDecision(0, -1.0, "large-context-active")
headroom = get_vram_headroom()
if not headroom.query_success:
return OcrResidencyDecision(0, -1.0, "query-failed")
if headroom.used_mb > pressure_threshold:
return OcrResidencyDecision(0, headroom.available_mb, "high-pressure")
if headroom.available_mb < threshold_mb:
return OcrResidencyDecision(0, headroom.available_mb, "high-pressure")
return OcrResidencyDecision(residency_window, headroom.available_mb, "headroom-sufficient")
@@ -0,0 +1,43 @@
# File: specs/04-Infrastructure-OPS/04-00-docker-compose/Desk-5439/ocr-sidecar/services/vram_monitor.py
# Change Log:
# - 2026-06-11: Initial creation of VramMonitor service for Python OCR sidecar to query GPU VRAM headroom from Ollama /api/ps
from dataclasses import dataclass
import os
import httpx
import logging
logger = logging.getLogger("ocr-sidecar.vram-monitor")
@dataclass
class VramHeadroom:
total_mb: float
used_mb: float
available_mb: float
query_success: bool
def get_vram_headroom() -> VramHeadroom:
"""
ดึงข้อมูล VRAM headroom จาก Ollama /api/ps
และคำนวณพื้นที่คงเหลือใน VRAM เพื่อประกอบการตัดสินใจเรื่อง Residency และ CPU Fallback
"""
ollama_url = os.getenv("OLLAMA_API_URL", "http://host.docker.internal:11434")
total_vram_mb = float(os.getenv("GPU_TOTAL_VRAM_MB", "16384.0"))
try:
# ดึงสถานะ running models จาก Ollama
with httpx.Client(timeout=3.0) as client:
response = client.get(f"{ollama_url}/api/ps")
if response.status_code != 200:
logger.warning(f"Ollama ps endpoint returned status code: {response.status_code}")
return VramHeadroom(total_vram_mb, total_vram_mb, 0.0, False)
data = response.json()
models = data.get("models", [])
total_used_bytes = 0
for model in models:
total_used_bytes += model.get("size_vram", 0)
used_mb = float(total_used_bytes) / (1024.0 * 1024.0)
available_mb = max(0.0, total_vram_mb - used_mb)
return VramHeadroom(total_vram_mb, used_mb, available_mb, True)
except Exception as e:
logger.warning(f"Failed to query Ollama VRAM: {str(e)}")
return VramHeadroom(total_vram_mb, total_vram_mb, 0.0, False)