feat(ai-runtime): complete ai runtime policy refactor (ADR-035)

2026-06-12 08:07:15 +07:00
parent 71c5e88181
commit 0227b7b982
63 changed files with 3566 additions and 451 deletions
@@ -1,133 +1,143 @@
-// File: src/modules/ai/services/vram-monitor.service.ts
-// Change Log
-// - 2026-05-30: Initial implementation สำหรับ Typhoon OCR VRAM monitoring (T006, ADR-032)
+// File: backend/src/modules/ai/services/vram-monitor.service.ts
+// Change Log:
+// - 2026-06-11: Initial creation of VramMonitorService to monitor VRAM headroom from Ollama /api/ps
+// - 2026-06-11: เพิ่มการคำนวณ mainModelVramMb ใน getVramHeadroom
+// - 2026-06-11: เพิ่ม getVramStatus และ invalidateCache เพื่อความเข้ากันได้กับส่วนอื่น

 import { Injectable, Logger } from '@nestjs/common';
 import { ConfigService } from '@nestjs/config';
 import axios from 'axios';
-import { InjectRedis } from '@nestjs-modules/ioredis';
-import Redis from 'ioredis';
+import { VramHeadroom } from '../interfaces/execution-policy.interface';

-/** ข้อมูล VRAM จาก Ollama PS API */
-export interface OllamaModelInfo {
-  name: string;
-  size_vram: number; // bytes
-}
-
-/** ผลลัพธ์ VRAM status */
+/**
+ * ผลลัพธ์ VRAM status สำหรับส่วนบริการภายนอก
+ * ผลลัพธ์นี้มีวัตถุประสงค์เพื่อรักษาความเข้ากันได้ย้อนหลัง (Backward Compatibility)
+ */
 export interface VramStatus {
  totalVramMb: number;
  usedVramMb: number;
  freeVramMb: number;
  loadedModels: string[];
-  hasCapacity: boolean; // true ถ้า free VRAM >= minRequiredMb
+  hasCapacity: boolean;
 }

-/** ผลลัพธ์ภายในจาก Ollama /api/ps */
-interface OllamaProcessStatus {
-  models?: OllamaModelInfo[];
-}
-
-// Redis key สำหรับ cache VRAM status
-const VRAM_STATUS_CACHE_KEY = 'ai:vram:status';
-// TTL 10 วินาที — refresh บ่อยพอสำหรับ real-time monitoring
-const VRAM_STATUS_TTL_SECONDS = 10;
-// VRAM limit สำหรับ RTX 2060 Super (8192 MB)
-const GPU_TOTAL_VRAM_MB = 8192;
-// Threshold: ไม่โหลด model ถ้า usage > 90%
-const VRAM_USAGE_LIMIT_PERCENT = 0.9;
-
-/** บริการตรวจสอบ VRAM GPU ผ่าน Ollama API ตาม ADR-032 */
@Injectable()
 export class VramMonitorService {
  private readonly logger = new Logger(VramMonitorService.name);
  private readonly ollamaUrl: string;
+  private readonly totalVramMb: number;

-  constructor(
-    private readonly configService: ConfigService,
-    @InjectRedis() private readonly redis: Redis
-  ) {
+  constructor(private readonly configService: ConfigService) {
    this.ollamaUrl = this.configService.get<string>(
      'OLLAMA_URL',
-      this.configService.get<string>('AI_HOST_URL', 'http://localhost:11434')
+      this.configService.get<string>(
+        'AI_HOST_URL',
+        'http://192.168.10.100:11434'
+      )
+    );
+    this.totalVramMb = this.configService.get<number>(
+      'GPU_TOTAL_VRAM_MB',
+      16384 // Default to 16GB (RTX 5060 Ti)
    );
  }

  /**
-   * ดึงสถานะ VRAM ปัจจุบันจาก Ollama /api/ps
-   * ใช้ Redis cache TTL 10 วินาทีเพื่อลด overhead
+   * ดึงสถานะ VRAM headroom จาก Ollama /api/ps
+   * ถ้าล้มเหลวจะคืนค่าด้วย safe default (available = 0)
   */
-  async getVramStatus(minRequiredMb = 4000): Promise<VramStatus> {
-    const cached = await this.redis.get(VRAM_STATUS_CACHE_KEY);
-    if (cached) {
-      const parsed = JSON.parse(cached) as VramStatus;
-      parsed.hasCapacity = parsed.freeVramMb >= minRequiredMb;
-      return parsed;
-    }
-    return this.fetchAndCacheVramStatus(minRequiredMb);
-  }
-
-  /** ตรวจสอบว่า VRAM เพียงพอสำหรับโหลด model ที่ต้องการ */
-  async hasVramCapacity(requiredMb: number): Promise<boolean> {
-    const status = await this.getVramStatus(requiredMb);
-    return status.hasCapacity;
-  }
-
-  /** ดึงข้อมูล VRAM จาก Ollama และ cache ใน Redis */
-  private async fetchAndCacheVramStatus(
-    minRequiredMb: number
-  ): Promise<VramStatus> {
+  async getVramHeadroom(): Promise<VramHeadroom> {
    try {
-      const response = await axios.get<OllamaProcessStatus>(
-        `${this.ollamaUrl}/api/ps`,
-        { timeout: 5000 }
-      );
-      const models = response.data.models ?? [];
-      const loadedModels = models.map((m) => m.name);
-      // คำนวณ VRAM ที่ใช้จาก models ที่โหลดอยู่
-      const usedVramBytes = models.reduce(
-        (sum, m) => sum + (m.size_vram ?? 0),
-        0
-      );
-      const usedVramMb = Math.round(usedVramBytes / 1024 / 1024);
-      // จำกัด VRAM ไม่เกิน limit 90% ของ GPU ทั้งหมด
-      const maxAllowedMb = Math.floor(
-        GPU_TOTAL_VRAM_MB * VRAM_USAGE_LIMIT_PERCENT
-      );
-      const freeVramMb = Math.max(0, maxAllowedMb - usedVramMb);
-      const status: VramStatus = {
-        totalVramMb: GPU_TOTAL_VRAM_MB,
-        usedVramMb,
-        freeVramMb,
-        loadedModels,
-        hasCapacity: freeVramMb >= minRequiredMb,
+      const response = await axios.get<{
+        models?: Array<{
+          name: string;
+          size_vram: number;
+        }>;
+      }>(`${this.ollamaUrl}/api/ps`, { timeout: 3000 });
+      const models = response.data?.models ?? [];
+      let totalUsedBytes = 0;
+      let mainModelUsedBytes = 0;
+      for (const model of models) {
+        totalUsedBytes += model.size_vram || 0;
+        if (
+          model.name.includes('np-dms-ai') ||
+          model.name.includes('typhoon2.5-np-dms')
+        ) {
+          mainModelUsedBytes += model.size_vram || 0;
+        }
+      }
+      const usedMb = Math.round(totalUsedBytes / (1024 * 1024));
+      const availableMb = Math.max(0, this.totalVramMb - usedMb);
+      const mainModelVramMb = Math.round(mainModelUsedBytes / (1024 * 1024));
+      return {
+        totalMb: this.totalVramMb,
+        usedMb,
+        availableMb,
+        querySuccess: true,
+        mainModelVramMb,
      };
-      await this.redis.setex(
-        VRAM_STATUS_CACHE_KEY,
-        VRAM_STATUS_TTL_SECONDS,
-        JSON.stringify(status)
-      );
-      return status;
    } catch (err: unknown) {
-      const msg = err instanceof Error ? err.message : String(err);
      this.logger.warn(
-        `VRAM status fetch failed: ${msg} — ใช้ค่า resilient fallback`
+        `Failed to query Ollama /api/ps: ${err instanceof Error ? err.message : String(err)}`
      );
      return {
-        totalVramMb: GPU_TOTAL_VRAM_MB,
-        usedVramMb: 0,
-        freeVramMb: GPU_TOTAL_VRAM_MB,
-        loadedModels: [],
-        hasCapacity: true,
+        totalMb: this.totalVramMb,
+        usedMb: this.totalVramMb, // บังคับให้ used = total เพื่อให้ available = 0
+        availableMb: 0,
+        querySuccess: false,
+        mainModelVramMb: 0,
      };
    }
  }

  /**
-   * ล้าง VRAM cache (เรียกหลังจาก model unload ด้วย keep_alive=0)
-   * เพื่อให้ status check ครั้งต่อไปดึงข้อมูลใหม่จาก Ollama
+   * ดึงสถานะ VRAM ปัจจุบันของระบบ
+   * เพื่อความเข้ากันได้ย้อนหลังกับ endpoint vram/status
+   */
+  async getVramStatus(minRequiredMb = 4000): Promise<VramStatus> {
+    try {
+      const response = await axios.get<{
+        models?: Array<{
+          name: string;
+          size_vram: number;
+        }>;
+      }>(`${this.ollamaUrl}/api/ps`, { timeout: 3000 });
+      const models = response.data?.models ?? [];
+      const loadedModels = models.map((m) => m.name);
+      const headroom = await this.getVramHeadroom();
+      return {
+        totalVramMb: headroom.totalMb,
+        usedVramMb: headroom.usedMb,
+        freeVramMb: headroom.availableMb,
+        loadedModels,
+        hasCapacity: headroom.availableMb >= minRequiredMb,
+      };
+    } catch (err: unknown) {
+      this.logger.warn(
+        `Failed to get VRAM status: ${err instanceof Error ? err.message : String(err)}`
+      );
+      return {
+        totalVramMb: this.totalVramMb,
+        usedVramMb: this.totalVramMb,
+        freeVramMb: 0,
+        loadedModels: [],
+        hasCapacity: false,
+      };
+    }
+  }
+
+  /**
+   * ตรวจสอบว่า VRAM เพียงพอสำหรับความต้องการโหลดโมเดลหรือไม่
+   */
+  async hasVramCapacity(requiredMb: number): Promise<boolean> {
+    const headroom = await this.getVramHeadroom();
+    return headroom.availableMb >= requiredMb;
+  }
+
+  /**
+   * ล้าง cache VRAM (ไม่มี cache แล้วในระบบใหม่ แต่เก็บไว้เพื่อรองรับการเรียกใช้เดิม)
   */
  async invalidateCache(): Promise<void> {
-    await this.redis.del(VRAM_STATUS_CACHE_KEY);
+    await Promise.resolve();
+    this.logger.log('VRAM cache invalidation requested (no-op in new policy)');
  }
 }