feat(ai-runtime): complete ai runtime policy refactor (ADR-035)

2026-06-12 08:07:15 +07:00
parent 71c5e88181
commit 0227b7b982
63 changed files with 3566 additions and 451 deletions
@@ -0,0 +1,183 @@
+// File: backend/src/modules/ai/services/ai-policy.service.ts
+// Change Log:
+// - 2026-06-11: Initial creation of AiPolicyService for managing execution profiles and policies
+// - 2026-06-11: แก้ไขข้อผิดพลาด TS2367 (เทียบ profile กับ ocr-extract) และลบบรรทัดว่างในฟังก์ชัน getProfileParameters
+
+import { Injectable, Logger } from '@nestjs/common';
+import { InjectRedis } from '@nestjs-modules/ioredis';
+import { InjectRepository } from '@nestjs/typeorm';
+import type Redis from 'ioredis';
+import { Repository } from 'typeorm';
+import { AiExecutionProfile } from '../entities/ai-execution-profile.entity';
+import {
+  ExecutionProfile,
+  InternalJobType,
+  RuntimePolicy,
+  AiJobPayload,
+} from '../interfaces/execution-policy.interface';
+
+@Injectable()
+export class AiPolicyService {
+  private readonly logger = new Logger(AiPolicyService.name);
+  private readonly cachePrefix = 'ai_execution_profiles:';
+  private readonly cacheTtlSeconds = 60;
+
+  private readonly defaultProfiles: Record<ExecutionProfile, RuntimePolicy> = {
+    interactive: {
+      canonicalModel: 'np-dms-ai',
+      temperature: 0.7,
+      topP: 0.9,
+      maxTokens: 2048,
+      numCtx: 4096,
+      repeatPenalty: 1.15,
+      keepAliveSeconds: 300,
+    },
+    standard: {
+      canonicalModel: 'np-dms-ai',
+      temperature: 0.5,
+      topP: 0.8,
+      maxTokens: 4096,
+      numCtx: 8192,
+      repeatPenalty: 1.15,
+      keepAliveSeconds: 600,
+    },
+    quality: {
+      canonicalModel: 'np-dms-ai',
+      temperature: 0.1,
+      topP: 0.95,
+      maxTokens: 8192,
+      numCtx: 8192,
+      repeatPenalty: 1.15,
+      keepAliveSeconds: 600,
+    },
+    'deep-analysis': {
+      canonicalModel: 'np-dms-ai',
+      temperature: 0.3,
+      topP: 0.85,
+      maxTokens: 8192,
+      numCtx: 32768,
+      repeatPenalty: 1.15,
+      keepAliveSeconds: 0,
+    },
+  };
+
+  constructor(
+    @InjectRepository(AiExecutionProfile)
+    private readonly profileRepo: Repository<AiExecutionProfile>,
+    @InjectRedis() private readonly redis: Redis
+  ) {}
+
+  /**
+   * แปลงชื่อ model หรือ tag ของ Ollama ให้เป็น canonical name เสมอ (np-dms-ai หรือ np-dms-ocr)
+   */
+  getCanonicalModelName(modelName: string): 'np-dms-ai' | 'np-dms-ocr' {
+    const name = modelName.toLowerCase();
+    if (name.includes('ocr') || name.includes('typhoon-np-dms-ocr')) {
+      return 'np-dms-ocr';
+    }
+    return 'np-dms-ai';
+  }
+
+  /**
+   * แผนผังการแปลง JobType เป็น ExecutionProfile
+   */
+  getProfileForJobType(jobType: InternalJobType): ExecutionProfile {
+    switch (jobType) {
+      case 'auto-fill-document':
+      case 'migrate-document':
+        return 'quality';
+      case 'rag-query':
+        return 'standard';
+      case 'intent-classify':
+      case 'tool-suggest':
+        return 'interactive';
+      case 'sandbox-analysis':
+        return 'deep-analysis';
+      case 'ocr-extract':
+      default:
+        return 'standard';
+    }
+  }
+
+  /**
+   * ดึงพารามิเตอร์การทำงานสำหรับ ExecutionProfile แต่ละอัน
+   */
+  async getProfileParameters(
+    profile: ExecutionProfile
+  ): Promise<RuntimePolicy> {
+    const cacheKey = `${this.cachePrefix}${profile}`;
+    try {
+      const cached = await this.redis.get(cacheKey);
+      if (cached) {
+        return JSON.parse(cached) as RuntimePolicy;
+      }
+    } catch (cacheErr) {
+      this.logger.warn(
+        `Failed to read execution profile cache: ${cacheErr instanceof Error ? cacheErr.message : String(cacheErr)}`
+      );
+    }
+    try {
+      const dbProfile = await this.profileRepo.findOne({
+        where: { profileName: profile, isActive: true },
+      });
+      if (dbProfile) {
+        const policy: RuntimePolicy = {
+          canonicalModel: 'np-dms-ai',
+          temperature: Number(dbProfile.temperature),
+          topP: Number(dbProfile.topP),
+          maxTokens: dbProfile.maxTokens,
+          numCtx: dbProfile.numCtx,
+          repeatPenalty: Number(dbProfile.repeatPenalty),
+          keepAliveSeconds: dbProfile.keepAliveSeconds,
+        };
+        try {
+          await this.redis.set(
+            cacheKey,
+            JSON.stringify(policy),
+            'EX',
+            this.cacheTtlSeconds
+          );
+        } catch (cacheSetErr) {
+          this.logger.warn(
+            `Failed to write execution profile cache: ${cacheSetErr instanceof Error ? cacheSetErr.message : String(cacheSetErr)}`
+          );
+        }
+        return policy;
+      }
+    } catch (dbErr) {
+      this.logger.error(
+        `Failed to read execution profile from DB: ${dbErr instanceof Error ? dbErr.message : String(dbErr)}`
+      );
+    }
+    return this.defaultProfiles[profile];
+  }
+
+  /**
+   * สร้าง payload ของ BullMQ job ที่มี snapshot parameters ณ เวลา dispatch
+   */
+  async createJobPayload(
+    jobType: InternalJobType,
+    documentPublicId?: string,
+    attachmentPublicId?: string
+  ): Promise<AiJobPayload> {
+    const effectiveProfile = this.getProfileForJobType(jobType);
+    const canonicalModel =
+      jobType === 'ocr-extract' ? 'np-dms-ocr' : 'np-dms-ai';
+    const policy = await this.getProfileParameters(effectiveProfile);
+    return {
+      jobType,
+      documentPublicId,
+      attachmentPublicId,
+      effectiveProfile,
+      canonicalModel,
+      snapshotParams: {
+        temperature: policy.temperature,
+        topP: policy.topP,
+        maxTokens: policy.maxTokens,
+        numCtx: policy.numCtx,
+        repeatPenalty: policy.repeatPenalty,
+        keepAliveSeconds: policy.keepAliveSeconds,
+      },
+    };
+  }
+}
@@ -2,6 +2,7 @@
 // Change Log
 // - 2026-05-15: เพิ่ม EmbeddingService สำหรับ full-document chunked embedding ตาม ADR-023A T021.
 // - 2026-06-05: ปรับปรุงเป็น Hybrid Embedding และเพิ่ม Semantic Chunking ผ่าน typhoon2.5 (T025-T027)
+// - 2026-06-11: US3 - เพิ่มการคืนค่า device (cpu/gpu) จาก embedding

 import { Injectable, Logger } from '@nestjs/common';
 import { ConfigService } from '@nestjs/config';
@@ -20,6 +21,7 @@ export interface EmbeddingResult {
  success: boolean;
  chunksEmbedded: number;
  error?: string;
+  device?: string;
 }

 /** บริการสร้าง embedding สำหรับ full-document RAG (ADR-023A) */
@@ -75,19 +77,18 @@ export class EmbeddingService {
          error: 'No OCR text provided',
        };
      }
-
-      // 1. แบ่งข้อความออกเป็น Chunk ด้วย Semantic Chunking
      const chunks = await this.semanticChunkTextWithFallback(ocrText);
      this.logger.log(
        `Document ${documentPublicId} split into ${chunks.length} chunks`
      );
-
-      // 2. แปลงแต่ละ chunk เป็น Hybrid Vector และเตรียม points
      const points = [];
+      let usedDevice = 'gpu';
      for (const [idx, chunk] of chunks.entries()) {
        try {
-          // เรียก Sidecar /embed เพื่อแปลงข้อความของ chunk
          const embedResult = await this.ocrService.embedViaSidecar(chunk.text);
+          if (embedResult.device === 'cpu') {
+            usedDevice = 'cpu';
+          }
          points.push({
            id: `${documentPublicId}-${idx}`,
            vector: {
@@ -116,7 +117,6 @@ export class EmbeddingService {
          );
        }
      }
-
      if (points.length === 0) {
        return {
          success: false,
@@ -124,21 +124,19 @@ export class EmbeddingService {
          error: 'All chunks failed to embed',
        };
      }
-
-      // 3. ลบ points เก่าของเอกสาร (เพื่อความ idempotent และรองรับ revision ใหม่)
      await this.qdrantService.deleteByDocumentPublicId(
        projectPublicId,
        documentPublicId
      );
-
-      // 4. บันทึก points ใหม่ลง Qdrant
      await this.qdrantService.upsert(projectPublicId, points);
-
      this.logger.log(
        `Successfully embedded ${points.length} chunks for document ${documentPublicId} in project ${projectPublicId}`
      );
-
-      return { success: true, chunksEmbedded: points.length };
+      return {
+        success: true,
+        chunksEmbedded: points.length,
+        device: usedDevice,
+      };
    } catch (err) {
      const errorMsg = err instanceof Error ? err.message : String(err);
      this.logger.error(
@@ -1,4 +1,4 @@
-// File: src/modules/ai/services/ocr.service.ts
+// File: backend/src/modules/ai/services/ocr.service.ts
 // Change Log
 // - 2026-05-15: เพิ่ม OCR auto-detection service สำหรับ ADR-023A.
 // - 2026-05-25: แก้ไข AggregateError (empty message) จาก axios โดย wrap เป็น Error พร้อม context ที่ชัดเจน.
@@ -11,6 +11,7 @@
 // - 2026-06-01: เปลี่ยน processWithTesseract/processWithTyphoon ให้ส่ง file content ผ่าน multipart ไปยัง /ocr-upload แทนการส่ง path
 // - 2026-06-02: ส่งค่า X-API-Key ใน request headers ไปยัง ocr-sidecar เพื่อความมั่นคงปลอดภัยสูงสุด (ADR-033, Suggestion 2)
 // - 2026-06-04: ADR-034 — เปลี่ยน TYPHOON_ENGINE.engineName เป็น typhoon-np-dms-ocr:latest ตรงกับชื่อโมเดลใน Ollama
+// - 2026-06-11: US2 - คำนวณ OCR residency keep_alive แบบ dynamic ตาม VRAM headroom และ active profile

 import { Injectable, Logger, NotFoundException } from '@nestjs/common';
 import { ConfigService } from '@nestjs/config';
@@ -29,12 +30,16 @@ import { SystemSetting } from '../entities/system-setting.entity';
 import { AiAuditLog, AiAuditStatus } from '../entities/ai-audit-log.entity';
 import { OcrCacheService } from './ocr-cache.service';
 import { VramMonitorService } from './vram-monitor.service';
+import { AiPolicyService } from './ai-policy.service';
+import { ExecutionProfile } from '../interfaces/execution-policy.interface';
+import { OcrResidencyDecision } from '../interfaces/ocr-residency.interface';

 export interface OcrDetectionInput {
  extractedText?: string;
  extractedChars?: number;
  pdfPath?: string;
  documentPublicId?: string; // เพิ่มเพื่อการทำ audit logs
+  activeProfile?: ExecutionProfile;
 }

 export interface OcrDetectionResult {
@@ -101,6 +106,9 @@ export class OcrService {
  private readonly threshold: number;
  private readonly ocrApiUrl: string;
  private readonly ocrSidecarApiKey: string;
+  private readonly vramHeadroomThresholdMb: number;
+  private readonly ocrResidencyWindowSeconds: number;
+  private readonly mainModelPressureThresholdMb: number;
  constructor(
    private readonly configService: ConfigService,
    @InjectRepository(SystemSetting)
@@ -109,6 +117,7 @@ export class OcrService {
    private readonly auditLogRepo: Repository<AiAuditLog>,
    private readonly ocrCacheService: OcrCacheService,
    private readonly vramMonitorService: VramMonitorService,
+    private readonly aiPolicyService: AiPolicyService,
    @InjectRedis() private readonly redis: Redis
  ) {
    this.threshold = this.configService.get<number>('OCR_CHAR_THRESHOLD', 100);
@@ -120,6 +129,82 @@ export class OcrService {
      'OCR_SIDECAR_API_KEY',
      'lcbp3-dms-ocr-sidecar-secure-token-2026'
    );
+    this.vramHeadroomThresholdMb = this.configService.get<number>(
+      'VRAM_HEADROOM_THRESHOLD_MB',
+      this.configService.get<number>('AI_VRAM_HEADROOM_THRESHOLD_MB', 3000)
+    );
+    this.ocrResidencyWindowSeconds = this.configService.get<number>(
+      'OCR_RESIDENCY_WINDOW_SECONDS',
+      this.configService.get<number>('AI_OCR_RESIDENCY_WINDOW_SECONDS', 120)
+    );
+    this.mainModelPressureThresholdMb = this.configService.get<number>(
+      'GPU_MAIN_MODEL_PRESSURE_THRESHOLD_MB',
+      this.configService.get<number>(
+        'AI_GPU_MAIN_MODEL_PRESSURE_THRESHOLD_MB',
+        12000
+      )
+    );
+  }
+
+  /**
+   * คำนวณ keep_alive สำหรับ OCR ตามความจุ VRAM และประวัติการรัน
+   */
+  async calculateOcrResidency(
+    activeProfile?: ExecutionProfile | null
+  ): Promise<OcrResidencyDecision> {
+    try {
+      const headroom = await this.vramMonitorService.getVramHeadroom();
+      if (!headroom.querySuccess) {
+        return {
+          keepAliveSeconds: 0,
+          vramHeadroomMb: 0,
+          activeProfile: activeProfile ?? null,
+          reason: 'query-failed',
+        };
+      }
+      if (activeProfile === 'deep-analysis') {
+        this.logger.log(`OCR Residency: deep-analysis active, keep_alive = 0`);
+        return {
+          keepAliveSeconds: 0,
+          vramHeadroomMb: headroom.availableMb,
+          activeProfile,
+          reason: 'deep-analysis-active',
+        };
+      }
+      const isHighPressure =
+        (headroom.mainModelVramMb ?? 0) > this.mainModelPressureThresholdMb ||
+        headroom.availableMb < this.vramHeadroomThresholdMb;
+      if (isHighPressure) {
+        this.logger.log(
+          `OCR Residency: VRAM pressure is high (main: ${headroom.mainModelVramMb}MB, avail: ${headroom.availableMb}MB), keep_alive = 0`
+        );
+        return {
+          keepAliveSeconds: 0,
+          vramHeadroomMb: headroom.availableMb,
+          activeProfile: activeProfile ?? null,
+          reason: 'high-pressure',
+        };
+      }
+      this.logger.log(
+        `OCR Residency: VRAM headroom sufficient (${headroom.availableMb} MB), keep_alive = ${this.ocrResidencyWindowSeconds}`
+      );
+      return {
+        keepAliveSeconds: this.ocrResidencyWindowSeconds,
+        vramHeadroomMb: headroom.availableMb,
+        activeProfile: activeProfile ?? null,
+        reason: 'headroom-sufficient',
+      };
+    } catch (err: unknown) {
+      this.logger.warn(
+        `Failed to calculate OCR residency: ${err instanceof Error ? err.message : String(err)}`
+      );
+      return {
+        keepAliveSeconds: 0,
+        vramHeadroomMb: 0,
+        activeProfile: activeProfile ?? null,
+        reason: 'query-failed',
+      };
+    }
  }

  /** ดึงรายการ OCR Engines ทั้งหมด พร้อมตรวจสอบตัวที่กำลัง Active */
@@ -311,7 +396,6 @@ export class OcrService {
  ): Promise<OcrDetectionResult> {
    const startTime = Date.now();
    try {
-      // 1. ตรวจสอบ VRAM insufficiency guard
      const hasCapacity = await this.vramMonitorService.hasVramCapacity(
        TYPHOON_OCR_REQUIRED_VRAM_MB
      );
@@ -321,7 +405,8 @@ export class OcrService {
        );
        return this.processWithTesseract(input);
      }
-
+      const residency = await this.calculateOcrResidency(input.activeProfile);
+      const keepAlive = residency.keepAliveSeconds;
      this.logger.debug(`Typhoon OCR processing: ${input.pdfPath}`);
      const fileBuffer = fs.readFileSync(input.pdfPath!);
      const form = new FormData();
@@ -331,6 +416,7 @@ export class OcrService {
        'upload.pdf'
      );
      form.append('engine', 'typhoon-np-dms-ocr');
+      form.append('keep_alive', String(keepAlive));
      const response = await axios.post<OcrSidecarResponse>(
        `${this.ocrApiUrl}/ocr-upload`,
        form,
@@ -339,10 +425,8 @@ export class OcrService {
          headers: { 'X-API-Key': this.ocrSidecarApiKey },
        }
      );
-
      const text = response.data.text ?? '';
      const durationMs = Date.now() - startTime;
-
      await this.writeAuditLog({
        documentPublicId: input.documentPublicId,
        aiModel: 'typhoon-ocr',
@@ -352,7 +436,6 @@ export class OcrService {
        processingTimeMs: durationMs,
        cacheHit: false,
      });
-
      return {
        text,
        ocrUsed: true,
@@ -398,6 +481,7 @@ export class OcrService {
  async embedViaSidecar(text: string): Promise<{
    dense: number[];
    sparse: { indices: number[]; values: number[] };
+    device?: string;
  }> {
    try {
      const response = await axios.post(
@@ -412,6 +496,7 @@ export class OcrService {
      return response.data as {
        dense: number[];
        sparse: { indices: number[]; values: number[] };
+        device?: string;
      };
    } catch (err: unknown) {
      const msg = err instanceof Error ? err.message : String(err);
@@ -424,7 +509,7 @@ export class OcrService {
  async rerankViaSidecar(
    query: string,
    chunks: string[]
-  ): Promise<{ scores: number[]; ranked_indices: number[] }> {
+  ): Promise<{ scores: number[]; ranked_indices: number[]; device?: string }> {
    try {
      const response = await axios.post(
        `${this.ocrApiUrl}/rerank`,
@@ -435,7 +520,11 @@ export class OcrService {
          },
        }
      );
-      return response.data as { scores: number[]; ranked_indices: number[] };
+      return response.data as {
+        scores: number[];
+        ranked_indices: number[];
+        device?: string;
+      };
    } catch (err: unknown) {
      const msg = err instanceof Error ? err.message : String(err);
      this.logger.error(`Failed to rerank via Sidecar: ${msg}`);
@@ -1,133 +1,143 @@
-// File: src/modules/ai/services/vram-monitor.service.ts
-// Change Log
-// - 2026-05-30: Initial implementation สำหรับ Typhoon OCR VRAM monitoring (T006, ADR-032)
+// File: backend/src/modules/ai/services/vram-monitor.service.ts
+// Change Log:
+// - 2026-06-11: Initial creation of VramMonitorService to monitor VRAM headroom from Ollama /api/ps
+// - 2026-06-11: เพิ่มการคำนวณ mainModelVramMb ใน getVramHeadroom
+// - 2026-06-11: เพิ่ม getVramStatus และ invalidateCache เพื่อความเข้ากันได้กับส่วนอื่น

 import { Injectable, Logger } from '@nestjs/common';
 import { ConfigService } from '@nestjs/config';
 import axios from 'axios';
-import { InjectRedis } from '@nestjs-modules/ioredis';
-import Redis from 'ioredis';
+import { VramHeadroom } from '../interfaces/execution-policy.interface';

-/** ข้อมูล VRAM จาก Ollama PS API */
-export interface OllamaModelInfo {
-  name: string;
-  size_vram: number; // bytes
-}
-
-/** ผลลัพธ์ VRAM status */
+/**
+ * ผลลัพธ์ VRAM status สำหรับส่วนบริการภายนอก
+ * ผลลัพธ์นี้มีวัตถุประสงค์เพื่อรักษาความเข้ากันได้ย้อนหลัง (Backward Compatibility)
+ */
 export interface VramStatus {
  totalVramMb: number;
  usedVramMb: number;
  freeVramMb: number;
  loadedModels: string[];
-  hasCapacity: boolean; // true ถ้า free VRAM >= minRequiredMb
+  hasCapacity: boolean;
 }

-/** ผลลัพธ์ภายในจาก Ollama /api/ps */
-interface OllamaProcessStatus {
-  models?: OllamaModelInfo[];
-}
-
-// Redis key สำหรับ cache VRAM status
-const VRAM_STATUS_CACHE_KEY = 'ai:vram:status';
-// TTL 10 วินาที — refresh บ่อยพอสำหรับ real-time monitoring
-const VRAM_STATUS_TTL_SECONDS = 10;
-// VRAM limit สำหรับ RTX 2060 Super (8192 MB)
-const GPU_TOTAL_VRAM_MB = 8192;
-// Threshold: ไม่โหลด model ถ้า usage > 90%
-const VRAM_USAGE_LIMIT_PERCENT = 0.9;
-
-/** บริการตรวจสอบ VRAM GPU ผ่าน Ollama API ตาม ADR-032 */
@Injectable()
 export class VramMonitorService {
  private readonly logger = new Logger(VramMonitorService.name);
  private readonly ollamaUrl: string;
+  private readonly totalVramMb: number;

-  constructor(
-    private readonly configService: ConfigService,
-    @InjectRedis() private readonly redis: Redis
-  ) {
+  constructor(private readonly configService: ConfigService) {
    this.ollamaUrl = this.configService.get<string>(
      'OLLAMA_URL',
-      this.configService.get<string>('AI_HOST_URL', 'http://localhost:11434')
+      this.configService.get<string>(
+        'AI_HOST_URL',
+        'http://192.168.10.100:11434'
+      )
+    );
+    this.totalVramMb = this.configService.get<number>(
+      'GPU_TOTAL_VRAM_MB',
+      16384 // Default to 16GB (RTX 5060 Ti)
    );
  }

  /**
-   * ดึงสถานะ VRAM ปัจจุบันจาก Ollama /api/ps
-   * ใช้ Redis cache TTL 10 วินาทีเพื่อลด overhead
+   * ดึงสถานะ VRAM headroom จาก Ollama /api/ps
+   * ถ้าล้มเหลวจะคืนค่าด้วย safe default (available = 0)
   */
-  async getVramStatus(minRequiredMb = 4000): Promise<VramStatus> {
-    const cached = await this.redis.get(VRAM_STATUS_CACHE_KEY);
-    if (cached) {
-      const parsed = JSON.parse(cached) as VramStatus;
-      parsed.hasCapacity = parsed.freeVramMb >= minRequiredMb;
-      return parsed;
-    }
-    return this.fetchAndCacheVramStatus(minRequiredMb);
-  }
-
-  /** ตรวจสอบว่า VRAM เพียงพอสำหรับโหลด model ที่ต้องการ */
-  async hasVramCapacity(requiredMb: number): Promise<boolean> {
-    const status = await this.getVramStatus(requiredMb);
-    return status.hasCapacity;
-  }
-
-  /** ดึงข้อมูล VRAM จาก Ollama และ cache ใน Redis */
-  private async fetchAndCacheVramStatus(
-    minRequiredMb: number
-  ): Promise<VramStatus> {
+  async getVramHeadroom(): Promise<VramHeadroom> {
    try {
-      const response = await axios.get<OllamaProcessStatus>(
-        `${this.ollamaUrl}/api/ps`,
-        { timeout: 5000 }
-      );
-      const models = response.data.models ?? [];
-      const loadedModels = models.map((m) => m.name);
-      // คำนวณ VRAM ที่ใช้จาก models ที่โหลดอยู่
-      const usedVramBytes = models.reduce(
-        (sum, m) => sum + (m.size_vram ?? 0),
-        0
-      );
-      const usedVramMb = Math.round(usedVramBytes / 1024 / 1024);
-      // จำกัด VRAM ไม่เกิน limit 90% ของ GPU ทั้งหมด
-      const maxAllowedMb = Math.floor(
-        GPU_TOTAL_VRAM_MB * VRAM_USAGE_LIMIT_PERCENT
-      );
-      const freeVramMb = Math.max(0, maxAllowedMb - usedVramMb);
-      const status: VramStatus = {
-        totalVramMb: GPU_TOTAL_VRAM_MB,
-        usedVramMb,
-        freeVramMb,
-        loadedModels,
-        hasCapacity: freeVramMb >= minRequiredMb,
+      const response = await axios.get<{
+        models?: Array<{
+          name: string;
+          size_vram: number;
+        }>;
+      }>(`${this.ollamaUrl}/api/ps`, { timeout: 3000 });
+      const models = response.data?.models ?? [];
+      let totalUsedBytes = 0;
+      let mainModelUsedBytes = 0;
+      for (const model of models) {
+        totalUsedBytes += model.size_vram || 0;
+        if (
+          model.name.includes('np-dms-ai') ||
+          model.name.includes('typhoon2.5-np-dms')
+        ) {
+          mainModelUsedBytes += model.size_vram || 0;
+        }
+      }
+      const usedMb = Math.round(totalUsedBytes / (1024 * 1024));
+      const availableMb = Math.max(0, this.totalVramMb - usedMb);
+      const mainModelVramMb = Math.round(mainModelUsedBytes / (1024 * 1024));
+      return {
+        totalMb: this.totalVramMb,
+        usedMb,
+        availableMb,
+        querySuccess: true,
+        mainModelVramMb,
      };
-      await this.redis.setex(
-        VRAM_STATUS_CACHE_KEY,
-        VRAM_STATUS_TTL_SECONDS,
-        JSON.stringify(status)
-      );
-      return status;
    } catch (err: unknown) {
-      const msg = err instanceof Error ? err.message : String(err);
      this.logger.warn(
-        `VRAM status fetch failed: ${msg} — ใช้ค่า resilient fallback`
+        `Failed to query Ollama /api/ps: ${err instanceof Error ? err.message : String(err)}`
      );
      return {
-        totalVramMb: GPU_TOTAL_VRAM_MB,
-        usedVramMb: 0,
-        freeVramMb: GPU_TOTAL_VRAM_MB,
-        loadedModels: [],
-        hasCapacity: true,
+        totalMb: this.totalVramMb,
+        usedMb: this.totalVramMb, // บังคับให้ used = total เพื่อให้ available = 0
+        availableMb: 0,
+        querySuccess: false,
+        mainModelVramMb: 0,
      };
    }
  }

  /**
-   * ล้าง VRAM cache (เรียกหลังจาก model unload ด้วย keep_alive=0)
-   * เพื่อให้ status check ครั้งต่อไปดึงข้อมูลใหม่จาก Ollama
+   * ดึงสถานะ VRAM ปัจจุบันของระบบ
+   * เพื่อความเข้ากันได้ย้อนหลังกับ endpoint vram/status
+   */
+  async getVramStatus(minRequiredMb = 4000): Promise<VramStatus> {
+    try {
+      const response = await axios.get<{
+        models?: Array<{
+          name: string;
+          size_vram: number;
+        }>;
+      }>(`${this.ollamaUrl}/api/ps`, { timeout: 3000 });
+      const models = response.data?.models ?? [];
+      const loadedModels = models.map((m) => m.name);
+      const headroom = await this.getVramHeadroom();
+      return {
+        totalVramMb: headroom.totalMb,
+        usedVramMb: headroom.usedMb,
+        freeVramMb: headroom.availableMb,
+        loadedModels,
+        hasCapacity: headroom.availableMb >= minRequiredMb,
+      };
+    } catch (err: unknown) {
+      this.logger.warn(
+        `Failed to get VRAM status: ${err instanceof Error ? err.message : String(err)}`
+      );
+      return {
+        totalVramMb: this.totalVramMb,
+        usedVramMb: this.totalVramMb,
+        freeVramMb: 0,
+        loadedModels: [],
+        hasCapacity: false,
+      };
+    }
+  }
+
+  /**
+   * ตรวจสอบว่า VRAM เพียงพอสำหรับความต้องการโหลดโมเดลหรือไม่
+   */
+  async hasVramCapacity(requiredMb: number): Promise<boolean> {
+    const headroom = await this.getVramHeadroom();
+    return headroom.availableMb >= requiredMb;
+  }
+
+  /**
+   * ล้าง cache VRAM (ไม่มี cache แล้วในระบบใหม่ แต่เก็บไว้เพื่อรองรับการเรียกใช้เดิม)
   */
  async invalidateCache(): Promise<void> {
-    await this.redis.del(VRAM_STATUS_CACHE_KEY);
+    await Promise.resolve();
+    this.logger.log('VRAM cache invalidation requested (no-op in new policy)');
  }
 }