feat(ai-runtime): complete ai runtime policy refactor (ADR-035)
This commit is contained in:
@@ -0,0 +1,183 @@
|
||||
// File: backend/src/modules/ai/services/ai-policy.service.ts
|
||||
// Change Log:
|
||||
// - 2026-06-11: Initial creation of AiPolicyService for managing execution profiles and policies
|
||||
// - 2026-06-11: แก้ไขข้อผิดพลาด TS2367 (เทียบ profile กับ ocr-extract) และลบบรรทัดว่างในฟังก์ชัน getProfileParameters
|
||||
|
||||
import { Injectable, Logger } from '@nestjs/common';
|
||||
import { InjectRedis } from '@nestjs-modules/ioredis';
|
||||
import { InjectRepository } from '@nestjs/typeorm';
|
||||
import type Redis from 'ioredis';
|
||||
import { Repository } from 'typeorm';
|
||||
import { AiExecutionProfile } from '../entities/ai-execution-profile.entity';
|
||||
import {
|
||||
ExecutionProfile,
|
||||
InternalJobType,
|
||||
RuntimePolicy,
|
||||
AiJobPayload,
|
||||
} from '../interfaces/execution-policy.interface';
|
||||
|
||||
@Injectable()
|
||||
export class AiPolicyService {
|
||||
private readonly logger = new Logger(AiPolicyService.name);
|
||||
private readonly cachePrefix = 'ai_execution_profiles:';
|
||||
private readonly cacheTtlSeconds = 60;
|
||||
|
||||
private readonly defaultProfiles: Record<ExecutionProfile, RuntimePolicy> = {
|
||||
interactive: {
|
||||
canonicalModel: 'np-dms-ai',
|
||||
temperature: 0.7,
|
||||
topP: 0.9,
|
||||
maxTokens: 2048,
|
||||
numCtx: 4096,
|
||||
repeatPenalty: 1.15,
|
||||
keepAliveSeconds: 300,
|
||||
},
|
||||
standard: {
|
||||
canonicalModel: 'np-dms-ai',
|
||||
temperature: 0.5,
|
||||
topP: 0.8,
|
||||
maxTokens: 4096,
|
||||
numCtx: 8192,
|
||||
repeatPenalty: 1.15,
|
||||
keepAliveSeconds: 600,
|
||||
},
|
||||
quality: {
|
||||
canonicalModel: 'np-dms-ai',
|
||||
temperature: 0.1,
|
||||
topP: 0.95,
|
||||
maxTokens: 8192,
|
||||
numCtx: 8192,
|
||||
repeatPenalty: 1.15,
|
||||
keepAliveSeconds: 600,
|
||||
},
|
||||
'deep-analysis': {
|
||||
canonicalModel: 'np-dms-ai',
|
||||
temperature: 0.3,
|
||||
topP: 0.85,
|
||||
maxTokens: 8192,
|
||||
numCtx: 32768,
|
||||
repeatPenalty: 1.15,
|
||||
keepAliveSeconds: 0,
|
||||
},
|
||||
};
|
||||
|
||||
constructor(
|
||||
@InjectRepository(AiExecutionProfile)
|
||||
private readonly profileRepo: Repository<AiExecutionProfile>,
|
||||
@InjectRedis() private readonly redis: Redis
|
||||
) {}
|
||||
|
||||
/**
|
||||
* แปลงชื่อ model หรือ tag ของ Ollama ให้เป็น canonical name เสมอ (np-dms-ai หรือ np-dms-ocr)
|
||||
*/
|
||||
getCanonicalModelName(modelName: string): 'np-dms-ai' | 'np-dms-ocr' {
|
||||
const name = modelName.toLowerCase();
|
||||
if (name.includes('ocr') || name.includes('typhoon-np-dms-ocr')) {
|
||||
return 'np-dms-ocr';
|
||||
}
|
||||
return 'np-dms-ai';
|
||||
}
|
||||
|
||||
/**
|
||||
* แผนผังการแปลง JobType เป็น ExecutionProfile
|
||||
*/
|
||||
getProfileForJobType(jobType: InternalJobType): ExecutionProfile {
|
||||
switch (jobType) {
|
||||
case 'auto-fill-document':
|
||||
case 'migrate-document':
|
||||
return 'quality';
|
||||
case 'rag-query':
|
||||
return 'standard';
|
||||
case 'intent-classify':
|
||||
case 'tool-suggest':
|
||||
return 'interactive';
|
||||
case 'sandbox-analysis':
|
||||
return 'deep-analysis';
|
||||
case 'ocr-extract':
|
||||
default:
|
||||
return 'standard';
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* ดึงพารามิเตอร์การทำงานสำหรับ ExecutionProfile แต่ละอัน
|
||||
*/
|
||||
async getProfileParameters(
|
||||
profile: ExecutionProfile
|
||||
): Promise<RuntimePolicy> {
|
||||
const cacheKey = `${this.cachePrefix}${profile}`;
|
||||
try {
|
||||
const cached = await this.redis.get(cacheKey);
|
||||
if (cached) {
|
||||
return JSON.parse(cached) as RuntimePolicy;
|
||||
}
|
||||
} catch (cacheErr) {
|
||||
this.logger.warn(
|
||||
`Failed to read execution profile cache: ${cacheErr instanceof Error ? cacheErr.message : String(cacheErr)}`
|
||||
);
|
||||
}
|
||||
try {
|
||||
const dbProfile = await this.profileRepo.findOne({
|
||||
where: { profileName: profile, isActive: true },
|
||||
});
|
||||
if (dbProfile) {
|
||||
const policy: RuntimePolicy = {
|
||||
canonicalModel: 'np-dms-ai',
|
||||
temperature: Number(dbProfile.temperature),
|
||||
topP: Number(dbProfile.topP),
|
||||
maxTokens: dbProfile.maxTokens,
|
||||
numCtx: dbProfile.numCtx,
|
||||
repeatPenalty: Number(dbProfile.repeatPenalty),
|
||||
keepAliveSeconds: dbProfile.keepAliveSeconds,
|
||||
};
|
||||
try {
|
||||
await this.redis.set(
|
||||
cacheKey,
|
||||
JSON.stringify(policy),
|
||||
'EX',
|
||||
this.cacheTtlSeconds
|
||||
);
|
||||
} catch (cacheSetErr) {
|
||||
this.logger.warn(
|
||||
`Failed to write execution profile cache: ${cacheSetErr instanceof Error ? cacheSetErr.message : String(cacheSetErr)}`
|
||||
);
|
||||
}
|
||||
return policy;
|
||||
}
|
||||
} catch (dbErr) {
|
||||
this.logger.error(
|
||||
`Failed to read execution profile from DB: ${dbErr instanceof Error ? dbErr.message : String(dbErr)}`
|
||||
);
|
||||
}
|
||||
return this.defaultProfiles[profile];
|
||||
}
|
||||
|
||||
/**
|
||||
* สร้าง payload ของ BullMQ job ที่มี snapshot parameters ณ เวลา dispatch
|
||||
*/
|
||||
async createJobPayload(
|
||||
jobType: InternalJobType,
|
||||
documentPublicId?: string,
|
||||
attachmentPublicId?: string
|
||||
): Promise<AiJobPayload> {
|
||||
const effectiveProfile = this.getProfileForJobType(jobType);
|
||||
const canonicalModel =
|
||||
jobType === 'ocr-extract' ? 'np-dms-ocr' : 'np-dms-ai';
|
||||
const policy = await this.getProfileParameters(effectiveProfile);
|
||||
return {
|
||||
jobType,
|
||||
documentPublicId,
|
||||
attachmentPublicId,
|
||||
effectiveProfile,
|
||||
canonicalModel,
|
||||
snapshotParams: {
|
||||
temperature: policy.temperature,
|
||||
topP: policy.topP,
|
||||
maxTokens: policy.maxTokens,
|
||||
numCtx: policy.numCtx,
|
||||
repeatPenalty: policy.repeatPenalty,
|
||||
keepAliveSeconds: policy.keepAliveSeconds,
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -2,6 +2,7 @@
|
||||
// Change Log
|
||||
// - 2026-05-15: เพิ่ม EmbeddingService สำหรับ full-document chunked embedding ตาม ADR-023A T021.
|
||||
// - 2026-06-05: ปรับปรุงเป็น Hybrid Embedding และเพิ่ม Semantic Chunking ผ่าน typhoon2.5 (T025-T027)
|
||||
// - 2026-06-11: US3 - เพิ่มการคืนค่า device (cpu/gpu) จาก embedding
|
||||
|
||||
import { Injectable, Logger } from '@nestjs/common';
|
||||
import { ConfigService } from '@nestjs/config';
|
||||
@@ -20,6 +21,7 @@ export interface EmbeddingResult {
|
||||
success: boolean;
|
||||
chunksEmbedded: number;
|
||||
error?: string;
|
||||
device?: string;
|
||||
}
|
||||
|
||||
/** บริการสร้าง embedding สำหรับ full-document RAG (ADR-023A) */
|
||||
@@ -75,19 +77,18 @@ export class EmbeddingService {
|
||||
error: 'No OCR text provided',
|
||||
};
|
||||
}
|
||||
|
||||
// 1. แบ่งข้อความออกเป็น Chunk ด้วย Semantic Chunking
|
||||
const chunks = await this.semanticChunkTextWithFallback(ocrText);
|
||||
this.logger.log(
|
||||
`Document ${documentPublicId} split into ${chunks.length} chunks`
|
||||
);
|
||||
|
||||
// 2. แปลงแต่ละ chunk เป็น Hybrid Vector และเตรียม points
|
||||
const points = [];
|
||||
let usedDevice = 'gpu';
|
||||
for (const [idx, chunk] of chunks.entries()) {
|
||||
try {
|
||||
// เรียก Sidecar /embed เพื่อแปลงข้อความของ chunk
|
||||
const embedResult = await this.ocrService.embedViaSidecar(chunk.text);
|
||||
if (embedResult.device === 'cpu') {
|
||||
usedDevice = 'cpu';
|
||||
}
|
||||
points.push({
|
||||
id: `${documentPublicId}-${idx}`,
|
||||
vector: {
|
||||
@@ -116,7 +117,6 @@ export class EmbeddingService {
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if (points.length === 0) {
|
||||
return {
|
||||
success: false,
|
||||
@@ -124,21 +124,19 @@ export class EmbeddingService {
|
||||
error: 'All chunks failed to embed',
|
||||
};
|
||||
}
|
||||
|
||||
// 3. ลบ points เก่าของเอกสาร (เพื่อความ idempotent และรองรับ revision ใหม่)
|
||||
await this.qdrantService.deleteByDocumentPublicId(
|
||||
projectPublicId,
|
||||
documentPublicId
|
||||
);
|
||||
|
||||
// 4. บันทึก points ใหม่ลง Qdrant
|
||||
await this.qdrantService.upsert(projectPublicId, points);
|
||||
|
||||
this.logger.log(
|
||||
`Successfully embedded ${points.length} chunks for document ${documentPublicId} in project ${projectPublicId}`
|
||||
);
|
||||
|
||||
return { success: true, chunksEmbedded: points.length };
|
||||
return {
|
||||
success: true,
|
||||
chunksEmbedded: points.length,
|
||||
device: usedDevice,
|
||||
};
|
||||
} catch (err) {
|
||||
const errorMsg = err instanceof Error ? err.message : String(err);
|
||||
this.logger.error(
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
// File: src/modules/ai/services/ocr.service.ts
|
||||
// File: backend/src/modules/ai/services/ocr.service.ts
|
||||
// Change Log
|
||||
// - 2026-05-15: เพิ่ม OCR auto-detection service สำหรับ ADR-023A.
|
||||
// - 2026-05-25: แก้ไข AggregateError (empty message) จาก axios โดย wrap เป็น Error พร้อม context ที่ชัดเจน.
|
||||
@@ -11,6 +11,7 @@
|
||||
// - 2026-06-01: เปลี่ยน processWithTesseract/processWithTyphoon ให้ส่ง file content ผ่าน multipart ไปยัง /ocr-upload แทนการส่ง path
|
||||
// - 2026-06-02: ส่งค่า X-API-Key ใน request headers ไปยัง ocr-sidecar เพื่อความมั่นคงปลอดภัยสูงสุด (ADR-033, Suggestion 2)
|
||||
// - 2026-06-04: ADR-034 — เปลี่ยน TYPHOON_ENGINE.engineName เป็น typhoon-np-dms-ocr:latest ตรงกับชื่อโมเดลใน Ollama
|
||||
// - 2026-06-11: US2 - คำนวณ OCR residency keep_alive แบบ dynamic ตาม VRAM headroom และ active profile
|
||||
|
||||
import { Injectable, Logger, NotFoundException } from '@nestjs/common';
|
||||
import { ConfigService } from '@nestjs/config';
|
||||
@@ -29,12 +30,16 @@ import { SystemSetting } from '../entities/system-setting.entity';
|
||||
import { AiAuditLog, AiAuditStatus } from '../entities/ai-audit-log.entity';
|
||||
import { OcrCacheService } from './ocr-cache.service';
|
||||
import { VramMonitorService } from './vram-monitor.service';
|
||||
import { AiPolicyService } from './ai-policy.service';
|
||||
import { ExecutionProfile } from '../interfaces/execution-policy.interface';
|
||||
import { OcrResidencyDecision } from '../interfaces/ocr-residency.interface';
|
||||
|
||||
export interface OcrDetectionInput {
|
||||
extractedText?: string;
|
||||
extractedChars?: number;
|
||||
pdfPath?: string;
|
||||
documentPublicId?: string; // เพิ่มเพื่อการทำ audit logs
|
||||
activeProfile?: ExecutionProfile;
|
||||
}
|
||||
|
||||
export interface OcrDetectionResult {
|
||||
@@ -101,6 +106,9 @@ export class OcrService {
|
||||
private readonly threshold: number;
|
||||
private readonly ocrApiUrl: string;
|
||||
private readonly ocrSidecarApiKey: string;
|
||||
private readonly vramHeadroomThresholdMb: number;
|
||||
private readonly ocrResidencyWindowSeconds: number;
|
||||
private readonly mainModelPressureThresholdMb: number;
|
||||
constructor(
|
||||
private readonly configService: ConfigService,
|
||||
@InjectRepository(SystemSetting)
|
||||
@@ -109,6 +117,7 @@ export class OcrService {
|
||||
private readonly auditLogRepo: Repository<AiAuditLog>,
|
||||
private readonly ocrCacheService: OcrCacheService,
|
||||
private readonly vramMonitorService: VramMonitorService,
|
||||
private readonly aiPolicyService: AiPolicyService,
|
||||
@InjectRedis() private readonly redis: Redis
|
||||
) {
|
||||
this.threshold = this.configService.get<number>('OCR_CHAR_THRESHOLD', 100);
|
||||
@@ -120,6 +129,82 @@ export class OcrService {
|
||||
'OCR_SIDECAR_API_KEY',
|
||||
'lcbp3-dms-ocr-sidecar-secure-token-2026'
|
||||
);
|
||||
this.vramHeadroomThresholdMb = this.configService.get<number>(
|
||||
'VRAM_HEADROOM_THRESHOLD_MB',
|
||||
this.configService.get<number>('AI_VRAM_HEADROOM_THRESHOLD_MB', 3000)
|
||||
);
|
||||
this.ocrResidencyWindowSeconds = this.configService.get<number>(
|
||||
'OCR_RESIDENCY_WINDOW_SECONDS',
|
||||
this.configService.get<number>('AI_OCR_RESIDENCY_WINDOW_SECONDS', 120)
|
||||
);
|
||||
this.mainModelPressureThresholdMb = this.configService.get<number>(
|
||||
'GPU_MAIN_MODEL_PRESSURE_THRESHOLD_MB',
|
||||
this.configService.get<number>(
|
||||
'AI_GPU_MAIN_MODEL_PRESSURE_THRESHOLD_MB',
|
||||
12000
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* คำนวณ keep_alive สำหรับ OCR ตามความจุ VRAM และประวัติการรัน
|
||||
*/
|
||||
async calculateOcrResidency(
|
||||
activeProfile?: ExecutionProfile | null
|
||||
): Promise<OcrResidencyDecision> {
|
||||
try {
|
||||
const headroom = await this.vramMonitorService.getVramHeadroom();
|
||||
if (!headroom.querySuccess) {
|
||||
return {
|
||||
keepAliveSeconds: 0,
|
||||
vramHeadroomMb: 0,
|
||||
activeProfile: activeProfile ?? null,
|
||||
reason: 'query-failed',
|
||||
};
|
||||
}
|
||||
if (activeProfile === 'deep-analysis') {
|
||||
this.logger.log(`OCR Residency: deep-analysis active, keep_alive = 0`);
|
||||
return {
|
||||
keepAliveSeconds: 0,
|
||||
vramHeadroomMb: headroom.availableMb,
|
||||
activeProfile,
|
||||
reason: 'deep-analysis-active',
|
||||
};
|
||||
}
|
||||
const isHighPressure =
|
||||
(headroom.mainModelVramMb ?? 0) > this.mainModelPressureThresholdMb ||
|
||||
headroom.availableMb < this.vramHeadroomThresholdMb;
|
||||
if (isHighPressure) {
|
||||
this.logger.log(
|
||||
`OCR Residency: VRAM pressure is high (main: ${headroom.mainModelVramMb}MB, avail: ${headroom.availableMb}MB), keep_alive = 0`
|
||||
);
|
||||
return {
|
||||
keepAliveSeconds: 0,
|
||||
vramHeadroomMb: headroom.availableMb,
|
||||
activeProfile: activeProfile ?? null,
|
||||
reason: 'high-pressure',
|
||||
};
|
||||
}
|
||||
this.logger.log(
|
||||
`OCR Residency: VRAM headroom sufficient (${headroom.availableMb} MB), keep_alive = ${this.ocrResidencyWindowSeconds}`
|
||||
);
|
||||
return {
|
||||
keepAliveSeconds: this.ocrResidencyWindowSeconds,
|
||||
vramHeadroomMb: headroom.availableMb,
|
||||
activeProfile: activeProfile ?? null,
|
||||
reason: 'headroom-sufficient',
|
||||
};
|
||||
} catch (err: unknown) {
|
||||
this.logger.warn(
|
||||
`Failed to calculate OCR residency: ${err instanceof Error ? err.message : String(err)}`
|
||||
);
|
||||
return {
|
||||
keepAliveSeconds: 0,
|
||||
vramHeadroomMb: 0,
|
||||
activeProfile: activeProfile ?? null,
|
||||
reason: 'query-failed',
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/** ดึงรายการ OCR Engines ทั้งหมด พร้อมตรวจสอบตัวที่กำลัง Active */
|
||||
@@ -311,7 +396,6 @@ export class OcrService {
|
||||
): Promise<OcrDetectionResult> {
|
||||
const startTime = Date.now();
|
||||
try {
|
||||
// 1. ตรวจสอบ VRAM insufficiency guard
|
||||
const hasCapacity = await this.vramMonitorService.hasVramCapacity(
|
||||
TYPHOON_OCR_REQUIRED_VRAM_MB
|
||||
);
|
||||
@@ -321,7 +405,8 @@ export class OcrService {
|
||||
);
|
||||
return this.processWithTesseract(input);
|
||||
}
|
||||
|
||||
const residency = await this.calculateOcrResidency(input.activeProfile);
|
||||
const keepAlive = residency.keepAliveSeconds;
|
||||
this.logger.debug(`Typhoon OCR processing: ${input.pdfPath}`);
|
||||
const fileBuffer = fs.readFileSync(input.pdfPath!);
|
||||
const form = new FormData();
|
||||
@@ -331,6 +416,7 @@ export class OcrService {
|
||||
'upload.pdf'
|
||||
);
|
||||
form.append('engine', 'typhoon-np-dms-ocr');
|
||||
form.append('keep_alive', String(keepAlive));
|
||||
const response = await axios.post<OcrSidecarResponse>(
|
||||
`${this.ocrApiUrl}/ocr-upload`,
|
||||
form,
|
||||
@@ -339,10 +425,8 @@ export class OcrService {
|
||||
headers: { 'X-API-Key': this.ocrSidecarApiKey },
|
||||
}
|
||||
);
|
||||
|
||||
const text = response.data.text ?? '';
|
||||
const durationMs = Date.now() - startTime;
|
||||
|
||||
await this.writeAuditLog({
|
||||
documentPublicId: input.documentPublicId,
|
||||
aiModel: 'typhoon-ocr',
|
||||
@@ -352,7 +436,6 @@ export class OcrService {
|
||||
processingTimeMs: durationMs,
|
||||
cacheHit: false,
|
||||
});
|
||||
|
||||
return {
|
||||
text,
|
||||
ocrUsed: true,
|
||||
@@ -398,6 +481,7 @@ export class OcrService {
|
||||
async embedViaSidecar(text: string): Promise<{
|
||||
dense: number[];
|
||||
sparse: { indices: number[]; values: number[] };
|
||||
device?: string;
|
||||
}> {
|
||||
try {
|
||||
const response = await axios.post(
|
||||
@@ -412,6 +496,7 @@ export class OcrService {
|
||||
return response.data as {
|
||||
dense: number[];
|
||||
sparse: { indices: number[]; values: number[] };
|
||||
device?: string;
|
||||
};
|
||||
} catch (err: unknown) {
|
||||
const msg = err instanceof Error ? err.message : String(err);
|
||||
@@ -424,7 +509,7 @@ export class OcrService {
|
||||
async rerankViaSidecar(
|
||||
query: string,
|
||||
chunks: string[]
|
||||
): Promise<{ scores: number[]; ranked_indices: number[] }> {
|
||||
): Promise<{ scores: number[]; ranked_indices: number[]; device?: string }> {
|
||||
try {
|
||||
const response = await axios.post(
|
||||
`${this.ocrApiUrl}/rerank`,
|
||||
@@ -435,7 +520,11 @@ export class OcrService {
|
||||
},
|
||||
}
|
||||
);
|
||||
return response.data as { scores: number[]; ranked_indices: number[] };
|
||||
return response.data as {
|
||||
scores: number[];
|
||||
ranked_indices: number[];
|
||||
device?: string;
|
||||
};
|
||||
} catch (err: unknown) {
|
||||
const msg = err instanceof Error ? err.message : String(err);
|
||||
this.logger.error(`Failed to rerank via Sidecar: ${msg}`);
|
||||
|
||||
@@ -1,133 +1,143 @@
|
||||
// File: src/modules/ai/services/vram-monitor.service.ts
|
||||
// Change Log
|
||||
// - 2026-05-30: Initial implementation สำหรับ Typhoon OCR VRAM monitoring (T006, ADR-032)
|
||||
// File: backend/src/modules/ai/services/vram-monitor.service.ts
|
||||
// Change Log:
|
||||
// - 2026-06-11: Initial creation of VramMonitorService to monitor VRAM headroom from Ollama /api/ps
|
||||
// - 2026-06-11: เพิ่มการคำนวณ mainModelVramMb ใน getVramHeadroom
|
||||
// - 2026-06-11: เพิ่ม getVramStatus และ invalidateCache เพื่อความเข้ากันได้กับส่วนอื่น
|
||||
|
||||
import { Injectable, Logger } from '@nestjs/common';
|
||||
import { ConfigService } from '@nestjs/config';
|
||||
import axios from 'axios';
|
||||
import { InjectRedis } from '@nestjs-modules/ioredis';
|
||||
import Redis from 'ioredis';
|
||||
import { VramHeadroom } from '../interfaces/execution-policy.interface';
|
||||
|
||||
/** ข้อมูล VRAM จาก Ollama PS API */
|
||||
export interface OllamaModelInfo {
|
||||
name: string;
|
||||
size_vram: number; // bytes
|
||||
}
|
||||
|
||||
/** ผลลัพธ์ VRAM status */
|
||||
/**
|
||||
* ผลลัพธ์ VRAM status สำหรับส่วนบริการภายนอก
|
||||
* ผลลัพธ์นี้มีวัตถุประสงค์เพื่อรักษาความเข้ากันได้ย้อนหลัง (Backward Compatibility)
|
||||
*/
|
||||
export interface VramStatus {
|
||||
totalVramMb: number;
|
||||
usedVramMb: number;
|
||||
freeVramMb: number;
|
||||
loadedModels: string[];
|
||||
hasCapacity: boolean; // true ถ้า free VRAM >= minRequiredMb
|
||||
hasCapacity: boolean;
|
||||
}
|
||||
|
||||
/** ผลลัพธ์ภายในจาก Ollama /api/ps */
|
||||
interface OllamaProcessStatus {
|
||||
models?: OllamaModelInfo[];
|
||||
}
|
||||
|
||||
// Redis key สำหรับ cache VRAM status
|
||||
const VRAM_STATUS_CACHE_KEY = 'ai:vram:status';
|
||||
// TTL 10 วินาที — refresh บ่อยพอสำหรับ real-time monitoring
|
||||
const VRAM_STATUS_TTL_SECONDS = 10;
|
||||
// VRAM limit สำหรับ RTX 2060 Super (8192 MB)
|
||||
const GPU_TOTAL_VRAM_MB = 8192;
|
||||
// Threshold: ไม่โหลด model ถ้า usage > 90%
|
||||
const VRAM_USAGE_LIMIT_PERCENT = 0.9;
|
||||
|
||||
/** บริการตรวจสอบ VRAM GPU ผ่าน Ollama API ตาม ADR-032 */
|
||||
@Injectable()
|
||||
export class VramMonitorService {
|
||||
private readonly logger = new Logger(VramMonitorService.name);
|
||||
private readonly ollamaUrl: string;
|
||||
private readonly totalVramMb: number;
|
||||
|
||||
constructor(
|
||||
private readonly configService: ConfigService,
|
||||
@InjectRedis() private readonly redis: Redis
|
||||
) {
|
||||
constructor(private readonly configService: ConfigService) {
|
||||
this.ollamaUrl = this.configService.get<string>(
|
||||
'OLLAMA_URL',
|
||||
this.configService.get<string>('AI_HOST_URL', 'http://localhost:11434')
|
||||
this.configService.get<string>(
|
||||
'AI_HOST_URL',
|
||||
'http://192.168.10.100:11434'
|
||||
)
|
||||
);
|
||||
this.totalVramMb = this.configService.get<number>(
|
||||
'GPU_TOTAL_VRAM_MB',
|
||||
16384 // Default to 16GB (RTX 5060 Ti)
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* ดึงสถานะ VRAM ปัจจุบันจาก Ollama /api/ps
|
||||
* ใช้ Redis cache TTL 10 วินาทีเพื่อลด overhead
|
||||
* ดึงสถานะ VRAM headroom จาก Ollama /api/ps
|
||||
* ถ้าล้มเหลวจะคืนค่าด้วย safe default (available = 0)
|
||||
*/
|
||||
async getVramStatus(minRequiredMb = 4000): Promise<VramStatus> {
|
||||
const cached = await this.redis.get(VRAM_STATUS_CACHE_KEY);
|
||||
if (cached) {
|
||||
const parsed = JSON.parse(cached) as VramStatus;
|
||||
parsed.hasCapacity = parsed.freeVramMb >= minRequiredMb;
|
||||
return parsed;
|
||||
}
|
||||
return this.fetchAndCacheVramStatus(minRequiredMb);
|
||||
}
|
||||
|
||||
/** ตรวจสอบว่า VRAM เพียงพอสำหรับโหลด model ที่ต้องการ */
|
||||
async hasVramCapacity(requiredMb: number): Promise<boolean> {
|
||||
const status = await this.getVramStatus(requiredMb);
|
||||
return status.hasCapacity;
|
||||
}
|
||||
|
||||
/** ดึงข้อมูล VRAM จาก Ollama และ cache ใน Redis */
|
||||
private async fetchAndCacheVramStatus(
|
||||
minRequiredMb: number
|
||||
): Promise<VramStatus> {
|
||||
async getVramHeadroom(): Promise<VramHeadroom> {
|
||||
try {
|
||||
const response = await axios.get<OllamaProcessStatus>(
|
||||
`${this.ollamaUrl}/api/ps`,
|
||||
{ timeout: 5000 }
|
||||
);
|
||||
const models = response.data.models ?? [];
|
||||
const loadedModels = models.map((m) => m.name);
|
||||
// คำนวณ VRAM ที่ใช้จาก models ที่โหลดอยู่
|
||||
const usedVramBytes = models.reduce(
|
||||
(sum, m) => sum + (m.size_vram ?? 0),
|
||||
0
|
||||
);
|
||||
const usedVramMb = Math.round(usedVramBytes / 1024 / 1024);
|
||||
// จำกัด VRAM ไม่เกิน limit 90% ของ GPU ทั้งหมด
|
||||
const maxAllowedMb = Math.floor(
|
||||
GPU_TOTAL_VRAM_MB * VRAM_USAGE_LIMIT_PERCENT
|
||||
);
|
||||
const freeVramMb = Math.max(0, maxAllowedMb - usedVramMb);
|
||||
const status: VramStatus = {
|
||||
totalVramMb: GPU_TOTAL_VRAM_MB,
|
||||
usedVramMb,
|
||||
freeVramMb,
|
||||
loadedModels,
|
||||
hasCapacity: freeVramMb >= minRequiredMb,
|
||||
const response = await axios.get<{
|
||||
models?: Array<{
|
||||
name: string;
|
||||
size_vram: number;
|
||||
}>;
|
||||
}>(`${this.ollamaUrl}/api/ps`, { timeout: 3000 });
|
||||
const models = response.data?.models ?? [];
|
||||
let totalUsedBytes = 0;
|
||||
let mainModelUsedBytes = 0;
|
||||
for (const model of models) {
|
||||
totalUsedBytes += model.size_vram || 0;
|
||||
if (
|
||||
model.name.includes('np-dms-ai') ||
|
||||
model.name.includes('typhoon2.5-np-dms')
|
||||
) {
|
||||
mainModelUsedBytes += model.size_vram || 0;
|
||||
}
|
||||
}
|
||||
const usedMb = Math.round(totalUsedBytes / (1024 * 1024));
|
||||
const availableMb = Math.max(0, this.totalVramMb - usedMb);
|
||||
const mainModelVramMb = Math.round(mainModelUsedBytes / (1024 * 1024));
|
||||
return {
|
||||
totalMb: this.totalVramMb,
|
||||
usedMb,
|
||||
availableMb,
|
||||
querySuccess: true,
|
||||
mainModelVramMb,
|
||||
};
|
||||
await this.redis.setex(
|
||||
VRAM_STATUS_CACHE_KEY,
|
||||
VRAM_STATUS_TTL_SECONDS,
|
||||
JSON.stringify(status)
|
||||
);
|
||||
return status;
|
||||
} catch (err: unknown) {
|
||||
const msg = err instanceof Error ? err.message : String(err);
|
||||
this.logger.warn(
|
||||
`VRAM status fetch failed: ${msg} — ใช้ค่า resilient fallback`
|
||||
`Failed to query Ollama /api/ps: ${err instanceof Error ? err.message : String(err)}`
|
||||
);
|
||||
return {
|
||||
totalVramMb: GPU_TOTAL_VRAM_MB,
|
||||
usedVramMb: 0,
|
||||
freeVramMb: GPU_TOTAL_VRAM_MB,
|
||||
loadedModels: [],
|
||||
hasCapacity: true,
|
||||
totalMb: this.totalVramMb,
|
||||
usedMb: this.totalVramMb, // บังคับให้ used = total เพื่อให้ available = 0
|
||||
availableMb: 0,
|
||||
querySuccess: false,
|
||||
mainModelVramMb: 0,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* ล้าง VRAM cache (เรียกหลังจาก model unload ด้วย keep_alive=0)
|
||||
* เพื่อให้ status check ครั้งต่อไปดึงข้อมูลใหม่จาก Ollama
|
||||
* ดึงสถานะ VRAM ปัจจุบันของระบบ
|
||||
* เพื่อความเข้ากันได้ย้อนหลังกับ endpoint vram/status
|
||||
*/
|
||||
async getVramStatus(minRequiredMb = 4000): Promise<VramStatus> {
|
||||
try {
|
||||
const response = await axios.get<{
|
||||
models?: Array<{
|
||||
name: string;
|
||||
size_vram: number;
|
||||
}>;
|
||||
}>(`${this.ollamaUrl}/api/ps`, { timeout: 3000 });
|
||||
const models = response.data?.models ?? [];
|
||||
const loadedModels = models.map((m) => m.name);
|
||||
const headroom = await this.getVramHeadroom();
|
||||
return {
|
||||
totalVramMb: headroom.totalMb,
|
||||
usedVramMb: headroom.usedMb,
|
||||
freeVramMb: headroom.availableMb,
|
||||
loadedModels,
|
||||
hasCapacity: headroom.availableMb >= minRequiredMb,
|
||||
};
|
||||
} catch (err: unknown) {
|
||||
this.logger.warn(
|
||||
`Failed to get VRAM status: ${err instanceof Error ? err.message : String(err)}`
|
||||
);
|
||||
return {
|
||||
totalVramMb: this.totalVramMb,
|
||||
usedVramMb: this.totalVramMb,
|
||||
freeVramMb: 0,
|
||||
loadedModels: [],
|
||||
hasCapacity: false,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* ตรวจสอบว่า VRAM เพียงพอสำหรับความต้องการโหลดโมเดลหรือไม่
|
||||
*/
|
||||
async hasVramCapacity(requiredMb: number): Promise<boolean> {
|
||||
const headroom = await this.getVramHeadroom();
|
||||
return headroom.availableMb >= requiredMb;
|
||||
}
|
||||
|
||||
/**
|
||||
* ล้าง cache VRAM (ไม่มี cache แล้วในระบบใหม่ แต่เก็บไว้เพื่อรองรับการเรียกใช้เดิม)
|
||||
*/
|
||||
async invalidateCache(): Promise<void> {
|
||||
await this.redis.del(VRAM_STATUS_CACHE_KEY);
|
||||
await Promise.resolve();
|
||||
this.logger.log('VRAM cache invalidation requested (no-op in new policy)');
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user