690605:2335 ADR-035-135 #1
This commit is contained in:
@@ -0,0 +1,137 @@
|
||||
// File: backend/src/modules/ai/services/embedding.service.spec.ts
|
||||
// Change Log:
|
||||
// - 2026-06-05: สร้าง unit test สำหรับ EmbeddingService เพื่อทดสอบกระบวนการ Semantic Chunking และ fixed-size fallback (T024)
|
||||
|
||||
import { Test, TestingModule } from '@nestjs/testing';
|
||||
import { ConfigService } from '@nestjs/config';
|
||||
import { EmbeddingService } from './embedding.service';
|
||||
import { OllamaService } from './ollama.service';
|
||||
import { AiQdrantService } from '../qdrant.service';
|
||||
import { OcrService } from './ocr.service';
|
||||
import { AiPromptsService } from '../prompts/ai-prompts.service';
|
||||
|
||||
describe('EmbeddingService (US3 — Semantic Chunking)', () => {
|
||||
let service: EmbeddingService;
|
||||
let ollamaService: OllamaService;
|
||||
let qdrantService: AiQdrantService;
|
||||
let ocrService: OcrService;
|
||||
let aiPromptsService: AiPromptsService;
|
||||
const mockConfigService = {
|
||||
get: jest.fn((key: string, defaultValue?: unknown): unknown => {
|
||||
const values: Record<string, unknown> = {
|
||||
EMBEDDING_CHUNK_SIZE: 512,
|
||||
EMBEDDING_CHUNK_OVERLAP: 64,
|
||||
};
|
||||
return values[key] ?? defaultValue;
|
||||
}),
|
||||
};
|
||||
const mockOllamaService = {
|
||||
generate: jest.fn(),
|
||||
};
|
||||
const mockQdrantService = {
|
||||
deleteByDocumentPublicId: jest.fn().mockResolvedValue(undefined),
|
||||
upsert: jest.fn().mockResolvedValue(undefined),
|
||||
};
|
||||
const mockOcrService = {
|
||||
embedViaSidecar: jest.fn(),
|
||||
};
|
||||
const mockAiPromptsService = {
|
||||
resolveActive: jest.fn(),
|
||||
};
|
||||
beforeEach(async () => {
|
||||
const module: TestingModule = await Test.createTestingModule({
|
||||
providers: [
|
||||
EmbeddingService,
|
||||
{ provide: ConfigService, useValue: mockConfigService },
|
||||
{ provide: OllamaService, useValue: mockOllamaService },
|
||||
{ provide: AiQdrantService, useValue: mockQdrantService },
|
||||
{ provide: OcrService, useValue: mockOcrService },
|
||||
{ provide: AiPromptsService, useValue: mockAiPromptsService },
|
||||
],
|
||||
}).compile();
|
||||
service = module.get<EmbeddingService>(EmbeddingService);
|
||||
ollamaService = module.get<OllamaService>(OllamaService);
|
||||
qdrantService = module.get<AiQdrantService>(AiQdrantService);
|
||||
ocrService = module.get<OcrService>(OcrService);
|
||||
aiPromptsService = module.get<AiPromptsService>(AiPromptsService);
|
||||
jest.clearAllMocks();
|
||||
});
|
||||
describe('embedDocument()', () => {
|
||||
it('ควรเรียกใช้ Semantic Chunking เมื่อ LLM ตอบกลับถูกต้องตามแท็ก และบันทึกเข้า Qdrant สำเร็จ', async () => {
|
||||
const mockLlmResponse = `
|
||||
<chunk topic="การติดตั้งระบบ">ขั้นตอนการติดตั้งระบบมีดังนี้คือ 1. ตรวจสอบเครื่องมือ 2. เริ่มเชื่อมต่อ</chunk>
|
||||
<chunk topic="การตั้งค่า">หลังจากติดตั้งให้ทำการตั้งค่าระบบผ่านหน้าจอควบคุมหลัก</chunk>
|
||||
`;
|
||||
mockAiPromptsService.resolveActive.mockResolvedValueOnce({
|
||||
resolvedPrompt: 'mock resolved prompt',
|
||||
versionNumber: 1,
|
||||
});
|
||||
mockOllamaService.generate.mockResolvedValueOnce(mockLlmResponse);
|
||||
mockOcrService.embedViaSidecar.mockImplementation((_text: string) => {
|
||||
return Promise.resolve({
|
||||
dense: Array(1024).fill(0.1),
|
||||
sparse: { indices: [1], values: [0.5] },
|
||||
});
|
||||
});
|
||||
const result = await service.embedDocument(
|
||||
'proj-uuid-456',
|
||||
'doc-uuid-123',
|
||||
'CORR-001',
|
||||
'LETTER',
|
||||
'IN_REVIEW',
|
||||
1,
|
||||
'Test Subject',
|
||||
'2026-06-05',
|
||||
'ข้อความทดสอบสำหรับการหั่นแบบ semantic chunking ซึ่งมีความยาวเกิน 50 ตัวอักษรอย่างแน่นอน'
|
||||
);
|
||||
expect(result.success).toBe(true);
|
||||
expect(result.chunksEmbedded).toBe(2);
|
||||
expect(aiPromptsService.resolveActive).toHaveBeenCalledWith(
|
||||
'rag_chunking',
|
||||
'ข้อความทดสอบสำหรับการหั่นแบบ semantic chunking ซึ่งมีความยาวเกิน 50 ตัวอักษรอย่างแน่นอน'
|
||||
);
|
||||
expect(ollamaService.generate).toHaveBeenCalledWith(
|
||||
'mock resolved prompt'
|
||||
);
|
||||
expect(ocrService.embedViaSidecar).toHaveBeenCalledTimes(2);
|
||||
expect(qdrantService.deleteByDocumentPublicId).toHaveBeenCalledWith(
|
||||
'proj-uuid-456',
|
||||
'doc-uuid-123'
|
||||
);
|
||||
expect(qdrantService.upsert).toHaveBeenCalled();
|
||||
});
|
||||
it('ควร fallback ไปใช้ fixed-size chunking เมื่อ LLM คืนข้อมูลที่ไม่มีแท็ก chunk หรือการเรียก LLM ล้มเหลว', async () => {
|
||||
mockAiPromptsService.resolveActive.mockResolvedValueOnce({
|
||||
resolvedPrompt: 'mock resolved prompt',
|
||||
versionNumber: 1,
|
||||
});
|
||||
mockOllamaService.generate.mockResolvedValueOnce(
|
||||
'ข้อความธรรมดาที่ไม่มีแท็ก chunk อะไรเลย'
|
||||
);
|
||||
mockOcrService.embedViaSidecar.mockImplementation((_text: string) => {
|
||||
return Promise.resolve({
|
||||
dense: Array(1024).fill(0.2),
|
||||
sparse: { indices: [2], values: [0.8] },
|
||||
});
|
||||
});
|
||||
const result = await service.embedDocument(
|
||||
'proj-uuid-456',
|
||||
'doc-uuid-123',
|
||||
'CORR-001',
|
||||
'LETTER',
|
||||
'IN_REVIEW',
|
||||
1,
|
||||
'Test Subject',
|
||||
'2026-06-05',
|
||||
'ข้อความทดสอบแบบยาวเพื่อจำลองการทำ fixed size chunking สำหรับการ fallback เมื่อ LLM ทำงานไม่ได้ตามเงื่อนไขที่กำหนดไว้'
|
||||
);
|
||||
expect(result.success).toBe(true);
|
||||
expect(result.chunksEmbedded).toBeGreaterThan(0);
|
||||
expect(qdrantService.deleteByDocumentPublicId).toHaveBeenCalledWith(
|
||||
'proj-uuid-456',
|
||||
'doc-uuid-123'
|
||||
);
|
||||
expect(qdrantService.upsert).toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -1,12 +1,14 @@
|
||||
// File: src/modules/ai/services/embedding.service.ts
|
||||
// File: backend/src/modules/ai/services/embedding.service.ts
|
||||
// Change Log
|
||||
// - 2026-05-15: เพิ่ม EmbeddingService สำหรับ full-document chunked embedding ตาม ADR-023A T021.
|
||||
// - 2026-06-05: ปรับปรุงเป็น Hybrid Embedding และเพิ่ม Semantic Chunking ผ่าน typhoon2.5 (T025-T027)
|
||||
|
||||
import { Injectable, Logger } from '@nestjs/common';
|
||||
import { ConfigService } from '@nestjs/config';
|
||||
import { OllamaService } from './ollama.service';
|
||||
import { AiQdrantService } from '../qdrant.service';
|
||||
import { OcrService } from './ocr.service';
|
||||
import { AiPromptsService } from '../prompts/ai-prompts.service';
|
||||
|
||||
export interface EmbeddingChunk {
|
||||
chunkIndex: number;
|
||||
@@ -31,7 +33,8 @@ export class EmbeddingService {
|
||||
private readonly configService: ConfigService,
|
||||
private readonly ollamaService: OllamaService,
|
||||
private readonly qdrantService: AiQdrantService,
|
||||
private readonly ocrService: OcrService
|
||||
private readonly ocrService: OcrService,
|
||||
private readonly aiPromptsService: AiPromptsService
|
||||
) {
|
||||
this.chunkSize = this.configService.get<number>(
|
||||
'EMBEDDING_CHUNK_SIZE',
|
||||
@@ -44,66 +47,71 @@ export class EmbeddingService {
|
||||
}
|
||||
|
||||
/**
|
||||
* สร้าง embedding สำหรับเอกสารทั้งฉบับ:
|
||||
* 1. ดึงข้อความ full-doc (ใช้ extractedText หรือ OCR)
|
||||
* 2. Chunk text 512 tokens / 64 overlap
|
||||
* 3. Generate embedding ต่อ chunk ด้วย nomic-embed-text
|
||||
* 4. Upsert ไป Qdrant พร้อม project isolation
|
||||
* สร้าง hybrid embedding สำหรับเอกสารทั้งฉบับ:
|
||||
* 1. ใช้ Semantic Chunking (ผ่าน LLM) เป็นหลัก พร้อม Fallback เป็นแบบ fixed-size
|
||||
* 2. เรียก Sidecar /embed เพื่อแปลงแต่ละ chunk เป็น Dense (1024 dims) + Sparse vector
|
||||
* 3. ลบ points เก่าของเอกสารใน Qdrant
|
||||
* 4. Upsert points ใหม่เก็บครบ 11 fields
|
||||
*/
|
||||
async embedDocument(
|
||||
pdfPath: string,
|
||||
documentPublicId: string,
|
||||
projectPublicId: string,
|
||||
extractedText?: string
|
||||
documentPublicId: string,
|
||||
correspondenceNumber: string,
|
||||
docType: string,
|
||||
statusCode: string,
|
||||
revisionNumber: number,
|
||||
subject: string,
|
||||
documentDate?: string,
|
||||
ocrText?: string
|
||||
): Promise<EmbeddingResult> {
|
||||
try {
|
||||
// 1. ดึงข้อความจาก PDF (ใช้ extractedText ถ้ามี หรือเรียก OCR)
|
||||
let fullText = extractedText;
|
||||
if (!fullText) {
|
||||
const ocrResult = await this.ocrService.detectAndExtract({
|
||||
pdfPath,
|
||||
extractedText: '',
|
||||
extractedChars: 0,
|
||||
});
|
||||
fullText = ocrResult.text;
|
||||
}
|
||||
|
||||
if (!fullText || fullText.trim().length === 0) {
|
||||
this.logger.warn(`No text extracted from document ${documentPublicId}`);
|
||||
if (!ocrText || ocrText.trim().length === 0) {
|
||||
this.logger.warn(
|
||||
`No OCR text provided for document ${documentPublicId}`
|
||||
);
|
||||
return {
|
||||
success: false,
|
||||
chunksEmbedded: 0,
|
||||
error: 'No text extracted',
|
||||
error: 'No OCR text provided',
|
||||
};
|
||||
}
|
||||
|
||||
// 2. Chunk text
|
||||
const chunks = this.chunkText(fullText);
|
||||
// 1. แบ่งข้อความออกเป็น Chunk ด้วย Semantic Chunking
|
||||
const chunks = await this.semanticChunkTextWithFallback(ocrText);
|
||||
this.logger.log(
|
||||
`Document ${documentPublicId} split into ${chunks.length} chunks`
|
||||
);
|
||||
|
||||
// 3. Generate embedding และ upsert ไป Qdrant
|
||||
// 2. แปลงแต่ละ chunk เป็น Hybrid Vector และเตรียม points
|
||||
const points = [];
|
||||
for (const chunk of chunks) {
|
||||
for (const [idx, chunk] of chunks.entries()) {
|
||||
try {
|
||||
const embedding = await this.ollamaService.generateEmbedding(
|
||||
chunk.text
|
||||
);
|
||||
// เรียก Sidecar /embed เพื่อแปลงข้อความของ chunk
|
||||
const embedResult = await this.ocrService.embedViaSidecar(chunk.text);
|
||||
points.push({
|
||||
id: `${documentPublicId}-${chunk.chunkIndex}`,
|
||||
vector: embedding,
|
||||
id: `${documentPublicId}-${idx}`,
|
||||
vector: {
|
||||
bge_dense: embedResult.dense,
|
||||
bge_sparse: embedResult.sparse,
|
||||
},
|
||||
payload: {
|
||||
document_public_id: documentPublicId,
|
||||
chunk_index: chunk.chunkIndex,
|
||||
page_number: chunk.pageNumber,
|
||||
doc_public_id: documentPublicId,
|
||||
project_public_id: projectPublicId,
|
||||
doc_number: correspondenceNumber,
|
||||
doc_type: docType,
|
||||
status_code: statusCode,
|
||||
revision_number: revisionNumber,
|
||||
subject: subject,
|
||||
document_date: documentDate || null,
|
||||
chunk_topic: chunk.topic,
|
||||
chunk_index: idx,
|
||||
chunk_text: chunk.text,
|
||||
embedded_at: new Date().toISOString(),
|
||||
},
|
||||
});
|
||||
} catch (err) {
|
||||
this.logger.error(
|
||||
`Failed to embed chunk ${chunk.chunkIndex} for document ${documentPublicId}`,
|
||||
`Failed to embed chunk ${idx} for document ${documentPublicId}`,
|
||||
err instanceof Error ? err.message : String(err)
|
||||
);
|
||||
}
|
||||
@@ -117,7 +125,13 @@ export class EmbeddingService {
|
||||
};
|
||||
}
|
||||
|
||||
// 4. Upsert ไป Qdrant พร้อม project isolation
|
||||
// 3. ลบ points เก่าของเอกสาร (เพื่อความ idempotent และรองรับ revision ใหม่)
|
||||
await this.qdrantService.deleteByDocumentPublicId(
|
||||
projectPublicId,
|
||||
documentPublicId
|
||||
);
|
||||
|
||||
// 4. บันทึก points ใหม่ลง Qdrant
|
||||
await this.qdrantService.upsert(projectPublicId, points);
|
||||
|
||||
this.logger.log(
|
||||
@@ -135,12 +149,53 @@ export class EmbeddingService {
|
||||
}
|
||||
|
||||
/**
|
||||
* Chunk text ด้วย overlap
|
||||
* - chunkSize: 512 characters (approximate token equivalent)
|
||||
* - overlap: 64 characters
|
||||
* แบ่งข้อความโดยใช้ typhoon2.5 และ Prompt 'rag_chunking' (T025, T026)
|
||||
* หากล้มเหลวหรือ LLM ไม่ตอบกลับในรูปแบบแท็ก <chunk> ให้ fallback เป็นแบบ fixed-size
|
||||
*/
|
||||
private chunkText(text: string): EmbeddingChunk[] {
|
||||
const chunks: EmbeddingChunk[] = [];
|
||||
private async semanticChunkTextWithFallback(
|
||||
ocrText: string
|
||||
): Promise<Array<{ topic: string; text: string }>> {
|
||||
try {
|
||||
this.logger.log('Attempting semantic chunking via typhoon2.5...');
|
||||
// ดึง prompt จาก ai_prompts ที่เป็น active version
|
||||
const resolved = await this.aiPromptsService.resolveActive(
|
||||
'rag_chunking',
|
||||
ocrText
|
||||
);
|
||||
|
||||
// เรียก LLM
|
||||
const llmOutput = await this.ollamaService.generate(
|
||||
resolved.resolvedPrompt
|
||||
);
|
||||
|
||||
// ดึงและวิเคราะห์ข้อความภายในแท็ก <chunk topic="...">
|
||||
const parsed = this.parseChunkTags(llmOutput);
|
||||
if (parsed.length > 0) {
|
||||
this.logger.log(
|
||||
`Semantic chunking succeeded: split into ${parsed.length} chunks.`
|
||||
);
|
||||
return parsed;
|
||||
}
|
||||
this.logger.warn(
|
||||
'No valid <chunk> tags found in LLM output, falling back to fixed-size chunking.'
|
||||
);
|
||||
} catch (err: unknown) {
|
||||
this.logger.warn(
|
||||
`Semantic chunking failed, falling back to fixed-size chunking: ${err instanceof Error ? err.message : String(err)}`
|
||||
);
|
||||
}
|
||||
|
||||
// Fallback: ใช้การแบ่ง chunk แบบ Fixed-size
|
||||
return this.fixedSizeChunk(ocrText, this.chunkSize, this.overlap);
|
||||
}
|
||||
|
||||
/** แบ่งข้อความตามขนาดคงที่ (Fixed-size Chunking) (FR-005) */
|
||||
private fixedSizeChunk(
|
||||
text: string,
|
||||
chunkSize: number,
|
||||
overlap: number
|
||||
): Array<{ topic: string; text: string }> {
|
||||
const chunks: Array<{ topic: string; text: string }> = [];
|
||||
const cleanText = text.replace(/\s+/g, ' ').trim();
|
||||
const textLength = cleanText.length;
|
||||
|
||||
@@ -148,19 +203,35 @@ export class EmbeddingService {
|
||||
let chunkIndex = 0;
|
||||
|
||||
while (startIndex < textLength) {
|
||||
const endIndex = Math.min(startIndex + this.chunkSize, textLength);
|
||||
const endIndex = Math.min(startIndex + chunkSize, textLength);
|
||||
const chunkText = cleanText.substring(startIndex, endIndex);
|
||||
|
||||
chunks.push({
|
||||
chunkIndex,
|
||||
topic: `ส่วนที่ ${chunkIndex + 1}`,
|
||||
text: chunkText,
|
||||
pageNumber: undefined, // TODO: Extract page numbers if available
|
||||
});
|
||||
|
||||
startIndex += this.chunkSize - this.overlap;
|
||||
startIndex += chunkSize - overlap;
|
||||
chunkIndex += 1;
|
||||
}
|
||||
|
||||
return chunks;
|
||||
}
|
||||
|
||||
/** ประมวลผลดึงค่า regex <chunk topic="...">... </chunk> (T026) */
|
||||
private parseChunkTags(
|
||||
llmOutput: string
|
||||
): Array<{ topic: string; text: string }> {
|
||||
const chunks: Array<{ topic: string; text: string }> = [];
|
||||
const regex = /<chunk\s+topic="([^"]*)"\s*>([\s\S]*?)<\/chunk\s*>/gi;
|
||||
let match;
|
||||
while ((match = regex.exec(llmOutput)) !== null) {
|
||||
const topic = match[1]?.trim() || 'ทั่วไป';
|
||||
const text = match[2]?.trim();
|
||||
if (text) {
|
||||
chunks.push({ topic, text });
|
||||
}
|
||||
}
|
||||
return chunks;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -393,4 +393,53 @@ export class OcrService {
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/** เรียก Sidecar /embed เพื่อทำ BGE-M3 (Dense + Sparse) embedding (T012) */
|
||||
async embedViaSidecar(text: string): Promise<{
|
||||
dense: number[];
|
||||
sparse: { indices: number[]; values: number[] };
|
||||
}> {
|
||||
try {
|
||||
const response = await axios.post(
|
||||
`${this.ocrApiUrl}/embed`,
|
||||
{ text },
|
||||
{
|
||||
headers: {
|
||||
'X-API-Key': this.ocrSidecarApiKey,
|
||||
},
|
||||
}
|
||||
);
|
||||
return response.data as {
|
||||
dense: number[];
|
||||
sparse: { indices: number[]; values: number[] };
|
||||
};
|
||||
} catch (err: unknown) {
|
||||
const msg = err instanceof Error ? err.message : String(err);
|
||||
this.logger.error(`Failed to embed via Sidecar: ${msg}`);
|
||||
throw new Error(`AI_SIDECAR_EMBED_FAILED: ${msg}`);
|
||||
}
|
||||
}
|
||||
|
||||
/** เรียก Sidecar /rerank เพื่อทำ BGE-Reranker-Large re-ranking (T014) */
|
||||
async rerankViaSidecar(
|
||||
query: string,
|
||||
chunks: string[]
|
||||
): Promise<{ scores: number[]; ranked_indices: number[] }> {
|
||||
try {
|
||||
const response = await axios.post(
|
||||
`${this.ocrApiUrl}/rerank`,
|
||||
{ query, chunks },
|
||||
{
|
||||
headers: {
|
||||
'X-API-Key': this.ocrSidecarApiKey,
|
||||
},
|
||||
}
|
||||
);
|
||||
return response.data as { scores: number[]; ranked_indices: number[] };
|
||||
} catch (err: unknown) {
|
||||
const msg = err instanceof Error ? err.message : String(err);
|
||||
this.logger.error(`Failed to rerank via Sidecar: ${msg}`);
|
||||
throw new Error(`AI_SIDECAR_RERANK_FAILED: ${msg}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user