690419:1831 feat: update CI/CD to use SSH key authentication #05
This commit is contained in:
@@ -0,0 +1,110 @@
|
||||
import { Processor, WorkerHost } from '@nestjs/bullmq';
|
||||
import { Logger } from '@nestjs/common';
|
||||
import { InjectRepository } from '@nestjs/typeorm';
|
||||
import { Repository } from 'typeorm';
|
||||
import { Job } from 'bullmq';
|
||||
import { v4 as uuidv4 } from 'uuid';
|
||||
|
||||
import { EmbeddingService } from '../embedding.service';
|
||||
import { QdrantService, VectorMetadata } from '../qdrant.service';
|
||||
import { DocumentChunk } from '../entities/document-chunk.entity';
|
||||
import { EmbeddingJobData } from './thai-preprocess.processor';
|
||||
|
||||
const CHUNK_SIZE = 512;
|
||||
const CHUNK_OVERLAP = 50;
|
||||
|
||||
@Processor('rag:embedding')
|
||||
export class EmbeddingProcessor extends WorkerHost {
|
||||
private readonly logger = new Logger(EmbeddingProcessor.name);
|
||||
|
||||
constructor(
|
||||
private readonly embeddingService: EmbeddingService,
|
||||
private readonly qdrantService: QdrantService,
|
||||
@InjectRepository(DocumentChunk)
|
||||
private readonly chunkRepo: Repository<DocumentChunk>
|
||||
) {
|
||||
super();
|
||||
}
|
||||
|
||||
async process(job: Job<EmbeddingJobData>): Promise<void> {
|
||||
const {
|
||||
attachmentPublicId,
|
||||
normalizedText,
|
||||
docType,
|
||||
docNumber,
|
||||
revision,
|
||||
projectCode,
|
||||
projectPublicId,
|
||||
classification,
|
||||
} = job.data;
|
||||
|
||||
const chunks = this.chunkText(normalizedText);
|
||||
const model = this.embeddingService.getModelName();
|
||||
|
||||
const upsertPoints: Parameters<QdrantService['upsertBatch']>[0] = [];
|
||||
const chunkEntities: DocumentChunk[] = [];
|
||||
|
||||
for (let i = 0; i < chunks.length; i++) {
|
||||
const chunkId = uuidv4();
|
||||
const vector = await this.embeddingService.embed(chunks[i]);
|
||||
|
||||
const payload: VectorMetadata = {
|
||||
chunk_id: chunkId,
|
||||
public_id: attachmentPublicId,
|
||||
project_public_id: projectPublicId,
|
||||
doc_type: docType,
|
||||
doc_number: docNumber,
|
||||
revision,
|
||||
project_code: projectCode,
|
||||
classification,
|
||||
content_preview: chunks[i].slice(0, 500),
|
||||
embedding_model: model,
|
||||
};
|
||||
|
||||
upsertPoints.push({ id: chunkId, vector, payload });
|
||||
|
||||
const entity = this.chunkRepo.create({
|
||||
id: chunkId,
|
||||
documentId: attachmentPublicId,
|
||||
chunkIndex: i,
|
||||
content: chunks[i],
|
||||
docType,
|
||||
docNumber,
|
||||
revision,
|
||||
projectCode,
|
||||
projectPublicId,
|
||||
classification,
|
||||
embeddingModel: model,
|
||||
});
|
||||
chunkEntities.push(entity);
|
||||
}
|
||||
|
||||
if (upsertPoints.length > 0) {
|
||||
await this.qdrantService.upsertBatch(upsertPoints);
|
||||
await this.chunkRepo.save(chunkEntities);
|
||||
}
|
||||
|
||||
await this.chunkRepo.manager.query(
|
||||
`UPDATE attachments SET rag_status = 'INDEXED', rag_last_error = NULL WHERE public_id = ?`,
|
||||
[attachmentPublicId]
|
||||
);
|
||||
|
||||
this.logger.log(
|
||||
`Embedded ${chunks.length} chunks for ${attachmentPublicId}`
|
||||
);
|
||||
}
|
||||
|
||||
private chunkText(text: string): string[] {
|
||||
const words = text.split(/\s+/);
|
||||
const chunks: string[] = [];
|
||||
let start = 0;
|
||||
|
||||
while (start < words.length) {
|
||||
const end = Math.min(start + CHUNK_SIZE, words.length);
|
||||
chunks.push(words.slice(start, end).join(' '));
|
||||
start += CHUNK_SIZE - CHUNK_OVERLAP;
|
||||
}
|
||||
|
||||
return chunks.filter((c) => c.trim().length > 0);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,68 @@
|
||||
import { Processor, WorkerHost } from '@nestjs/bullmq';
|
||||
import { Logger } from '@nestjs/common';
|
||||
import { InjectRepository } from '@nestjs/typeorm';
|
||||
import { Repository } from 'typeorm';
|
||||
import { Job } from 'bullmq';
|
||||
import * as fs from 'fs';
|
||||
import { InjectQueue } from '@nestjs/bullmq';
|
||||
import { Queue } from 'bullmq';
|
||||
|
||||
import { DocumentChunk } from '../entities/document-chunk.entity';
|
||||
|
||||
export interface OcrJobData {
|
||||
attachmentPublicId: string;
|
||||
filePath: string;
|
||||
docType: string;
|
||||
docNumber: string | null;
|
||||
revision: string | null;
|
||||
projectCode: string;
|
||||
projectPublicId: string;
|
||||
classification: 'PUBLIC' | 'INTERNAL' | 'CONFIDENTIAL';
|
||||
}
|
||||
|
||||
@Processor('rag:ocr')
|
||||
export class OcrProcessor extends WorkerHost {
|
||||
private readonly logger = new Logger(OcrProcessor.name);
|
||||
|
||||
constructor(
|
||||
@InjectQueue('rag:thai-preprocess') private readonly thaiQueue: Queue,
|
||||
@InjectRepository(DocumentChunk)
|
||||
private readonly chunkRepo: Repository<DocumentChunk>
|
||||
) {
|
||||
super();
|
||||
}
|
||||
|
||||
async process(job: Job<OcrJobData>): Promise<void> {
|
||||
const { attachmentPublicId, filePath } = job.data;
|
||||
|
||||
const existing = await this.chunkRepo.count({
|
||||
where: { documentId: attachmentPublicId },
|
||||
});
|
||||
if (existing > 0) {
|
||||
this.logger.log(
|
||||
`rag:ocr job already indexed for ${attachmentPublicId}, skipping`
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
await this.chunkRepo.manager.query(
|
||||
`UPDATE attachments SET rag_status = 'PROCESSING' WHERE public_id = ?`,
|
||||
[attachmentPublicId]
|
||||
);
|
||||
|
||||
let rawText: string;
|
||||
try {
|
||||
rawText = fs.readFileSync(filePath, 'utf-8');
|
||||
} catch {
|
||||
rawText = `[binary:${attachmentPublicId}]`;
|
||||
}
|
||||
|
||||
await this.thaiQueue.add(
|
||||
'preprocess',
|
||||
{ ...job.data, rawText },
|
||||
{ jobId: `thai:${attachmentPublicId}` }
|
||||
);
|
||||
|
||||
this.logger.log(`OCR enqueued thai-preprocess for ${attachmentPublicId}`);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,56 @@
|
||||
import { Processor, WorkerHost, InjectQueue } from '@nestjs/bullmq';
|
||||
import { Logger } from '@nestjs/common';
|
||||
import { ConfigService } from '@nestjs/config';
|
||||
import { Queue, Job } from 'bullmq';
|
||||
import axios from 'axios';
|
||||
|
||||
import { OcrJobData } from './ocr.processor';
|
||||
|
||||
export interface ThaiPreprocessJobData extends OcrJobData {
|
||||
rawText: string;
|
||||
}
|
||||
|
||||
export interface EmbeddingJobData extends ThaiPreprocessJobData {
|
||||
normalizedText: string;
|
||||
}
|
||||
|
||||
@Processor('rag:thai-preprocess')
|
||||
export class ThaiPreprocessProcessor extends WorkerHost {
|
||||
private readonly logger = new Logger(ThaiPreprocessProcessor.name);
|
||||
private readonly thaiUrl: string;
|
||||
|
||||
constructor(
|
||||
private readonly configService: ConfigService,
|
||||
@InjectQueue('rag:embedding') private readonly embeddingQueue: Queue
|
||||
) {
|
||||
super();
|
||||
this.thaiUrl = this.configService.get<string>(
|
||||
'THAI_PREPROCESS_URL',
|
||||
'http://localhost:8765'
|
||||
);
|
||||
}
|
||||
|
||||
async process(job: Job<ThaiPreprocessJobData>): Promise<void> {
|
||||
const { rawText, attachmentPublicId } = job.data;
|
||||
|
||||
let normalizedText = rawText;
|
||||
try {
|
||||
const response = await axios.post<{ normalized: string }>(
|
||||
`${this.thaiUrl}/normalize`,
|
||||
{ text: rawText },
|
||||
{ timeout: 30000 }
|
||||
);
|
||||
normalizedText = response.data.normalized ?? rawText;
|
||||
} catch (err) {
|
||||
this.logger.warn(
|
||||
`Thai preprocess failed for ${attachmentPublicId}, using raw text: ${err instanceof Error ? err.message : String(err)}`
|
||||
);
|
||||
}
|
||||
|
||||
await this.embeddingQueue.add(
|
||||
'embed',
|
||||
{ ...job.data, normalizedText } as EmbeddingJobData,
|
||||
{ jobId: `embed:${attachmentPublicId}` }
|
||||
);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user