Files
lcbp3/backend/src/modules/rag/processors/ocr.processor.ts
T
admin 3143dd7263
CI / CD Pipeline / build (push) Successful in 4m53s
CI / CD Pipeline / deploy (push) Failing after 5m7s
690421:1628 Update RAG Module #01
2026-04-21 16:28:23 +07:00

69 lines
1.8 KiB
TypeScript

import { Processor, WorkerHost } from '@nestjs/bullmq';
import { Logger } from '@nestjs/common';
import { InjectRepository } from '@nestjs/typeorm';
import { Repository } from 'typeorm';
import { Job } from 'bullmq';
import * as fs from 'fs';
import { InjectQueue } from '@nestjs/bullmq';
import { Queue } from 'bullmq';
import { DocumentChunk } from '../entities/document-chunk.entity';
export interface OcrJobData {
attachmentPublicId: string;
filePath: string;
docType: string;
docNumber: string | null;
revision: string | null;
projectCode: string;
projectPublicId: string;
classification: 'PUBLIC' | 'INTERNAL' | 'CONFIDENTIAL';
}
@Processor('rag-ocr')
export class OcrProcessor extends WorkerHost {
private readonly logger = new Logger(OcrProcessor.name);
constructor(
@InjectQueue('rag-thai-preprocess') private readonly thaiQueue: Queue,
@InjectRepository(DocumentChunk)
private readonly chunkRepo: Repository<DocumentChunk>
) {
super();
}
async process(job: Job<OcrJobData>): Promise<void> {
const { attachmentPublicId, filePath } = job.data;
const existing = await this.chunkRepo.count({
where: { documentId: attachmentPublicId },
});
if (existing > 0) {
this.logger.log(
`rag-ocr job already indexed for ${attachmentPublicId}, skipping`
);
return;
}
await this.chunkRepo.manager.query(
`UPDATE attachments SET rag_status = 'PROCESSING' WHERE public_id = ?`,
[attachmentPublicId]
);
let rawText: string;
try {
rawText = fs.readFileSync(filePath, 'utf-8');
} catch {
rawText = `[binary:${attachmentPublicId}]`;
}
await this.thaiQueue.add(
'preprocess',
{ ...job.data, rawText },
{ jobId: `thai:${attachmentPublicId}` }
);
this.logger.log(`OCR enqueued thai-preprocess for ${attachmentPublicId}`);
}
}