690601:1929 ADR-032-232 #08
This commit is contained in:
@@ -8,6 +8,8 @@
|
|||||||
// - 2026-05-30: เพิ่ม VRAM insufficiency guard สำหรับ Typhoon OCR engine (T016a, ADR-032)
|
// - 2026-05-30: เพิ่ม VRAM insufficiency guard สำหรับ Typhoon OCR engine (T016a, ADR-032)
|
||||||
// - 2026-05-30: ปรับปรุงสำหรับ Dynamic OCR Engine selection, Caching, และ Graceful Fallback (T013, T014, T016, T022, T023, US1)
|
// - 2026-05-30: ปรับปรุงสำหรับ Dynamic OCR Engine selection, Caching, และ Graceful Fallback (T013, T014, T016, T022, T023, US1)
|
||||||
// - 2026-06-01: ปรับปรุง remapPath ให้รองรับ Windows absolute และ relative path ได้แม่นยำ 100%
|
// - 2026-06-01: ปรับปรุง remapPath ให้รองรับ Windows absolute และ relative path ได้แม่นยำ 100%
|
||||||
|
// - 2026-06-01: เปลี่ยน processWithTesseract/processWithTyphoon ให้ส่ง file content ผ่าน multipart
|
||||||
|
// ไปยัง /ocr-upload แทนการส่ง path (แก้ปัญหา Docker WSL2 mount ไม่ได้)
|
||||||
|
|
||||||
import { Injectable, Logger, NotFoundException } from '@nestjs/common';
|
import { Injectable, Logger, NotFoundException } from '@nestjs/common';
|
||||||
import { ConfigService } from '@nestjs/config';
|
import { ConfigService } from '@nestjs/config';
|
||||||
@@ -16,6 +18,7 @@ import Redis from 'ioredis';
|
|||||||
import { InjectRepository } from '@nestjs/typeorm';
|
import { InjectRepository } from '@nestjs/typeorm';
|
||||||
import { Repository, EntityManager } from 'typeorm';
|
import { Repository, EntityManager } from 'typeorm';
|
||||||
import axios from 'axios';
|
import axios from 'axios';
|
||||||
|
import * as fs from 'fs';
|
||||||
import {
|
import {
|
||||||
OcrEngineConfiguration,
|
OcrEngineConfiguration,
|
||||||
OcrEngineType,
|
OcrEngineType,
|
||||||
@@ -96,8 +99,6 @@ export class OcrService {
|
|||||||
private readonly logger = new Logger(OcrService.name);
|
private readonly logger = new Logger(OcrService.name);
|
||||||
private readonly threshold: number;
|
private readonly threshold: number;
|
||||||
private readonly ocrApiUrl: string;
|
private readonly ocrApiUrl: string;
|
||||||
private readonly localUploadBase: string;
|
|
||||||
private readonly sidecarUploadBase: string;
|
|
||||||
|
|
||||||
constructor(
|
constructor(
|
||||||
private readonly configService: ConfigService,
|
private readonly configService: ConfigService,
|
||||||
@@ -114,13 +115,6 @@ export class OcrService {
|
|||||||
'OCR_API_URL',
|
'OCR_API_URL',
|
||||||
'http://localhost:8765'
|
'http://localhost:8765'
|
||||||
);
|
);
|
||||||
this.localUploadBase = this.configService
|
|
||||||
.get<string>('UPLOAD_PERMANENT_DIR', '/app/uploads/permanent')
|
|
||||||
.replace(/\/permanent$/, '');
|
|
||||||
this.sidecarUploadBase = this.configService.get<string>(
|
|
||||||
'OCR_SIDECAR_UPLOAD_BASE',
|
|
||||||
'/mnt/uploads'
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** ดึงรายการ OCR Engines ทั้งหมด พร้อมตรวจสอบตัวที่กำลัง Active */
|
/** ดึงรายการ OCR Engines ทั้งหมด พร้อมตรวจสอบตัวที่กำลัง Active */
|
||||||
@@ -198,57 +192,6 @@ export class OcrService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** แปลง local upload path เป็น path ที่ sidecar เห็นผ่าน CIFS/Windows bind mount */
|
|
||||||
private remapPath(localPath: string): string {
|
|
||||||
if (!localPath) return localPath;
|
|
||||||
|
|
||||||
// 1. แปลง Backslash (\) ทั้งหมดให้เป็น Forward slash (/) และรวม slash ที่ซ้ำซ้อน
|
|
||||||
const normalizedPath = localPath.replace(/\\/g, '/').replace(/\/+/g, '/');
|
|
||||||
const sidecarBase = this.sidecarUploadBase.replace(/\/+$/, '');
|
|
||||||
|
|
||||||
// 2. สกัดเอาส่วนของ path ที่อยู่หลัง /uploads/
|
|
||||||
const uploadsMatch = normalizedPath.match(/\/uploads\/(.+)$/i);
|
|
||||||
if (uploadsMatch && uploadsMatch[1]) {
|
|
||||||
const relativePart = uploadsMatch[1].replace(/^\/+/, '');
|
|
||||||
const mappedPath = `${sidecarBase}/${relativePart}`;
|
|
||||||
this.logger.debug(
|
|
||||||
`Mapped Windows path "${localPath}" to Sidecar path "${mappedPath}"`
|
|
||||||
);
|
|
||||||
return mappedPath;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 3. กรณี Relative path ที่ขึ้นต้นด้วย uploads/ เช่น "uploads/temp/xxx.pdf"
|
|
||||||
if (normalizedPath.startsWith('uploads/')) {
|
|
||||||
const relativePart = normalizedPath.substring(8).replace(/^\/+/, '');
|
|
||||||
const mappedPath = `${sidecarBase}/${relativePart}`;
|
|
||||||
this.logger.debug(
|
|
||||||
`Mapped relative path "${localPath}" to "${mappedPath}"`
|
|
||||||
);
|
|
||||||
return mappedPath;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 4. กรณีสำรอง: ถ้าเริ่มด้วย localUploadBase
|
|
||||||
const normalizedLocalBase = this.localUploadBase
|
|
||||||
.replace(/\\/g, '/')
|
|
||||||
.replace(/\/+/g, '/');
|
|
||||||
if (normalizedLocalBase && normalizedPath.includes(normalizedLocalBase)) {
|
|
||||||
const relativePart = normalizedPath
|
|
||||||
.substring(
|
|
||||||
normalizedPath.indexOf(normalizedLocalBase) +
|
|
||||||
normalizedLocalBase.length
|
|
||||||
)
|
|
||||||
.replace(/^\/+/, '');
|
|
||||||
const mappedPath = `${sidecarBase}/${relativePart}`;
|
|
||||||
this.logger.debug(
|
|
||||||
`Mapped fallback path "${localPath}" to "${mappedPath}"`
|
|
||||||
);
|
|
||||||
return mappedPath;
|
|
||||||
}
|
|
||||||
|
|
||||||
return normalizedPath;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** ตรวจสอบสุขภาพและ latency ของ OCR sidecar (Tesseract) ผ่าน GET /health */
|
|
||||||
async checkHealth(): Promise<OcrHealthResult> {
|
async checkHealth(): Promise<OcrHealthResult> {
|
||||||
const startTime = Date.now();
|
const startTime = Date.now();
|
||||||
try {
|
try {
|
||||||
@@ -295,26 +238,28 @@ export class OcrService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** ประมวลผลผ่าน Tesseract OCR */
|
/** ประมวลผลผ่าน Tesseract OCR โดยส่ง file content ผ่าน multipart */
|
||||||
private async processWithTesseract(
|
private async processWithTesseract(
|
||||||
input: OcrDetectionInput
|
input: OcrDetectionInput
|
||||||
): Promise<OcrDetectionResult> {
|
): Promise<OcrDetectionResult> {
|
||||||
const startTime = Date.now();
|
const startTime = Date.now();
|
||||||
const sidecarPath = this.remapPath(input.pdfPath!);
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
this.logger.debug(
|
this.logger.debug(`Tesseract OCR processing: ${input.pdfPath}`);
|
||||||
`Tesseract OCR processing: ${input.pdfPath} → ${sidecarPath}`
|
const fileBuffer = fs.readFileSync(input.pdfPath!);
|
||||||
|
const form = new FormData();
|
||||||
|
form.append(
|
||||||
|
'file',
|
||||||
|
new Blob([fileBuffer], { type: 'application/pdf' }),
|
||||||
|
'upload.pdf'
|
||||||
);
|
);
|
||||||
|
form.append('engine', 'auto');
|
||||||
const response = await axios.post<OcrSidecarResponse>(
|
const response = await axios.post<OcrSidecarResponse>(
|
||||||
`${this.ocrApiUrl}/ocr`,
|
`${this.ocrApiUrl}/ocr-upload`,
|
||||||
{ pdfPath: sidecarPath },
|
form,
|
||||||
{ timeout: 90000 }
|
{ timeout: 90000 }
|
||||||
);
|
);
|
||||||
|
|
||||||
const text = response.data.text ?? '';
|
const text = response.data.text ?? '';
|
||||||
const durationMs = Date.now() - startTime;
|
const durationMs = Date.now() - startTime;
|
||||||
|
|
||||||
await this.writeAuditLog({
|
await this.writeAuditLog({
|
||||||
documentPublicId: input.documentPublicId,
|
documentPublicId: input.documentPublicId,
|
||||||
aiModel: 'tesseract',
|
aiModel: 'tesseract',
|
||||||
@@ -324,26 +269,9 @@ export class OcrService {
|
|||||||
processingTimeMs: durationMs,
|
processingTimeMs: durationMs,
|
||||||
cacheHit: false,
|
cacheHit: false,
|
||||||
});
|
});
|
||||||
|
return { text, ocrUsed: true };
|
||||||
return {
|
|
||||||
text,
|
|
||||||
ocrUsed: true,
|
|
||||||
};
|
|
||||||
} catch (err: unknown) {
|
} catch (err: unknown) {
|
||||||
const durationMs = Date.now() - startTime;
|
const durationMs = Date.now() - startTime;
|
||||||
// ดึง axios response body detail ออกมาด้วย (เช่น ไม่พบไฟล์: /mnt/uploads/...)
|
|
||||||
const axiosDetail =
|
|
||||||
err !== null &&
|
|
||||||
typeof err === 'object' &&
|
|
||||||
'response' in err &&
|
|
||||||
err.response !== null &&
|
|
||||||
typeof err.response === 'object' &&
|
|
||||||
'data' in err.response &&
|
|
||||||
err.response.data !== null &&
|
|
||||||
typeof err.response.data === 'object' &&
|
|
||||||
'detail' in err.response.data
|
|
||||||
? String((err.response.data as { detail: unknown }).detail)
|
|
||||||
: null;
|
|
||||||
const cause =
|
const cause =
|
||||||
err instanceof AggregateError && err.errors?.length
|
err instanceof AggregateError && err.errors?.length
|
||||||
? err.errors
|
? err.errors
|
||||||
@@ -352,10 +280,6 @@ export class OcrService {
|
|||||||
: err instanceof Error
|
: err instanceof Error
|
||||||
? err.message
|
? err.message
|
||||||
: String(err);
|
: String(err);
|
||||||
const fullCause = axiosDetail
|
|
||||||
? `${cause} — sidecar detail: ${axiosDetail} (sidecarPath: ${sidecarPath})`
|
|
||||||
: `${cause} (sidecarPath: ${sidecarPath})`;
|
|
||||||
|
|
||||||
await this.writeAuditLog({
|
await this.writeAuditLog({
|
||||||
documentPublicId: input.documentPublicId,
|
documentPublicId: input.documentPublicId,
|
||||||
aiModel: 'tesseract',
|
aiModel: 'tesseract',
|
||||||
@@ -363,11 +287,10 @@ export class OcrService {
|
|||||||
modelType: 'tesseract',
|
modelType: 'tesseract',
|
||||||
status: AiAuditStatus.FAILED,
|
status: AiAuditStatus.FAILED,
|
||||||
processingTimeMs: durationMs,
|
processingTimeMs: durationMs,
|
||||||
errorMessage: fullCause,
|
errorMessage: cause,
|
||||||
cacheHit: false,
|
cacheHit: false,
|
||||||
});
|
});
|
||||||
|
throw new Error(`Tesseract OCR Sidecar failed: ${cause}`);
|
||||||
throw new Error(`Tesseract OCR Sidecar failed: ${fullCause}`);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -376,8 +299,6 @@ export class OcrService {
|
|||||||
input: OcrDetectionInput
|
input: OcrDetectionInput
|
||||||
): Promise<OcrDetectionResult> {
|
): Promise<OcrDetectionResult> {
|
||||||
const startTime = Date.now();
|
const startTime = Date.now();
|
||||||
const sidecarPath = this.remapPath(input.pdfPath!);
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// 1. ตรวจสอบ VRAM insufficiency guard
|
// 1. ตรวจสอบ VRAM insufficiency guard
|
||||||
const hasCapacity = await this.vramMonitorService.hasVramCapacity(
|
const hasCapacity = await this.vramMonitorService.hasVramCapacity(
|
||||||
@@ -390,15 +311,18 @@ export class OcrService {
|
|||||||
return this.processWithTesseract(input);
|
return this.processWithTesseract(input);
|
||||||
}
|
}
|
||||||
|
|
||||||
this.logger.debug(
|
this.logger.debug(`Typhoon OCR processing: ${input.pdfPath}`);
|
||||||
`Typhoon OCR processing: ${input.pdfPath} → ${sidecarPath}`
|
const fileBuffer = fs.readFileSync(input.pdfPath!);
|
||||||
|
const form = new FormData();
|
||||||
|
form.append(
|
||||||
|
'file',
|
||||||
|
new Blob([fileBuffer], { type: 'application/pdf' }),
|
||||||
|
'upload.pdf'
|
||||||
);
|
);
|
||||||
|
form.append('engine', 'typhoon-ocr-3b');
|
||||||
const response = await axios.post<OcrSidecarResponse>(
|
const response = await axios.post<OcrSidecarResponse>(
|
||||||
`${this.ocrApiUrl}/ocr`,
|
`${this.ocrApiUrl}/ocr-upload`,
|
||||||
{
|
form,
|
||||||
pdfPath: sidecarPath,
|
|
||||||
engine: 'typhoon-ocr-3b',
|
|
||||||
},
|
|
||||||
{ timeout: 120000 }
|
{ timeout: 120000 }
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|||||||
@@ -1,11 +1,12 @@
|
|||||||
// File: src/modules/ai/services/sandbox-ocr-engine.service.ts
|
// File: src/modules/ai/services/sandbox-ocr-engine.service.ts
|
||||||
// Change Log
|
// Change Log
|
||||||
// - 2026-05-30: แยก SandboxOcrEngineService ออกจาก OcrService เพื่อรองรับการเลือก Typhoon OCR เฉพาะ sandbox โดยไม่กระทบ core OCR flow
|
// - 2026-05-30: แยก SandboxOcrEngineService ออกจาก OcrService เพื่อรองรับการเลือก Typhoon OCR เฉพาะ sandbox โดยไม่กระทบ core OCR flow
|
||||||
// - 2026-06-01: ปรับปรุง remapPath ให้รองรับ Windows absolute และ relative path ได้แม่นยำ 100%
|
// - 2026-06-01: เปลี่ยนจาก remapPath + pdfPath ไปเป็น multipart file upload ไปยัง /ocr-upload (แก้ปัญหา Docker WSL2 mount)
|
||||||
|
|
||||||
import { Injectable, Logger } from '@nestjs/common';
|
import { Injectable, Logger } from '@nestjs/common';
|
||||||
import { ConfigService } from '@nestjs/config';
|
import { ConfigService } from '@nestjs/config';
|
||||||
import axios from 'axios';
|
import axios from 'axios';
|
||||||
|
import * as fs from 'fs';
|
||||||
import { OcrService } from './ocr.service';
|
import { OcrService } from './ocr.service';
|
||||||
|
|
||||||
export type SandboxOcrEngineType = 'auto' | 'tesseract' | 'typhoon-ocr-3b';
|
export type SandboxOcrEngineType = 'auto' | 'tesseract' | 'typhoon-ocr-3b';
|
||||||
@@ -28,8 +29,6 @@ export interface SandboxOcrResult {
|
|||||||
export class SandboxOcrEngineService {
|
export class SandboxOcrEngineService {
|
||||||
private readonly logger = new Logger(SandboxOcrEngineService.name);
|
private readonly logger = new Logger(SandboxOcrEngineService.name);
|
||||||
private readonly ocrApiUrl: string;
|
private readonly ocrApiUrl: string;
|
||||||
private readonly localUploadBase: string;
|
|
||||||
private readonly sidecarUploadBase: string;
|
|
||||||
|
|
||||||
constructor(
|
constructor(
|
||||||
private readonly configService: ConfigService,
|
private readonly configService: ConfigService,
|
||||||
@@ -39,63 +38,6 @@ export class SandboxOcrEngineService {
|
|||||||
'OCR_API_URL',
|
'OCR_API_URL',
|
||||||
'http://localhost:8765'
|
'http://localhost:8765'
|
||||||
);
|
);
|
||||||
this.localUploadBase = this.configService
|
|
||||||
.get<string>('UPLOAD_PERMANENT_DIR', '/app/uploads/permanent')
|
|
||||||
.replace(/\/permanent$/, '');
|
|
||||||
this.sidecarUploadBase = this.configService.get<string>(
|
|
||||||
'OCR_SIDECAR_UPLOAD_BASE',
|
|
||||||
'/mnt/uploads'
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
/** แปลง local upload path เป็น path ที่ sidecar เห็นผ่าน CIFS/Windows bind mount */
|
|
||||||
private remapPath(localPath: string): string {
|
|
||||||
if (!localPath) return localPath;
|
|
||||||
|
|
||||||
// 1. แปลง Backslash (\) ทั้งหมดให้เป็น Forward slash (/) และรวม slash ที่ซ้ำซ้อน
|
|
||||||
const normalizedPath = localPath.replace(/\\/g, '/').replace(/\/+/g, '/');
|
|
||||||
const sidecarBase = this.sidecarUploadBase.replace(/\/+$/, '');
|
|
||||||
|
|
||||||
// 2. สกัดเอาส่วนของ path ที่อยู่หลัง /uploads/
|
|
||||||
const uploadsMatch = normalizedPath.match(/\/uploads\/(.+)$/i);
|
|
||||||
if (uploadsMatch && uploadsMatch[1]) {
|
|
||||||
const relativePart = uploadsMatch[1].replace(/^\/+/, '');
|
|
||||||
const mappedPath = `${sidecarBase}/${relativePart}`;
|
|
||||||
this.logger.debug(
|
|
||||||
`Mapped Windows path "${localPath}" to Sidecar path "${mappedPath}"`
|
|
||||||
);
|
|
||||||
return mappedPath;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 3. กรณี Relative path ที่ขึ้นต้นด้วย uploads/ เช่น "uploads/temp/xxx.pdf"
|
|
||||||
if (normalizedPath.startsWith('uploads/')) {
|
|
||||||
const relativePart = normalizedPath.substring(8).replace(/^\/+/, '');
|
|
||||||
const mappedPath = `${sidecarBase}/${relativePart}`;
|
|
||||||
this.logger.debug(
|
|
||||||
`Mapped relative path "${localPath}" to "${mappedPath}"`
|
|
||||||
);
|
|
||||||
return mappedPath;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 4. กรณีสำรอง: ถ้าเริ่มด้วย localUploadBase
|
|
||||||
const normalizedLocalBase = this.localUploadBase
|
|
||||||
.replace(/\\/g, '/')
|
|
||||||
.replace(/\/+/g, '/');
|
|
||||||
if (normalizedLocalBase && normalizedPath.includes(normalizedLocalBase)) {
|
|
||||||
const relativePart = normalizedPath
|
|
||||||
.substring(
|
|
||||||
normalizedPath.indexOf(normalizedLocalBase) +
|
|
||||||
normalizedLocalBase.length
|
|
||||||
)
|
|
||||||
.replace(/^\/+/, '');
|
|
||||||
const mappedPath = `${sidecarBase}/${relativePart}`;
|
|
||||||
this.logger.debug(
|
|
||||||
`Mapped fallback path "${localPath}" to "${mappedPath}"`
|
|
||||||
);
|
|
||||||
return mappedPath;
|
|
||||||
}
|
|
||||||
|
|
||||||
return normalizedPath;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** รัน OCR ตาม engine ที่เลือก โดย fallback กลับไป Tesseract baseline เมื่อ Typhoon ล้มเหลว */
|
/** รัน OCR ตาม engine ที่เลือก โดย fallback กลับไป Tesseract baseline เมื่อ Typhoon ล้มเหลว */
|
||||||
@@ -114,12 +56,17 @@ export class SandboxOcrEngineService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
const fileBuffer = fs.readFileSync(pdfPath);
|
||||||
|
const form = new FormData();
|
||||||
|
form.append(
|
||||||
|
'file',
|
||||||
|
new Blob([fileBuffer], { type: 'application/pdf' }),
|
||||||
|
'upload.pdf'
|
||||||
|
);
|
||||||
|
form.append('engine', engineType);
|
||||||
const response = await axios.post<SandboxOcrSidecarResponse>(
|
const response = await axios.post<SandboxOcrSidecarResponse>(
|
||||||
`${this.ocrApiUrl}/ocr`,
|
`${this.ocrApiUrl}/ocr-upload`,
|
||||||
{
|
form,
|
||||||
pdfPath: this.remapPath(pdfPath),
|
|
||||||
engine: engineType,
|
|
||||||
},
|
|
||||||
{ timeout: 120000 }
|
{ timeout: 120000 }
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|||||||
@@ -6,6 +6,7 @@
|
|||||||
# - 2026-05-30: เปลี่ยน lang='en' เป็น lang='ch' (CTJK) เพื่อรองรับภาษาไทย
|
# - 2026-05-30: เปลี่ยน lang='en' เป็น lang='ch' (CTJK) เพื่อรองรับภาษาไทย
|
||||||
# - 2026-05-30: เปลี่ยนจาก PaddleOCR เป็น Tesseract OCR เพื่อความเข้ากันได้กับ CPU เก่า
|
# - 2026-05-30: เปลี่ยนจาก PaddleOCR เป็น Tesseract OCR เพื่อความเข้ากันได้กับ CPU เก่า
|
||||||
# - 2026-05-30: เพิ่ม OpenCV preprocessing (threshold, denoise) และ DPI 300 เพื่อเพิ่มความแม่นยำ
|
# - 2026-05-30: เพิ่ม OpenCV preprocessing (threshold, denoise) และ DPI 300 เพื่อเพิ่มความแม่นยำ
|
||||||
|
# - 2026-06-01: เพิ่ม POST /ocr-upload รับ multipart file โดยตรง ไม่ต้องพึ่ง shared volume mount
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import logging
|
import logging
|
||||||
@@ -21,7 +22,7 @@ import io
|
|||||||
import cv2
|
import cv2
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from fastapi import FastAPI, HTTPException
|
from fastapi import FastAPI, HTTPException, UploadFile, File, Form
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from pythainlp.tokenize import word_tokenize
|
from pythainlp.tokenize import word_tokenize
|
||||||
from pythainlp.util import normalize as thai_normalize
|
from pythainlp.util import normalize as thai_normalize
|
||||||
@@ -122,6 +123,71 @@ def health():
|
|||||||
return {"status": "ok", "engine": "tesseract"}
|
return {"status": "ok", "engine": "tesseract"}
|
||||||
|
|
||||||
|
|
||||||
|
def _process_pdf_doc(doc: fitz.Document, selected_engine: str, max_pages: int) -> OcrResponse:
|
||||||
|
"""ประมวลผล fitz.Document ด้วย engine ที่เลือก — shared logic สำหรับ /ocr และ /ocr-upload"""
|
||||||
|
pages_to_process = list(range(min(len(doc), max_pages) if max_pages > 0 else len(doc)))
|
||||||
|
page_count = len(pages_to_process)
|
||||||
|
|
||||||
|
fast_text_parts = []
|
||||||
|
total_chars = 0
|
||||||
|
if selected_engine == "auto":
|
||||||
|
for i in pages_to_process:
|
||||||
|
page = doc[i]
|
||||||
|
fast_text_parts.append(page.get_text())
|
||||||
|
fast_text = "\n".join(fast_text_parts).strip()
|
||||||
|
total_chars = len(fast_text)
|
||||||
|
if total_chars > OCR_CHAR_THRESHOLD:
|
||||||
|
logger.info(f"Fast path: {total_chars} chars extracted")
|
||||||
|
return OcrResponse(
|
||||||
|
text=fast_text,
|
||||||
|
ocrUsed=False,
|
||||||
|
pageCount=page_count,
|
||||||
|
charCount=total_chars,
|
||||||
|
engineUsed="fast-path",
|
||||||
|
)
|
||||||
|
|
||||||
|
if selected_engine == "typhoon-ocr-3b":
|
||||||
|
typhoon_text_parts = []
|
||||||
|
for i in pages_to_process:
|
||||||
|
page = doc[i]
|
||||||
|
pix = page.get_pixmap(dpi=300)
|
||||||
|
img_bytes = pix.tobytes("png")
|
||||||
|
img = Image.open(io.BytesIO(img_bytes))
|
||||||
|
cropped_img = crop_header_footer(img, CROP_TOP_RATIO, CROP_BOTTOM_RATIO)
|
||||||
|
processed_img = preprocess_image(cropped_img)
|
||||||
|
typhoon_text_parts.append(process_with_typhoon_ocr(processed_img))
|
||||||
|
typhoon_text = filter_ocr_noise("\n".join(typhoon_text_parts).strip())
|
||||||
|
return OcrResponse(
|
||||||
|
text=typhoon_text,
|
||||||
|
ocrUsed=True,
|
||||||
|
pageCount=page_count,
|
||||||
|
charCount=len(typhoon_text),
|
||||||
|
engineUsed="typhoon-ocr-3b",
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(f"Slow path (Tesseract): {total_chars} chars too few")
|
||||||
|
ocr_text_parts = []
|
||||||
|
for i in pages_to_process:
|
||||||
|
page = doc[i]
|
||||||
|
pix = page.get_pixmap(dpi=300)
|
||||||
|
img_bytes = pix.tobytes("png")
|
||||||
|
img = Image.open(io.BytesIO(img_bytes))
|
||||||
|
cropped_img = crop_header_footer(img, CROP_TOP_RATIO, CROP_BOTTOM_RATIO)
|
||||||
|
processed_img = preprocess_image(cropped_img)
|
||||||
|
text = pytesseract.image_to_string(processed_img, lang=OCR_LANG, config=TESSERACT_CONFIG)
|
||||||
|
ocr_text_parts.append(text.strip())
|
||||||
|
|
||||||
|
ocr_text = filter_ocr_noise("\n".join(ocr_text_parts).strip())
|
||||||
|
logger.info(f"Tesseract extracted {len(ocr_text)} chars")
|
||||||
|
return OcrResponse(
|
||||||
|
text=ocr_text,
|
||||||
|
ocrUsed=True,
|
||||||
|
pageCount=page_count,
|
||||||
|
charCount=len(ocr_text),
|
||||||
|
engineUsed="tesseract",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def process_with_typhoon_ocr(pil_image: Image.Image) -> str:
|
def process_with_typhoon_ocr(pil_image: Image.Image) -> str:
|
||||||
"""เรียก Typhoon OCR ผ่าน Ollama สำหรับ sandbox option โดยไม่แตะ backend DB/storage"""
|
"""เรียก Typhoon OCR ผ่าน Ollama สำหรับ sandbox option โดยไม่แตะ backend DB/storage"""
|
||||||
img_buffer = io.BytesIO()
|
img_buffer = io.BytesIO()
|
||||||
@@ -148,92 +214,35 @@ def process_with_typhoon_ocr(pil_image: Image.Image) -> str:
|
|||||||
|
|
||||||
@app.post("/ocr", response_model=OcrResponse)
|
@app.post("/ocr", response_model=OcrResponse)
|
||||||
def ocr_extract(req: OcrRequest):
|
def ocr_extract(req: OcrRequest):
|
||||||
|
"""OCR จาก path (legacy — ใช้เมื่อ sidecar และ backend เข้าถึง storage เดียวกัน)"""
|
||||||
pdf_path = Path(req.pdfPath)
|
pdf_path = Path(req.pdfPath)
|
||||||
if not pdf_path.exists():
|
if not pdf_path.exists():
|
||||||
raise HTTPException(status_code=404, detail=f"ไม่พบไฟล์: {req.pdfPath}")
|
raise HTTPException(status_code=404, detail=f"ไม่พบไฟล์: {req.pdfPath}")
|
||||||
|
|
||||||
selected_engine = (req.engine or "auto").strip().lower()
|
selected_engine = (req.engine or "auto").strip().lower()
|
||||||
max_pages = req.maxPages or MAX_PAGES
|
max_pages = req.maxPages or MAX_PAGES
|
||||||
|
|
||||||
try:
|
try:
|
||||||
doc = fitz.open(str(pdf_path))
|
doc = fitz.open(str(pdf_path))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise HTTPException(status_code=422, detail=f"เปิดไฟล์ PDF ล้มเหลว: {e}")
|
raise HTTPException(status_code=422, detail=f"เปิดไฟล์ PDF ล้มเหลว: {e}")
|
||||||
|
return _process_pdf_doc(doc, selected_engine, max_pages)
|
||||||
|
|
||||||
pages_to_process = list(range(min(len(doc), max_pages) if max_pages > 0 else len(doc)))
|
|
||||||
page_count = len(pages_to_process)
|
|
||||||
|
|
||||||
fast_text_parts = []
|
@app.post("/ocr-upload", response_model=OcrResponse)
|
||||||
total_chars = 0
|
def ocr_upload(
|
||||||
if selected_engine == "auto":
|
file: UploadFile = File(...),
|
||||||
# Fast path: ลอง extract text layer ก่อน
|
engine: str = Form(default="auto"),
|
||||||
for i in pages_to_process:
|
maxPages: int = Form(default=0),
|
||||||
page = doc[i]
|
):
|
||||||
fast_text_parts.append(page.get_text())
|
"""OCR จาก multipart file upload — ไม่ต้องการ shared volume mount"""
|
||||||
fast_text = "\n".join(fast_text_parts).strip()
|
selected_engine = engine.strip().lower()
|
||||||
total_chars = len(fast_text)
|
max_pages = maxPages or MAX_PAGES
|
||||||
if total_chars > OCR_CHAR_THRESHOLD:
|
pdf_bytes = file.file.read()
|
||||||
logger.info(f"Fast path: {total_chars} chars extracted from {pdf_path.name}")
|
try:
|
||||||
return OcrResponse(
|
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
||||||
text=fast_text,
|
except Exception as e:
|
||||||
ocrUsed=False,
|
raise HTTPException(status_code=422, detail=f"เปิดไฟล์ PDF ล้มเหลว: {e}")
|
||||||
pageCount=page_count,
|
logger.info(f"OCR upload: {file.filename} engine={selected_engine}")
|
||||||
charCount=total_chars,
|
return _process_pdf_doc(doc, selected_engine, max_pages)
|
||||||
engineUsed="fast-path",
|
|
||||||
)
|
|
||||||
|
|
||||||
if selected_engine == "typhoon-ocr-3b":
|
|
||||||
logger.info(f"Typhoon OCR path: {pdf_path.name}")
|
|
||||||
typhoon_text_parts = []
|
|
||||||
for i in pages_to_process:
|
|
||||||
page = doc[i]
|
|
||||||
pix = page.get_pixmap(dpi=300)
|
|
||||||
img_bytes = pix.tobytes("png")
|
|
||||||
img = Image.open(io.BytesIO(img_bytes))
|
|
||||||
cropped_img = crop_header_footer(img, CROP_TOP_RATIO, CROP_BOTTOM_RATIO)
|
|
||||||
processed_img = preprocess_image(cropped_img)
|
|
||||||
typhoon_text_parts.append(process_with_typhoon_ocr(processed_img))
|
|
||||||
typhoon_text = filter_ocr_noise("\n".join(typhoon_text_parts).strip())
|
|
||||||
return OcrResponse(
|
|
||||||
text=typhoon_text,
|
|
||||||
ocrUsed=True,
|
|
||||||
pageCount=page_count,
|
|
||||||
charCount=len(typhoon_text),
|
|
||||||
engineUsed="typhoon-ocr-3b",
|
|
||||||
)
|
|
||||||
|
|
||||||
logger.info(f"Slow path (Tesseract): {total_chars} chars too few for {pdf_path.name}")
|
|
||||||
ocr_text_parts = []
|
|
||||||
for i in pages_to_process:
|
|
||||||
page = doc[i]
|
|
||||||
pix = page.get_pixmap(dpi=300) # เพิ่ม DPI เป็น 300 เพื่อความชัด
|
|
||||||
img_bytes = pix.tobytes("png")
|
|
||||||
img = Image.open(io.BytesIO(img_bytes))
|
|
||||||
|
|
||||||
# Crop header/footer ก่อนเพื่อลบข้อความที่ไม่จำเป็น
|
|
||||||
cropped_img = crop_header_footer(img, CROP_TOP_RATIO, CROP_BOTTOM_RATIO)
|
|
||||||
|
|
||||||
# Preprocess ด้วย OpenCV เพื่อเพิ่มความแม่นยำ
|
|
||||||
processed_img = preprocess_image(cropped_img)
|
|
||||||
|
|
||||||
# OCR ด้วย Tesseract โดยใช้ PSM 6 และ OEM 1
|
|
||||||
text = pytesseract.image_to_string(processed_img, lang=OCR_LANG, config=TESSERACT_CONFIG)
|
|
||||||
ocr_text_parts.append(text.strip())
|
|
||||||
|
|
||||||
ocr_text = "\n".join(ocr_text_parts).strip()
|
|
||||||
|
|
||||||
# Filter ขยะ OCR หลังจากสกัดข้อความแล้ว
|
|
||||||
ocr_text = filter_ocr_noise(ocr_text)
|
|
||||||
|
|
||||||
logger.info(f"Tesseract extracted {len(ocr_text)} chars from {pdf_path.name}")
|
|
||||||
|
|
||||||
return OcrResponse(
|
|
||||||
text=ocr_text,
|
|
||||||
ocrUsed=True,
|
|
||||||
pageCount=page_count,
|
|
||||||
charCount=len(ocr_text),
|
|
||||||
engineUsed="tesseract",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class NormalizeRequest(BaseModel):
|
class NormalizeRequest(BaseModel):
|
||||||
|
|||||||
+2
-10
@@ -7,9 +7,8 @@
|
|||||||
# - 2026-05-30: เพิ่ม Typhoon OCR environment variables (T009b, ADR-032)
|
# - 2026-05-30: เพิ่ม Typhoon OCR environment variables (T009b, ADR-032)
|
||||||
# OLLAMA_API_URL ชี้ไปที่ http://192.168.10.100:11434 (Admin Desktop LAN IP)
|
# OLLAMA_API_URL ชี้ไปที่ http://192.168.10.100:11434 (Admin Desktop LAN IP)
|
||||||
# - 2026-05-30: Revert volumes กลับไปใช้ Windows Z: drive bind mount (แทน CIFS volume driver ที่พัง)
|
# - 2026-05-30: Revert volumes กลับไปใช้ Windows Z: drive bind mount (แทน CIFS volume driver ที่พัง)
|
||||||
# - 2026-06-01: แก้ volumes เปลี่ยนจาก Z: drive bind mount (ไม่ทำงานบน WSL2)
|
# - 2026-06-01: ลบ volumes ออกทั้งหมด — backend ส่ง file content ผ่าน multipart /ocr-upload แทน
|
||||||
# เป็น CIFS named volume ชี้ตรงไปที่ UNC path \\192.168.10.8\np-dms-as\data\uploads
|
# ไม่ต้องการ shared storage อีกต่อไป
|
||||||
# ต้องสร้างไฟล์ .env ที่ Desk-5439 (ดูตัวอย่างใน .env.example)
|
|
||||||
#
|
#
|
||||||
# วิธีรัน:
|
# วิธีรัน:
|
||||||
# docker compose up -d --build
|
# docker compose up -d --build
|
||||||
@@ -40,13 +39,6 @@ services:
|
|||||||
TYPHOON_OCR_MODEL: "scb10x/typhoon-ocr-3b"
|
TYPHOON_OCR_MODEL: "scb10x/typhoon-ocr-3b"
|
||||||
# Timeout 120 วินาที/หน้า (budget สำหรับ 3B model บน RTX 2060 Super)
|
# Timeout 120 วินาที/หน้า (budget สำหรับ 3B model บน RTX 2060 Super)
|
||||||
TYPHOON_OCR_TIMEOUT: "120"
|
TYPHOON_OCR_TIMEOUT: "120"
|
||||||
volumes:
|
|
||||||
# Uploads จาก QNAP NAS ผ่าน WSL2 mount path
|
|
||||||
# Z: = \\192.168.10.8\np-dms-as → WSL2 เห็นเป็น /mnt/z
|
|
||||||
# Docker Desktop bind mount จาก Windows path ใช้ //wsl.localhost/ ไม่ได้
|
|
||||||
# แต่ใช้ Windows absolute path ของ Z: ได้ผ่าน Docker Desktop settings
|
|
||||||
# วิธีที่ใช้งานได้: ระบุ source เป็น Windows UNC path โดยตรง
|
|
||||||
- //192.168.10.8/np-dms-as/data/uploads:/mnt/uploads:ro
|
|
||||||
logging:
|
logging:
|
||||||
driver: "json-file"
|
driver: "json-file"
|
||||||
options:
|
options:
|
||||||
|
|||||||
Reference in New Issue
Block a user