690525:1418 ADR-028-228-migration-OCR #01
This commit is contained in:
@@ -1,6 +1,7 @@
|
|||||||
// File: src/modules/ai/services/ocr.service.ts
|
// File: src/modules/ai/services/ocr.service.ts
|
||||||
// Change Log
|
// Change Log
|
||||||
// - 2026-05-15: เพิ่ม OCR auto-detection service สำหรับ ADR-023A.
|
// - 2026-05-15: เพิ่ม OCR auto-detection service สำหรับ ADR-023A.
|
||||||
|
// - 2026-05-25: แก้ไข AggregateError (empty message) จาก axios โดย wrap เป็น Error พร้อม context ที่ชัดเจน.
|
||||||
|
|
||||||
import { Injectable, Logger } from '@nestjs/common';
|
import { Injectable, Logger } from '@nestjs/common';
|
||||||
import { ConfigService } from '@nestjs/config';
|
import { ConfigService } from '@nestjs/config';
|
||||||
@@ -52,15 +53,28 @@ export class OcrService {
|
|||||||
return { text: extractedText, ocrUsed: false };
|
return { text: extractedText, ocrUsed: false };
|
||||||
}
|
}
|
||||||
|
|
||||||
const response = await axios.post<PaddleOcrResponse>(
|
try {
|
||||||
`${this.ocrApiUrl}/ocr`,
|
const response = await axios.post<PaddleOcrResponse>(
|
||||||
{ pdfPath: input.pdfPath },
|
`${this.ocrApiUrl}/ocr`,
|
||||||
{ timeout: 90000 }
|
{ pdfPath: input.pdfPath },
|
||||||
);
|
{ timeout: 90000 }
|
||||||
|
);
|
||||||
return {
|
return {
|
||||||
text: response.data.text ?? '',
|
text: response.data.text ?? '',
|
||||||
ocrUsed: true,
|
ocrUsed: true,
|
||||||
};
|
};
|
||||||
|
} catch (err: unknown) {
|
||||||
|
const cause =
|
||||||
|
err instanceof AggregateError && err.errors?.length
|
||||||
|
? err.errors
|
||||||
|
.map((e: unknown) => (e instanceof Error ? e.message : String(e)))
|
||||||
|
.join('; ')
|
||||||
|
: err instanceof Error
|
||||||
|
? err.message
|
||||||
|
: String(err);
|
||||||
|
throw new Error(
|
||||||
|
`PaddleOCR sidecar unreachable at ${this.ocrApiUrl} — ${cause}`
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,37 @@
|
|||||||
|
# File: specs/04-Infrastructure-OPS/04-00-docker-compose/Desk-5439/ocr-sidecar/Dockerfile
|
||||||
|
# PaddleOCR Sidecar — HTTP API server สำหรับสกัดข้อความจาก PDF/Image
|
||||||
|
# รันบน Desk-5439 (GPU RTX 2060 Super 8GB) ตาม ADR-023A
|
||||||
|
# Change Log:
|
||||||
|
# - 2026-05-25: Initial Dockerfile สำหรับ PaddleOCR sidecar (port 8765)
|
||||||
|
|
||||||
|
FROM python:3.10-slim
|
||||||
|
|
||||||
|
# ติดตั้ง system dependencies สำหรับ PDF processing และ image library
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
libglib2.0-0 \
|
||||||
|
libgl1 \
|
||||||
|
libgomp1 \
|
||||||
|
poppler-utils \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# ติดตั้ง Python dependencies
|
||||||
|
# ใช้ paddlepaddle-gpu สำหรับ GPU acceleration (RTX 2060 Super — CUDA 11.x)
|
||||||
|
# เปลี่ยนเป็น paddlepaddle (CPU only) ถ้าต้องการ fallback
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
# Copy source code
|
||||||
|
COPY app.py .
|
||||||
|
|
||||||
|
# Download PaddleOCR models ล่วงหน้าระหว่าง build (ลด cold-start time)
|
||||||
|
# โมเดลภาษาไทย (th) + อังกฤษ (en) ตาม ADR-023A
|
||||||
|
RUN python -c "from paddleocr import PaddleOCR; PaddleOCR(use_angle_cls=True, lang='en', use_gpu=False)"
|
||||||
|
|
||||||
|
EXPOSE 8765
|
||||||
|
|
||||||
|
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
|
||||||
|
CMD curl -f http://localhost:8765/health || exit 1
|
||||||
|
|
||||||
|
CMD ["python", "app.py"]
|
||||||
@@ -0,0 +1,144 @@
|
|||||||
|
# File: specs/04-Infrastructure-OPS/04-00-docker-compose/Desk-5439/ocr-sidecar/app.py
|
||||||
|
# PaddleOCR HTTP Sidecar API — รับ POST /ocr แล้วคืนข้อความที่สกัดจาก PDF/Image
|
||||||
|
# ตาม ADR-023A: OCR auto-detect (PyMuPDF chars > 100 → Fast path, else PaddleOCR)
|
||||||
|
# Change Log:
|
||||||
|
# - 2026-05-25: Initial FastAPI server สำหรับ PaddleOCR sidecar
|
||||||
|
|
||||||
|
import os
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import fitz # PyMuPDF
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from fastapi import FastAPI, HTTPException
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from paddleocr import PaddleOCR
|
||||||
|
from pythainlp.tokenize import word_tokenize
|
||||||
|
from pythainlp.util import normalize as thai_normalize
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
logger = logging.getLogger("ocr-sidecar")
|
||||||
|
|
||||||
|
app = FastAPI(title="PaddleOCR Sidecar", version="1.0.0")
|
||||||
|
|
||||||
|
# อ่านค่า config จาก environment
|
||||||
|
OCR_CHAR_THRESHOLD = int(os.getenv("OCR_CHAR_THRESHOLD", "100"))
|
||||||
|
USE_GPU = os.getenv("USE_GPU", "false").lower() == "true"
|
||||||
|
MAX_PAGES = int(os.getenv("OCR_MAX_PAGES", "0")) # 0 = ทุกหน้า
|
||||||
|
|
||||||
|
# โหลด PaddleOCR model ครั้งเดียวตอน startup (ลด latency ต่อ request)
|
||||||
|
logger.info(f"Loading PaddleOCR model (use_gpu={USE_GPU})...")
|
||||||
|
ocr_engine = PaddleOCR(
|
||||||
|
use_angle_cls=True,
|
||||||
|
lang="en",
|
||||||
|
use_gpu=USE_GPU,
|
||||||
|
show_log=False,
|
||||||
|
)
|
||||||
|
logger.info("PaddleOCR model loaded.")
|
||||||
|
|
||||||
|
|
||||||
|
class OcrRequest(BaseModel):
|
||||||
|
pdfPath: str
|
||||||
|
maxPages: Optional[int] = None
|
||||||
|
|
||||||
|
|
||||||
|
class OcrResponse(BaseModel):
|
||||||
|
text: str
|
||||||
|
ocrUsed: bool
|
||||||
|
pageCount: int
|
||||||
|
charCount: int
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/health")
|
||||||
|
def health():
|
||||||
|
return {"status": "ok", "engine": "paddleocr"}
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/ocr", response_model=OcrResponse)
|
||||||
|
def ocr_extract(req: OcrRequest):
|
||||||
|
pdf_path = Path(req.pdfPath)
|
||||||
|
if not pdf_path.exists():
|
||||||
|
raise HTTPException(status_code=404, detail=f"ไม่พบไฟล์: {req.pdfPath}")
|
||||||
|
|
||||||
|
max_pages = req.maxPages or MAX_PAGES
|
||||||
|
|
||||||
|
try:
|
||||||
|
doc = fitz.open(str(pdf_path))
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(status_code=422, detail=f"เปิดไฟล์ PDF ล้มเหลว: {e}")
|
||||||
|
|
||||||
|
pages_to_process = list(range(min(len(doc), max_pages) if max_pages > 0 else len(doc)))
|
||||||
|
page_count = len(pages_to_process)
|
||||||
|
|
||||||
|
# Fast path: ลอง extract text layer ก่อน
|
||||||
|
fast_text_parts = []
|
||||||
|
for i in pages_to_process:
|
||||||
|
page = doc[i]
|
||||||
|
fast_text_parts.append(page.get_text())
|
||||||
|
fast_text = "\n".join(fast_text_parts).strip()
|
||||||
|
total_chars = len(fast_text)
|
||||||
|
|
||||||
|
if total_chars > OCR_CHAR_THRESHOLD:
|
||||||
|
logger.info(f"Fast path: {total_chars} chars extracted from {pdf_path.name}")
|
||||||
|
return OcrResponse(
|
||||||
|
text=fast_text,
|
||||||
|
ocrUsed=False,
|
||||||
|
pageCount=page_count,
|
||||||
|
charCount=total_chars,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Slow path: ใช้ PaddleOCR กับทุกหน้า
|
||||||
|
logger.info(f"Slow path (PaddleOCR): {total_chars} chars too few for {pdf_path.name}")
|
||||||
|
ocr_text_parts = []
|
||||||
|
for i in pages_to_process:
|
||||||
|
page = doc[i]
|
||||||
|
pix = page.get_pixmap(dpi=200)
|
||||||
|
img_bytes = pix.tobytes("png")
|
||||||
|
result = ocr_engine.ocr(img_bytes, cls=True)
|
||||||
|
if result:
|
||||||
|
for line in result:
|
||||||
|
if line:
|
||||||
|
for word_info in line:
|
||||||
|
if word_info and len(word_info) >= 2:
|
||||||
|
text_part = word_info[1]
|
||||||
|
if isinstance(text_part, (list, tuple)) and len(text_part) >= 1:
|
||||||
|
ocr_text_parts.append(str(text_part[0]))
|
||||||
|
|
||||||
|
ocr_text = "\n".join(ocr_text_parts).strip()
|
||||||
|
logger.info(f"PaddleOCR extracted {len(ocr_text)} chars from {pdf_path.name}")
|
||||||
|
|
||||||
|
return OcrResponse(
|
||||||
|
text=ocr_text,
|
||||||
|
ocrUsed=True,
|
||||||
|
pageCount=page_count,
|
||||||
|
charCount=len(ocr_text),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class NormalizeRequest(BaseModel):
|
||||||
|
text: str
|
||||||
|
|
||||||
|
|
||||||
|
class NormalizeResponse(BaseModel):
|
||||||
|
normalized: str
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/normalize", response_model=NormalizeResponse)
|
||||||
|
def normalize_text(req: NormalizeRequest):
|
||||||
|
"""Normalize Thai text ด้วย PyThaiNLP สำหรับ rag-thai-preprocess queue"""
|
||||||
|
try:
|
||||||
|
# normalize unicode + ตัดคำแล้วต่อกลับด้วย space เพื่อ embedding
|
||||||
|
normalized = thai_normalize(req.text)
|
||||||
|
tokens = word_tokenize(normalized, engine="newmm", keep_whitespace=False)
|
||||||
|
result = " ".join(tokens)
|
||||||
|
return NormalizeResponse(normalized=result)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Thai normalize failed, returning raw text: {e}")
|
||||||
|
return NormalizeResponse(normalized=req.text)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import uvicorn
|
||||||
|
port = int(os.getenv("OCR_PORT", "8765"))
|
||||||
|
uvicorn.run(app, host="0.0.0.0", port=port)
|
||||||
+43
@@ -0,0 +1,43 @@
|
|||||||
|
# File: specs/04-Infrastructure-OPS/04-00-docker-compose/Desk-5439/ocr-sidecar/docker-compose.yml
|
||||||
|
# PaddleOCR Sidecar — รันบน Desk-5439 (AI Isolation Host) ตาม ADR-023A
|
||||||
|
# Change Log:
|
||||||
|
# - 2026-05-25: Initial compose file สำหรับ PaddleOCR HTTP sidecar
|
||||||
|
#
|
||||||
|
# วิธีรัน:
|
||||||
|
# docker compose up -d --build
|
||||||
|
#
|
||||||
|
# ทดสอบ:
|
||||||
|
# curl http://localhost:8765/health
|
||||||
|
|
||||||
|
name: lcbp3-ocr
|
||||||
|
|
||||||
|
services:
|
||||||
|
ocr-sidecar:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
container_name: ocr-sidecar
|
||||||
|
restart: unless-stopped
|
||||||
|
ports:
|
||||||
|
- "8765:8765"
|
||||||
|
environment:
|
||||||
|
OCR_CHAR_THRESHOLD: "100"
|
||||||
|
OCR_PORT: "8765"
|
||||||
|
OCR_MAX_PAGES: "0"
|
||||||
|
# ตั้ง USE_GPU=true เพื่อใช้ RTX 2060 Super (ต้องติดตั้ง nvidia-container-toolkit)
|
||||||
|
USE_GPU: "false"
|
||||||
|
volumes:
|
||||||
|
# mount path เดียวกับที่ backend เห็น (permanent uploads)
|
||||||
|
# ต้องตรงกับ UPLOAD_PERMANENT_DIR ที่ backend ใช้ผ่าน network share
|
||||||
|
- /share/np-dms-as/data/uploads:/mnt/uploads:ro
|
||||||
|
logging:
|
||||||
|
driver: "json-file"
|
||||||
|
options:
|
||||||
|
max-size: "10m"
|
||||||
|
max-file: "3"
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-f", "http://localhost:8765/health"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
start_period: 60s
|
||||||
+9
@@ -0,0 +1,9 @@
|
|||||||
|
paddlepaddle==2.6.2
|
||||||
|
paddleocr==2.7.3
|
||||||
|
PyMuPDF==1.24.0
|
||||||
|
pytesseract==0.3.13
|
||||||
|
fastapi==0.111.0
|
||||||
|
uvicorn[standard]==0.30.1
|
||||||
|
python-multipart==0.0.9
|
||||||
|
pythainlp==5.0.4
|
||||||
|
httpx==0.27.0
|
||||||
@@ -80,13 +80,17 @@ CLAMAV_PORT=3310
|
|||||||
QDRANT_HOST=qdrant
|
QDRANT_HOST=qdrant
|
||||||
QDRANT_PORT=6333
|
QDRANT_PORT=6333
|
||||||
|
|
||||||
# Ollama (Admin Desktop Desk-5439 — ADR-018 AI boundary)
|
# Ollama (Admin Desktop Desk-5439 — ADR-023A AI boundary)
|
||||||
OLLAMA_EMBED_MODEL=nomic-embed-text
|
OLLAMA_EMBED_MODEL=nomic-embed-text
|
||||||
OLLAMA_RAG_MODEL=gemma3:12b
|
OLLAMA_MODEL_MAIN=gemma4:e2b
|
||||||
OLLAMA_URL=http://192.168.20.200:11434
|
OLLAMA_URL=http://192.168.10.100:11434
|
||||||
|
|
||||||
|
# PaddleOCR Sidecar (Admin Desktop Desk-5439 — ADR-023A)
|
||||||
|
OCR_API_URL=http://192.168.10.100:8765
|
||||||
|
OCR_CHAR_THRESHOLD=100
|
||||||
|
|
||||||
# Thai preprocessing microservice (PyThaiNLP — Admin Desktop)
|
# Thai preprocessing microservice (PyThaiNLP — Admin Desktop)
|
||||||
THAI_PREPROCESS_URL=http://192.168.20.200:8765
|
THAI_PREPROCESS_URL=http://192.168.10.100:8765
|
||||||
|
|
||||||
# Typhoon API (cloud LLM — PUBLIC/INTERNAL only, never CONFIDENTIAL)
|
# Typhoon API (cloud LLM — PUBLIC/INTERNAL only, never CONFIDENTIAL)
|
||||||
TYPHOON_API_KEY=your-typhoon-api-key-here
|
TYPHOON_API_KEY=your-typhoon-api-key-here
|
||||||
|
|||||||
@@ -95,6 +95,9 @@ services:
|
|||||||
QDRANT_HOST: 'qdrant'
|
QDRANT_HOST: 'qdrant'
|
||||||
QDRANT_PORT: '6333'
|
QDRANT_PORT: '6333'
|
||||||
QDRANT_URL: 'http://qdrant:6333'
|
QDRANT_URL: 'http://qdrant:6333'
|
||||||
|
# --- PaddleOCR Sidecar (Desk-5439 — ADR-023A) ---
|
||||||
|
OCR_API_URL: ${OCR_API_URL:?OCR_API_URL required}
|
||||||
|
OCR_CHAR_THRESHOLD: ${OCR_CHAR_THRESHOLD:-100}
|
||||||
# --- Numbering ---
|
# --- Numbering ---
|
||||||
NUMBERING_LOCK_TIMEOUT: '5000'
|
NUMBERING_LOCK_TIMEOUT: '5000'
|
||||||
NUMBERING_RESERVATION_TTL: '300'
|
NUMBERING_RESERVATION_TTL: '300'
|
||||||
|
|||||||
Reference in New Issue
Block a user