690525:1418 ADR-028-228-migration-OCR #01
This commit is contained in:
@@ -0,0 +1,144 @@
|
||||
# File: specs/04-Infrastructure-OPS/04-00-docker-compose/Desk-5439/ocr-sidecar/app.py
|
||||
# PaddleOCR HTTP Sidecar API — รับ POST /ocr แล้วคืนข้อความที่สกัดจาก PDF/Image
|
||||
# ตาม ADR-023A: OCR auto-detect (PyMuPDF chars > 100 → Fast path, else PaddleOCR)
|
||||
# Change Log:
|
||||
# - 2026-05-25: Initial FastAPI server สำหรับ PaddleOCR sidecar
|
||||
|
||||
import os
|
||||
import logging
|
||||
import re
|
||||
import fitz # PyMuPDF
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from pydantic import BaseModel
|
||||
from paddleocr import PaddleOCR
|
||||
from pythainlp.tokenize import word_tokenize
|
||||
from pythainlp.util import normalize as thai_normalize
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger("ocr-sidecar")
|
||||
|
||||
app = FastAPI(title="PaddleOCR Sidecar", version="1.0.0")
|
||||
|
||||
# อ่านค่า config จาก environment
|
||||
OCR_CHAR_THRESHOLD = int(os.getenv("OCR_CHAR_THRESHOLD", "100"))
|
||||
USE_GPU = os.getenv("USE_GPU", "false").lower() == "true"
|
||||
MAX_PAGES = int(os.getenv("OCR_MAX_PAGES", "0")) # 0 = ทุกหน้า
|
||||
|
||||
# โหลด PaddleOCR model ครั้งเดียวตอน startup (ลด latency ต่อ request)
|
||||
logger.info(f"Loading PaddleOCR model (use_gpu={USE_GPU})...")
|
||||
ocr_engine = PaddleOCR(
|
||||
use_angle_cls=True,
|
||||
lang="en",
|
||||
use_gpu=USE_GPU,
|
||||
show_log=False,
|
||||
)
|
||||
logger.info("PaddleOCR model loaded.")
|
||||
|
||||
|
||||
class OcrRequest(BaseModel):
|
||||
pdfPath: str
|
||||
maxPages: Optional[int] = None
|
||||
|
||||
|
||||
class OcrResponse(BaseModel):
|
||||
text: str
|
||||
ocrUsed: bool
|
||||
pageCount: int
|
||||
charCount: int
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
def health():
|
||||
return {"status": "ok", "engine": "paddleocr"}
|
||||
|
||||
|
||||
@app.post("/ocr", response_model=OcrResponse)
|
||||
def ocr_extract(req: OcrRequest):
|
||||
pdf_path = Path(req.pdfPath)
|
||||
if not pdf_path.exists():
|
||||
raise HTTPException(status_code=404, detail=f"ไม่พบไฟล์: {req.pdfPath}")
|
||||
|
||||
max_pages = req.maxPages or MAX_PAGES
|
||||
|
||||
try:
|
||||
doc = fitz.open(str(pdf_path))
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=422, detail=f"เปิดไฟล์ PDF ล้มเหลว: {e}")
|
||||
|
||||
pages_to_process = list(range(min(len(doc), max_pages) if max_pages > 0 else len(doc)))
|
||||
page_count = len(pages_to_process)
|
||||
|
||||
# Fast path: ลอง extract text layer ก่อน
|
||||
fast_text_parts = []
|
||||
for i in pages_to_process:
|
||||
page = doc[i]
|
||||
fast_text_parts.append(page.get_text())
|
||||
fast_text = "\n".join(fast_text_parts).strip()
|
||||
total_chars = len(fast_text)
|
||||
|
||||
if total_chars > OCR_CHAR_THRESHOLD:
|
||||
logger.info(f"Fast path: {total_chars} chars extracted from {pdf_path.name}")
|
||||
return OcrResponse(
|
||||
text=fast_text,
|
||||
ocrUsed=False,
|
||||
pageCount=page_count,
|
||||
charCount=total_chars,
|
||||
)
|
||||
|
||||
# Slow path: ใช้ PaddleOCR กับทุกหน้า
|
||||
logger.info(f"Slow path (PaddleOCR): {total_chars} chars too few for {pdf_path.name}")
|
||||
ocr_text_parts = []
|
||||
for i in pages_to_process:
|
||||
page = doc[i]
|
||||
pix = page.get_pixmap(dpi=200)
|
||||
img_bytes = pix.tobytes("png")
|
||||
result = ocr_engine.ocr(img_bytes, cls=True)
|
||||
if result:
|
||||
for line in result:
|
||||
if line:
|
||||
for word_info in line:
|
||||
if word_info and len(word_info) >= 2:
|
||||
text_part = word_info[1]
|
||||
if isinstance(text_part, (list, tuple)) and len(text_part) >= 1:
|
||||
ocr_text_parts.append(str(text_part[0]))
|
||||
|
||||
ocr_text = "\n".join(ocr_text_parts).strip()
|
||||
logger.info(f"PaddleOCR extracted {len(ocr_text)} chars from {pdf_path.name}")
|
||||
|
||||
return OcrResponse(
|
||||
text=ocr_text,
|
||||
ocrUsed=True,
|
||||
pageCount=page_count,
|
||||
charCount=len(ocr_text),
|
||||
)
|
||||
|
||||
|
||||
class NormalizeRequest(BaseModel):
|
||||
text: str
|
||||
|
||||
|
||||
class NormalizeResponse(BaseModel):
|
||||
normalized: str
|
||||
|
||||
|
||||
@app.post("/normalize", response_model=NormalizeResponse)
|
||||
def normalize_text(req: NormalizeRequest):
|
||||
"""Normalize Thai text ด้วย PyThaiNLP สำหรับ rag-thai-preprocess queue"""
|
||||
try:
|
||||
# normalize unicode + ตัดคำแล้วต่อกลับด้วย space เพื่อ embedding
|
||||
normalized = thai_normalize(req.text)
|
||||
tokens = word_tokenize(normalized, engine="newmm", keep_whitespace=False)
|
||||
result = " ".join(tokens)
|
||||
return NormalizeResponse(normalized=result)
|
||||
except Exception as e:
|
||||
logger.warning(f"Thai normalize failed, returning raw text: {e}")
|
||||
return NormalizeResponse(normalized=req.text)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
port = int(os.getenv("OCR_PORT", "8765"))
|
||||
uvicorn.run(app, host="0.0.0.0", port=port)
|
||||
Reference in New Issue
Block a user