lcbp3/specs/04-Infrastructure-OPS/04-00-docker-compose/Desk-5439/ocr-sidecar/app.py

# File: specs/04-Infrastructure-OPS/04-00-docker-compose/Desk-5439/ocr-sidecar/app.py
# PaddleOCR HTTP Sidecar API — รับ POST /ocr แล้วคืนข้อความที่สกัดจาก PDF/Image
# ตาม ADR-023A: OCR auto-detect (PyMuPDF chars > 100 → Fast path, else PaddleOCR)
# Change Log:
# - 2026-05-25: Initial FastAPI server สำหรับ PaddleOCR sidecar

import os
import logging
import re
import fitz  # PyMuPDF
from pathlib import Path
from typing import Optional

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from paddleocr import PaddleOCR
from pythainlp.tokenize import word_tokenize
from pythainlp.util import normalize as thai_normalize

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("ocr-sidecar")

app = FastAPI(title="PaddleOCR Sidecar", version="1.0.0")

# อ่านค่า config จาก environment
OCR_CHAR_THRESHOLD = int(os.getenv("OCR_CHAR_THRESHOLD", "100"))
USE_GPU = os.getenv("USE_GPU", "false").lower() == "true"
MAX_PAGES = int(os.getenv("OCR_MAX_PAGES", "0"))  # 0 = ทุกหน้า

# โหลด PaddleOCR model ครั้งเดียวตอน startup (ลด latency ต่อ request)
logger.info(f"Loading PaddleOCR model (use_gpu={USE_GPU})...")
ocr_engine = PaddleOCR(
    use_angle_cls=True,
    lang="en",
    use_gpu=USE_GPU,
    show_log=False,
)
logger.info("PaddleOCR model loaded.")


class OcrRequest(BaseModel):
    pdfPath: str
    maxPages: Optional[int] = None


class OcrResponse(BaseModel):
    text: str
    ocrUsed: bool
    pageCount: int
    charCount: int


@app.get("/health")
def health():
    return {"status": "ok", "engine": "paddleocr"}


@app.post("/ocr", response_model=OcrResponse)
def ocr_extract(req: OcrRequest):
    pdf_path = Path(req.pdfPath)
    if not pdf_path.exists():
        raise HTTPException(status_code=404, detail=f"ไม่พบไฟล์: {req.pdfPath}")

    max_pages = req.maxPages or MAX_PAGES

    try:
        doc = fitz.open(str(pdf_path))
    except Exception as e:
        raise HTTPException(status_code=422, detail=f"เปิดไฟล์ PDF ล้มเหลว: {e}")

    pages_to_process = list(range(min(len(doc), max_pages) if max_pages > 0 else len(doc)))
    page_count = len(pages_to_process)

    # Fast path: ลอง extract text layer ก่อน
    fast_text_parts = []
    for i in pages_to_process:
        page = doc[i]
        fast_text_parts.append(page.get_text())
    fast_text = "\n".join(fast_text_parts).strip()
    total_chars = len(fast_text)

    if total_chars > OCR_CHAR_THRESHOLD:
        logger.info(f"Fast path: {total_chars} chars extracted from {pdf_path.name}")
        return OcrResponse(
            text=fast_text,
            ocrUsed=False,
            pageCount=page_count,
            charCount=total_chars,
        )

    # Slow path: ใช้ PaddleOCR กับทุกหน้า
    logger.info(f"Slow path (PaddleOCR): {total_chars} chars too few for {pdf_path.name}")
    ocr_text_parts = []
    for i in pages_to_process:
        page = doc[i]
        pix = page.get_pixmap(dpi=200)
        img_bytes = pix.tobytes("png")
        result = ocr_engine.ocr(img_bytes, cls=True)
        if result:
            for line in result:
                if line:
                    for word_info in line:
                        if word_info and len(word_info) >= 2:
                            text_part = word_info[1]
                            if isinstance(text_part, (list, tuple)) and len(text_part) >= 1:
                                ocr_text_parts.append(str(text_part[0]))

    ocr_text = "\n".join(ocr_text_parts).strip()
    logger.info(f"PaddleOCR extracted {len(ocr_text)} chars from {pdf_path.name}")

    return OcrResponse(
        text=ocr_text,
        ocrUsed=True,
        pageCount=page_count,
        charCount=len(ocr_text),
    )


class NormalizeRequest(BaseModel):
    text: str


class NormalizeResponse(BaseModel):
    normalized: str


@app.post("/normalize", response_model=NormalizeResponse)
def normalize_text(req: NormalizeRequest):
    """Normalize Thai text ด้วย PyThaiNLP สำหรับ rag-thai-preprocess queue"""
    try:
        # normalize unicode + ตัดคำแล้วต่อกลับด้วย space เพื่อ embedding
        normalized = thai_normalize(req.text)
        tokens = word_tokenize(normalized, engine="newmm", keep_whitespace=False)
        result = " ".join(tokens)
        return NormalizeResponse(normalized=result)
    except Exception as e:
        logger.warning(f"Thai normalize failed, returning raw text: {e}")
        return NormalizeResponse(normalized=req.text)


if __name__ == "__main__":
    import uvicorn
    port = int(os.getenv("OCR_PORT", "8765"))
    uvicorn.run(app, host="0.0.0.0", port=port)