690525:1418 ADR-028-228-migration-OCR #01

2026-05-25 14:18:02 +07:00
parent 001237ea35
commit 256a31b38c
7 changed files with 268 additions and 14 deletions
@@ -0,0 +1,144 @@
+# File: specs/04-Infrastructure-OPS/04-00-docker-compose/Desk-5439/ocr-sidecar/app.py
+# PaddleOCR HTTP Sidecar API — รับ POST /ocr แล้วคืนข้อความที่สกัดจาก PDF/Image
+# ตาม ADR-023A: OCR auto-detect (PyMuPDF chars > 100 → Fast path, else PaddleOCR)
+# Change Log:
+# - 2026-05-25: Initial FastAPI server สำหรับ PaddleOCR sidecar
+
+import os
+import logging
+import re
+import fitz  # PyMuPDF
+from pathlib import Path
+from typing import Optional
+
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from paddleocr import PaddleOCR
+from pythainlp.tokenize import word_tokenize
+from pythainlp.util import normalize as thai_normalize
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("ocr-sidecar")
+
+app = FastAPI(title="PaddleOCR Sidecar", version="1.0.0")
+
+# อ่านค่า config จาก environment
+OCR_CHAR_THRESHOLD = int(os.getenv("OCR_CHAR_THRESHOLD", "100"))
+USE_GPU = os.getenv("USE_GPU", "false").lower() == "true"
+MAX_PAGES = int(os.getenv("OCR_MAX_PAGES", "0"))  # 0 = ทุกหน้า
+
+# โหลด PaddleOCR model ครั้งเดียวตอน startup (ลด latency ต่อ request)
+logger.info(f"Loading PaddleOCR model (use_gpu={USE_GPU})...")
+ocr_engine = PaddleOCR(
+    use_angle_cls=True,
+    lang="en",
+    use_gpu=USE_GPU,
+    show_log=False,
+)
+logger.info("PaddleOCR model loaded.")
+
+
+class OcrRequest(BaseModel):
+    pdfPath: str
+    maxPages: Optional[int] = None
+
+
+class OcrResponse(BaseModel):
+    text: str
+    ocrUsed: bool
+    pageCount: int
+    charCount: int
+
+
+@app.get("/health")
+def health():
+    return {"status": "ok", "engine": "paddleocr"}
+
+
+@app.post("/ocr", response_model=OcrResponse)
+def ocr_extract(req: OcrRequest):
+    pdf_path = Path(req.pdfPath)
+    if not pdf_path.exists():
+        raise HTTPException(status_code=404, detail=f"ไม่พบไฟล์: {req.pdfPath}")
+
+    max_pages = req.maxPages or MAX_PAGES
+
+    try:
+        doc = fitz.open(str(pdf_path))
+    except Exception as e:
+        raise HTTPException(status_code=422, detail=f"เปิดไฟล์ PDF ล้มเหลว: {e}")
+
+    pages_to_process = list(range(min(len(doc), max_pages) if max_pages > 0 else len(doc)))
+    page_count = len(pages_to_process)
+
+    # Fast path: ลอง extract text layer ก่อน
+    fast_text_parts = []
+    for i in pages_to_process:
+        page = doc[i]
+        fast_text_parts.append(page.get_text())
+    fast_text = "\n".join(fast_text_parts).strip()
+    total_chars = len(fast_text)
+
+    if total_chars > OCR_CHAR_THRESHOLD:
+        logger.info(f"Fast path: {total_chars} chars extracted from {pdf_path.name}")
+        return OcrResponse(
+            text=fast_text,
+            ocrUsed=False,
+            pageCount=page_count,
+            charCount=total_chars,
+        )
+
+    # Slow path: ใช้ PaddleOCR กับทุกหน้า
+    logger.info(f"Slow path (PaddleOCR): {total_chars} chars too few for {pdf_path.name}")
+    ocr_text_parts = []
+    for i in pages_to_process:
+        page = doc[i]
+        pix = page.get_pixmap(dpi=200)
+        img_bytes = pix.tobytes("png")
+        result = ocr_engine.ocr(img_bytes, cls=True)
+        if result:
+            for line in result:
+                if line:
+                    for word_info in line:
+                        if word_info and len(word_info) >= 2:
+                            text_part = word_info[1]
+                            if isinstance(text_part, (list, tuple)) and len(text_part) >= 1:
+                                ocr_text_parts.append(str(text_part[0]))
+
+    ocr_text = "\n".join(ocr_text_parts).strip()
+    logger.info(f"PaddleOCR extracted {len(ocr_text)} chars from {pdf_path.name}")
+
+    return OcrResponse(
+        text=ocr_text,
+        ocrUsed=True,
+        pageCount=page_count,
+        charCount=len(ocr_text),
+    )
+
+
+class NormalizeRequest(BaseModel):
+    text: str
+
+
+class NormalizeResponse(BaseModel):
+    normalized: str
+
+
+@app.post("/normalize", response_model=NormalizeResponse)
+def normalize_text(req: NormalizeRequest):
+    """Normalize Thai text ด้วย PyThaiNLP สำหรับ rag-thai-preprocess queue"""
+    try:
+        # normalize unicode + ตัดคำแล้วต่อกลับด้วย space เพื่อ embedding
+        normalized = thai_normalize(req.text)
+        tokens = word_tokenize(normalized, engine="newmm", keep_whitespace=False)
+        result = " ".join(tokens)
+        return NormalizeResponse(normalized=result)
+    except Exception as e:
+        logger.warning(f"Thai normalize failed, returning raw text: {e}")
+        return NormalizeResponse(normalized=req.text)
+
+
+if __name__ == "__main__":
+    import uvicorn
+    port = int(os.getenv("OCR_PORT", "8765"))
+    uvicorn.run(app, host="0.0.0.0", port=port)