690525:1418 ADR-028-228-migration-OCR #01

2026-05-25 14:18:02 +07:00
parent 001237ea35
commit 256a31b38c
7 changed files with 268 additions and 14 deletions
@@ -0,0 +1,37 @@
+# File: specs/04-Infrastructure-OPS/04-00-docker-compose/Desk-5439/ocr-sidecar/Dockerfile
+# PaddleOCR Sidecar — HTTP API server สำหรับสกัดข้อความจาก PDF/Image
+# รันบน Desk-5439 (GPU RTX 2060 Super 8GB) ตาม ADR-023A
+# Change Log:
+# - 2026-05-25: Initial Dockerfile สำหรับ PaddleOCR sidecar (port 8765)
+
+FROM python:3.10-slim
+
+# ติดตั้ง system dependencies สำหรับ PDF processing และ image library
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libglib2.0-0 \
+    libgl1 \
+    libgomp1 \
+    poppler-utils \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+# ติดตั้ง Python dependencies
+# ใช้ paddlepaddle-gpu สำหรับ GPU acceleration (RTX 2060 Super — CUDA 11.x)
+# เปลี่ยนเป็น paddlepaddle (CPU only) ถ้าต้องการ fallback
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy source code
+COPY app.py .
+
+# Download PaddleOCR models ล่วงหน้าระหว่าง build (ลด cold-start time)
+# โมเดลภาษาไทย (th) + อังกฤษ (en) ตาม ADR-023A
+RUN python -c "from paddleocr import PaddleOCR; PaddleOCR(use_angle_cls=True, lang='en', use_gpu=False)"
+
+EXPOSE 8765
+
+HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
+    CMD curl -f http://localhost:8765/health || exit 1
+
+CMD ["python", "app.py"]
@@ -0,0 +1,144 @@
+# File: specs/04-Infrastructure-OPS/04-00-docker-compose/Desk-5439/ocr-sidecar/app.py
+# PaddleOCR HTTP Sidecar API — รับ POST /ocr แล้วคืนข้อความที่สกัดจาก PDF/Image
+# ตาม ADR-023A: OCR auto-detect (PyMuPDF chars > 100 → Fast path, else PaddleOCR)
+# Change Log:
+# - 2026-05-25: Initial FastAPI server สำหรับ PaddleOCR sidecar
+
+import os
+import logging
+import re
+import fitz  # PyMuPDF
+from pathlib import Path
+from typing import Optional
+
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from paddleocr import PaddleOCR
+from pythainlp.tokenize import word_tokenize
+from pythainlp.util import normalize as thai_normalize
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("ocr-sidecar")
+
+app = FastAPI(title="PaddleOCR Sidecar", version="1.0.0")
+
+# อ่านค่า config จาก environment
+OCR_CHAR_THRESHOLD = int(os.getenv("OCR_CHAR_THRESHOLD", "100"))
+USE_GPU = os.getenv("USE_GPU", "false").lower() == "true"
+MAX_PAGES = int(os.getenv("OCR_MAX_PAGES", "0"))  # 0 = ทุกหน้า
+
+# โหลด PaddleOCR model ครั้งเดียวตอน startup (ลด latency ต่อ request)
+logger.info(f"Loading PaddleOCR model (use_gpu={USE_GPU})...")
+ocr_engine = PaddleOCR(
+    use_angle_cls=True,
+    lang="en",
+    use_gpu=USE_GPU,
+    show_log=False,
+)
+logger.info("PaddleOCR model loaded.")
+
+
+class OcrRequest(BaseModel):
+    pdfPath: str
+    maxPages: Optional[int] = None
+
+
+class OcrResponse(BaseModel):
+    text: str
+    ocrUsed: bool
+    pageCount: int
+    charCount: int
+
+
+@app.get("/health")
+def health():
+    return {"status": "ok", "engine": "paddleocr"}
+
+
+@app.post("/ocr", response_model=OcrResponse)
+def ocr_extract(req: OcrRequest):
+    pdf_path = Path(req.pdfPath)
+    if not pdf_path.exists():
+        raise HTTPException(status_code=404, detail=f"ไม่พบไฟล์: {req.pdfPath}")
+
+    max_pages = req.maxPages or MAX_PAGES
+
+    try:
+        doc = fitz.open(str(pdf_path))
+    except Exception as e:
+        raise HTTPException(status_code=422, detail=f"เปิดไฟล์ PDF ล้มเหลว: {e}")
+
+    pages_to_process = list(range(min(len(doc), max_pages) if max_pages > 0 else len(doc)))
+    page_count = len(pages_to_process)
+
+    # Fast path: ลอง extract text layer ก่อน
+    fast_text_parts = []
+    for i in pages_to_process:
+        page = doc[i]
+        fast_text_parts.append(page.get_text())
+    fast_text = "\n".join(fast_text_parts).strip()
+    total_chars = len(fast_text)
+
+    if total_chars > OCR_CHAR_THRESHOLD:
+        logger.info(f"Fast path: {total_chars} chars extracted from {pdf_path.name}")
+        return OcrResponse(
+            text=fast_text,
+            ocrUsed=False,
+            pageCount=page_count,
+            charCount=total_chars,
+        )
+
+    # Slow path: ใช้ PaddleOCR กับทุกหน้า
+    logger.info(f"Slow path (PaddleOCR): {total_chars} chars too few for {pdf_path.name}")
+    ocr_text_parts = []
+    for i in pages_to_process:
+        page = doc[i]
+        pix = page.get_pixmap(dpi=200)
+        img_bytes = pix.tobytes("png")
+        result = ocr_engine.ocr(img_bytes, cls=True)
+        if result:
+            for line in result:
+                if line:
+                    for word_info in line:
+                        if word_info and len(word_info) >= 2:
+                            text_part = word_info[1]
+                            if isinstance(text_part, (list, tuple)) and len(text_part) >= 1:
+                                ocr_text_parts.append(str(text_part[0]))
+
+    ocr_text = "\n".join(ocr_text_parts).strip()
+    logger.info(f"PaddleOCR extracted {len(ocr_text)} chars from {pdf_path.name}")
+
+    return OcrResponse(
+        text=ocr_text,
+        ocrUsed=True,
+        pageCount=page_count,
+        charCount=len(ocr_text),
+    )
+
+
+class NormalizeRequest(BaseModel):
+    text: str
+
+
+class NormalizeResponse(BaseModel):
+    normalized: str
+
+
+@app.post("/normalize", response_model=NormalizeResponse)
+def normalize_text(req: NormalizeRequest):
+    """Normalize Thai text ด้วย PyThaiNLP สำหรับ rag-thai-preprocess queue"""
+    try:
+        # normalize unicode + ตัดคำแล้วต่อกลับด้วย space เพื่อ embedding
+        normalized = thai_normalize(req.text)
+        tokens = word_tokenize(normalized, engine="newmm", keep_whitespace=False)
+        result = " ".join(tokens)
+        return NormalizeResponse(normalized=result)
+    except Exception as e:
+        logger.warning(f"Thai normalize failed, returning raw text: {e}")
+        return NormalizeResponse(normalized=req.text)
+
+
+if __name__ == "__main__":
+    import uvicorn
+    port = int(os.getenv("OCR_PORT", "8765"))
+    uvicorn.run(app, host="0.0.0.0", port=port)
@@ -0,0 +1,43 @@
+# File: specs/04-Infrastructure-OPS/04-00-docker-compose/Desk-5439/ocr-sidecar/docker-compose.yml
+# PaddleOCR Sidecar — รันบน Desk-5439 (AI Isolation Host) ตาม ADR-023A
+# Change Log:
+# - 2026-05-25: Initial compose file สำหรับ PaddleOCR HTTP sidecar
+#
+# วิธีรัน:
+#   docker compose up -d --build
+#
+# ทดสอบ:
+#   curl http://localhost:8765/health
+
+name: lcbp3-ocr
+
+services:
+  ocr-sidecar:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    container_name: ocr-sidecar
+    restart: unless-stopped
+    ports:
+      - "8765:8765"
+    environment:
+      OCR_CHAR_THRESHOLD: "100"
+      OCR_PORT: "8765"
+      OCR_MAX_PAGES: "0"
+      # ตั้ง USE_GPU=true เพื่อใช้ RTX 2060 Super (ต้องติดตั้ง nvidia-container-toolkit)
+      USE_GPU: "false"
+    volumes:
+      # mount path เดียวกับที่ backend เห็น (permanent uploads)
+      # ต้องตรงกับ UPLOAD_PERMANENT_DIR ที่ backend ใช้ผ่าน network share
+      - /share/np-dms-as/data/uploads:/mnt/uploads:ro
+    logging:
+      driver: "json-file"
+      options:
+        max-size: "10m"
+        max-file: "3"
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8765/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 60s
@@ -0,0 +1,9 @@
+paddlepaddle==2.6.2
+paddleocr==2.7.3
+PyMuPDF==1.24.0
+pytesseract==0.3.13
+fastapi==0.111.0
+uvicorn[standard]==0.30.1
+python-multipart==0.0.9
+pythainlp==5.0.4
+httpx==0.27.0
@@ -80,13 +80,17 @@ CLAMAV_PORT=3310
 QDRANT_HOST=qdrant
 QDRANT_PORT=6333

-# Ollama (Admin Desktop Desk-5439 — ADR-018 AI boundary)
+# Ollama (Admin Desktop Desk-5439 — ADR-023A AI boundary)
 OLLAMA_EMBED_MODEL=nomic-embed-text
-OLLAMA_RAG_MODEL=gemma3:12b
-OLLAMA_URL=http://192.168.20.200:11434
+OLLAMA_MODEL_MAIN=gemma4:e2b
+OLLAMA_URL=http://192.168.10.100:11434
+
+# PaddleOCR Sidecar (Admin Desktop Desk-5439 — ADR-023A)
+OCR_API_URL=http://192.168.10.100:8765
+OCR_CHAR_THRESHOLD=100

 # Thai preprocessing microservice (PyThaiNLP — Admin Desktop)
-THAI_PREPROCESS_URL=http://192.168.20.200:8765
+THAI_PREPROCESS_URL=http://192.168.10.100:8765

 # Typhoon API (cloud LLM — PUBLIC/INTERNAL only, never CONFIDENTIAL)
 TYPHOON_API_KEY=your-typhoon-api-key-here
@@ -95,6 +95,9 @@ services:
      QDRANT_HOST: 'qdrant'
      QDRANT_PORT: '6333'
      QDRANT_URL: 'http://qdrant:6333'
+      # --- PaddleOCR Sidecar (Desk-5439 — ADR-023A) ---
+      OCR_API_URL: ${OCR_API_URL:?OCR_API_URL required}
+      OCR_CHAR_THRESHOLD: ${OCR_CHAR_THRESHOLD:-100}
      # --- Numbering ---
      NUMBERING_LOCK_TIMEOUT: '5000'
      NUMBERING_RESERVATION_TTL: '300'