690530:1345 ADR-030-231-ocr-sandbox-two-step-flow #05

2026-05-30 13:45:18 +07:00
parent c9edd62a0b
commit 30d9d721fb
6 changed files with 118 additions and 32 deletions
@@ -5,6 +5,7 @@
 # - 2026-05-25: Initial FastAPI server สำหรับ PaddleOCR sidecar
 # - 2026-05-30: เปลี่ยน lang='en' เป็น lang='ch' (CTJK) เพื่อรองรับภาษาไทย
 # - 2026-05-30: เปลี่ยนจาก PaddleOCR เป็น Tesseract OCR เพื่อความเข้ากันได้กับ CPU เก่า
+# - 2026-05-30: เพิ่ม OpenCV preprocessing (threshold, denoise) และ DPI 300 เพื่อเพิ่มความแม่นยำ

 import os
 import logging
@@ -15,6 +16,8 @@ from typing import Optional
 from PIL import Image
 import pytesseract
 import io
+import cv2
+import numpy as np

 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
@@ -34,6 +37,26 @@ OCR_LANG = os.getenv("OCR_LANG", "tha+eng")  # Tesseract language code (tha+eng
 logger.info(f"Tesseract OCR Sidecar initialized (lang={OCR_LANG})")


+def preprocess_image(pil_image: Image.Image) -> Image.Image:
+    """Preprocess image ด้วย OpenCV เพื่อเพิ่มความแม่นยำ OCR"""
+    # แปลง PIL Image เป็น numpy array (OpenCV format)
+    img_array = np.array(pil_image)
+
+    # แปลงเป็น grayscale
+    gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
+
+    # Denoise ด้วย median blur
+    denoised = cv2.medianBlur(gray, 3)
+
+    # Adaptive threshold เพื่อแยก background จาก text
+    thresh = cv2.adaptiveThreshold(
+        denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
+    )
+
+    # แปลงกลับเป็น PIL Image
+    return Image.fromarray(thresh)
+
+
 class OcrRequest(BaseModel):
    pdfPath: str
    maxPages: Optional[int] = None
@@ -89,10 +112,14 @@ def ocr_extract(req: OcrRequest):
    ocr_text_parts = []
    for i in pages_to_process:
        page = doc[i]
-        pix = page.get_pixmap(dpi=200)
+        pix = page.get_pixmap(dpi=300)  # เพิ่ม DPI เป็น 300 เพื่อความชัด
        img_bytes = pix.tobytes("png")
        img = Image.open(io.BytesIO(img_bytes))
-        text = pytesseract.image_to_string(img, lang=OCR_LANG)
+
+        # Preprocess ด้วย OpenCV เพื่อเพิ่มความแม่นยำ
+        processed_img = preprocess_image(img)
+
+        text = pytesseract.image_to_string(processed_img, lang=OCR_LANG)
        ocr_text_parts.append(text.strip())

    ocr_text = "\n".join(ocr_text_parts).strip()