lcbp3/specs/04-Infrastructure-OPS/04-00-docker-compose/Desk-5439/ocr-sidecar/app.py

# File: specs/04-Infrastructure-OPS/04-00-docker-compose/Desk-5439/ocr-sidecar/app.py
# Tesseract OCR HTTP Sidecar API — รับ POST /ocr แล้วคืนข้อความที่สกัดจาก PDF/Image
# ตาม ADR-023A: OCR auto-detect (PyMuPDF chars > 100 → Fast path, else Tesseract)
# Change Log:
# - 2026-05-25: Initial FastAPI server สำหรับ PaddleOCR sidecar
# - 2026-05-30: เปลี่ยน lang='en' เป็น lang='ch' (CTJK) เพื่อรองรับภาษาไทย
# - 2026-05-30: เปลี่ยนจาก PaddleOCR เป็น Tesseract OCR เพื่อความเข้ากันได้กับ CPU เก่า
# - 2026-05-30: เพิ่ม OpenCV preprocessing (threshold, denoise) และ DPI 300 เพื่อเพิ่มความแม่นยำ
# - 2026-06-01: เพิ่ม POST /ocr-upload รับ multipart file โดยตรง ไม่ต้องพึ่ง shared volume mount

import os
import logging
import re
import base64
import fitz  # PyMuPDF
import httpx
from pathlib import Path
from typing import Optional
from PIL import Image
import pytesseract
import io
import cv2
import numpy as np

from fastapi import FastAPI, HTTPException, UploadFile, File, Form
from pydantic import BaseModel
from pythainlp.tokenize import word_tokenize
from pythainlp.util import normalize as thai_normalize

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("ocr-sidecar")

app = FastAPI(title="Tesseract OCR Sidecar", version="1.0.0")

# อ่านค่า config จาก environment
OCR_CHAR_THRESHOLD = int(os.getenv("OCR_CHAR_THRESHOLD", "100"))
MAX_PAGES = int(os.getenv("OCR_MAX_PAGES", "0"))  # 0 = ทุกหน้า
OCR_LANG = os.getenv("OCR_LANG", "tha+eng")  # Tesseract language code (tha+eng = Thai + English)
OLLAMA_API_URL = os.getenv("OLLAMA_API_URL", "http://host.docker.internal:11434")
TYPHOON_OCR_MODEL = os.getenv("TYPHOON_OCR_MODEL", "scb10x/typhoon-ocr-3b")
TYPHOON_OCR_TIMEOUT = int(os.getenv("TYPHOON_OCR_TIMEOUT", "120"))
# PSM 3 = Fully automatic page segmentation (เหมาะกับเอกสารที่มี layout หลายส่วน เช่น วันที่/เลขที่)
# OEM 1 = LSTM only (ดีกว่า legacy engine)
TESSERACT_CONFIG = f"--psm 3 --oem 1"
# Crop margin: ตัด header/footer (บน 5%, ล่าง 2%)
CROP_TOP_RATIO = 0.05
CROP_BOTTOM_RATIO = 0.02

logger.info(f"Tesseract OCR Sidecar initialized (lang={OCR_LANG}, config={TESSERACT_CONFIG})")


def filter_ocr_noise(text: str) -> str:
    """Filter ขยะ OCR เช่น บรรทัดสั้น/สัญลักษณ์ที่ไม่มีความหมาย"""
    lines = text.split("\n")
    filtered_lines = []

    for line in lines:
        line = line.strip()
        if not line:
            continue

        # ลบบรรทัดที่สั้นเกินไป (น้อยกว่า 3 ตัวอักษร)
        if len(line) < 3:
            continue

        # ลบบรรทัดที่มีแต่สัญลักษณ์/ตัวเลขโดดๆ (ไม่มีตัวอักษรภาษาไทย/อังกฤษ)
        thai_chars = sum(1 for c in line if '\u0E00' <= c <= '\u0E7F')
        english_chars = sum(1 for c in line if c.isalpha() and c.isascii())
        total_chars = len(line)

        # ถ้ามีตัวอักษรภาษาไทยหรืออังกฤษน้อยกว่า 20% ของบรรทัด ให้ถือว่าเป็นขยะ
        if total_chars > 0 and (thai_chars + english_chars) / total_chars < 0.2:
            continue

        filtered_lines.append(line)

    return "\n".join(filtered_lines)


def crop_header_footer(pil_image: Image.Image, top_ratio: float = 0.10, bottom_ratio: float = 0.10) -> Image.Image:
    """Crop header/footer ออกจาก image เพื่อลบข้อความที่ไม่จำเป็น"""
    width, height = pil_image.size
    top_crop = int(height * top_ratio)
    bottom_crop = int(height * bottom_ratio)

    # Crop: (left, top, right, bottom)
    cropped = pil_image.crop((0, top_crop, width, height - bottom_crop))
    return cropped


def preprocess_image(pil_image: Image.Image) -> Image.Image:
    """Preprocess image ด้วย OpenCV เพื่อเพิ่มความแม่นยำ OCR"""
    # แปลง PIL Image เป็น numpy array (OpenCV format)
    img_array = np.array(pil_image)

    # แปลงเป็น grayscale
    gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)

    # Denoise ด้วย median blur (เบางๆ เพื่อลบ noise แต่ไม่ทำลายตัวอักษร)
    denoised = cv2.medianBlur(gray, 3)

    # ใช้ grayscale เท่านั้น (ไม่ใช้ adaptive threshold เพราะทำให้ตัวอักษรเสียรูป)
    # แปลงกลับเป็น PIL Image
    return Image.fromarray(denoised)


class OcrRequest(BaseModel):
    pdfPath: str
    maxPages: Optional[int] = None
    engine: Optional[str] = None


class OcrResponse(BaseModel):
    text: str
    ocrUsed: bool
    pageCount: int
    charCount: int
    engineUsed: str


@app.get("/health")
def health():
    return {"status": "ok", "engine": "tesseract"}


def _process_pdf_doc(doc: fitz.Document, selected_engine: str, max_pages: int) -> OcrResponse:
    """ประมวลผล fitz.Document ด้วย engine ที่เลือก — shared logic สำหรับ /ocr และ /ocr-upload"""
    pages_to_process = list(range(min(len(doc), max_pages) if max_pages > 0 else len(doc)))
    page_count = len(pages_to_process)

    fast_text_parts = []
    total_chars = 0
    if selected_engine == "auto":
        for i in pages_to_process:
            page = doc[i]
            fast_text_parts.append(page.get_text())
        fast_text = "\n".join(fast_text_parts).strip()
        total_chars = len(fast_text)
        if total_chars > OCR_CHAR_THRESHOLD:
            logger.info(f"Fast path: {total_chars} chars extracted")
            return OcrResponse(
                text=fast_text,
                ocrUsed=False,
                pageCount=page_count,
                charCount=total_chars,
                engineUsed="fast-path",
            )

    if selected_engine == "typhoon-ocr-3b":
        typhoon_text_parts = []
        for i in pages_to_process:
            page = doc[i]
            pix = page.get_pixmap(dpi=300)
            img_bytes = pix.tobytes("png")
            img = Image.open(io.BytesIO(img_bytes))
            cropped_img = crop_header_footer(img, CROP_TOP_RATIO, CROP_BOTTOM_RATIO)
            processed_img = preprocess_image(cropped_img)
            typhoon_text_parts.append(process_with_typhoon_ocr(processed_img))
        typhoon_text = filter_ocr_noise("\n".join(typhoon_text_parts).strip())
        return OcrResponse(
            text=typhoon_text,
            ocrUsed=True,
            pageCount=page_count,
            charCount=len(typhoon_text),
            engineUsed="typhoon-ocr-3b",
        )

    logger.info(f"Slow path (Tesseract): {total_chars} chars too few")
    ocr_text_parts = []
    for i in pages_to_process:
        page = doc[i]
        pix = page.get_pixmap(dpi=300)
        img_bytes = pix.tobytes("png")
        img = Image.open(io.BytesIO(img_bytes))
        cropped_img = crop_header_footer(img, CROP_TOP_RATIO, CROP_BOTTOM_RATIO)
        processed_img = preprocess_image(cropped_img)
        text = pytesseract.image_to_string(processed_img, lang=OCR_LANG, config=TESSERACT_CONFIG)
        ocr_text_parts.append(text.strip())

    ocr_text = filter_ocr_noise("\n".join(ocr_text_parts).strip())
    logger.info(f"Tesseract extracted {len(ocr_text)} chars")
    return OcrResponse(
        text=ocr_text,
        ocrUsed=True,
        pageCount=page_count,
        charCount=len(ocr_text),
        engineUsed="tesseract",
    )


def process_with_typhoon_ocr(pil_image: Image.Image) -> str:
    """เรียก Typhoon OCR ผ่าน Ollama สำหรับ sandbox option โดยไม่แตะ backend DB/storage"""
    img_buffer = io.BytesIO()
    pil_image.save(img_buffer, format="PNG")
    image_base64 = base64.b64encode(img_buffer.getvalue()).decode("utf-8")
    payload = {
        "model": TYPHOON_OCR_MODEL,
        "prompt": "สกัดข้อความภาษาไทยและอังกฤษทั้งหมดจากภาพนี้อย่างถูกต้อง รักษาโครงสร้างบรรทัดและการเว้นวรรคให้ใกล้เคียงต้นฉบับมากที่สุด ห้ามเพิ่มคำอธิบายใดๆ",
        "images": [image_base64],
        "stream": False,
        "options": {
            "temperature": 0.0,
            "top_p": 0.9,
            "repeat_penalty": 1.0,
        },
        "keep_alive": 0,
    }
    with httpx.Client(timeout=TYPHOON_OCR_TIMEOUT) as client:
        response = client.post(f"{OLLAMA_API_URL}/api/generate", json=payload)
        response.raise_for_status()
        data = response.json()
        return str(data.get("response", "")).strip()


@app.post("/ocr", response_model=OcrResponse)
def ocr_extract(req: OcrRequest):
    """OCR จาก path (legacy — ใช้เมื่อ sidecar และ backend เข้าถึง storage เดียวกัน)"""
    pdf_path = Path(req.pdfPath)
    if not pdf_path.exists():
        raise HTTPException(status_code=404, detail=f"ไม่พบไฟล์: {req.pdfPath}")
    selected_engine = (req.engine or "auto").strip().lower()
    max_pages = req.maxPages or MAX_PAGES
    try:
        doc = fitz.open(str(pdf_path))
    except Exception as e:
        raise HTTPException(status_code=422, detail=f"เปิดไฟล์ PDF ล้มเหลว: {e}")
    return _process_pdf_doc(doc, selected_engine, max_pages)


@app.post("/ocr-upload", response_model=OcrResponse)
def ocr_upload(
    file: UploadFile = File(...),
    engine: str = Form(default="auto"),
    maxPages: int = Form(default=0),
):
    """OCR จาก multipart file upload — ไม่ต้องการ shared volume mount"""
    selected_engine = engine.strip().lower()
    max_pages = maxPages or MAX_PAGES
    pdf_bytes = file.file.read()
    try:
        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    except Exception as e:
        raise HTTPException(status_code=422, detail=f"เปิดไฟล์ PDF ล้มเหลว: {e}")
    logger.info(f"OCR upload: {file.filename} engine={selected_engine}")
    return _process_pdf_doc(doc, selected_engine, max_pages)


class NormalizeRequest(BaseModel):
    text: str


class NormalizeResponse(BaseModel):
    normalized: str


@app.post("/normalize", response_model=NormalizeResponse)
def normalize_text(req: NormalizeRequest):
    """Normalize Thai text ด้วย PyThaiNLP สำหรับ rag-thai-preprocess queue"""
    try:
        # normalize unicode + ตัดคำแล้วต่อกลับด้วย space เพื่อ embedding
        normalized = thai_normalize(req.text)
        tokens = word_tokenize(normalized, engine="newmm", keep_whitespace=False)
        result = " ".join(tokens)
        return NormalizeResponse(normalized=result)
    except Exception as e:
        logger.warning(f"Thai normalize failed, returning raw text: {e}")
        return NormalizeResponse(normalized=req.text)


if __name__ == "__main__":
    import uvicorn
    port = int(os.getenv("OCR_PORT", "8765"))
    uvicorn.run(app, host="0.0.0.0", port=port)