diff --git a/backend/src/modules/ai/services/ocr.service.ts b/backend/src/modules/ai/services/ocr.service.ts index 9967dacc..8b3324c3 100644 --- a/backend/src/modules/ai/services/ocr.service.ts +++ b/backend/src/modules/ai/services/ocr.service.ts @@ -1,6 +1,7 @@ // File: src/modules/ai/services/ocr.service.ts // Change Log // - 2026-05-15: เพิ่ม OCR auto-detection service สำหรับ ADR-023A. +// - 2026-05-25: แก้ไข AggregateError (empty message) จาก axios โดย wrap เป็น Error พร้อม context ที่ชัดเจน. import { Injectable, Logger } from '@nestjs/common'; import { ConfigService } from '@nestjs/config'; @@ -52,15 +53,28 @@ export class OcrService { return { text: extractedText, ocrUsed: false }; } - const response = await axios.post( - `${this.ocrApiUrl}/ocr`, - { pdfPath: input.pdfPath }, - { timeout: 90000 } - ); - - return { - text: response.data.text ?? '', - ocrUsed: true, - }; + try { + const response = await axios.post( + `${this.ocrApiUrl}/ocr`, + { pdfPath: input.pdfPath }, + { timeout: 90000 } + ); + return { + text: response.data.text ?? '', + ocrUsed: true, + }; + } catch (err: unknown) { + const cause = + err instanceof AggregateError && err.errors?.length + ? err.errors + .map((e: unknown) => (e instanceof Error ? e.message : String(e))) + .join('; ') + : err instanceof Error + ? err.message + : String(err); + throw new Error( + `PaddleOCR sidecar unreachable at ${this.ocrApiUrl} — ${cause}` + ); + } } } diff --git a/specs/04-Infrastructure-OPS/04-00-docker-compose/Desk-5439/ocr-sidecar/Dockerfile b/specs/04-Infrastructure-OPS/04-00-docker-compose/Desk-5439/ocr-sidecar/Dockerfile new file mode 100644 index 00000000..f2537a83 --- /dev/null +++ b/specs/04-Infrastructure-OPS/04-00-docker-compose/Desk-5439/ocr-sidecar/Dockerfile @@ -0,0 +1,37 @@ +# File: specs/04-Infrastructure-OPS/04-00-docker-compose/Desk-5439/ocr-sidecar/Dockerfile +# PaddleOCR Sidecar — HTTP API server สำหรับสกัดข้อความจาก PDF/Image +# รันบน Desk-5439 (GPU RTX 2060 Super 8GB) ตาม ADR-023A +# Change Log: +# - 2026-05-25: Initial Dockerfile สำหรับ PaddleOCR sidecar (port 8765) + +FROM python:3.10-slim + +# ติดตั้ง system dependencies สำหรับ PDF processing และ image library +RUN apt-get update && apt-get install -y --no-install-recommends \ + libglib2.0-0 \ + libgl1 \ + libgomp1 \ + poppler-utils \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# ติดตั้ง Python dependencies +# ใช้ paddlepaddle-gpu สำหรับ GPU acceleration (RTX 2060 Super — CUDA 11.x) +# เปลี่ยนเป็น paddlepaddle (CPU only) ถ้าต้องการ fallback +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy source code +COPY app.py . + +# Download PaddleOCR models ล่วงหน้าระหว่าง build (ลด cold-start time) +# โมเดลภาษาไทย (th) + อังกฤษ (en) ตาม ADR-023A +RUN python -c "from paddleocr import PaddleOCR; PaddleOCR(use_angle_cls=True, lang='en', use_gpu=False)" + +EXPOSE 8765 + +HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \ + CMD curl -f http://localhost:8765/health || exit 1 + +CMD ["python", "app.py"] diff --git a/specs/04-Infrastructure-OPS/04-00-docker-compose/Desk-5439/ocr-sidecar/app.py b/specs/04-Infrastructure-OPS/04-00-docker-compose/Desk-5439/ocr-sidecar/app.py new file mode 100644 index 00000000..f18922e1 --- /dev/null +++ b/specs/04-Infrastructure-OPS/04-00-docker-compose/Desk-5439/ocr-sidecar/app.py @@ -0,0 +1,144 @@ +# File: specs/04-Infrastructure-OPS/04-00-docker-compose/Desk-5439/ocr-sidecar/app.py +# PaddleOCR HTTP Sidecar API — รับ POST /ocr แล้วคืนข้อความที่สกัดจาก PDF/Image +# ตาม ADR-023A: OCR auto-detect (PyMuPDF chars > 100 → Fast path, else PaddleOCR) +# Change Log: +# - 2026-05-25: Initial FastAPI server สำหรับ PaddleOCR sidecar + +import os +import logging +import re +import fitz # PyMuPDF +from pathlib import Path +from typing import Optional + +from fastapi import FastAPI, HTTPException +from pydantic import BaseModel +from paddleocr import PaddleOCR +from pythainlp.tokenize import word_tokenize +from pythainlp.util import normalize as thai_normalize + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger("ocr-sidecar") + +app = FastAPI(title="PaddleOCR Sidecar", version="1.0.0") + +# อ่านค่า config จาก environment +OCR_CHAR_THRESHOLD = int(os.getenv("OCR_CHAR_THRESHOLD", "100")) +USE_GPU = os.getenv("USE_GPU", "false").lower() == "true" +MAX_PAGES = int(os.getenv("OCR_MAX_PAGES", "0")) # 0 = ทุกหน้า + +# โหลด PaddleOCR model ครั้งเดียวตอน startup (ลด latency ต่อ request) +logger.info(f"Loading PaddleOCR model (use_gpu={USE_GPU})...") +ocr_engine = PaddleOCR( + use_angle_cls=True, + lang="en", + use_gpu=USE_GPU, + show_log=False, +) +logger.info("PaddleOCR model loaded.") + + +class OcrRequest(BaseModel): + pdfPath: str + maxPages: Optional[int] = None + + +class OcrResponse(BaseModel): + text: str + ocrUsed: bool + pageCount: int + charCount: int + + +@app.get("/health") +def health(): + return {"status": "ok", "engine": "paddleocr"} + + +@app.post("/ocr", response_model=OcrResponse) +def ocr_extract(req: OcrRequest): + pdf_path = Path(req.pdfPath) + if not pdf_path.exists(): + raise HTTPException(status_code=404, detail=f"ไม่พบไฟล์: {req.pdfPath}") + + max_pages = req.maxPages or MAX_PAGES + + try: + doc = fitz.open(str(pdf_path)) + except Exception as e: + raise HTTPException(status_code=422, detail=f"เปิดไฟล์ PDF ล้มเหลว: {e}") + + pages_to_process = list(range(min(len(doc), max_pages) if max_pages > 0 else len(doc))) + page_count = len(pages_to_process) + + # Fast path: ลอง extract text layer ก่อน + fast_text_parts = [] + for i in pages_to_process: + page = doc[i] + fast_text_parts.append(page.get_text()) + fast_text = "\n".join(fast_text_parts).strip() + total_chars = len(fast_text) + + if total_chars > OCR_CHAR_THRESHOLD: + logger.info(f"Fast path: {total_chars} chars extracted from {pdf_path.name}") + return OcrResponse( + text=fast_text, + ocrUsed=False, + pageCount=page_count, + charCount=total_chars, + ) + + # Slow path: ใช้ PaddleOCR กับทุกหน้า + logger.info(f"Slow path (PaddleOCR): {total_chars} chars too few for {pdf_path.name}") + ocr_text_parts = [] + for i in pages_to_process: + page = doc[i] + pix = page.get_pixmap(dpi=200) + img_bytes = pix.tobytes("png") + result = ocr_engine.ocr(img_bytes, cls=True) + if result: + for line in result: + if line: + for word_info in line: + if word_info and len(word_info) >= 2: + text_part = word_info[1] + if isinstance(text_part, (list, tuple)) and len(text_part) >= 1: + ocr_text_parts.append(str(text_part[0])) + + ocr_text = "\n".join(ocr_text_parts).strip() + logger.info(f"PaddleOCR extracted {len(ocr_text)} chars from {pdf_path.name}") + + return OcrResponse( + text=ocr_text, + ocrUsed=True, + pageCount=page_count, + charCount=len(ocr_text), + ) + + +class NormalizeRequest(BaseModel): + text: str + + +class NormalizeResponse(BaseModel): + normalized: str + + +@app.post("/normalize", response_model=NormalizeResponse) +def normalize_text(req: NormalizeRequest): + """Normalize Thai text ด้วย PyThaiNLP สำหรับ rag-thai-preprocess queue""" + try: + # normalize unicode + ตัดคำแล้วต่อกลับด้วย space เพื่อ embedding + normalized = thai_normalize(req.text) + tokens = word_tokenize(normalized, engine="newmm", keep_whitespace=False) + result = " ".join(tokens) + return NormalizeResponse(normalized=result) + except Exception as e: + logger.warning(f"Thai normalize failed, returning raw text: {e}") + return NormalizeResponse(normalized=req.text) + + +if __name__ == "__main__": + import uvicorn + port = int(os.getenv("OCR_PORT", "8765")) + uvicorn.run(app, host="0.0.0.0", port=port) diff --git a/specs/04-Infrastructure-OPS/04-00-docker-compose/Desk-5439/ocr-sidecar/docker-compose.yml b/specs/04-Infrastructure-OPS/04-00-docker-compose/Desk-5439/ocr-sidecar/docker-compose.yml new file mode 100644 index 00000000..243532b0 --- /dev/null +++ b/specs/04-Infrastructure-OPS/04-00-docker-compose/Desk-5439/ocr-sidecar/docker-compose.yml @@ -0,0 +1,43 @@ +# File: specs/04-Infrastructure-OPS/04-00-docker-compose/Desk-5439/ocr-sidecar/docker-compose.yml +# PaddleOCR Sidecar — รันบน Desk-5439 (AI Isolation Host) ตาม ADR-023A +# Change Log: +# - 2026-05-25: Initial compose file สำหรับ PaddleOCR HTTP sidecar +# +# วิธีรัน: +# docker compose up -d --build +# +# ทดสอบ: +# curl http://localhost:8765/health + +name: lcbp3-ocr + +services: + ocr-sidecar: + build: + context: . + dockerfile: Dockerfile + container_name: ocr-sidecar + restart: unless-stopped + ports: + - "8765:8765" + environment: + OCR_CHAR_THRESHOLD: "100" + OCR_PORT: "8765" + OCR_MAX_PAGES: "0" + # ตั้ง USE_GPU=true เพื่อใช้ RTX 2060 Super (ต้องติดตั้ง nvidia-container-toolkit) + USE_GPU: "false" + volumes: + # mount path เดียวกับที่ backend เห็น (permanent uploads) + # ต้องตรงกับ UPLOAD_PERMANENT_DIR ที่ backend ใช้ผ่าน network share + - /share/np-dms-as/data/uploads:/mnt/uploads:ro + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8765/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 60s diff --git a/specs/04-Infrastructure-OPS/04-00-docker-compose/Desk-5439/ocr-sidecar/requirements.txt b/specs/04-Infrastructure-OPS/04-00-docker-compose/Desk-5439/ocr-sidecar/requirements.txt new file mode 100644 index 00000000..bea30b64 --- /dev/null +++ b/specs/04-Infrastructure-OPS/04-00-docker-compose/Desk-5439/ocr-sidecar/requirements.txt @@ -0,0 +1,9 @@ +paddlepaddle==2.6.2 +paddleocr==2.7.3 +PyMuPDF==1.24.0 +pytesseract==0.3.13 +fastapi==0.111.0 +uvicorn[standard]==0.30.1 +python-multipart==0.0.9 +pythainlp==5.0.4 +httpx==0.27.0 diff --git a/specs/04-Infrastructure-OPS/04-00-docker-compose/QNAP/app/.env.example b/specs/04-Infrastructure-OPS/04-00-docker-compose/QNAP/app/.env.example index 5741e27a..0a220482 100644 --- a/specs/04-Infrastructure-OPS/04-00-docker-compose/QNAP/app/.env.example +++ b/specs/04-Infrastructure-OPS/04-00-docker-compose/QNAP/app/.env.example @@ -80,13 +80,17 @@ CLAMAV_PORT=3310 QDRANT_HOST=qdrant QDRANT_PORT=6333 -# Ollama (Admin Desktop Desk-5439 — ADR-018 AI boundary) +# Ollama (Admin Desktop Desk-5439 — ADR-023A AI boundary) OLLAMA_EMBED_MODEL=nomic-embed-text -OLLAMA_RAG_MODEL=gemma3:12b -OLLAMA_URL=http://192.168.20.200:11434 +OLLAMA_MODEL_MAIN=gemma4:e2b +OLLAMA_URL=http://192.168.10.100:11434 + +# PaddleOCR Sidecar (Admin Desktop Desk-5439 — ADR-023A) +OCR_API_URL=http://192.168.10.100:8765 +OCR_CHAR_THRESHOLD=100 # Thai preprocessing microservice (PyThaiNLP — Admin Desktop) -THAI_PREPROCESS_URL=http://192.168.20.200:8765 +THAI_PREPROCESS_URL=http://192.168.10.100:8765 # Typhoon API (cloud LLM — PUBLIC/INTERNAL only, never CONFIDENTIAL) TYPHOON_API_KEY=your-typhoon-api-key-here diff --git a/specs/04-Infrastructure-OPS/04-00-docker-compose/QNAP/app/docker-compose-app.yml b/specs/04-Infrastructure-OPS/04-00-docker-compose/QNAP/app/docker-compose-app.yml index 99b00bc5..b26f6552 100644 --- a/specs/04-Infrastructure-OPS/04-00-docker-compose/QNAP/app/docker-compose-app.yml +++ b/specs/04-Infrastructure-OPS/04-00-docker-compose/QNAP/app/docker-compose-app.yml @@ -95,6 +95,9 @@ services: QDRANT_HOST: 'qdrant' QDRANT_PORT: '6333' QDRANT_URL: 'http://qdrant:6333' + # --- PaddleOCR Sidecar (Desk-5439 — ADR-023A) --- + OCR_API_URL: ${OCR_API_URL:?OCR_API_URL required} + OCR_CHAR_THRESHOLD: ${OCR_CHAR_THRESHOLD:-100} # --- Numbering --- NUMBERING_LOCK_TIMEOUT: '5000' NUMBERING_RESERVATION_TTL: '300'