refactor(ai): OCR sidecar canonical naming cleanup — typhoon→np-dms, remove hardcoded keys, asyncio.to_thread, ADR-040/041
CI / CD Pipeline / build (push) Successful in 7m37s
CI / CD Pipeline / deploy (push) Failing after 20m15s

This commit is contained in:
2026-06-20 16:37:04 +07:00
parent d418d791a4
commit a80ebef285
70 changed files with 5762 additions and 452 deletions
@@ -0,0 +1,42 @@
# File: tests/unit/ocr-sidecar/test_api_key_validation.py
# Change Log:
# - 2026-06-20: Added ADR-040 API key startup and request validation tests.
import importlib
import os
import sys
from pathlib import Path
import pytest
from fastapi.testclient import TestClient
from test_path_traversal import SIDECAR_DIR, install_import_stubs, load_app
def test_sidecar_fails_fast_when_api_key_missing(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
install_import_stubs()
monkeypatch.delenv("OCR_SIDECAR_API_KEY", raising=False)
monkeypatch.setenv("OCR_SIDECAR_UPLOAD_BASE", str(tmp_path))
if str(SIDECAR_DIR) not in sys.path:
sys.path.insert(0, str(SIDECAR_DIR))
sys.modules.pop("app", None)
with pytest.raises(RuntimeError, match="OCR_SIDECAR_API_KEY is required"):
importlib.import_module("app")
def test_sidecar_rejects_invalid_api_key(tmp_path: Path) -> None:
app_module = load_app(tmp_path)
client = TestClient(app_module.app)
response = client.post(
"/embed",
json={"text": "hello"},
headers={"X-API-Key": "wrong-key"},
)
assert response.status_code == 401
def test_sidecar_rejects_missing_api_key(tmp_path: Path) -> None:
app_module = load_app(tmp_path)
client = TestClient(app_module.app)
response = client.post("/embed", json={"text": "hello"})
assert response.status_code == 401
@@ -0,0 +1,114 @@
# File: tests/unit/ocr-sidecar/test_path_traversal.py
# Change Log:
# - 2026-06-20: Added ADR-040 path traversal tests for OCR sidecar.
import importlib
import os
import sys
import types
from pathlib import Path
from unittest.mock import patch
from fastapi.testclient import TestClient
SIDECAR_DIR = Path(__file__).resolve().parents[3] / "specs" / "04-Infrastructure-OPS" / "04-00-docker-compose" / "Desk-5439" / "ocr-sidecar"
def install_import_stubs() -> None:
"""ติดตั้ง stub สำหรับ dependency หนักเพื่อให้ unit test import app ได้เร็ว"""
fitz_module = types.ModuleType("fitz")
fitz_module.Document = object
fitz_module.open = lambda *args, **kwargs: None
sys.modules["fitz"] = fitz_module
typhoon_module = types.ModuleType("typhoon_ocr")
typhoon_module.prepare_ocr_messages = lambda *args, **kwargs: [{"content": []}]
sys.modules["typhoon_ocr"] = typhoon_module
flag_module = types.ModuleType("FlagEmbedding")
flag_module.BGEM3FlagModel = lambda *args, **kwargs: None
flag_module.FlagReranker = lambda *args, **kwargs: None
sys.modules["FlagEmbedding"] = flag_module
pil_module = types.ModuleType("PIL")
pil_image_module = types.ModuleType("PIL.Image")
pil_module.Image = pil_image_module
sys.modules["PIL"] = pil_module
sys.modules["PIL.Image"] = pil_image_module
pythainlp_module = types.ModuleType("pythainlp")
tokenize_module = types.ModuleType("pythainlp.tokenize")
tokenize_module.word_tokenize = lambda text, **kwargs: text.split()
util_module = types.ModuleType("pythainlp.util")
util_module.normalize = lambda text: text
sys.modules["pythainlp"] = pythainlp_module
sys.modules["pythainlp.tokenize"] = tokenize_module
sys.modules["pythainlp.util"] = util_module
def load_app(upload_base: Path):
install_import_stubs()
os.environ["OCR_SIDECAR_API_KEY"] = "test-key"
os.environ["OCR_SIDECAR_UPLOAD_BASE"] = str(upload_base)
if str(SIDECAR_DIR) not in sys.path:
sys.path.insert(0, str(SIDECAR_DIR))
sys.modules.pop("app", None)
return importlib.import_module("app")
class FakePage:
def get_text(self) -> str:
return "A" * 120
class FakeDocument:
name = "fake.pdf"
def __len__(self) -> int:
return 1
def __getitem__(self, index: int) -> FakePage:
return FakePage()
def test_ocr_rejects_parent_traversal_outside_upload_base(tmp_path: Path) -> None:
upload_base = tmp_path / "uploads"
upload_base.mkdir()
app_module = load_app(upload_base)
client = TestClient(app_module.app)
outside_path = upload_base / ".." / "outside.pdf"
response = client.post(
"/ocr",
json={"pdfPath": str(outside_path)},
headers={"X-API-Key": "test-key"},
)
assert response.status_code == 403
def test_ocr_rejects_prefix_sibling_path(tmp_path: Path) -> None:
upload_base = tmp_path / "uploads"
sibling = tmp_path / "uploads_evil"
upload_base.mkdir()
sibling.mkdir()
app_module = load_app(upload_base)
client = TestClient(app_module.app)
response = client.post(
"/ocr",
json={"pdfPath": str(sibling / "document.pdf")},
headers={"X-API-Key": "test-key"},
)
assert response.status_code == 403
def test_ocr_accepts_canonical_path_inside_upload_base(tmp_path: Path) -> None:
upload_base = tmp_path / "uploads"
upload_base.mkdir()
pdf_path = upload_base / "document.pdf"
pdf_path.write_bytes(b"%PDF-1.4\n")
app_module = load_app(upload_base)
client = TestClient(app_module.app)
with patch.object(app_module.fitz, "open", return_value=FakeDocument()):
response = client.post(
"/ocr",
json={"pdfPath": str(pdf_path)},
headers={"X-API-Key": "test-key"},
)
assert response.status_code == 200
assert response.json()["engineUsed"] == "fast-path"
@@ -0,0 +1,81 @@
# File: tests/unit/ocr-sidecar/test_residency_wiring.py
# Change Log:
# - 2026-06-20: Added ADR-040 residency wiring tests for process_ocr.
# - 2026-06-20: Updated for async process_ocr (Phase 6 — async I/O refactor).
import asyncio
from pathlib import Path
from types import SimpleNamespace
from unittest.mock import MagicMock, patch
import pytest
from test_path_traversal import load_app
class FakeAsyncResponse:
"""จำลอง httpx.AsyncClient response สำหรับ async process_ocr"""
def raise_for_status(self) -> None:
return None
def json(self) -> dict:
return {"choices": [{"message": {"content": "{\"natural_text\": \"ok\"}"}}]}
class FakeAsyncClient:
"""จำลอง httpx.AsyncClient สำหรับ async process_ocr"""
def __init__(self, *args, **kwargs) -> None:
self.payload = None
FakeAsyncClient.last_payload = None
async def post(self, url: str, json: dict, headers: dict) -> FakeAsyncResponse:
self.payload = json
FakeAsyncClient.last_payload = json
return FakeAsyncResponse()
async def aclose(self) -> None:
pass
FakeAsyncClient.last_payload = None
def test_process_ocr_uses_calculated_residency_keep_alive(tmp_path: Path) -> None:
"""T019: process_ocr ต้องเรียก calculate_ocr_residency และใช้ค่า keep_alive ที่คำนวณได้"""
app_module = load_app(tmp_path)
decision = SimpleNamespace(keep_alive_seconds=120, reason="headroom-sufficient", vram_headroom_mb=9000.0)
fake_client = FakeAsyncClient()
with patch.object(app_module, "calculate_ocr_residency", return_value=decision) as calculate, \
patch.object(app_module, "prepare_ocr_messages", return_value=[{"content": []}]), \
patch.object(app_module, "ollama_client", fake_client):
result = asyncio.run(app_module.process_ocr("/tmp/test.pdf", page_num=1))
assert result == "ok"
calculate.assert_called_once_with(app_module.OCR_ACTIVE_PROFILE)
assert FakeAsyncClient.last_payload["keep_alive"] == 120
def test_process_ocr_rejects_backend_keep_alive_override(tmp_path: Path) -> None:
"""T021: process_ocr ต้องปฏิเสธ keep_alive จาก backend"""
app_module = load_app(tmp_path)
async def run_test():
with pytest.raises(ValueError, match="keep_alive must be calculated"):
await app_module.process_ocr("/tmp/test.pdf", options_override={"keep_alive": 0})
asyncio.run(run_test())
def test_ocr_endpoint_rejects_keep_alive_override(tmp_path: Path) -> None:
"""T021: /ocr endpoint ต้องปฏิเสธ keep_alive ใน request body"""
app_module = load_app(tmp_path)
from fastapi.testclient import TestClient
client = TestClient(app_module.app)
response = client.post(
"/ocr",
json={"pdfPath": str(tmp_path / "document.pdf"), "keep_alive": 0},
headers={"X-API-Key": "test-key"},
)
assert response.status_code == 400