refactor(ai): OCR sidecar canonical naming cleanup — typhoon→np-dms, remove hardcoded keys, asyncio.to_thread, ADR-040/041
This commit is contained in:
@@ -0,0 +1,114 @@
|
||||
# File: tests/unit/ocr-sidecar/test_path_traversal.py
|
||||
# Change Log:
|
||||
# - 2026-06-20: Added ADR-040 path traversal tests for OCR sidecar.
|
||||
|
||||
import importlib
|
||||
import os
|
||||
import sys
|
||||
import types
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
SIDECAR_DIR = Path(__file__).resolve().parents[3] / "specs" / "04-Infrastructure-OPS" / "04-00-docker-compose" / "Desk-5439" / "ocr-sidecar"
|
||||
|
||||
|
||||
def install_import_stubs() -> None:
|
||||
"""ติดตั้ง stub สำหรับ dependency หนักเพื่อให้ unit test import app ได้เร็ว"""
|
||||
fitz_module = types.ModuleType("fitz")
|
||||
fitz_module.Document = object
|
||||
fitz_module.open = lambda *args, **kwargs: None
|
||||
sys.modules["fitz"] = fitz_module
|
||||
typhoon_module = types.ModuleType("typhoon_ocr")
|
||||
typhoon_module.prepare_ocr_messages = lambda *args, **kwargs: [{"content": []}]
|
||||
sys.modules["typhoon_ocr"] = typhoon_module
|
||||
flag_module = types.ModuleType("FlagEmbedding")
|
||||
flag_module.BGEM3FlagModel = lambda *args, **kwargs: None
|
||||
flag_module.FlagReranker = lambda *args, **kwargs: None
|
||||
sys.modules["FlagEmbedding"] = flag_module
|
||||
pil_module = types.ModuleType("PIL")
|
||||
pil_image_module = types.ModuleType("PIL.Image")
|
||||
pil_module.Image = pil_image_module
|
||||
sys.modules["PIL"] = pil_module
|
||||
sys.modules["PIL.Image"] = pil_image_module
|
||||
pythainlp_module = types.ModuleType("pythainlp")
|
||||
tokenize_module = types.ModuleType("pythainlp.tokenize")
|
||||
tokenize_module.word_tokenize = lambda text, **kwargs: text.split()
|
||||
util_module = types.ModuleType("pythainlp.util")
|
||||
util_module.normalize = lambda text: text
|
||||
sys.modules["pythainlp"] = pythainlp_module
|
||||
sys.modules["pythainlp.tokenize"] = tokenize_module
|
||||
sys.modules["pythainlp.util"] = util_module
|
||||
|
||||
|
||||
def load_app(upload_base: Path):
|
||||
install_import_stubs()
|
||||
os.environ["OCR_SIDECAR_API_KEY"] = "test-key"
|
||||
os.environ["OCR_SIDECAR_UPLOAD_BASE"] = str(upload_base)
|
||||
if str(SIDECAR_DIR) not in sys.path:
|
||||
sys.path.insert(0, str(SIDECAR_DIR))
|
||||
sys.modules.pop("app", None)
|
||||
return importlib.import_module("app")
|
||||
|
||||
|
||||
class FakePage:
|
||||
def get_text(self) -> str:
|
||||
return "A" * 120
|
||||
|
||||
|
||||
class FakeDocument:
|
||||
name = "fake.pdf"
|
||||
|
||||
def __len__(self) -> int:
|
||||
return 1
|
||||
|
||||
def __getitem__(self, index: int) -> FakePage:
|
||||
return FakePage()
|
||||
|
||||
|
||||
def test_ocr_rejects_parent_traversal_outside_upload_base(tmp_path: Path) -> None:
|
||||
upload_base = tmp_path / "uploads"
|
||||
upload_base.mkdir()
|
||||
app_module = load_app(upload_base)
|
||||
client = TestClient(app_module.app)
|
||||
outside_path = upload_base / ".." / "outside.pdf"
|
||||
response = client.post(
|
||||
"/ocr",
|
||||
json={"pdfPath": str(outside_path)},
|
||||
headers={"X-API-Key": "test-key"},
|
||||
)
|
||||
assert response.status_code == 403
|
||||
|
||||
|
||||
def test_ocr_rejects_prefix_sibling_path(tmp_path: Path) -> None:
|
||||
upload_base = tmp_path / "uploads"
|
||||
sibling = tmp_path / "uploads_evil"
|
||||
upload_base.mkdir()
|
||||
sibling.mkdir()
|
||||
app_module = load_app(upload_base)
|
||||
client = TestClient(app_module.app)
|
||||
response = client.post(
|
||||
"/ocr",
|
||||
json={"pdfPath": str(sibling / "document.pdf")},
|
||||
headers={"X-API-Key": "test-key"},
|
||||
)
|
||||
assert response.status_code == 403
|
||||
|
||||
|
||||
def test_ocr_accepts_canonical_path_inside_upload_base(tmp_path: Path) -> None:
|
||||
upload_base = tmp_path / "uploads"
|
||||
upload_base.mkdir()
|
||||
pdf_path = upload_base / "document.pdf"
|
||||
pdf_path.write_bytes(b"%PDF-1.4\n")
|
||||
app_module = load_app(upload_base)
|
||||
client = TestClient(app_module.app)
|
||||
with patch.object(app_module.fitz, "open", return_value=FakeDocument()):
|
||||
response = client.post(
|
||||
"/ocr",
|
||||
json={"pdfPath": str(pdf_path)},
|
||||
headers={"X-API-Key": "test-key"},
|
||||
)
|
||||
assert response.status_code == 200
|
||||
assert response.json()["engineUsed"] == "fast-path"
|
||||
|
||||
Reference in New Issue
Block a user