Compare commits
2 Commits
bd96c4122c
...
4a808dd9c4
| Author | SHA1 | Date | |
|---|---|---|---|
| 4a808dd9c4 | |||
| e71602e90c |
@@ -71,7 +71,7 @@ TESSERACT_PSM = os.getenv("TESSERACT_PSM", "3")
|
|||||||
# PSM 6 = Assume single column of text (ลด hallucination จาก noise)
|
# PSM 6 = Assume single column of text (ลด hallucination จาก noise)
|
||||||
# OEM 1 = LSTM only (ดีกว่า legacy engine)
|
# OEM 1 = LSTM only (ดีกว่า legacy engine)
|
||||||
TESSERACT_CONFIG = f"--psm {TESSERACT_PSM} --oem 1"
|
TESSERACT_CONFIG = f"--psm {TESSERACT_PSM} --oem 1"
|
||||||
# Crop margin: ตัด header/footer (บน 5%, ล่าง 2%)
|
# Crop margin: ตัด header/afooter (บน 5%, ล่าง 2%)
|
||||||
CROP_TOP_RATIO = 0.05
|
CROP_TOP_RATIO = 0.05
|
||||||
CROP_BOTTOM_RATIO = 0.02
|
CROP_BOTTOM_RATIO = 0.02
|
||||||
# Enable aggressive preprocessing (Option 2) สำหรับ Tesseract
|
# Enable aggressive preprocessing (Option 2) สำหรับ Tesseract
|
||||||
@@ -341,7 +341,27 @@ def process_with_typhoon_ocr(pil_image: Image.Image, options_override: dict = {}
|
|||||||
}
|
}
|
||||||
payload = {
|
payload = {
|
||||||
"model": model_name,
|
"model": model_name,
|
||||||
"prompt": "", # SYSTEM instruction ใน Modelfile จัดการทั้งหมด
|
"prompt": """You are an expert in structuring Thai documents
|
||||||
|
|
||||||
|
Task: Extract the information from the image in the most correct and organized format.
|
||||||
|
|
||||||
|
Output Rules:
|
||||||
|
- Return ONLY clean Markdown output
|
||||||
|
- Include ALL information visible on the page
|
||||||
|
- Preserve document structure and hierarchy
|
||||||
|
- Do NOT add explanations or interpretations
|
||||||
|
- Do NOT include these instructions in your response
|
||||||
|
|
||||||
|
Formatting:
|
||||||
|
- Tables: Use HTML <table> tags
|
||||||
|
- Math: $inline$ and $$block$$ LaTeX
|
||||||
|
- Figures: <figure>Thai description</figure>
|
||||||
|
- Pages: <page_number>N</page_number>
|
||||||
|
- Boxes: ☐ / ☑
|
||||||
|
- Unclear: [unclear: context]
|
||||||
|
- Signatures/Stamps: Describe location and context
|
||||||
|
|
||||||
|
Extract all text from this image.""",
|
||||||
"images": [image_base64],
|
"images": [image_base64],
|
||||||
"stream": False,
|
"stream": False,
|
||||||
"options": options,
|
"options": options,
|
||||||
|
|||||||
+111
-19
@@ -19,6 +19,7 @@
|
|||||||
# - 2026-06-04: ส่ง color image (ไม่ผ่าน preprocess_image) ไปยัง Typhoon OCR — vision model ต้องการ color ไม่ใช่ binarized grayscale
|
# - 2026-06-04: ส่ง color image (ไม่ผ่าน preprocess_image) ไปยัง Typhoon OCR — vision model ต้องการ color ไม่ใช่ binarized grayscale
|
||||||
# - 2026-06-04: เพิ่ม num_gpu:99 ใน Ollama options เพื่อบังคับ GPU layers (แก้ device=CPU ทั้งที่ VRAM พอ)
|
# - 2026-06-04: เพิ่ม num_gpu:99 ใน Ollama options เพื่อบังคับ GPU layers (แก้ device=CPU ทั้งที่ VRAM พอ)
|
||||||
# - 2026-06-02: เพิ่มการตรวจสอบ API Key (X-API-Key Header) สำหรับ endpoints หลัก เพื่อความมั่นคงปลอดภัยตามข้อเสนอแนะ Code Review
|
# - 2026-06-02: เพิ่มการตรวจสอบ API Key (X-API-Key Header) สำหรับ endpoints หลัก เพื่อความมั่นคงปลอดภัยตามข้อเสนอแนะ Code Review
|
||||||
|
# - 2026-06-05: เพิ่ม Option 2 (aggressive preprocessing: deskew + Otsu threshold + morphology) และ Option 3 (smart post-processing: regex-based hallucination removal) เพื่อลด Tesseract noise/hallucination (T025)
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import logging
|
import logging
|
||||||
@@ -64,15 +65,21 @@ TYPHOON_OCR_MODEL = os.getenv("TYPHOON_OCR_MODEL", "typhoon-np-dms-ocr:latest")
|
|||||||
TYPHOON_OCR_TIMEOUT = int(os.getenv("TYPHOON_OCR_TIMEOUT", "360")) # รองรับ cold-start ~65s + inference ~30s/page
|
TYPHOON_OCR_TIMEOUT = int(os.getenv("TYPHOON_OCR_TIMEOUT", "360")) # รองรับ cold-start ~65s + inference ~30s/page
|
||||||
# DPI สำหรับ Typhoon OCR — ต่ำกว่า Tesseract เพราะ vision model ใช้ image patches (150 DPI ลด token ~4x)
|
# DPI สำหรับ Typhoon OCR — ต่ำกว่า Tesseract เพราะ vision model ใช้ image patches (150 DPI ลด token ~4x)
|
||||||
TYPHOON_OCR_DPI = int(os.getenv("TYPHOON_OCR_DPI", "150"))
|
TYPHOON_OCR_DPI = int(os.getenv("TYPHOON_OCR_DPI", "150"))
|
||||||
|
# PSM mode: 3 (default, fully automatic) หรือ 6 (assume single column, ลด noise)
|
||||||
|
TESSERACT_PSM = os.getenv("TESSERACT_PSM", "3")
|
||||||
# PSM 3 = Fully automatic page segmentation (เหมาะกับเอกสารที่มี layout หลายส่วน เช่น วันที่/เลขที่)
|
# PSM 3 = Fully automatic page segmentation (เหมาะกับเอกสารที่มี layout หลายส่วน เช่น วันที่/เลขที่)
|
||||||
|
# PSM 6 = Assume single column of text (ลด hallucination จาก noise)
|
||||||
# OEM 1 = LSTM only (ดีกว่า legacy engine)
|
# OEM 1 = LSTM only (ดีกว่า legacy engine)
|
||||||
TESSERACT_CONFIG = f"--psm 3 --oem 1"
|
TESSERACT_CONFIG = f"--psm {TESSERACT_PSM} --oem 1"
|
||||||
# Crop margin: ตัด header/footer (บน 5%, ล่าง 2%)
|
# Crop margin: ตัด header/afooter (บน 5%, ล่าง 2%)
|
||||||
CROP_TOP_RATIO = 0.05
|
CROP_TOP_RATIO = 0.05
|
||||||
CROP_BOTTOM_RATIO = 0.02
|
CROP_BOTTOM_RATIO = 0.02
|
||||||
|
# Enable aggressive preprocessing (Option 2) สำหรับ Tesseract
|
||||||
|
USE_AGGRESSIVE_PREPROCESSING = os.getenv("TESSERACT_AGGRESSIVE_PREPROCESS", "true").lower() == "true"
|
||||||
|
# Enable smart post-processing (Option 3) สำหรับลบ hallucination
|
||||||
|
USE_SMART_CLEANING = os.getenv("TESSERACT_SMART_CLEAN", "true").lower() == "true"
|
||||||
|
|
||||||
logger.info(f"Tesseract OCR Sidecar initialized (lang={OCR_LANG}, config={TESSERACT_CONFIG})")
|
logger.info(f"Tesseract OCR Sidecar initialized (lang={OCR_LANG}, config={TESSERACT_CONFIG}, aggressive={USE_AGGRESSIVE_PREPROCESSING}, smart_clean={USE_SMART_CLEANING})")
|
||||||
|
|
||||||
|
|
||||||
def filter_ocr_noise(text: str) -> str:
|
def filter_ocr_noise(text: str) -> str:
|
||||||
"""Filter ขยะ OCR เช่น บรรทัดสั้น/สัญลักษณ์ที่ไม่มีความหมาย"""
|
"""Filter ขยะ OCR เช่น บรรทัดสั้น/สัญลักษณ์ที่ไม่มีความหมาย"""
|
||||||
@@ -112,9 +119,8 @@ def crop_header_footer(pil_image: Image.Image, top_ratio: float = 0.10, bottom_r
|
|||||||
cropped = pil_image.crop((0, top_crop, width, height - bottom_crop))
|
cropped = pil_image.crop((0, top_crop, width, height - bottom_crop))
|
||||||
return cropped
|
return cropped
|
||||||
|
|
||||||
|
|
||||||
def preprocess_image(pil_image: Image.Image) -> Image.Image:
|
def preprocess_image(pil_image: Image.Image) -> Image.Image:
|
||||||
"""Preprocess image ด้วย OpenCV เพื่อเพิ่มความแม่นยำ OCR"""
|
"""Preprocess image ด้วย OpenCV เพื่อเพิ่มความแม่นยำ OCR (แบบธรรมชาติ)"""
|
||||||
# แปลง PIL Image เป็น numpy array (OpenCV format)
|
# แปลง PIL Image เป็น numpy array (OpenCV format)
|
||||||
img_array = np.array(pil_image)
|
img_array = np.array(pil_image)
|
||||||
|
|
||||||
@@ -128,13 +134,93 @@ def preprocess_image(pil_image: Image.Image) -> Image.Image:
|
|||||||
# แปลงกลับเป็น PIL Image
|
# แปลงกลับเป็น PIL Image
|
||||||
return Image.fromarray(denoised)
|
return Image.fromarray(denoised)
|
||||||
|
|
||||||
|
def preprocess_image_aggressive(pil_image: Image.Image) -> Image.Image:
|
||||||
|
"""
|
||||||
|
Aggressive preprocessing (Option 2) — ลด hallucination โดย:
|
||||||
|
1. Deskew ถ้าหน้าเอียง
|
||||||
|
2. Denoise ด้วย bilateral filter
|
||||||
|
3. Otsu adaptive threshold
|
||||||
|
4. Morphological operations
|
||||||
|
"""
|
||||||
|
img_array = np.array(pil_image)
|
||||||
|
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
||||||
|
|
||||||
|
# 1. Deskew ถ้าหน้าเอียง (detect angle จาก Canny edges + Hough lines)
|
||||||
|
try:
|
||||||
|
edges = cv2.Canny(gray, 100, 200)
|
||||||
|
lines = cv2.HoughLinesP(edges, 1, np.pi/180, 100, minLineLength=100, maxLineGap=10)
|
||||||
|
if lines is not None and len(lines) > 0:
|
||||||
|
angles = [np.arctan2(y2-y1, x2-x1) for x1,y1,x2,y2 in lines[:min(10, len(lines))]]
|
||||||
|
angle = np.median(angles) * 180 / np.pi
|
||||||
|
if abs(angle) > 0.5: # มุมเอียงน้อย ≥ 0.5 องศา
|
||||||
|
h, w = gray.shape
|
||||||
|
M = cv2.getRotationMatrix2D((w/2, h/2), angle, 1.0)
|
||||||
|
gray = cv2.warpAffine(gray, M, (w, h), borderMode=cv2.BORDER_REFLECT)
|
||||||
|
logger.info(f"[PREPROCESS] Deskewed {angle:.1f}°")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"[PREPROCESS] Deskew failed: {e}")
|
||||||
|
|
||||||
|
# 2. Denoise — median blur + bilateral filter
|
||||||
|
denoised = cv2.medianBlur(gray, 3)
|
||||||
|
denoised = cv2.bilateralFilter(denoised, 9, 75, 75)
|
||||||
|
|
||||||
|
# 3. Otsu threshold (adaptive, ไม่ fixed value)
|
||||||
|
_, thresh = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
||||||
|
|
||||||
|
# 4. Morphological operations — ลบ line noise ขนาดเล็ก (ต้าน speckle artifacts)
|
||||||
|
kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (2, 2))
|
||||||
|
morph = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel) # ลบ small white noise
|
||||||
|
morph = cv2.morphologyEx(morph, cv2.MORPH_CLOSE, kernel) # ลบ small black hole
|
||||||
|
|
||||||
|
logger.info(f"[PREPROCESS] Aggressive: Otsu threshold + morphology applied")
|
||||||
|
return Image.fromarray(morph)
|
||||||
|
|
||||||
|
def clean_ocr_output(text: str) -> str:
|
||||||
|
"""
|
||||||
|
Smart post-processing (Option 3) — ลบ Tesseract hallucination โดย:
|
||||||
|
1. ลบ line ที่เป็นแค่สัญลักษณ์ repeated
|
||||||
|
2. ลบ line ที่เป็นแค่สัญลักษณ์แปลก
|
||||||
|
3. ลบ line ที่ซ้ำตัวอักษรเดียว (artifact noise)
|
||||||
|
"""
|
||||||
|
lines = text.split("\n")
|
||||||
|
cleaned = []
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# ✗ ลบ line ที่เป็นแค่สัญลักษณ์/punctuation เดี่ยวๆ ไม่มีตัวอักษร
|
||||||
|
alphanumeric_part = re.sub(r'[^\w\u0E00-\u0E7F]', '', line)
|
||||||
|
if len(alphanumeric_part) < 2:
|
||||||
|
logger.debug(f"[CLEAN] Reject (no alphanum): {line[:50]}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# ✗ ลบ line ที่เป็น repeated pattern — ถ้า unique char ≤ 20% (e.g., "-----", ">>>>>>>")
|
||||||
|
unique_chars = len(set(line))
|
||||||
|
if unique_chars < max(2, len(line) // 5):
|
||||||
|
logger.debug(f"[CLEAN] Reject (repeated pattern): {line[:50]}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# ✗ ลบ line ที่เป็นสัญลักษณ์แปลก (< 20% Thai/English alphanumeric)
|
||||||
|
thai_chars = sum(1 for c in line if '\u0E00' <= c <= '\u0E7F')
|
||||||
|
eng_chars = sum(1 for c in line if c.isascii() and c.isalnum())
|
||||||
|
if len(line) > 0 and (thai_chars + eng_chars) / len(line) < 0.2:
|
||||||
|
logger.debug(f"[CLEAN] Reject (low language content): {line[:50]}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# ✓ ปล่อยผ่าน
|
||||||
|
cleaned.append(line)
|
||||||
|
|
||||||
|
result = "\n".join(cleaned)
|
||||||
|
logger.info(f"[CLEAN] Input {len(lines)} lines → {len(cleaned)} lines")
|
||||||
|
return result
|
||||||
|
|
||||||
class OcrRequest(BaseModel):
|
class OcrRequest(BaseModel):
|
||||||
pdfPath: str
|
pdfPath: str
|
||||||
maxPages: Optional[int] = None
|
maxPages: Optional[int] = None
|
||||||
engine: Optional[str] = None
|
engine: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
class OcrResponse(BaseModel):
|
class OcrResponse(BaseModel):
|
||||||
text: str
|
text: str
|
||||||
ocrUsed: bool
|
ocrUsed: bool
|
||||||
@@ -142,16 +228,17 @@ class OcrResponse(BaseModel):
|
|||||||
charCount: int
|
charCount: int
|
||||||
engineUsed: str
|
engineUsed: str
|
||||||
|
|
||||||
|
|
||||||
@app.get("/health")
|
@app.get("/health")
|
||||||
def health():
|
def health():
|
||||||
return {
|
return {
|
||||||
"status": "ok",
|
"status": "ok",
|
||||||
"engines": ["tesseract", "typhoon-np-dms-ocr"],
|
"engines": ["tesseract", "typhoon-np-dms-ocr"],
|
||||||
"typhoonModel": TYPHOON_OCR_MODEL,
|
"typhoonModel": TYPHOON_OCR_MODEL,
|
||||||
|
"tesseractConfig": TESSERACT_CONFIG,
|
||||||
|
"aggressivePreprocess": USE_AGGRESSIVE_PREPROCESSING,
|
||||||
|
"smartCleaning": USE_SMART_CLEANING,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# alias map สำหรับ engine name เก่า → canonical name
|
# alias map สำหรับ engine name เก่า → canonical name
|
||||||
_ENGINE_ALIASES: dict[str, str] = {
|
_ENGINE_ALIASES: dict[str, str] = {
|
||||||
"typhoon-ocr1.5-3b": "typhoon-np-dms-ocr",
|
"typhoon-ocr1.5-3b": "typhoon-np-dms-ocr",
|
||||||
@@ -159,7 +246,6 @@ _ENGINE_ALIASES: dict[str, str] = {
|
|||||||
"typhoon_ocr": "typhoon-np-dms-ocr",
|
"typhoon_ocr": "typhoon-np-dms-ocr",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def _process_pdf_doc(doc: fitz.Document, selected_engine: str, max_pages: int, typhoon_options: dict = {}) -> OcrResponse:
|
def _process_pdf_doc(doc: fitz.Document, selected_engine: str, max_pages: int, typhoon_options: dict = {}) -> OcrResponse:
|
||||||
"""ประมวลผล fitz.Document ด้วย engine ที่เลือก — shared logic สำหรับ /ocr และ /ocr-upload"""
|
"""ประมวลผล fitz.Document ด้วย engine ที่เลือก — shared logic สำหรับ /ocr และ /ocr-upload"""
|
||||||
selected_engine = _ENGINE_ALIASES.get(selected_engine, selected_engine)
|
selected_engine = _ENGINE_ALIASES.get(selected_engine, selected_engine)
|
||||||
@@ -211,11 +297,24 @@ def _process_pdf_doc(doc: fitz.Document, selected_engine: str, max_pages: int, t
|
|||||||
img_bytes = pix.tobytes("png")
|
img_bytes = pix.tobytes("png")
|
||||||
img = Image.open(io.BytesIO(img_bytes))
|
img = Image.open(io.BytesIO(img_bytes))
|
||||||
cropped_img = crop_header_footer(img, CROP_TOP_RATIO, CROP_BOTTOM_RATIO)
|
cropped_img = crop_header_footer(img, CROP_TOP_RATIO, CROP_BOTTOM_RATIO)
|
||||||
|
|
||||||
|
# Option 2: Choose preprocessing strategy
|
||||||
|
if USE_AGGRESSIVE_PREPROCESSING:
|
||||||
|
processed_img = preprocess_image_aggressive(cropped_img)
|
||||||
|
else:
|
||||||
processed_img = preprocess_image(cropped_img)
|
processed_img = preprocess_image(cropped_img)
|
||||||
|
|
||||||
text = pytesseract.image_to_string(processed_img, lang=OCR_LANG, config=TESSERACT_CONFIG)
|
text = pytesseract.image_to_string(processed_img, lang=OCR_LANG, config=TESSERACT_CONFIG)
|
||||||
ocr_text_parts.append(text.strip())
|
ocr_text_parts.append(text.strip())
|
||||||
|
|
||||||
ocr_text = filter_ocr_noise("\n".join(ocr_text_parts).strip())
|
ocr_text = "\n".join(ocr_text_parts).strip()
|
||||||
|
|
||||||
|
# Option 3: Apply smart post-processing
|
||||||
|
if USE_SMART_CLEANING:
|
||||||
|
ocr_text = clean_ocr_output(ocr_text)
|
||||||
|
else:
|
||||||
|
ocr_text = filter_ocr_noise(ocr_text)
|
||||||
|
|
||||||
logger.info(f"Tesseract extracted {len(ocr_text)} chars")
|
logger.info(f"Tesseract extracted {len(ocr_text)} chars")
|
||||||
return OcrResponse(
|
return OcrResponse(
|
||||||
text=ocr_text,
|
text=ocr_text,
|
||||||
@@ -225,7 +324,6 @@ def _process_pdf_doc(doc: fitz.Document, selected_engine: str, max_pages: int, t
|
|||||||
engineUsed="tesseract",
|
engineUsed="tesseract",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def process_with_typhoon_ocr(pil_image: Image.Image, options_override: dict = {}) -> str:
|
def process_with_typhoon_ocr(pil_image: Image.Image, options_override: dict = {}) -> str:
|
||||||
"""เรียก Typhoon OCR ผ่าน Ollama — ใช้ SYSTEM ใน Modelfile เป็น instruction หลัก; options_override ยัง override ค่า Modelfile ได้"""
|
"""เรียก Typhoon OCR ผ่าน Ollama — ใช้ SYSTEM ใน Modelfile เป็น instruction หลัก; options_override ยัง override ค่า Modelfile ได้"""
|
||||||
model_name = TYPHOON_OCR_MODEL
|
model_name = TYPHOON_OCR_MODEL
|
||||||
@@ -243,7 +341,7 @@ def process_with_typhoon_ocr(pil_image: Image.Image, options_override: dict = {}
|
|||||||
}
|
}
|
||||||
payload = {
|
payload = {
|
||||||
"model": model_name,
|
"model": model_name,
|
||||||
"prompt": "Extract all text from this image.",
|
"prompt": "", # SYSTEM instruction ใน Modelfile จัดการทั้งหมด
|
||||||
"images": [image_base64],
|
"images": [image_base64],
|
||||||
"stream": False,
|
"stream": False,
|
||||||
"options": options,
|
"options": options,
|
||||||
@@ -267,7 +365,6 @@ def process_with_typhoon_ocr(pil_image: Image.Image, options_override: dict = {}
|
|||||||
)
|
)
|
||||||
return result_text
|
return result_text
|
||||||
|
|
||||||
|
|
||||||
@app.post("/ocr", response_model=OcrResponse, dependencies=[Depends(get_api_key)])
|
@app.post("/ocr", response_model=OcrResponse, dependencies=[Depends(get_api_key)])
|
||||||
def ocr_extract(req: OcrRequest):
|
def ocr_extract(req: OcrRequest):
|
||||||
"""OCR จาก path (legacy — ใช้เมื่อ sidecar และ backend เข้าถึง storage เดียวกัน)"""
|
"""OCR จาก path (legacy — ใช้เมื่อ sidecar และ backend เข้าถึง storage เดียวกัน)"""
|
||||||
@@ -282,7 +379,6 @@ def ocr_extract(req: OcrRequest):
|
|||||||
raise HTTPException(status_code=422, detail=f"เปิดไฟล์ PDF ล้มเหลว: {e}")
|
raise HTTPException(status_code=422, detail=f"เปิดไฟล์ PDF ล้มเหลว: {e}")
|
||||||
return _process_pdf_doc(doc, selected_engine, max_pages)
|
return _process_pdf_doc(doc, selected_engine, max_pages)
|
||||||
|
|
||||||
|
|
||||||
@app.post("/ocr-upload", response_model=OcrResponse, dependencies=[Depends(get_api_key)])
|
@app.post("/ocr-upload", response_model=OcrResponse, dependencies=[Depends(get_api_key)])
|
||||||
def ocr_upload(
|
def ocr_upload(
|
||||||
file: UploadFile = File(...),
|
file: UploadFile = File(...),
|
||||||
@@ -311,15 +407,12 @@ def ocr_upload(
|
|||||||
logger.info(f"OCR upload: {file.filename} engine={selected_engine} options={typhoon_options or 'modelfile-defaults'}")
|
logger.info(f"OCR upload: {file.filename} engine={selected_engine} options={typhoon_options or 'modelfile-defaults'}")
|
||||||
return _process_pdf_doc(doc, selected_engine, max_pages, typhoon_options)
|
return _process_pdf_doc(doc, selected_engine, max_pages, typhoon_options)
|
||||||
|
|
||||||
|
|
||||||
class NormalizeRequest(BaseModel):
|
class NormalizeRequest(BaseModel):
|
||||||
text: str
|
text: str
|
||||||
|
|
||||||
|
|
||||||
class NormalizeResponse(BaseModel):
|
class NormalizeResponse(BaseModel):
|
||||||
normalized: str
|
normalized: str
|
||||||
|
|
||||||
|
|
||||||
@app.post("/normalize", response_model=NormalizeResponse, dependencies=[Depends(get_api_key)])
|
@app.post("/normalize", response_model=NormalizeResponse, dependencies=[Depends(get_api_key)])
|
||||||
def normalize_text(req: NormalizeRequest):
|
def normalize_text(req: NormalizeRequest):
|
||||||
"""Normalize Thai text ด้วย PyThaiNLP สำหรับ rag-thai-preprocess queue"""
|
"""Normalize Thai text ด้วย PyThaiNLP สำหรับ rag-thai-preprocess queue"""
|
||||||
@@ -333,7 +426,6 @@ def normalize_text(req: NormalizeRequest):
|
|||||||
logger.warning(f"Thai normalize failed, returning raw text: {e}")
|
logger.warning(f"Thai normalize failed, returning raw text: {e}")
|
||||||
return NormalizeResponse(normalized=req.text)
|
return NormalizeResponse(normalized=req.text)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import uvicorn
|
import uvicorn
|
||||||
port = int(os.getenv("OCR_PORT", "8765"))
|
port = int(os.getenv("OCR_PORT", "8765"))
|
||||||
|
|||||||
-19
@@ -5,22 +5,3 @@ PARAMETER num_predict 4096
|
|||||||
PARAMETER temperature 0.1
|
PARAMETER temperature 0.1
|
||||||
PARAMETER top_p 0.1
|
PARAMETER top_p 0.1
|
||||||
PARAMETER repeat_penalty 1.1
|
PARAMETER repeat_penalty 1.1
|
||||||
|
|
||||||
SYSTEM """You are an expert in structuring Thai documents
|
|
||||||
|
|
||||||
Task: Extract the information from the image in the most correct and organized format
|
|
||||||
|
|
||||||
Output Rules:
|
|
||||||
- Return ONLY clean Markdown output
|
|
||||||
- Include ALL information visible on the page
|
|
||||||
- Preserve document structure and hierarchy
|
|
||||||
- Do NOT add explanations or interpretations
|
|
||||||
|
|
||||||
Formatting:
|
|
||||||
- Tables: Use HTML <table> tags
|
|
||||||
- Math: $inline$ and $$block$$ LaTeX
|
|
||||||
- Figures: <figure>Thai description</figure>
|
|
||||||
- Pages: <page_number>N</page_number>
|
|
||||||
- Boxes: ☐ / ☑
|
|
||||||
- Unclear: [unclear: context]
|
|
||||||
- Signatures/Stamps: Describe location and context"""
|
|
||||||
|
|||||||
Reference in New Issue
Block a user