690605:0922 ADR-034-134 #10.2 [skip CI]
CI / CD Pipeline / build (push) Has been skipped
CI / CD Pipeline / deploy (push) Has been skipped

This commit is contained in:
2026-06-05 09:22:41 +07:00
parent 8b6ef392f5
commit 2db4810dfc
2 changed files with 18 additions and 23 deletions
@@ -147,7 +147,7 @@ def preprocess_image_aggressive(pil_image: Image.Image) -> Image.Image:
"""
img_array = np.array(pil_image)
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
# 1. Deskew ถ้าหน้าเอียง (detect angle จาก Canny edges + Hough lines)
try:
edges = cv2.Canny(gray, 100, 200)
@@ -162,19 +162,19 @@ def preprocess_image_aggressive(pil_image: Image.Image) -> Image.Image:
logger.info(f"[PREPROCESS] Deskewed {angle:.1f}°")
except Exception as e:
logger.warning(f"[PREPROCESS] Deskew failed: {e}")
# 2. Denoise — median blur + bilateral filter
denoised = cv2.medianBlur(gray, 3)
denoised = cv2.bilateralFilter(denoised, 9, 75, 75)
# 3. Otsu threshold (adaptive, ไม่ fixed value)
_, thresh = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
# 4. Morphological operations — ลบ line noise ขนาดเล็ก (ต้าน speckle artifacts)
kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (2, 2))
morph = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel) # ลบ small white noise
morph = cv2.morphologyEx(morph, cv2.MORPH_CLOSE, kernel) # ลบ small black hole
logger.info(f"[PREPROCESS] Aggressive: Otsu threshold + morphology applied")
return Image.fromarray(morph)
@@ -188,34 +188,34 @@ def clean_ocr_output(text: str) -> str:
"""
lines = text.split("\n")
cleaned = []
for line in lines:
line = line.strip()
if not line:
continue
# ✗ ลบ line ที่เป็นแค่สัญลักษณ์/punctuation เดี่ยวๆ ไม่มีตัวอักษร
alphanumeric_part = re.sub(r'[^\w\u0E00-\u0E7F]', '', line)
if len(alphanumeric_part) < 2:
logger.debug(f"[CLEAN] Reject (no alphanum): {line[:50]}")
continue
# ✗ ลบ line ที่เป็น repeated pattern — ถ้า unique char ≤ 20% (e.g., "-----", ">>>>>>>")
unique_chars = len(set(line))
if unique_chars < max(2, len(line) // 5):
logger.debug(f"[CLEAN] Reject (repeated pattern): {line[:50]}")
continue
# ✗ ลบ line ที่เป็นสัญลักษณ์แปลก (< 20% Thai/English alphanumeric)
thai_chars = sum(1 for c in line if '\u0E00' <= c <= '\u0E7F')
eng_chars = sum(1 for c in line if c.isascii() and c.isalnum())
if len(line) > 0 and (thai_chars + eng_chars) / len(line) < 0.2:
logger.debug(f"[CLEAN] Reject (low language content): {line[:50]}")
continue
# ✓ ปล่อยผ่าน
cleaned.append(line)
result = "\n".join(cleaned)
logger.info(f"[CLEAN] Input {len(lines)} lines → {len(cleaned)} lines")
return result
@@ -306,24 +306,24 @@ def _process_pdf_doc(doc: fitz.Document, selected_engine: str, max_pages: int, t
img_bytes = pix.tobytes("png")
img = Image.open(io.BytesIO(img_bytes))
cropped_img = crop_header_footer(img, CROP_TOP_RATIO, CROP_BOTTOM_RATIO)
# Option 2: Choose preprocessing strategy
if USE_AGGRESSIVE_PREPROCESSING:
processed_img = preprocess_image_aggressive(cropped_img)
else:
processed_img = preprocess_image(cropped_img)
text = pytesseract.image_to_string(processed_img, lang=OCR_LANG, config=TESSERACT_CONFIG)
ocr_text_parts.append(text.strip())
ocr_text = "\n".join(ocr_text_parts).strip()
# Option 3: Apply smart post-processing
if USE_SMART_CLEANING:
ocr_text = clean_ocr_output(ocr_text)
else:
ocr_text = filter_ocr_noise(ocr_text)
logger.info(f"Tesseract extracted {len(ocr_text)} chars")
return OcrResponse(
text=ocr_text,
@@ -6,21 +6,16 @@ PARAMETER temperature 0.1
PARAMETER top_p 0.1
PARAMETER repeat_penalty 1.1
SYSTEM """You are an expert in structuring Thai documents.
Extract the information from the image in the most correct and organized format.
SYSTEM """You are an expert in structuring Thai documents. Extract the information from the image in the most correct and organized format.
Instructions:
- Return ONLY clean Markdown output.
- Include ALL information visible on the page.
- Preserve document structure and hierarchy.
- Do NOT add explanations or interpretations.
Formatting Rules:
- Do NOT add explanations or interpretations. Formatting Rules:
- Tables: Render tables using <table>...</table> in clean HTML format.
- Equations: Render equations using LaTeX syntax with inline ($...$) and block ($$...$$).
- Images/Charts/Diagrams: Wrap any clearly defined visual areas in:
<figure>
Describe the image's main elements, note contextual clues, mention visible text and meaning. Describe in Thai.
<figure> Describe the image's main elements, note contextual clues, mention visible text and meaning. Describe in Thai.
</figure>
- Page Numbers: Wrap page numbers in <page_number>...</page_number>.
- Checkboxes: Use ☐ for unchecked and ☑ for checked boxes.