690605:0922 ADR-034-134 #10.2 [skip CI]
CI / CD Pipeline / build (push) Has been skipped
CI / CD Pipeline / deploy (push) Has been skipped

This commit is contained in:
2026-06-05 09:22:41 +07:00
parent 8b6ef392f5
commit 2db4810dfc
2 changed files with 18 additions and 23 deletions
@@ -147,7 +147,7 @@ def preprocess_image_aggressive(pil_image: Image.Image) -> Image.Image:
""" """
img_array = np.array(pil_image) img_array = np.array(pil_image)
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY) gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
# 1. Deskew ถ้าหน้าเอียง (detect angle จาก Canny edges + Hough lines) # 1. Deskew ถ้าหน้าเอียง (detect angle จาก Canny edges + Hough lines)
try: try:
edges = cv2.Canny(gray, 100, 200) edges = cv2.Canny(gray, 100, 200)
@@ -162,19 +162,19 @@ def preprocess_image_aggressive(pil_image: Image.Image) -> Image.Image:
logger.info(f"[PREPROCESS] Deskewed {angle:.1f}°") logger.info(f"[PREPROCESS] Deskewed {angle:.1f}°")
except Exception as e: except Exception as e:
logger.warning(f"[PREPROCESS] Deskew failed: {e}") logger.warning(f"[PREPROCESS] Deskew failed: {e}")
# 2. Denoise — median blur + bilateral filter # 2. Denoise — median blur + bilateral filter
denoised = cv2.medianBlur(gray, 3) denoised = cv2.medianBlur(gray, 3)
denoised = cv2.bilateralFilter(denoised, 9, 75, 75) denoised = cv2.bilateralFilter(denoised, 9, 75, 75)
# 3. Otsu threshold (adaptive, ไม่ fixed value) # 3. Otsu threshold (adaptive, ไม่ fixed value)
_, thresh = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) _, thresh = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
# 4. Morphological operations — ลบ line noise ขนาดเล็ก (ต้าน speckle artifacts) # 4. Morphological operations — ลบ line noise ขนาดเล็ก (ต้าน speckle artifacts)
kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (2, 2)) kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (2, 2))
morph = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel) # ลบ small white noise morph = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel) # ลบ small white noise
morph = cv2.morphologyEx(morph, cv2.MORPH_CLOSE, kernel) # ลบ small black hole morph = cv2.morphologyEx(morph, cv2.MORPH_CLOSE, kernel) # ลบ small black hole
logger.info(f"[PREPROCESS] Aggressive: Otsu threshold + morphology applied") logger.info(f"[PREPROCESS] Aggressive: Otsu threshold + morphology applied")
return Image.fromarray(morph) return Image.fromarray(morph)
@@ -188,34 +188,34 @@ def clean_ocr_output(text: str) -> str:
""" """
lines = text.split("\n") lines = text.split("\n")
cleaned = [] cleaned = []
for line in lines: for line in lines:
line = line.strip() line = line.strip()
if not line: if not line:
continue continue
# ✗ ลบ line ที่เป็นแค่สัญลักษณ์/punctuation เดี่ยวๆ ไม่มีตัวอักษร # ✗ ลบ line ที่เป็นแค่สัญลักษณ์/punctuation เดี่ยวๆ ไม่มีตัวอักษร
alphanumeric_part = re.sub(r'[^\w\u0E00-\u0E7F]', '', line) alphanumeric_part = re.sub(r'[^\w\u0E00-\u0E7F]', '', line)
if len(alphanumeric_part) < 2: if len(alphanumeric_part) < 2:
logger.debug(f"[CLEAN] Reject (no alphanum): {line[:50]}") logger.debug(f"[CLEAN] Reject (no alphanum): {line[:50]}")
continue continue
# ✗ ลบ line ที่เป็น repeated pattern — ถ้า unique char ≤ 20% (e.g., "-----", ">>>>>>>") # ✗ ลบ line ที่เป็น repeated pattern — ถ้า unique char ≤ 20% (e.g., "-----", ">>>>>>>")
unique_chars = len(set(line)) unique_chars = len(set(line))
if unique_chars < max(2, len(line) // 5): if unique_chars < max(2, len(line) // 5):
logger.debug(f"[CLEAN] Reject (repeated pattern): {line[:50]}") logger.debug(f"[CLEAN] Reject (repeated pattern): {line[:50]}")
continue continue
# ✗ ลบ line ที่เป็นสัญลักษณ์แปลก (< 20% Thai/English alphanumeric) # ✗ ลบ line ที่เป็นสัญลักษณ์แปลก (< 20% Thai/English alphanumeric)
thai_chars = sum(1 for c in line if '\u0E00' <= c <= '\u0E7F') thai_chars = sum(1 for c in line if '\u0E00' <= c <= '\u0E7F')
eng_chars = sum(1 for c in line if c.isascii() and c.isalnum()) eng_chars = sum(1 for c in line if c.isascii() and c.isalnum())
if len(line) > 0 and (thai_chars + eng_chars) / len(line) < 0.2: if len(line) > 0 and (thai_chars + eng_chars) / len(line) < 0.2:
logger.debug(f"[CLEAN] Reject (low language content): {line[:50]}") logger.debug(f"[CLEAN] Reject (low language content): {line[:50]}")
continue continue
# ✓ ปล่อยผ่าน # ✓ ปล่อยผ่าน
cleaned.append(line) cleaned.append(line)
result = "\n".join(cleaned) result = "\n".join(cleaned)
logger.info(f"[CLEAN] Input {len(lines)} lines → {len(cleaned)} lines") logger.info(f"[CLEAN] Input {len(lines)} lines → {len(cleaned)} lines")
return result return result
@@ -306,24 +306,24 @@ def _process_pdf_doc(doc: fitz.Document, selected_engine: str, max_pages: int, t
img_bytes = pix.tobytes("png") img_bytes = pix.tobytes("png")
img = Image.open(io.BytesIO(img_bytes)) img = Image.open(io.BytesIO(img_bytes))
cropped_img = crop_header_footer(img, CROP_TOP_RATIO, CROP_BOTTOM_RATIO) cropped_img = crop_header_footer(img, CROP_TOP_RATIO, CROP_BOTTOM_RATIO)
# Option 2: Choose preprocessing strategy # Option 2: Choose preprocessing strategy
if USE_AGGRESSIVE_PREPROCESSING: if USE_AGGRESSIVE_PREPROCESSING:
processed_img = preprocess_image_aggressive(cropped_img) processed_img = preprocess_image_aggressive(cropped_img)
else: else:
processed_img = preprocess_image(cropped_img) processed_img = preprocess_image(cropped_img)
text = pytesseract.image_to_string(processed_img, lang=OCR_LANG, config=TESSERACT_CONFIG) text = pytesseract.image_to_string(processed_img, lang=OCR_LANG, config=TESSERACT_CONFIG)
ocr_text_parts.append(text.strip()) ocr_text_parts.append(text.strip())
ocr_text = "\n".join(ocr_text_parts).strip() ocr_text = "\n".join(ocr_text_parts).strip()
# Option 3: Apply smart post-processing # Option 3: Apply smart post-processing
if USE_SMART_CLEANING: if USE_SMART_CLEANING:
ocr_text = clean_ocr_output(ocr_text) ocr_text = clean_ocr_output(ocr_text)
else: else:
ocr_text = filter_ocr_noise(ocr_text) ocr_text = filter_ocr_noise(ocr_text)
logger.info(f"Tesseract extracted {len(ocr_text)} chars") logger.info(f"Tesseract extracted {len(ocr_text)} chars")
return OcrResponse( return OcrResponse(
text=ocr_text, text=ocr_text,
@@ -6,21 +6,16 @@ PARAMETER temperature 0.1
PARAMETER top_p 0.1 PARAMETER top_p 0.1
PARAMETER repeat_penalty 1.1 PARAMETER repeat_penalty 1.1
SYSTEM """You are an expert in structuring Thai documents. SYSTEM """You are an expert in structuring Thai documents. Extract the information from the image in the most correct and organized format.
Extract the information from the image in the most correct and organized format.
Instructions: Instructions:
- Return ONLY clean Markdown output. - Return ONLY clean Markdown output.
- Include ALL information visible on the page. - Include ALL information visible on the page.
- Preserve document structure and hierarchy. - Preserve document structure and hierarchy.
- Do NOT add explanations or interpretations. - Do NOT add explanations or interpretations. Formatting Rules:
Formatting Rules:
- Tables: Render tables using <table>...</table> in clean HTML format. - Tables: Render tables using <table>...</table> in clean HTML format.
- Equations: Render equations using LaTeX syntax with inline ($...$) and block ($$...$$). - Equations: Render equations using LaTeX syntax with inline ($...$) and block ($$...$$).
- Images/Charts/Diagrams: Wrap any clearly defined visual areas in: - Images/Charts/Diagrams: Wrap any clearly defined visual areas in:
<figure> <figure> Describe the image's main elements, note contextual clues, mention visible text and meaning. Describe in Thai.
Describe the image's main elements, note contextual clues, mention visible text and meaning. Describe in Thai.
</figure> </figure>
- Page Numbers: Wrap page numbers in <page_number>...</page_number>. - Page Numbers: Wrap page numbers in <page_number>...</page_number>.
- Checkboxes: Use ☐ for unchecked and ☑ for checked boxes. - Checkboxes: Use ☐ for unchecked and ☑ for checked boxes.