690605:0922 ADR-034-134 #10.2 [skip CI]

2026-06-05 09:22:41 +07:00
parent 8b6ef392f5
commit 2db4810dfc
2 changed files with 18 additions and 23 deletions
@@ -147,7 +147,7 @@ def preprocess_image_aggressive(pil_image: Image.Image) -> Image.Image:
    """
    img_array = np.array(pil_image)
    gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
-    
+
    # 1. Deskew ถ้าหน้าเอียง (detect angle จาก Canny edges + Hough lines)
    try:
        edges = cv2.Canny(gray, 100, 200)
@@ -162,19 +162,19 @@ def preprocess_image_aggressive(pil_image: Image.Image) -> Image.Image:
                logger.info(f"[PREPROCESS] Deskewed {angle:.1f}°")
    except Exception as e:
        logger.warning(f"[PREPROCESS] Deskew failed: {e}")
-    
+
    # 2. Denoise — median blur + bilateral filter
    denoised = cv2.medianBlur(gray, 3)
    denoised = cv2.bilateralFilter(denoised, 9, 75, 75)
-    
+
    # 3. Otsu threshold (adaptive, ไม่ fixed value)
    _, thresh = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
-    
+
    # 4. Morphological operations — ลบ line noise ขนาดเล็ก (ต้าน speckle artifacts)
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (2, 2))
    morph = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel)  # ลบ small white noise
    morph = cv2.morphologyEx(morph, cv2.MORPH_CLOSE, kernel)  # ลบ small black hole
-    
+
    logger.info(f"[PREPROCESS] Aggressive: Otsu threshold + morphology applied")
    return Image.fromarray(morph)

@@ -188,34 +188,34 @@ def clean_ocr_output(text: str) -> str:
    """
    lines = text.split("\n")
    cleaned = []
-    
+
    for line in lines:
        line = line.strip()
        if not line:
            continue
-        
+
        # ✗ ลบ line ที่เป็นแค่สัญลักษณ์/punctuation เดี่ยวๆ ไม่มีตัวอักษร
        alphanumeric_part = re.sub(r'[^\w\u0E00-\u0E7F]', '', line)
        if len(alphanumeric_part) < 2:
            logger.debug(f"[CLEAN] Reject (no alphanum): {line[:50]}")
            continue
-        
+
        # ✗ ลบ line ที่เป็น repeated pattern — ถ้า unique char ≤ 20% (e.g., "-----", ">>>>>>>")
        unique_chars = len(set(line))
        if unique_chars < max(2, len(line) // 5):
            logger.debug(f"[CLEAN] Reject (repeated pattern): {line[:50]}")
            continue
-        
+
        # ✗ ลบ line ที่เป็นสัญลักษณ์แปลก (< 20% Thai/English alphanumeric)
        thai_chars = sum(1 for c in line if '\u0E00' <= c <= '\u0E7F')
        eng_chars = sum(1 for c in line if c.isascii() and c.isalnum())
        if len(line) > 0 and (thai_chars + eng_chars) / len(line) < 0.2:
            logger.debug(f"[CLEAN] Reject (low language content): {line[:50]}")
            continue
-        
+
        # ✓ ปล่อยผ่าน
        cleaned.append(line)
-    
+
    result = "\n".join(cleaned)
    logger.info(f"[CLEAN] Input {len(lines)} lines → {len(cleaned)} lines")
    return result
@@ -306,24 +306,24 @@ def _process_pdf_doc(doc: fitz.Document, selected_engine: str, max_pages: int, t
        img_bytes = pix.tobytes("png")
        img = Image.open(io.BytesIO(img_bytes))
        cropped_img = crop_header_footer(img, CROP_TOP_RATIO, CROP_BOTTOM_RATIO)
-        
+
        # Option 2: Choose preprocessing strategy
        if USE_AGGRESSIVE_PREPROCESSING:
            processed_img = preprocess_image_aggressive(cropped_img)
        else:
            processed_img = preprocess_image(cropped_img)
-        
+
        text = pytesseract.image_to_string(processed_img, lang=OCR_LANG, config=TESSERACT_CONFIG)
        ocr_text_parts.append(text.strip())

    ocr_text = "\n".join(ocr_text_parts).strip()
-    
+
    # Option 3: Apply smart post-processing
    if USE_SMART_CLEANING:
        ocr_text = clean_ocr_output(ocr_text)
    else:
        ocr_text = filter_ocr_noise(ocr_text)
-    
+
    logger.info(f"Tesseract extracted {len(ocr_text)} chars")
    return OcrResponse(
        text=ocr_text,
@@ -6,21 +6,16 @@ PARAMETER temperature 0.1
 PARAMETER top_p 0.1
 PARAMETER repeat_penalty 1.1

-SYSTEM """You are an expert in structuring Thai documents.
-Extract the information from the image in the most correct and organized format.
-
+SYSTEM """You are an expert in structuring Thai documents. Extract the information from the image in the most correct and organized format.
 Instructions:
 - Return ONLY clean Markdown output.
 - Include ALL information visible on the page.
 - Preserve document structure and hierarchy.
- Do NOT add explanations or interpretations.
-
-Formatting Rules:
+- Do NOT add explanations or interpretations. Formatting Rules:
 - Tables: Render tables using <table>...</table> in clean HTML format.
 - Equations: Render equations using LaTeX syntax with inline ($...$) and block ($$...$$).
 - Images/Charts/Diagrams: Wrap any clearly defined visual areas in:
-<figure>
-Describe the image's main elements, note contextual clues, mention visible text and meaning. Describe in Thai.
+<figure> Describe the image's main elements, note contextual clues, mention visible text and meaning. Describe in Thai.
 </figure>
 - Page Numbers: Wrap page numbers in <page_number>...</page_number>.
 - Checkboxes: Use ☐ for unchecked and ☑ for checked boxes.