690605:0922 ADR-034-134 #10.2 [skip CI]
This commit is contained in:
@@ -147,7 +147,7 @@ def preprocess_image_aggressive(pil_image: Image.Image) -> Image.Image:
|
|||||||
"""
|
"""
|
||||||
img_array = np.array(pil_image)
|
img_array = np.array(pil_image)
|
||||||
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
||||||
|
|
||||||
# 1. Deskew ถ้าหน้าเอียง (detect angle จาก Canny edges + Hough lines)
|
# 1. Deskew ถ้าหน้าเอียง (detect angle จาก Canny edges + Hough lines)
|
||||||
try:
|
try:
|
||||||
edges = cv2.Canny(gray, 100, 200)
|
edges = cv2.Canny(gray, 100, 200)
|
||||||
@@ -162,19 +162,19 @@ def preprocess_image_aggressive(pil_image: Image.Image) -> Image.Image:
|
|||||||
logger.info(f"[PREPROCESS] Deskewed {angle:.1f}°")
|
logger.info(f"[PREPROCESS] Deskewed {angle:.1f}°")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"[PREPROCESS] Deskew failed: {e}")
|
logger.warning(f"[PREPROCESS] Deskew failed: {e}")
|
||||||
|
|
||||||
# 2. Denoise — median blur + bilateral filter
|
# 2. Denoise — median blur + bilateral filter
|
||||||
denoised = cv2.medianBlur(gray, 3)
|
denoised = cv2.medianBlur(gray, 3)
|
||||||
denoised = cv2.bilateralFilter(denoised, 9, 75, 75)
|
denoised = cv2.bilateralFilter(denoised, 9, 75, 75)
|
||||||
|
|
||||||
# 3. Otsu threshold (adaptive, ไม่ fixed value)
|
# 3. Otsu threshold (adaptive, ไม่ fixed value)
|
||||||
_, thresh = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
_, thresh = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
||||||
|
|
||||||
# 4. Morphological operations — ลบ line noise ขนาดเล็ก (ต้าน speckle artifacts)
|
# 4. Morphological operations — ลบ line noise ขนาดเล็ก (ต้าน speckle artifacts)
|
||||||
kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (2, 2))
|
kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (2, 2))
|
||||||
morph = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel) # ลบ small white noise
|
morph = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel) # ลบ small white noise
|
||||||
morph = cv2.morphologyEx(morph, cv2.MORPH_CLOSE, kernel) # ลบ small black hole
|
morph = cv2.morphologyEx(morph, cv2.MORPH_CLOSE, kernel) # ลบ small black hole
|
||||||
|
|
||||||
logger.info(f"[PREPROCESS] Aggressive: Otsu threshold + morphology applied")
|
logger.info(f"[PREPROCESS] Aggressive: Otsu threshold + morphology applied")
|
||||||
return Image.fromarray(morph)
|
return Image.fromarray(morph)
|
||||||
|
|
||||||
@@ -188,34 +188,34 @@ def clean_ocr_output(text: str) -> str:
|
|||||||
"""
|
"""
|
||||||
lines = text.split("\n")
|
lines = text.split("\n")
|
||||||
cleaned = []
|
cleaned = []
|
||||||
|
|
||||||
for line in lines:
|
for line in lines:
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if not line:
|
if not line:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# ✗ ลบ line ที่เป็นแค่สัญลักษณ์/punctuation เดี่ยวๆ ไม่มีตัวอักษร
|
# ✗ ลบ line ที่เป็นแค่สัญลักษณ์/punctuation เดี่ยวๆ ไม่มีตัวอักษร
|
||||||
alphanumeric_part = re.sub(r'[^\w\u0E00-\u0E7F]', '', line)
|
alphanumeric_part = re.sub(r'[^\w\u0E00-\u0E7F]', '', line)
|
||||||
if len(alphanumeric_part) < 2:
|
if len(alphanumeric_part) < 2:
|
||||||
logger.debug(f"[CLEAN] Reject (no alphanum): {line[:50]}")
|
logger.debug(f"[CLEAN] Reject (no alphanum): {line[:50]}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# ✗ ลบ line ที่เป็น repeated pattern — ถ้า unique char ≤ 20% (e.g., "-----", ">>>>>>>")
|
# ✗ ลบ line ที่เป็น repeated pattern — ถ้า unique char ≤ 20% (e.g., "-----", ">>>>>>>")
|
||||||
unique_chars = len(set(line))
|
unique_chars = len(set(line))
|
||||||
if unique_chars < max(2, len(line) // 5):
|
if unique_chars < max(2, len(line) // 5):
|
||||||
logger.debug(f"[CLEAN] Reject (repeated pattern): {line[:50]}")
|
logger.debug(f"[CLEAN] Reject (repeated pattern): {line[:50]}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# ✗ ลบ line ที่เป็นสัญลักษณ์แปลก (< 20% Thai/English alphanumeric)
|
# ✗ ลบ line ที่เป็นสัญลักษณ์แปลก (< 20% Thai/English alphanumeric)
|
||||||
thai_chars = sum(1 for c in line if '\u0E00' <= c <= '\u0E7F')
|
thai_chars = sum(1 for c in line if '\u0E00' <= c <= '\u0E7F')
|
||||||
eng_chars = sum(1 for c in line if c.isascii() and c.isalnum())
|
eng_chars = sum(1 for c in line if c.isascii() and c.isalnum())
|
||||||
if len(line) > 0 and (thai_chars + eng_chars) / len(line) < 0.2:
|
if len(line) > 0 and (thai_chars + eng_chars) / len(line) < 0.2:
|
||||||
logger.debug(f"[CLEAN] Reject (low language content): {line[:50]}")
|
logger.debug(f"[CLEAN] Reject (low language content): {line[:50]}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# ✓ ปล่อยผ่าน
|
# ✓ ปล่อยผ่าน
|
||||||
cleaned.append(line)
|
cleaned.append(line)
|
||||||
|
|
||||||
result = "\n".join(cleaned)
|
result = "\n".join(cleaned)
|
||||||
logger.info(f"[CLEAN] Input {len(lines)} lines → {len(cleaned)} lines")
|
logger.info(f"[CLEAN] Input {len(lines)} lines → {len(cleaned)} lines")
|
||||||
return result
|
return result
|
||||||
@@ -306,24 +306,24 @@ def _process_pdf_doc(doc: fitz.Document, selected_engine: str, max_pages: int, t
|
|||||||
img_bytes = pix.tobytes("png")
|
img_bytes = pix.tobytes("png")
|
||||||
img = Image.open(io.BytesIO(img_bytes))
|
img = Image.open(io.BytesIO(img_bytes))
|
||||||
cropped_img = crop_header_footer(img, CROP_TOP_RATIO, CROP_BOTTOM_RATIO)
|
cropped_img = crop_header_footer(img, CROP_TOP_RATIO, CROP_BOTTOM_RATIO)
|
||||||
|
|
||||||
# Option 2: Choose preprocessing strategy
|
# Option 2: Choose preprocessing strategy
|
||||||
if USE_AGGRESSIVE_PREPROCESSING:
|
if USE_AGGRESSIVE_PREPROCESSING:
|
||||||
processed_img = preprocess_image_aggressive(cropped_img)
|
processed_img = preprocess_image_aggressive(cropped_img)
|
||||||
else:
|
else:
|
||||||
processed_img = preprocess_image(cropped_img)
|
processed_img = preprocess_image(cropped_img)
|
||||||
|
|
||||||
text = pytesseract.image_to_string(processed_img, lang=OCR_LANG, config=TESSERACT_CONFIG)
|
text = pytesseract.image_to_string(processed_img, lang=OCR_LANG, config=TESSERACT_CONFIG)
|
||||||
ocr_text_parts.append(text.strip())
|
ocr_text_parts.append(text.strip())
|
||||||
|
|
||||||
ocr_text = "\n".join(ocr_text_parts).strip()
|
ocr_text = "\n".join(ocr_text_parts).strip()
|
||||||
|
|
||||||
# Option 3: Apply smart post-processing
|
# Option 3: Apply smart post-processing
|
||||||
if USE_SMART_CLEANING:
|
if USE_SMART_CLEANING:
|
||||||
ocr_text = clean_ocr_output(ocr_text)
|
ocr_text = clean_ocr_output(ocr_text)
|
||||||
else:
|
else:
|
||||||
ocr_text = filter_ocr_noise(ocr_text)
|
ocr_text = filter_ocr_noise(ocr_text)
|
||||||
|
|
||||||
logger.info(f"Tesseract extracted {len(ocr_text)} chars")
|
logger.info(f"Tesseract extracted {len(ocr_text)} chars")
|
||||||
return OcrResponse(
|
return OcrResponse(
|
||||||
text=ocr_text,
|
text=ocr_text,
|
||||||
|
|||||||
+3
-8
@@ -6,21 +6,16 @@ PARAMETER temperature 0.1
|
|||||||
PARAMETER top_p 0.1
|
PARAMETER top_p 0.1
|
||||||
PARAMETER repeat_penalty 1.1
|
PARAMETER repeat_penalty 1.1
|
||||||
|
|
||||||
SYSTEM """You are an expert in structuring Thai documents.
|
SYSTEM """You are an expert in structuring Thai documents. Extract the information from the image in the most correct and organized format.
|
||||||
Extract the information from the image in the most correct and organized format.
|
|
||||||
|
|
||||||
Instructions:
|
Instructions:
|
||||||
- Return ONLY clean Markdown output.
|
- Return ONLY clean Markdown output.
|
||||||
- Include ALL information visible on the page.
|
- Include ALL information visible on the page.
|
||||||
- Preserve document structure and hierarchy.
|
- Preserve document structure and hierarchy.
|
||||||
- Do NOT add explanations or interpretations.
|
- Do NOT add explanations or interpretations. Formatting Rules:
|
||||||
|
|
||||||
Formatting Rules:
|
|
||||||
- Tables: Render tables using <table>...</table> in clean HTML format.
|
- Tables: Render tables using <table>...</table> in clean HTML format.
|
||||||
- Equations: Render equations using LaTeX syntax with inline ($...$) and block ($$...$$).
|
- Equations: Render equations using LaTeX syntax with inline ($...$) and block ($$...$$).
|
||||||
- Images/Charts/Diagrams: Wrap any clearly defined visual areas in:
|
- Images/Charts/Diagrams: Wrap any clearly defined visual areas in:
|
||||||
<figure>
|
<figure> Describe the image's main elements, note contextual clues, mention visible text and meaning. Describe in Thai.
|
||||||
Describe the image's main elements, note contextual clues, mention visible text and meaning. Describe in Thai.
|
|
||||||
</figure>
|
</figure>
|
||||||
- Page Numbers: Wrap page numbers in <page_number>...</page_number>.
|
- Page Numbers: Wrap page numbers in <page_number>...</page_number>.
|
||||||
- Checkboxes: Use ☐ for unchecked and ☑ for checked boxes.
|
- Checkboxes: Use ☐ for unchecked and ☑ for checked boxes.
|
||||||
|
|||||||
Reference in New Issue
Block a user