Просмотр исходного кода

fix: add HTML parsing for wireless and wired table results to improve cell analysis

myhloli 2 месяцев назад
Родитель
Сommit
254a0a483b
2 измененных файлов с 27 добавлено и 5 удалено
  1. 26 5
      mineru/model/table/rec/unet_table/main.py
  2. 1 0
      pyproject.toml

+ 26 - 5
mineru/model/table/rec/unet_table/main.py

@@ -10,6 +10,7 @@ import numpy as np
 import cv2
 from PIL import Image
 from loguru import logger
+from bs4 import BeautifulSoup
 
 from .table_structure_unet import TSRUnet
 
@@ -283,20 +284,40 @@ class UnetTableModel:
             # )
 
             wired_html_code = wired_table_results.pred_html
-
             wired_len = count_table_cells_physical(wired_html_code)
             wireless_len = count_table_cells_physical(wireless_html_code)
-
-            # logger.debug(f"wired table cell bboxes: {wired_len}, wireless table cell bboxes: {wireless_len}")
             # 计算两种模型检测的单元格数量差异
             gap_of_len = wireless_len - wired_len
+            # logger.debug(f"wired table cell bboxes: {wired_len}, wireless table cell bboxes: {wireless_len}")
+
+            # 使用OCR结果计算两种模型填入的文字数量
+            wireless_text_count = 0
+            wired_text_count = 0
+            for ocr_res in ocr_result:
+                if ocr_res[1] in wireless_html_code:
+                    wireless_text_count += 1
+                if ocr_res[1] in wired_html_code:
+                    wired_text_count += 1
+            # logger.debug(f"wireless table ocr text count: {wireless_text_count}, wired table ocr text count: {wired_text_count}")
+
+            # 使用HTML解析器计算空单元格数量
+            wireless_soup = BeautifulSoup(wireless_html_code, 'html.parser') if wireless_html_code else BeautifulSoup("", 'html.parser')
+            wired_soup = BeautifulSoup(wired_html_code, 'html.parser') if wired_html_code else BeautifulSoup("", 'html.parser')
+            # 计算空单元格数量(没有文本内容或只有空白字符)
+            wireless_blank_count = sum(1 for cell in wireless_soup.find_all(['td', 'th']) if not cell.text.strip())
+            wired_blank_count = sum(1 for cell in wired_soup.find_all(['td', 'th']) if not cell.text.strip())
+            # logger.debug(f"wireless table blank cell count: {wireless_blank_count}, wired table blank cell count: {wired_blank_count}")
+
             # 判断是否使用无线表格模型的结果
             if (
-                int(wireless_len * 0.1) <= wired_len <= int(wireless_len * 0.62)+1  # 有线模型检测到的单元格数太少(低于无线模型的55%)
+                int(wireless_len * 0.04) <= wired_len <= int(wireless_len * 0.62)+1  # 有线模型检测到的单元格数太少(低于无线模型的55%)
                 or (0 <= gap_of_len <= 5 and wired_len <= round(wireless_len * 0.75))  # 两者相差不大但有线模型结果较少
                 or (gap_of_len == 0 and wired_len <= 4)  # 单元格数量完全相等且总量小于等于4
+                or wired_text_count <= wireless_text_count * 0.6  # 有线模型填入的文字明显少于无线模型
+            )and(
+                wireless_blank_count <= (wired_blank_count+1) * 2  # 无线模型的空单元格数量不多于有线模型的2倍
             ):
-                # logger.debug("fall back to wireless table model")
+                logger.debug("fall back to wireless table model")
                 html_code = wireless_html_code
             else:
                 html_code = wired_html_code

+ 1 - 0
pyproject.toml

@@ -37,6 +37,7 @@ dependencies = [
     "fast-langdetect>=0.2.3,<0.3.0",
     "scikit-image>=0.25.0,<1.0.0",
     "openai>=1.70.0,<2",
+    "beautifulsoup4>=4.13.5,<5",
 ]
 
 [project.optional-dependencies]