há 1 mês atrás · 4b7c2bbcc0
--- a/.github/ISSUE_TEMPLATE/bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -122,8 +122,21 @@ body:
 
				       #multiple: false
			
 
				       options:
			
 
				         -
			
 
				+        - "<2.2.0"
			
 
				         - "2.2.x"
			
 
				-        - "2.5.x"
			
 
				+        - ">=2.5"
			
 
				+    validations:
			
 
				+      required: true
			
 
				+
			
 
				+  - type: dropdown
			
 
				+    id: backend_name
			
 
				+    attributes:
			
 
				+      label: Backend name | 解析后端
			
 
				+      #multiple: false
			
 
				+      options:
			
 
				+        -
			
 
				+        - "vlm"
			
 
				+        - "pipeline"
			
 
				     validations:
			
 
				       required: true
			
 
				 
			
--- a/mineru/utils/table_merge.py
+++ b/mineru/utils/table_merge.py
@@ -145,8 +145,9 @@ def detect_table_headers(soup1, soup2, max_header_rows=5):
 
				                 colspan2 = int(cell2.get("colspan", 1))
			
 
				                 rowspan2 = int(cell2.get("rowspan", 1))
			
 
				 
			
 
				-                text1 = full_to_half(cell1.get_text().strip())
			
 
				-                text2 = full_to_half(cell2.get_text().strip())
			
 
				+                # 去除所有空白字符（包括空格、换行、制表符等）
			
 
				+                text1 = ''.join(full_to_half(cell1.get_text()).split())
			
 
				+                text2 = ''.join(full_to_half(cell2.get_text()).split())
			
 
				 
			
 
				                 if colspan1 != colspan2 or rowspan1 != rowspan2 or text1 != text2:
			
 
				                     structure_match = False
			
@@ -170,11 +171,12 @@ def detect_table_headers(soup1, soup2, max_header_rows=5):
 
				 def can_merge_tables(current_table_block, previous_table_block):
			
 
				     """判断两个表格是否可以合并"""
			
 
				     # 检查表格是否有caption和footnote
			
 
				-    # if any(block["type"] == BlockType.TABLE_CAPTION for block in current_table_block["blocks"]):
			
 
				-    #     return False, None, None, None, None
			
 
				-    # current_table_block["blocks"]中有任何TABLE_CAPTION类型的块，且任意caption块内不以"（续）"结尾，则不合并
			
 
				-    if any(block["type"] == BlockType.TABLE_CAPTION and not full_to_half(merge_para_with_text(block).strip()).endswith("(续)") for block in current_table_block["blocks"]):
			
 
				-        return False, None, None, None, None
			
 
				+    # 如果有TABLE_CAPTION类型的块,检查是否至少有一个以"(续)"结尾
			
 
				+    caption_blocks = [block for block in current_table_block["blocks"] if block["type"] == BlockType.TABLE_CAPTION]
			
 
				+    if caption_blocks:
			
 
				+        # 如果所有caption都不以"(续)"结尾,则不合并
			
 
				+        if not any(full_to_half(merge_para_with_text(block).strip()).endswith("(续)") for block in caption_blocks):
			
 
				+            return False, None, None, None, None
			
 
				 
			
 
				     if any(block["type"] == BlockType.TABLE_FOOTNOTE for block in previous_table_block["blocks"]):
			
 
				         return False, None, None, None, None
			
@@ -257,6 +259,59 @@ def check_rows_match(soup1, soup2):
 
				     return last_row_cols == first_row_cols or last_row_visual_cols == first_row_visual_cols
			
 
				 
			
 
				 
			
 
				+def check_row_columns_match(row1, row2):
			
 
				+    # 逐个cell检测colspan属性是否一致
			
 
				+    cells1 = row1.find_all(["td", "th"])
			
 
				+    cells2 = row2.find_all(["td", "th"])
			
 
				+    if len(cells1) != len(cells2):
			
 
				+        return False
			
 
				+    for cell1, cell2 in zip(cells1, cells2):
			
 
				+        colspan1 = int(cell1.get("colspan", 1))
			
 
				+        colspan2 = int(cell2.get("colspan", 1))
			
 
				+        if colspan1 != colspan2:
			
 
				+            return False
			
 
				+    return True
			
 
				+
			
 
				+
			
 
				+def adjust_table_rows_colspan(rows, start_idx, end_idx,
			
 
				+                              reference_structure, reference_visual_cols,
			
 
				+                              target_cols, current_cols, reference_row):
			
 
				+    """调整表格行的colspan属性以匹配目标列数
			
 
				+
			
 
				+    Args:
			
 
				+        rows: 表格行列表
			
 
				+        start_idx: 起始行索引
			
 
				+        end_idx: 结束行索引（不包含）
			
 
				+        reference_structure: 参考行的colspan结构列表
			
 
				+        reference_visual_cols: 参考行的视觉列数
			
 
				+        target_cols: 目标总列数
			
 
				+        current_cols: 当前总列数
			
 
				+        reference_row: 参考行对象
			
 
				+    """
			
 
				+    for i in range(start_idx, end_idx):
			
 
				+        row = rows[i]
			
 
				+        cells = row.find_all(["td", "th"])
			
 
				+        if not cells:
			
 
				+            continue
			
 
				+
			
 
				+        current_row_cols = calculate_row_columns(row)
			
 
				+        if current_row_cols >= target_cols:
			
 
				+            continue
			
 
				+
			
 
				+        # 检查是否与参考行结构匹配
			
 
				+        if calculate_visual_columns(row) == reference_visual_cols and check_row_columns_match(row, reference_row):
			
 
				+            # 尝试应用参考结构
			
 
				+            if len(cells) <= len(reference_structure):
			
 
				+                for j, cell in enumerate(cells):
			
 
				+                    if j < len(reference_structure) and reference_structure[j] > 1:
			
 
				+                        cell["colspan"] = str(reference_structure[j])
			
 
				+        else:
			
 
				+            # 扩展最后一个单元格以填补列数差异
			
 
				+            last_cell = cells[-1]
			
 
				+            current_last_span = int(last_cell.get("colspan", 1))
			
 
				+            last_cell["colspan"] = str(current_last_span + (target_cols - current_cols))
			
 
				+
			
 
				+
			
 
				 def perform_table_merge(soup1, soup2, previous_table_block, wait_merge_table_footnotes):
			
 
				     """执行表格合并操作"""
			
 
				     # 检测表头有几行，并确认表头内容是否一致
			
@@ -273,69 +328,32 @@ def perform_table_merge(soup1, soup2, previous_table_block, wait_merge_table_foo
 
				 
			
 
				 
			
 
				     if rows1 and rows2 and header_count < len(rows2):
			
 
				-        # 获取表1最后一行
			
 
				+        # 获取表1最后一行和表2第一个非表头行
			
 
				         last_row1 = rows1[-1]
			
 
				-        # 获取表2第一个非表头行
			
 
				         first_data_row2 = rows2[header_count]
			
 
				 
			
 
				-        # 分析两行的colspan结构
			
 
				-        last_row1_structure = []
			
 
				-        has_colspan_last_row1 = False
			
 
				-        for cell in last_row1.find_all(["td", "th"]):
			
 
				-            colspan = int(cell.get("colspan", 1))
			
 
				-            last_row1_structure.append(colspan)
			
 
				-            if colspan > 1:
			
 
				-                has_colspan_last_row1 = True
			
 
				-
			
 
				-        first_row2_structure = []
			
 
				-        has_colspan_first_row2 = False
			
 
				-        for cell in first_data_row2.find_all(["td", "th"]):
			
 
				-            colspan = int(cell.get("colspan", 1))
			
 
				-            first_row2_structure.append(colspan)
			
 
				-            if colspan > 1:
			
 
				-                has_colspan_first_row2 = True
			
 
				-
			
 
				-        # 确定基准结构（优先使用有colspan的行）
			
 
				-        if has_colspan_last_row1:
			
 
				-            reference_structure = last_row1_structure
			
 
				+        # 计算表格总列数
			
 
				+        table_cols1 = calculate_table_total_columns(soup1)
			
 
				+        table_cols2 = calculate_table_total_columns(soup2)
			
 
				+        if table_cols1 >= table_cols2:
			
 
				+            reference_structure = [int(cell.get("colspan", 1)) for cell in last_row1.find_all(["td", "th"])]
			
 
				             reference_visual_cols = calculate_visual_columns(last_row1)
			
 
				-        elif has_colspan_first_row2:
			
 
				-            reference_structure = first_row2_structure
			
 
				+            # 以表1的最后一行为参考，调整表2的行
			
 
				+            adjust_table_rows_colspan(
			
 
				+                rows2, header_count, len(rows2),
			
 
				+                reference_structure, reference_visual_cols,
			
 
				+                table_cols1, table_cols2, first_data_row2
			
 
				+            )
			
 
				+
			
 
				+        else:  # table_cols2 > table_cols1
			
 
				+            reference_structure = [int(cell.get("colspan", 1)) for cell in first_data_row2.find_all(["td", "th"])]
			
 
				             reference_visual_cols = calculate_visual_columns(first_data_row2)
			
 
				-        else:
			
 
				-            # 都没有colspan时使用表1最后一行作为默认基准
			
 
				-            reference_structure = last_row1_structure
			
 
				-            reference_visual_cols = calculate_visual_columns(last_row1)
			
 
				-
			
 
				-        # 如果表1最后一行没有colspan但表2首行有，则调整表1相关行
			
 
				-        if not has_colspan_last_row1 and has_colspan_first_row2:
			
 
				-            # 找到表1中所有具有相同视觉列数的行
			
 
				-            rows_to_adjust = []
			
 
				-            for i in range(len(rows1) - 1, -1, -1):
			
 
				-                if calculate_visual_columns(rows1[i]) == reference_visual_cols:
			
 
				-                    rows_to_adjust.append(rows1[i])
			
 
				-                else:
			
 
				-                    break
			
 
				-
			
 
				-            # 应用参考结构到这些行
			
 
				-            for row in rows_to_adjust:
			
 
				-                cells = row.find_all(["td", "th"])
			
 
				-                if cells and len(cells) <= len(reference_structure):
			
 
				-                    for j, cell in enumerate(cells):
			
 
				-                        if j < len(reference_structure) and reference_structure[j] > 1:
			
 
				-                            cell["colspan"] = str(reference_structure[j])
			
 
				-
			
 
				-        # 如果表2首行没有colspan但表1最后一行有，则调整表2相关行
			
 
				-        elif has_colspan_last_row1 and not has_colspan_first_row2:
			
 
				-            # 调整表2中所有具有相同视觉列数的行
			
 
				-            for i in range(header_count, len(rows2)):
			
 
				-                row = rows2[i]
			
 
				-                if calculate_visual_columns(row) == reference_visual_cols:
			
 
				-                    cells = row.find_all(["td", "th"])
			
 
				-                    if cells and len(cells) <= len(reference_structure):
			
 
				-                        for j, cell in enumerate(cells):
			
 
				-                            if j < len(reference_structure) and reference_structure[j] > 1:
			
 
				-                                cell["colspan"] = str(reference_structure[j])
			
 
				+            # 以表2的第一个数据行为参考，调整表1的行
			
 
				+            adjust_table_rows_colspan(
			
 
				+                rows1, 0, len(rows1),
			
 
				+                reference_structure, reference_visual_cols,
			
 
				+                table_cols2, table_cols1, last_row1
			
 
				+            )
			
 
				 
			
 
				     # 将第二个表格的行添加到第一个表格中
			
 
				     if tbody1: