瀏覽代碼

Merge pull request #3748 from myhloli/dev

Enhance table merging logic to adjust colspan attributes based on row structures
Xiaomeng Zhao 1 月之前
父節點
當前提交
4c8bb038ce
共有 2 個文件被更改,包括 83 次插入18 次删除
  1. 2 8
      mineru/cli/fast_api.py
  2. 81 10
      mineru/utils/table_merge.py

+ 2 - 8
mineru/cli/fast_api.py

@@ -177,10 +177,7 @@ async def parse_pdf(
                             zf.write(path, arcname=os.path.join(safe_pdf_name, f"{safe_pdf_name}_middle.json"))
 
                     if return_model_output:
-                        if backend.startswith("pipeline"):
-                            path = os.path.join(parse_dir, f"{pdf_name}_model.json")
-                        else:
-                            path = os.path.join(parse_dir, f"{pdf_name}_model_output.txt")
+                        path = os.path.join(parse_dir, f"{pdf_name}_model.json")
                         if os.path.exists(path): 
                             zf.write(path, arcname=os.path.join(safe_pdf_name, os.path.basename(path)))
 
@@ -220,10 +217,7 @@ async def parse_pdf(
                     if return_middle_json:
                         data["middle_json"] = get_infer_result("_middle.json", pdf_name, parse_dir)
                     if return_model_output:
-                        if backend.startswith("pipeline"):
-                            data["model_output"] = get_infer_result("_model.json", pdf_name, parse_dir)
-                        else:
-                            data["model_output"] = get_infer_result("_model_output.txt", pdf_name, parse_dir)
+                        data["model_output"] = get_infer_result("_model.json", pdf_name, parse_dir)
                     if return_content_list:
                         data["content_list"] = get_infer_result("_content_list.json", pdf_name, parse_dir)
                     if return_images:

+ 81 - 10
mineru/utils/table_merge.py

@@ -3,6 +3,7 @@
 from loguru import logger
 from bs4 import BeautifulSoup
 
+from mineru.backend.vlm.vlm_middle_json_mkcontent import merge_para_with_text
 from mineru.utils.enum_class import BlockType, SplitFlag
 
 
@@ -169,7 +170,10 @@ def detect_table_headers(soup1, soup2, max_header_rows=5):
 def can_merge_tables(current_table_block, previous_table_block):
     """判断两个表格是否可以合并"""
     # 检查表格是否有caption和footnote
-    if any(block["type"] == BlockType.TABLE_CAPTION for block in current_table_block["blocks"]):
+    # if any(block["type"] == BlockType.TABLE_CAPTION for block in current_table_block["blocks"]):
+    #     return False, None, None, None, None
+    # current_table_block["blocks"]中有任何TABLE_CAPTION类型的块,且任意caption块内不以"(续)"结尾,则不合并
+    if any(block["type"] == BlockType.TABLE_CAPTION and not full_to_half(merge_para_with_text(block).strip()).endswith("(续)") for block in current_table_block["blocks"]):
         return False, None, None, None, None
 
     if any(block["type"] == BlockType.TABLE_FOOTNOTE for block in previous_table_block["blocks"]):
@@ -263,17 +267,84 @@ def perform_table_merge(soup1, soup2, previous_table_block, wait_merge_table_foo
     # 找到第一个表格的tbody,如果没有则查找table元素
     tbody1 = soup1.find("tbody") or soup1.find("table")
 
-    # 找到第二个表格的tbody,如果没有则查找table元素
-    tbody2 = soup2.find("tbody") or soup2.find("table")
+    # 获取表1和表2的所有行
+    rows1 = soup1.find_all("tr")
+    rows2 = soup2.find_all("tr")
+
+
+    if rows1 and rows2 and header_count < len(rows2):
+        # 获取表1最后一行
+        last_row1 = rows1[-1]
+        # 获取表2第一个非表头行
+        first_data_row2 = rows2[header_count]
+
+        # 分析两行的colspan结构
+        last_row1_structure = []
+        has_colspan_last_row1 = False
+        for cell in last_row1.find_all(["td", "th"]):
+            colspan = int(cell.get("colspan", 1))
+            last_row1_structure.append(colspan)
+            if colspan > 1:
+                has_colspan_last_row1 = True
+
+        first_row2_structure = []
+        has_colspan_first_row2 = False
+        for cell in first_data_row2.find_all(["td", "th"]):
+            colspan = int(cell.get("colspan", 1))
+            first_row2_structure.append(colspan)
+            if colspan > 1:
+                has_colspan_first_row2 = True
+
+        # 确定基准结构(优先使用有colspan的行)
+        if has_colspan_last_row1:
+            reference_structure = last_row1_structure
+            reference_visual_cols = calculate_visual_columns(last_row1)
+        elif has_colspan_first_row2:
+            reference_structure = first_row2_structure
+            reference_visual_cols = calculate_visual_columns(first_data_row2)
+        else:
+            # 都没有colspan时使用表1最后一行作为默认基准
+            reference_structure = last_row1_structure
+            reference_visual_cols = calculate_visual_columns(last_row1)
+
+        # 如果表1最后一行没有colspan但表2首行有,则调整表1相关行
+        if not has_colspan_last_row1 and has_colspan_first_row2:
+            # 找到表1中所有具有相同视觉列数的行
+            rows_to_adjust = []
+            for i in range(len(rows1) - 1, -1, -1):
+                if calculate_visual_columns(rows1[i]) == reference_visual_cols:
+                    rows_to_adjust.append(rows1[i])
+                else:
+                    break
+
+            # 应用参考结构到这些行
+            for row in rows_to_adjust:
+                cells = row.find_all(["td", "th"])
+                if cells and len(cells) <= len(reference_structure):
+                    for j, cell in enumerate(cells):
+                        if j < len(reference_structure) and reference_structure[j] > 1:
+                            cell["colspan"] = str(reference_structure[j])
+
+        # 如果表2首行没有colspan但表1最后一行有,则调整表2相关行
+        elif has_colspan_last_row1 and not has_colspan_first_row2:
+            # 调整表2中所有具有相同视觉列数的行
+            for i in range(header_count, len(rows2)):
+                row = rows2[i]
+                if calculate_visual_columns(row) == reference_visual_cols:
+                    cells = row.find_all(["td", "th"])
+                    if cells and len(cells) <= len(reference_structure):
+                        for j, cell in enumerate(cells):
+                            if j < len(reference_structure) and reference_structure[j] > 1:
+                                cell["colspan"] = str(reference_structure[j])
 
     # 将第二个表格的行添加到第一个表格中
-    if tbody1 and tbody2:
-        rows2 = soup2.find_all("tr")
-        # 将第二个表格的行添加到第一个表格中(跳过表头行)
-        for row in rows2[header_count:]:
-            # 从原来的位置移除行,并添加到第一个表格中
-            row.extract()
-            tbody1.append(row)
+    if tbody1:
+        tbody2 = soup2.find("tbody") or soup2.find("table")
+        if tbody2:
+            # 将第二个表格的行添加到第一个表格中(跳过表头行)
+            for row in rows2[header_count:]:
+                row.extract()
+                tbody1.append(row)
 
     # 添加待合并表格的footnote到前一个表格中
     for table_footnote in wait_merge_table_footnotes: