|
|
@@ -3,6 +3,7 @@
|
|
|
from loguru import logger
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
+from mineru.backend.vlm.vlm_middle_json_mkcontent import merge_para_with_text
|
|
|
from mineru.utils.enum_class import BlockType, SplitFlag
|
|
|
|
|
|
|
|
|
@@ -169,7 +170,10 @@ def detect_table_headers(soup1, soup2, max_header_rows=5):
|
|
|
def can_merge_tables(current_table_block, previous_table_block):
|
|
|
"""判断两个表格是否可以合并"""
|
|
|
# 检查表格是否有caption和footnote
|
|
|
- if any(block["type"] == BlockType.TABLE_CAPTION for block in current_table_block["blocks"]):
|
|
|
+ # if any(block["type"] == BlockType.TABLE_CAPTION for block in current_table_block["blocks"]):
|
|
|
+ # return False, None, None, None, None
|
|
|
+ # current_table_block["blocks"]中有任何TABLE_CAPTION类型的块,且任意caption块内不以"(续)"结尾,则不合并
|
|
|
+ if any(block["type"] == BlockType.TABLE_CAPTION and not full_to_half(merge_para_with_text(block).strip()).endswith("(续)") for block in current_table_block["blocks"]):
|
|
|
return False, None, None, None, None
|
|
|
|
|
|
if any(block["type"] == BlockType.TABLE_FOOTNOTE for block in previous_table_block["blocks"]):
|
|
|
@@ -263,17 +267,84 @@ def perform_table_merge(soup1, soup2, previous_table_block, wait_merge_table_foo
|
|
|
# 找到第一个表格的tbody,如果没有则查找table元素
|
|
|
tbody1 = soup1.find("tbody") or soup1.find("table")
|
|
|
|
|
|
- # 找到第二个表格的tbody,如果没有则查找table元素
|
|
|
- tbody2 = soup2.find("tbody") or soup2.find("table")
|
|
|
+ # 获取表1和表2的所有行
|
|
|
+ rows1 = soup1.find_all("tr")
|
|
|
+ rows2 = soup2.find_all("tr")
|
|
|
+
|
|
|
+
|
|
|
+ if rows1 and rows2 and header_count < len(rows2):
|
|
|
+ # 获取表1最后一行
|
|
|
+ last_row1 = rows1[-1]
|
|
|
+ # 获取表2第一个非表头行
|
|
|
+ first_data_row2 = rows2[header_count]
|
|
|
+
|
|
|
+ # 分析两行的colspan结构
|
|
|
+ last_row1_structure = []
|
|
|
+ has_colspan_last_row1 = False
|
|
|
+ for cell in last_row1.find_all(["td", "th"]):
|
|
|
+ colspan = int(cell.get("colspan", 1))
|
|
|
+ last_row1_structure.append(colspan)
|
|
|
+ if colspan > 1:
|
|
|
+ has_colspan_last_row1 = True
|
|
|
+
|
|
|
+ first_row2_structure = []
|
|
|
+ has_colspan_first_row2 = False
|
|
|
+ for cell in first_data_row2.find_all(["td", "th"]):
|
|
|
+ colspan = int(cell.get("colspan", 1))
|
|
|
+ first_row2_structure.append(colspan)
|
|
|
+ if colspan > 1:
|
|
|
+ has_colspan_first_row2 = True
|
|
|
+
|
|
|
+ # 确定基准结构(优先使用有colspan的行)
|
|
|
+ if has_colspan_last_row1:
|
|
|
+ reference_structure = last_row1_structure
|
|
|
+ reference_visual_cols = calculate_visual_columns(last_row1)
|
|
|
+ elif has_colspan_first_row2:
|
|
|
+ reference_structure = first_row2_structure
|
|
|
+ reference_visual_cols = calculate_visual_columns(first_data_row2)
|
|
|
+ else:
|
|
|
+ # 都没有colspan时使用表1最后一行作为默认基准
|
|
|
+ reference_structure = last_row1_structure
|
|
|
+ reference_visual_cols = calculate_visual_columns(last_row1)
|
|
|
+
|
|
|
+ # 如果表1最后一行没有colspan但表2首行有,则调整表1相关行
|
|
|
+ if not has_colspan_last_row1 and has_colspan_first_row2:
|
|
|
+ # 找到表1中所有具有相同视觉列数的行
|
|
|
+ rows_to_adjust = []
|
|
|
+ for i in range(len(rows1) - 1, -1, -1):
|
|
|
+ if calculate_visual_columns(rows1[i]) == reference_visual_cols:
|
|
|
+ rows_to_adjust.append(rows1[i])
|
|
|
+ else:
|
|
|
+ break
|
|
|
+
|
|
|
+ # 应用参考结构到这些行
|
|
|
+ for row in rows_to_adjust:
|
|
|
+ cells = row.find_all(["td", "th"])
|
|
|
+ if cells and len(cells) <= len(reference_structure):
|
|
|
+ for j, cell in enumerate(cells):
|
|
|
+ if j < len(reference_structure) and reference_structure[j] > 1:
|
|
|
+ cell["colspan"] = str(reference_structure[j])
|
|
|
+
|
|
|
+ # 如果表2首行没有colspan但表1最后一行有,则调整表2相关行
|
|
|
+ elif has_colspan_last_row1 and not has_colspan_first_row2:
|
|
|
+ # 调整表2中所有具有相同视觉列数的行
|
|
|
+ for i in range(header_count, len(rows2)):
|
|
|
+ row = rows2[i]
|
|
|
+ if calculate_visual_columns(row) == reference_visual_cols:
|
|
|
+ cells = row.find_all(["td", "th"])
|
|
|
+ if cells and len(cells) <= len(reference_structure):
|
|
|
+ for j, cell in enumerate(cells):
|
|
|
+ if j < len(reference_structure) and reference_structure[j] > 1:
|
|
|
+ cell["colspan"] = str(reference_structure[j])
|
|
|
|
|
|
# 将第二个表格的行添加到第一个表格中
|
|
|
- if tbody1 and tbody2:
|
|
|
- rows2 = soup2.find_all("tr")
|
|
|
- # 将第二个表格的行添加到第一个表格中(跳过表头行)
|
|
|
- for row in rows2[header_count:]:
|
|
|
- # 从原来的位置移除行,并添加到第一个表格中
|
|
|
- row.extract()
|
|
|
- tbody1.append(row)
|
|
|
+ if tbody1:
|
|
|
+ tbody2 = soup2.find("tbody") or soup2.find("table")
|
|
|
+ if tbody2:
|
|
|
+ # 将第二个表格的行添加到第一个表格中(跳过表头行)
|
|
|
+ for row in rows2[header_count:]:
|
|
|
+ row.extract()
|
|
|
+ tbody1.append(row)
|
|
|
|
|
|
# 添加待合并表格的footnote到前一个表格中
|
|
|
for table_footnote in wait_merge_table_footnotes:
|