table_merge.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427
  1. # Copyright (c) Opendatalab. All rights reserved.
  2. from loguru import logger
  3. from bs4 import BeautifulSoup
  4. from mineru.backend.vlm.vlm_middle_json_mkcontent import merge_para_with_text
  5. from mineru.utils.enum_class import BlockType, SplitFlag
  6. def full_to_half(text: str) -> str:
  7. """Convert full-width characters to half-width characters using code point manipulation.
  8. Args:
  9. text: String containing full-width characters
  10. Returns:
  11. String with full-width characters converted to half-width
  12. """
  13. result = []
  14. for char in text:
  15. code = ord(char)
  16. # Full-width letters, numbers and punctuation (FF01-FF5E)
  17. if 0xFF01 <= code <= 0xFF5E:
  18. result.append(chr(code - 0xFEE0)) # Shift to ASCII range
  19. else:
  20. result.append(char)
  21. return ''.join(result)
  22. def calculate_table_total_columns(soup):
  23. """计算表格的总列数,通过分析整个表格结构来处理rowspan和colspan
  24. Args:
  25. soup: BeautifulSoup解析的表格
  26. Returns:
  27. int: 表格的总列数
  28. """
  29. rows = soup.find_all("tr")
  30. if not rows:
  31. return 0
  32. # 创建一个矩阵来跟踪每个位置的占用情况
  33. max_cols = 0
  34. occupied = {} # {row_idx: {col_idx: True}}
  35. for row_idx, row in enumerate(rows):
  36. col_idx = 0
  37. cells = row.find_all(["td", "th"])
  38. if row_idx not in occupied:
  39. occupied[row_idx] = {}
  40. for cell in cells:
  41. # 找到下一个未被占用的列位置
  42. while col_idx in occupied[row_idx]:
  43. col_idx += 1
  44. colspan = int(cell.get("colspan", 1))
  45. rowspan = int(cell.get("rowspan", 1))
  46. # 标记被这个单元格占用的所有位置
  47. for r in range(row_idx, row_idx + rowspan):
  48. if r not in occupied:
  49. occupied[r] = {}
  50. for c in range(col_idx, col_idx + colspan):
  51. occupied[r][c] = True
  52. col_idx += colspan
  53. max_cols = max(max_cols, col_idx)
  54. return max_cols
  55. def calculate_row_columns(row):
  56. """
  57. 计算表格行的实际列数,考虑colspan属性
  58. Args:
  59. row: BeautifulSoup的tr元素对象
  60. Returns:
  61. int: 行的实际列数
  62. """
  63. cells = row.find_all(["td", "th"])
  64. column_count = 0
  65. for cell in cells:
  66. colspan = int(cell.get("colspan", 1))
  67. column_count += colspan
  68. return column_count
  69. def calculate_visual_columns(row):
  70. """
  71. 计算表格行的视觉列数(实际td/th单元格数量,不考虑colspan)
  72. Args:
  73. row: BeautifulSoup的tr元素对象
  74. Returns:
  75. int: 行的视觉列数(实际单元格数)
  76. """
  77. cells = row.find_all(["td", "th"])
  78. return len(cells)
  79. def detect_table_headers(soup1, soup2, max_header_rows=5):
  80. """
  81. 检测并比较两个表格的表头
  82. Args:
  83. soup1: 第一个表格的BeautifulSoup对象
  84. soup2: 第二个表格的BeautifulSoup对象
  85. max_header_rows: 最大可能的表头行数
  86. Returns:
  87. tuple: (表头行数, 表头是否一致, 表头文本列表)
  88. """
  89. rows1 = soup1.find_all("tr")
  90. rows2 = soup2.find_all("tr")
  91. min_rows = min(len(rows1), len(rows2), max_header_rows)
  92. header_rows = 0
  93. headers_match = True
  94. header_texts = []
  95. for i in range(min_rows):
  96. # 提取当前行的所有单元格
  97. cells1 = rows1[i].find_all(["td", "th"])
  98. cells2 = rows2[i].find_all(["td", "th"])
  99. # 检查两行的结构和内容是否一致
  100. structure_match = True
  101. # 首先检查单元格数量
  102. if len(cells1) != len(cells2):
  103. structure_match = False
  104. else:
  105. # 然后检查单元格的属性和内容
  106. for cell1, cell2 in zip(cells1, cells2):
  107. colspan1 = int(cell1.get("colspan", 1))
  108. rowspan1 = int(cell1.get("rowspan", 1))
  109. colspan2 = int(cell2.get("colspan", 1))
  110. rowspan2 = int(cell2.get("rowspan", 1))
  111. # 去除所有空白字符(包括空格、换行、制表符等)
  112. text1 = ''.join(full_to_half(cell1.get_text()).split())
  113. text2 = ''.join(full_to_half(cell2.get_text()).split())
  114. if colspan1 != colspan2 or rowspan1 != rowspan2 or text1 != text2:
  115. structure_match = False
  116. break
  117. if structure_match:
  118. header_rows += 1
  119. row_texts = [full_to_half(cell.get_text().strip()) for cell in cells1]
  120. header_texts.append(row_texts) # 添加表头文本
  121. else:
  122. headers_match = header_rows > 0 # 只有当至少匹配了一行时,才认为表头匹配
  123. break
  124. # 如果没有找到匹配的表头行,则返回失败
  125. if header_rows == 0:
  126. headers_match = False
  127. return header_rows, headers_match, header_texts
  128. def can_merge_tables(current_table_block, previous_table_block):
  129. """判断两个表格是否可以合并"""
  130. # 检查表格是否有caption和footnote
  131. # 如果有TABLE_CAPTION类型的块,检查是否至少有一个以"(续)"结尾
  132. caption_blocks = [block for block in current_table_block["blocks"] if block["type"] == BlockType.TABLE_CAPTION]
  133. if caption_blocks:
  134. # 如果所有caption都不以"(续)"结尾,则不合并
  135. if not any(full_to_half(merge_para_with_text(block).strip()).endswith("(续)") for block in caption_blocks):
  136. return False, None, None, None, None
  137. if any(block["type"] == BlockType.TABLE_FOOTNOTE for block in previous_table_block["blocks"]):
  138. return False, None, None, None, None
  139. # 获取两个表格的HTML内容
  140. current_html = ""
  141. previous_html = ""
  142. for block in current_table_block["blocks"]:
  143. if (block["type"] == BlockType.TABLE_BODY and block["lines"] and block["lines"][0]["spans"]):
  144. current_html = block["lines"][0]["spans"][0].get("html", "")
  145. for block in previous_table_block["blocks"]:
  146. if (block["type"] == BlockType.TABLE_BODY and block["lines"] and block["lines"][0]["spans"]):
  147. previous_html = block["lines"][0]["spans"][0].get("html", "")
  148. if not current_html or not previous_html:
  149. return False, None, None, None, None
  150. # 检查表格宽度差异
  151. x0_t1, y0_t1, x1_t1, y1_t1 = current_table_block["bbox"]
  152. x0_t2, y0_t2, x1_t2, y1_t2 = previous_table_block["bbox"]
  153. table1_width = x1_t1 - x0_t1
  154. table2_width = x1_t2 - x0_t2
  155. if abs(table1_width - table2_width) / min(table1_width, table2_width) >= 0.1:
  156. return False, None, None, None, None
  157. # 解析HTML并检查表格结构
  158. soup1 = BeautifulSoup(previous_html, "html.parser")
  159. soup2 = BeautifulSoup(current_html, "html.parser")
  160. # 检查整体列数匹配
  161. table_cols1 = calculate_table_total_columns(soup1)
  162. table_cols2 = calculate_table_total_columns(soup2)
  163. # logger.debug(f"Table columns - Previous: {table_cols1}, Current: {table_cols2}")
  164. tables_match = table_cols1 == table_cols2
  165. # 检查首末行列数匹配
  166. rows_match = check_rows_match(soup1, soup2)
  167. return (tables_match or rows_match), soup1, soup2, current_html, previous_html
  168. def check_rows_match(soup1, soup2):
  169. """检查表格行是否匹配"""
  170. rows1 = soup1.find_all("tr")
  171. rows2 = soup2.find_all("tr")
  172. if not (rows1 and rows2):
  173. return False
  174. # 获取第一个表的最后一行数据行
  175. last_row = None
  176. for row in reversed(rows1):
  177. if row.find_all(["td", "th"]):
  178. last_row = row
  179. break
  180. # 检测表头行数,以便获取第二个表的首个数据行
  181. header_count, _, _ = detect_table_headers(soup1, soup2)
  182. # 获取第二个表的首个数据行
  183. first_data_row = None
  184. if len(rows2) > header_count:
  185. first_data_row = rows2[header_count] # 第一个非表头行
  186. if not (last_row and first_data_row):
  187. return False
  188. # 计算实际列数(考虑colspan)和视觉列数
  189. last_row_cols = calculate_row_columns(last_row)
  190. first_row_cols = calculate_row_columns(first_data_row)
  191. last_row_visual_cols = calculate_visual_columns(last_row)
  192. first_row_visual_cols = calculate_visual_columns(first_data_row)
  193. # logger.debug(f"行列数 - 前表最后一行: {last_row_cols}(视觉列数:{last_row_visual_cols}), 当前表首行: {first_row_cols}(视觉列数:{first_row_visual_cols})")
  194. # 同时考虑实际列数匹配和视觉列数匹配
  195. return last_row_cols == first_row_cols or last_row_visual_cols == first_row_visual_cols
  196. def check_row_columns_match(row1, row2):
  197. # 逐个cell检测colspan属性是否一致
  198. cells1 = row1.find_all(["td", "th"])
  199. cells2 = row2.find_all(["td", "th"])
  200. if len(cells1) != len(cells2):
  201. return False
  202. for cell1, cell2 in zip(cells1, cells2):
  203. colspan1 = int(cell1.get("colspan", 1))
  204. colspan2 = int(cell2.get("colspan", 1))
  205. if colspan1 != colspan2:
  206. return False
  207. return True
  208. def adjust_table_rows_colspan(rows, start_idx, end_idx,
  209. reference_structure, reference_visual_cols,
  210. target_cols, current_cols, reference_row):
  211. """调整表格行的colspan属性以匹配目标列数
  212. Args:
  213. rows: 表格行列表
  214. start_idx: 起始行索引
  215. end_idx: 结束行索引(不包含)
  216. reference_structure: 参考行的colspan结构列表
  217. reference_visual_cols: 参考行的视觉列数
  218. target_cols: 目标总列数
  219. current_cols: 当前总列数
  220. reference_row: 参考行对象
  221. """
  222. for i in range(start_idx, end_idx):
  223. row = rows[i]
  224. cells = row.find_all(["td", "th"])
  225. if not cells:
  226. continue
  227. current_row_cols = calculate_row_columns(row)
  228. if current_row_cols >= target_cols:
  229. continue
  230. # 检查是否与参考行结构匹配
  231. if calculate_visual_columns(row) == reference_visual_cols and check_row_columns_match(row, reference_row):
  232. # 尝试应用参考结构
  233. if len(cells) <= len(reference_structure):
  234. for j, cell in enumerate(cells):
  235. if j < len(reference_structure) and reference_structure[j] > 1:
  236. cell["colspan"] = str(reference_structure[j])
  237. else:
  238. # 扩展最后一个单元格以填补列数差异
  239. last_cell = cells[-1]
  240. current_last_span = int(last_cell.get("colspan", 1))
  241. last_cell["colspan"] = str(current_last_span + (target_cols - current_cols))
  242. def perform_table_merge(soup1, soup2, previous_table_block, wait_merge_table_footnotes):
  243. """执行表格合并操作"""
  244. # 检测表头有几行,并确认表头内容是否一致
  245. header_count, headers_match, header_texts = detect_table_headers(soup1, soup2)
  246. # logger.debug(f"检测到表头行数: {header_count}, 表头匹配: {headers_match}")
  247. # logger.debug(f"表头内容: {header_texts}")
  248. # 找到第一个表格的tbody,如果没有则查找table元素
  249. tbody1 = soup1.find("tbody") or soup1.find("table")
  250. # 获取表1和表2的所有行
  251. rows1 = soup1.find_all("tr")
  252. rows2 = soup2.find_all("tr")
  253. if rows1 and rows2 and header_count < len(rows2):
  254. # 获取表1最后一行和表2第一个非表头行
  255. last_row1 = rows1[-1]
  256. first_data_row2 = rows2[header_count]
  257. # 计算表格总列数
  258. table_cols1 = calculate_table_total_columns(soup1)
  259. table_cols2 = calculate_table_total_columns(soup2)
  260. if table_cols1 >= table_cols2:
  261. reference_structure = [int(cell.get("colspan", 1)) for cell in last_row1.find_all(["td", "th"])]
  262. reference_visual_cols = calculate_visual_columns(last_row1)
  263. # 以表1的最后一行为参考,调整表2的行
  264. adjust_table_rows_colspan(
  265. rows2, header_count, len(rows2),
  266. reference_structure, reference_visual_cols,
  267. table_cols1, table_cols2, first_data_row2
  268. )
  269. else: # table_cols2 > table_cols1
  270. reference_structure = [int(cell.get("colspan", 1)) for cell in first_data_row2.find_all(["td", "th"])]
  271. reference_visual_cols = calculate_visual_columns(first_data_row2)
  272. # 以表2的第一个数据行为参考,调整表1的行
  273. adjust_table_rows_colspan(
  274. rows1, 0, len(rows1),
  275. reference_structure, reference_visual_cols,
  276. table_cols2, table_cols1, last_row1
  277. )
  278. # 将第二个表格的行添加到第一个表格中
  279. if tbody1:
  280. tbody2 = soup2.find("tbody") or soup2.find("table")
  281. if tbody2:
  282. # 将第二个表格的行添加到第一个表格中(跳过表头行)
  283. for row in rows2[header_count:]:
  284. row.extract()
  285. tbody1.append(row)
  286. # 添加待合并表格的footnote到前一个表格中
  287. for table_footnote in wait_merge_table_footnotes:
  288. temp_table_footnote = table_footnote.copy()
  289. temp_table_footnote[SplitFlag.CROSS_PAGE] = True
  290. previous_table_block["blocks"].append(temp_table_footnote)
  291. return str(soup1)
  292. def merge_table(page_info_list):
  293. """合并跨页表格"""
  294. # 倒序遍历每一页
  295. for page_idx in range(len(page_info_list) - 1, -1, -1):
  296. # 跳过第一页,因为它没有前一页
  297. if page_idx == 0:
  298. continue
  299. page_info = page_info_list[page_idx]
  300. previous_page_info = page_info_list[page_idx - 1]
  301. # 检查当前页是否有表格块
  302. if not (page_info["para_blocks"] and page_info["para_blocks"][0]["type"] == BlockType.TABLE):
  303. continue
  304. current_table_block = page_info["para_blocks"][0]
  305. # 检查上一页是否有表格块
  306. if not (previous_page_info["para_blocks"] and previous_page_info["para_blocks"][-1]["type"] == BlockType.TABLE):
  307. continue
  308. previous_table_block = previous_page_info["para_blocks"][-1]
  309. # 收集待合并表格的footnote
  310. wait_merge_table_footnotes = [
  311. block for block in current_table_block["blocks"]
  312. if block["type"] == BlockType.TABLE_FOOTNOTE
  313. ]
  314. # 检查两个表格是否可以合并
  315. can_merge, soup1, soup2, current_html, previous_html = can_merge_tables(
  316. current_table_block, previous_table_block
  317. )
  318. if not can_merge:
  319. continue
  320. # 执行表格合并
  321. merged_html = perform_table_merge(
  322. soup1, soup2, previous_table_block, wait_merge_table_footnotes
  323. )
  324. # 更新previous_table_block的html
  325. for block in previous_table_block["blocks"]:
  326. if (block["type"] == BlockType.TABLE_BODY and block["lines"] and block["lines"][0]["spans"]):
  327. block["lines"][0]["spans"][0]["html"] = merged_html
  328. break
  329. # 删除当前页的table
  330. for block in current_table_block["blocks"]:
  331. block['lines'] = []
  332. block[SplitFlag.LINES_DELETED] = True