Răsfoiți Sursa

Enhance table merging logic to adjust colspan attributes based on row structures

myhloli 1 lună în urmă
părinte
comite
2108019698
1 a modificat fișierele cu 76 adăugiri și 9 ștergeri
  1. 76 9
      mineru/utils/table_merge.py

+ 76 - 9
mineru/utils/table_merge.py

@@ -263,17 +263,84 @@ def perform_table_merge(soup1, soup2, previous_table_block, wait_merge_table_foo
     # 找到第一个表格的tbody,如果没有则查找table元素
     tbody1 = soup1.find("tbody") or soup1.find("table")
 
-    # 找到第二个表格的tbody,如果没有则查找table元素
-    tbody2 = soup2.find("tbody") or soup2.find("table")
+    # 获取表1和表2的所有行
+    rows1 = soup1.find_all("tr")
+    rows2 = soup2.find_all("tr")
+
+
+    if rows1 and rows2 and header_count < len(rows2):
+        # 获取表1最后一行
+        last_row1 = rows1[-1]
+        # 获取表2第一个非表头行
+        first_data_row2 = rows2[header_count]
+
+        # 分析两行的colspan结构
+        last_row1_structure = []
+        has_colspan_last_row1 = False
+        for cell in last_row1.find_all(["td", "th"]):
+            colspan = int(cell.get("colspan", 1))
+            last_row1_structure.append(colspan)
+            if colspan > 1:
+                has_colspan_last_row1 = True
+
+        first_row2_structure = []
+        has_colspan_first_row2 = False
+        for cell in first_data_row2.find_all(["td", "th"]):
+            colspan = int(cell.get("colspan", 1))
+            first_row2_structure.append(colspan)
+            if colspan > 1:
+                has_colspan_first_row2 = True
+
+        # 确定基准结构(优先使用有colspan的行)
+        if has_colspan_last_row1:
+            reference_structure = last_row1_structure
+            reference_visual_cols = calculate_visual_columns(last_row1)
+        elif has_colspan_first_row2:
+            reference_structure = first_row2_structure
+            reference_visual_cols = calculate_visual_columns(first_data_row2)
+        else:
+            # 都没有colspan时使用表1最后一行作为默认基准
+            reference_structure = last_row1_structure
+            reference_visual_cols = calculate_visual_columns(last_row1)
+
+        # 如果表1最后一行没有colspan但表2首行有,则调整表1相关行
+        if not has_colspan_last_row1 and has_colspan_first_row2:
+            # 找到表1中所有具有相同视觉列数的行
+            rows_to_adjust = []
+            for i in range(len(rows1) - 1, -1, -1):
+                if calculate_visual_columns(rows1[i]) == reference_visual_cols:
+                    rows_to_adjust.append(rows1[i])
+                else:
+                    break
+
+            # 应用参考结构到这些行
+            for row in rows_to_adjust:
+                cells = row.find_all(["td", "th"])
+                if cells and len(cells) <= len(reference_structure):
+                    for j, cell in enumerate(cells):
+                        if j < len(reference_structure) and reference_structure[j] > 1:
+                            cell["colspan"] = str(reference_structure[j])
+
+        # 如果表2首行没有colspan但表1最后一行有,则调整表2相关行
+        elif has_colspan_last_row1 and not has_colspan_first_row2:
+            # 调整表2中所有具有相同视觉列数的行
+            for i in range(header_count, len(rows2)):
+                row = rows2[i]
+                if calculate_visual_columns(row) == reference_visual_cols:
+                    cells = row.find_all(["td", "th"])
+                    if cells and len(cells) <= len(reference_structure):
+                        for j, cell in enumerate(cells):
+                            if j < len(reference_structure) and reference_structure[j] > 1:
+                                cell["colspan"] = str(reference_structure[j])
 
     # 将第二个表格的行添加到第一个表格中
-    if tbody1 and tbody2:
-        rows2 = soup2.find_all("tr")
-        # 将第二个表格的行添加到第一个表格中(跳过表头行)
-        for row in rows2[header_count:]:
-            # 从原来的位置移除行,并添加到第一个表格中
-            row.extract()
-            tbody1.append(row)
+    if tbody1:
+        tbody2 = soup2.find("tbody") or soup2.find("table")
+        if tbody2:
+            # 将第二个表格的行添加到第一个表格中(跳过表头行)
+            for row in rows2[header_count:]:
+                row.extract()
+                tbody1.append(row)
 
     # 添加待合并表格的footnote到前一个表格中
     for table_footnote in wait_merge_table_footnotes: