table_cell_matcher.py 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910
  1. """
  2. 表格单元格匹配器
  3. 负责将 HTML 表格单元格与 PaddleOCR bbox 进行匹配
  4. """
  5. from typing import List, Dict, Tuple, Optional
  6. from bs4 import BeautifulSoup
  7. import numpy as np
  8. try:
  9. from rapidfuzz import fuzz
  10. except ImportError:
  11. from fuzzywuzzy import fuzz
  12. import sys
  13. from pathlib import Path
  14. # 添加 ocr_platform 根目录到 Python 路径(用于导入 ocr_utils)
  15. ocr_platform_root = Path(__file__).parents[3] # ocr_merger -> ocr_tools -> ocr_platform -> repository.git
  16. if str(ocr_platform_root) not in sys.path:
  17. sys.path.insert(0, str(ocr_platform_root))
  18. try:
  19. from .text_matcher import TextMatcher
  20. from ocr_utils import BBoxExtractor # 从 ocr_utils 导入
  21. except ImportError:
  22. from text_matcher import TextMatcher
  23. from ocr_utils import BBoxExtractor # 从 ocr_utils 导入
  24. class TableCellMatcher:
  25. """表格单元格匹配器"""
  26. def __init__(self, text_matcher: TextMatcher,
  27. x_tolerance: int = 3,
  28. y_tolerance: int = 10,
  29. skew_threshold: float = 0.3):
  30. """
  31. Args:
  32. text_matcher: 文本匹配器
  33. x_tolerance: X轴容差(用于列边界判断)
  34. y_tolerance: Y轴容差(用于行分组)
  35. skew_threshold: 倾斜校正阈值(度数)
  36. """
  37. self.text_matcher = text_matcher
  38. self.x_tolerance = x_tolerance
  39. self.y_tolerance = y_tolerance
  40. self.skew_threshold = skew_threshold # 倾斜校正阈值(度数)
  41. def enhance_table_html_with_bbox(self, html: str, paddle_text_boxes: List[Dict],
  42. start_pointer: int, table_bbox: Optional[List[int]] = None) -> Tuple[str, List[Dict], int, float]:
  43. """
  44. 为 HTML 表格添加 bbox 信息(优化版:使用行级动态规划)
  45. Returns:
  46. (enhanced_html, cells, new_pointer, skew_angle):
  47. 增强后的HTML、单元格列表、新指针位置、倾斜角度
  48. """
  49. soup = BeautifulSoup(html, 'html.parser')
  50. cells = []
  51. skew_angle = 0.0
  52. # 🔑 第一步:筛选表格区域内的 paddle boxes
  53. table_region_boxes, actual_table_bbox = self._filter_boxes_in_table_region(
  54. paddle_text_boxes[start_pointer:],
  55. table_bbox,
  56. html
  57. )
  58. if not table_region_boxes:
  59. print(f"⚠️ 未在表格区域找到 paddle boxes")
  60. return str(soup), cells, start_pointer, skew_angle
  61. print(f"📊 表格区域: {len(table_region_boxes)} 个文本框")
  62. # 🔑 第二步:将表格区域的 boxes 按行分组
  63. grouped_boxes, skew_angle = self._group_paddle_boxes_by_rows(
  64. table_region_boxes,
  65. y_tolerance=self.y_tolerance,
  66. auto_correct_skew=True,
  67. skew_threshold=self.skew_threshold
  68. )
  69. # 🔑 第三步:在每组内按 x 坐标排序
  70. for group in grouped_boxes:
  71. group['boxes'].sort(key=lambda x: x['bbox'][0])
  72. grouped_boxes.sort(key=lambda g: g['y_center'])
  73. # 🔑 第四步:智能匹配 HTML 行与 paddle 行组
  74. html_rows = soup.find_all('tr')
  75. row_mapping = self._match_html_rows_to_paddle_groups(html_rows, grouped_boxes)
  76. print(f" HTML行: {len(html_rows)} 行, 映射: {len([v for v in row_mapping.values() if v])} 个有效映射")
  77. # 🔑 第五步:遍历 HTML 表格,使用 DP 进行行内匹配
  78. for row_idx, row in enumerate(html_rows):
  79. group_indices = row_mapping.get(row_idx, [])
  80. if not group_indices:
  81. continue
  82. # 合并多个组的 boxes
  83. current_boxes = []
  84. for group_idx in group_indices:
  85. if group_idx < len(grouped_boxes):
  86. current_boxes.extend(grouped_boxes[group_idx]['boxes'])
  87. # 再次按 x 排序确保顺序
  88. current_boxes.sort(key=lambda x: x['bbox'][0])
  89. html_cells = row.find_all(['td', 'th'])
  90. if not html_cells:
  91. continue
  92. # 🎯 核心变更:使用行级 DP 替代原来的顺序匹配
  93. # 输入:HTML 单元格列表, OCR Box 列表
  94. # 输出:匹配结果列表
  95. dp_results = self._match_cells_in_row_dp(html_cells, current_boxes)
  96. print(f" 行 {row_idx + 1}: {len(html_cells)} 列, 匹配到 {len(dp_results)} 个单元格")
  97. # 解析 DP 结果并填充 cells 列表
  98. for res in dp_results:
  99. cell_idx = res['cell_idx']
  100. match_info = res['match_info']
  101. cell_element = html_cells[cell_idx]
  102. cell_text = cell_element.get_text(strip=True)
  103. matched_boxes = match_info['boxes']
  104. matched_text = match_info['text']
  105. score = match_info['score']
  106. # 标记 box 为已使用
  107. paddle_indices = []
  108. for box in matched_boxes:
  109. box['used'] = True
  110. paddle_indices.append(box.get('paddle_bbox_index', -1))
  111. # 计算合并后的 bbox (使用原始坐标 original_bbox 优先)
  112. merged_bbox = self._merge_boxes_bbox(matched_boxes)
  113. # 注入 HTML 属性
  114. cell_element['data-bbox'] = f"[{merged_bbox[0]},{merged_bbox[1]},{merged_bbox[2]},{merged_bbox[3]}]"
  115. cell_element['data-score'] = f"{score:.4f}"
  116. cell_element['data-paddle-indices'] = str(paddle_indices)
  117. # 构建返回结构 (保持与原函数一致)
  118. cells.append({
  119. 'type': 'table_cell',
  120. 'text': cell_text,
  121. 'matched_text': matched_text,
  122. 'bbox': merged_bbox,
  123. 'row': row_idx + 1,
  124. 'col': cell_idx + 1,
  125. 'score': score,
  126. 'paddle_bbox_indices': paddle_indices
  127. })
  128. print(f" 列 {cell_idx + 1}: '{cell_text[:15]}...' 匹配 {len(matched_boxes)} 个box (分值: {score:.1f})")
  129. # 计算新的指针位置 (逻辑保持不变:基于 used 标记)
  130. used_count = sum(1 for box in table_region_boxes if box.get('used'))
  131. new_pointer = start_pointer + used_count
  132. print(f" 总计匹配: {len(cells)} 个单元格")
  133. return str(soup), cells, new_pointer, skew_angle
  134. def _merge_boxes_bbox(self, boxes: List[Dict]) -> List[int]:
  135. """辅助函数:合并多个 box 的坐标"""
  136. if not boxes:
  137. return [0, 0, 0, 0]
  138. # 优先使用 original_bbox,如果没有则使用 bbox
  139. def get_coords(b):
  140. return b.get('original_bbox', b['bbox'])
  141. x1 = min(get_coords(b)[0] for b in boxes)
  142. y1 = min(get_coords(b)[1] for b in boxes)
  143. x2 = max(get_coords(b)[2] for b in boxes)
  144. y2 = max(get_coords(b)[3] for b in boxes)
  145. return [x1, y1, x2, y2]
  146. def _match_cells_in_row_dp(self, html_cells: List, row_boxes: List[Dict]) -> List[Dict]:
  147. """
  148. 使用动态规划进行行内单元格匹配
  149. 目标:找到一种分配方案,使得整行的匹配总分最高
  150. """
  151. n_cells = len(html_cells)
  152. n_boxes = len(row_boxes)
  153. # dp[i][j] 表示:前 i 个单元格 消耗了 前 j 个 boxes 的最大得分
  154. dp = np.full((n_cells + 1, n_boxes + 1), -np.inf)
  155. dp[0][0] = 0
  156. # path[i][j] = (prev_j, matched_info) 用于回溯
  157. path = {}
  158. # 允许合并的最大 box 数量
  159. MAX_MERGE = 5
  160. for i in range(1, n_cells + 1):
  161. cell = html_cells[i-1]
  162. cell_text = cell.get_text(strip=True)
  163. # 如果单元格为空,允许继承状态(相当于跳过该单元格)
  164. if not cell_text:
  165. for j in range(n_boxes + 1):
  166. if dp[i-1][j] > -np.inf:
  167. dp[i][j] = dp[i-1][j]
  168. path[(i, j)] = (j, None)
  169. continue
  170. # 遍历当前 box 指针 j
  171. for j in range(n_boxes + 1):
  172. # 策略 A: 当前单元格不匹配任何 box (Cell Missing / OCR漏检)
  173. if dp[i-1][j] > dp[i][j]:
  174. dp[i][j] = dp[i-1][j]
  175. path[(i, j)] = (j, None)
  176. # 策略 B: 当前单元格匹配了 k 个 boxes (从 prev_j 到 j)
  177. # 限制搜索范围:最多往前看 MAX_MERGE 个 box
  178. search_limit = max(0, j - MAX_MERGE)
  179. # 允许中间跳过少量噪音 box (例如 prev_j 到 j 之间跨度大,但只取了部分)
  180. # 但为了简化,这里假设是连续取用 row_boxes[prev_j:j]
  181. for prev_j in range(j - 1, search_limit - 1, -1):
  182. if dp[i-1][prev_j] == -np.inf:
  183. continue
  184. candidate_boxes = row_boxes[prev_j:j]
  185. # 组合文本 (使用空格连接)
  186. merged_text = " ".join([b['text'] for b in candidate_boxes])
  187. # 计算得分
  188. score = self._compute_match_score(cell_text, merged_text)
  189. # 只有及格的匹配才考虑
  190. if score > 50:
  191. new_score = dp[i-1][prev_j] + score
  192. if new_score > dp[i][j]:
  193. dp[i][j] = new_score
  194. path[(i, j)] = (prev_j, {
  195. 'text': merged_text,
  196. 'boxes': candidate_boxes,
  197. 'score': score
  198. })
  199. # --- 回溯找最优解 ---
  200. best_j = np.argmax(dp[n_cells])
  201. if dp[n_cells][best_j] == -np.inf:
  202. return []
  203. results = []
  204. curr_i, curr_j = n_cells, best_j
  205. while curr_i > 0:
  206. step_info = path.get((curr_i, curr_j))
  207. if step_info:
  208. prev_j, match_info = step_info
  209. if match_info:
  210. results.append({
  211. 'cell_idx': curr_i - 1,
  212. 'match_info': match_info
  213. })
  214. curr_j = prev_j
  215. curr_i -= 1
  216. return results[::-1]
  217. def _compute_match_score(self, cell_text: str, box_text: str) -> float:
  218. """
  219. 纯粹的评分函数:计算单元格文本与候选 Box 文本的匹配得分
  220. 包含所有防御逻辑
  221. """
  222. # 1. 预处理
  223. cell_norm = self.text_matcher.normalize_text(cell_text)
  224. box_norm = self.text_matcher.normalize_text(box_text)
  225. if not cell_norm or not box_norm:
  226. return 0.0
  227. # --- ⚡️ 快速防御 ---
  228. len_cell = len(cell_norm)
  229. len_box = len(box_norm)
  230. # 长度差异过大直接 0 分 (除非是包含关系且特征明显)
  231. if len_box > len_cell * 3 + 5:
  232. if len_cell < 5: return 0.0
  233. # --- 🔍 核心相似度计算 ---
  234. cell_proc = self._preprocess_text_for_matching(cell_text)
  235. box_proc = self._preprocess_text_for_matching(box_text)
  236. # A. Token Sort (解决乱序)
  237. score_sort = fuzz.token_sort_ratio(cell_proc, box_proc)
  238. # B. Partial (解决截断/包含)
  239. score_partial = fuzz.partial_ratio(cell_norm, box_norm)
  240. # C. Subsequence (解决噪音插入)
  241. score_subseq = 0.0
  242. if len_cell > 5:
  243. score_subseq = self._calculate_subsequence_score(cell_norm, box_norm)
  244. # --- 🛡️ 深度防御逻辑 ---
  245. # 1. 短文本防御
  246. if score_partial > 80:
  247. import re
  248. has_content = lambda t: bool(re.search(r'[a-zA-Z0-9\u4e00-\u9fa5]', t))
  249. # 纯符号防御
  250. if not has_content(cell_norm) and has_content(box_norm):
  251. if len_box > len_cell + 2: score_partial = 0.0
  252. # 微小碎片防御
  253. elif len_cell <= 2 and len_box > 8:
  254. score_partial = 0.0
  255. # 覆盖率防御
  256. else:
  257. coverage = len_cell / len_box if len_box > 0 else 0
  258. if coverage < 0.3 and score_sort < 45:
  259. score_partial = 0.0
  260. # 2. 子序列防御
  261. if score_subseq > 80:
  262. if len_box > len_cell * 1.5:
  263. import re
  264. if re.match(r'^[\d\-\:\.\s]+$', cell_norm) and len_cell < 12:
  265. score_subseq = 0.0
  266. # --- 📊 综合评分 ---
  267. final_score = max(score_sort, score_partial, score_subseq)
  268. # 精确匹配奖励
  269. if cell_norm == box_norm:
  270. final_score = 100.0
  271. elif cell_norm in box_norm:
  272. final_score = min(100, final_score + 5)
  273. return final_score
  274. def _filter_boxes_in_table_region(self, paddle_boxes: List[Dict],
  275. table_bbox: Optional[List[int]],
  276. html: str) -> Tuple[List[Dict], List[int]]:
  277. """
  278. 筛选表格区域内的 paddle boxes
  279. 策略:
  280. 1. 如果有 table_bbox,使用边界框筛选(扩展边界)
  281. Args:
  282. paddle_boxes: paddle OCR 结果
  283. table_bbox: 表格边界框 [x1, y1, x2, y2]
  284. html: HTML 内容(用于内容验证)
  285. Returns:
  286. (筛选后的 boxes, 实际表格边界框)
  287. """
  288. if not paddle_boxes:
  289. return [], [0, 0, 0, 0]
  290. # 🎯 策略 1: 使用提供的 table_bbox(扩展边界)
  291. if table_bbox and len(table_bbox) == 4:
  292. x1, y1, x2, y2 = table_bbox
  293. # 扩展边界(考虑边框外的文本)
  294. margin = 20
  295. expanded_bbox = [
  296. max(0, x1 - margin),
  297. max(0, y1 - margin),
  298. x2 + margin,
  299. y2 + margin
  300. ]
  301. filtered = []
  302. for box in paddle_boxes:
  303. bbox = box['bbox']
  304. box_center_x = (bbox[0] + bbox[2]) / 2
  305. box_center_y = (bbox[1] + bbox[3]) / 2
  306. # 中心点在扩展区域内
  307. if (expanded_bbox[0] <= box_center_x <= expanded_bbox[2] and
  308. expanded_bbox[1] <= box_center_y <= expanded_bbox[3]):
  309. filtered.append(box)
  310. if filtered:
  311. # 计算实际边界框
  312. actual_bbox = [
  313. min(b['bbox'][0] for b in filtered),
  314. min(b['bbox'][1] for b in filtered),
  315. max(b['bbox'][2] for b in filtered),
  316. max(b['bbox'][3] for b in filtered)
  317. ]
  318. return filtered, actual_bbox
  319. else:
  320. return [], [0, 0, 0, 0]
  321. else:
  322. raise ValueError(f"table_bbox is not valid: table_bbox={table_bbox}")
  323. def _group_paddle_boxes_by_rows(self, paddle_boxes: List[Dict],
  324. y_tolerance: int = 10,
  325. auto_correct_skew: bool = True,
  326. skew_threshold: float = 0.3) -> Tuple[List[Dict], float]:
  327. """
  328. 将 paddle_text_boxes 按 y 坐标分组(聚类)- 增强版本
  329. Args:
  330. paddle_boxes: Paddle OCR 文字框列表
  331. y_tolerance: Y 坐标容忍度(像素)
  332. auto_correct_skew: 是否自动校正倾斜
  333. Returns:
  334. 分组列表,每组包含 {'y_center': float, 'boxes': List[Dict]}
  335. """
  336. skew_angle = 0.0
  337. if not paddle_boxes:
  338. return [], skew_angle
  339. # 🎯 步骤 1: 检测并校正倾斜(使用 BBoxExtractor)
  340. if auto_correct_skew:
  341. skew_angle = BBoxExtractor.calculate_skew_angle(paddle_boxes)
  342. if abs(skew_angle) > skew_threshold:
  343. max_x = max(box['bbox'][2] for box in paddle_boxes)
  344. max_y = max(box['bbox'][3] for box in paddle_boxes)
  345. image_size = (max_x, max_y)
  346. print(f" 🔧 校正倾斜角度: {skew_angle:.2f}°")
  347. # 计算校正角度 (顺时针旋转)
  348. correction_angle = -skew_angle
  349. paddle_boxes = BBoxExtractor.correct_boxes_skew(
  350. paddle_boxes, correction_angle, image_size
  351. )
  352. # 🎯 步骤 2: 按校正后的 y 坐标分组
  353. boxes_with_y = []
  354. for box in paddle_boxes:
  355. bbox = box['bbox']
  356. y_center = (bbox[1] + bbox[3]) / 2
  357. boxes_with_y.append({
  358. 'y_center': y_center,
  359. 'box': box
  360. })
  361. # 按 y 坐标排序
  362. boxes_with_y.sort(key=lambda x: x['y_center'])
  363. groups = []
  364. current_group = None
  365. for item in boxes_with_y:
  366. if current_group is None:
  367. # 开始新组
  368. current_group = {
  369. 'y_center': item['y_center'],
  370. 'boxes': [item['box']]
  371. }
  372. else:
  373. if abs(item['y_center'] - current_group['y_center']) <= y_tolerance:
  374. current_group['boxes'].append(item['box'])
  375. # 更新组的中心
  376. current_group['y_center'] = sum(
  377. (b['bbox'][1] + b['bbox'][3]) / 2 for b in current_group['boxes']
  378. ) / len(current_group['boxes'])
  379. else:
  380. groups.append(current_group)
  381. current_group = {
  382. 'y_center': item['y_center'],
  383. 'boxes': [item['box']]
  384. }
  385. if current_group:
  386. groups.append(current_group)
  387. print(f" ✓ 分组完成: {len(groups)} 行")
  388. return groups, skew_angle
  389. def _match_html_rows_to_paddle_groups(self, html_rows: List,
  390. grouped_boxes: List[Dict]) -> Dict[int, List[int]]:
  391. """
  392. 智能匹配 HTML 行与 paddle 分组(增强版 DP:支持跳过 HTML 行,防止链条断裂)
  393. """
  394. if not html_rows or not grouped_boxes:
  395. return {}
  396. mapping = {}
  397. # 提取 HTML 文本(用于验证)
  398. html_row_texts = []
  399. for row in html_rows:
  400. cells = row.find_all(['td', 'th'])
  401. texts = [self.text_matcher.normalize_text(c.get_text(strip=True)) for c in cells]
  402. html_row_texts.append("".join(texts))
  403. # 预计算所有组的文本(用于验证)
  404. group_texts = []
  405. for group in grouped_boxes:
  406. boxes = group['boxes']
  407. texts = [self.text_matcher.normalize_text(b['text']) for b in boxes]
  408. group_texts.append("".join(texts))
  409. # 🎯 策略 1: 数量相等时,验证内容匹配度,不能简单1:1映射
  410. if len(html_rows) == len(grouped_boxes):
  411. # 计算每对行的相似度
  412. similarity_matrix = []
  413. for i, html_text in enumerate(html_row_texts):
  414. row_similarities = []
  415. for j, group_text in enumerate(group_texts):
  416. similarity = self._calculate_similarity(html_text, group_text)
  417. row_similarities.append(similarity)
  418. similarity_matrix.append(row_similarities)
  419. # 检查是否所有对角线元素都是最佳匹配(允许一定偏移)
  420. # 如果对角线匹配度都很高,才使用1:1映射
  421. diagonal_ok = True
  422. min_similarity_threshold = 0.3 # 最低相似度阈值
  423. for i in range(len(html_rows)):
  424. diag_sim = similarity_matrix[i][i]
  425. # 检查是否是对角线元素是最佳匹配
  426. max_sim_in_row = max(similarity_matrix[i])
  427. # 如果对角线相似度太低,或者不是该行的最佳匹配,则不使用1:1映射
  428. if diag_sim < min_similarity_threshold or (max_sim_in_row > diag_sim + 0.1):
  429. diagonal_ok = False
  430. break
  431. # 只有当对角线匹配度都足够高时,才使用简单1:1映射
  432. if diagonal_ok:
  433. print(f" ✓ 行数相同且对角线匹配良好,使用1:1映射")
  434. for i in range(len(html_rows)):
  435. mapping[i] = [i]
  436. return mapping
  437. else:
  438. print(f" ⚠️ 行数相同但内容不匹配,使用DP算法进行智能匹配")
  439. # 继续执行下面的DP算法(html_row_texts 和 group_texts 已在上面计算)
  440. n_html = len(html_row_texts)
  441. n_paddle = len(grouped_boxes)
  442. # 剪枝参数
  443. beam_width = self._get_adaptive_beam_width(n_html)
  444. # ⚡️ 优化 3: 预计算合并文本
  445. MAX_MERGE = 4
  446. merged_cache = {}
  447. for j in range(n_paddle):
  448. current_t = ""
  449. for k in range(MAX_MERGE):
  450. if j + k < n_paddle:
  451. current_t += group_texts[j + k]
  452. merged_cache[(j, k + 1)] = current_t
  453. else:
  454. break
  455. # --- 动态规划 (DP) ---
  456. # dp[i][j] 表示:HTML 前 i 行 (0..i) 匹配到了 Paddle 的前 j 组 (0..j) 的最大得分
  457. # 初始化为负无穷
  458. dp = np.full((n_html, n_paddle), -np.inf)
  459. # 记录路径:path[i][j] = (prev_j, start_j)
  460. # prev_j: 上一行结束的 paddle index
  461. # start_j: 当前行开始的 paddle index (因为一行可能对应多个组)
  462. path = {}
  463. # 参数配置
  464. SEARCH_WINDOW = 15 # 向前搜索窗口
  465. SKIP_PADDLE_PENALTY = 0.1 # 跳过 Paddle 组的惩罚
  466. SKIP_HTML_PENALTY = 0.3 # 关键:跳过 HTML 行的惩罚
  467. # --- 1. 初始化第一行 ---
  468. # 选项 A: 匹配 Paddle 组
  469. first_row_matched = False
  470. for end_j in range(min(n_paddle, SEARCH_WINDOW + MAX_MERGE)):
  471. for count in range(1, MAX_MERGE + 1):
  472. start_j = end_j - count + 1
  473. if start_j < 0: continue
  474. current_text = merged_cache.get((start_j, count), "")
  475. similarity = self._calculate_similarity(html_row_texts[0], current_text)
  476. penalty = start_j * SKIP_PADDLE_PENALTY
  477. score = similarity - penalty
  478. # 只有得分尚可才作为有效状态
  479. if score > 0.1:
  480. if score > dp[0][end_j]:
  481. dp[0][end_j] = score
  482. path[(0, end_j)] = (-1, start_j)
  483. first_row_matched = True
  484. # 选项 B: 如果第一行完全没有匹配,设置默认初始状态(允许跳过第一行)
  485. # 这样后续行可以从第一个OCR组开始匹配
  486. if not first_row_matched:
  487. print(f" ⚠️ 第一行(表头)无法匹配任何OCR分组,允许跳过第一行")
  488. # 设置一个默认初始状态:从第一个OCR组开始,但得分较低(表示跳过了第一行)
  489. # 这样第二行可以从第一个OCR组开始匹配
  490. if n_paddle > 0:
  491. # 从索引0开始,但得分很低(表示跳过了第一行HTML)
  492. dp[0][0] = -SKIP_HTML_PENALTY # 负分表示跳过了第一行
  493. path[(0, 0)] = (-1, 0) # 没有消耗任何OCR组
  494. first_row_matched = True # 标记为已处理,避免后续重复
  495. # --- 2. 状态转移 ---
  496. for i in range(1, n_html):
  497. html_text = html_row_texts[i]
  498. # 获取上一行所有有效位置
  499. valid_prev_indices = [j for j in range(n_paddle) if dp[i-1][j] > -np.inf]
  500. # 🛡️ 关键修复:如果上一行没有有效状态(第一行完全无法匹配),
  501. # 允许从第一个OCR组开始(跳过第一行HTML)
  502. if not valid_prev_indices:
  503. print(f" ⚠️ 第{i}行:上一行无有效状态,从第一个OCR组开始匹配(跳过前面的HTML行)")
  504. # 从第一个OCR组开始,但得分较低(表示跳过了前面的HTML行)
  505. if n_paddle > 0:
  506. dp[i][0] = -SKIP_HTML_PENALTY * i # 惩罚与跳过的行数成正比
  507. path[(i, 0)] = (-1, 0) # 没有消耗任何OCR组
  508. valid_prev_indices = [0] # 设置一个初始状态
  509. # 剪枝
  510. # 剪枝操作:为了提升DP效率,若上一行的可行状态(prev_j)过多(>30),
  511. # 只保留得分最高的前beam_width个prev_j作为起点,减少组合爆炸
  512. if len(valid_prev_indices) > beam_width:
  513. valid_prev_indices.sort(key=lambda j: dp[i-1][j], reverse=True)
  514. valid_prev_indices = valid_prev_indices[:beam_width]
  515. # 🛡️ 关键修复:允许跳过当前 HTML 行 (继承上一行的状态)
  516. # 如果跳过当前行,Paddle 指针 j 不变
  517. for prev_j in valid_prev_indices:
  518. score_skip = dp[i-1][prev_j] - SKIP_HTML_PENALTY
  519. if score_skip > dp[i][prev_j]:
  520. dp[i][prev_j] = score_skip
  521. # 记录路径:start_j = prev_j + 1 表示没有消耗新组 (空范围)
  522. path[(i, prev_j)] = (prev_j, prev_j + 1)
  523. # 如果是空行,直接跳过计算,仅保留继承的状态
  524. if not html_text:
  525. continue
  526. # 正常匹配逻辑
  527. for prev_j in valid_prev_indices:
  528. prev_score = dp[i-1][prev_j]
  529. max_gap = min(SEARCH_WINDOW, n_paddle - prev_j - 1)
  530. for gap in range(max_gap):
  531. start_j = prev_j + 1 + gap
  532. for count in range(1, MAX_MERGE + 1):
  533. end_j = start_j + count - 1
  534. if end_j >= n_paddle: break
  535. current_text = merged_cache.get((start_j, count), "")
  536. # 长度预筛选
  537. h_len = len(html_text)
  538. p_len = len(current_text)
  539. if h_len > 10 and p_len < h_len * 0.2:
  540. continue
  541. similarity = self._calculate_similarity(html_text, current_text)
  542. # 计算惩罚
  543. # 1. 跳过惩罚 (gap)
  544. # 2. 长度惩罚 (防止过度合并)
  545. len_penalty = 0.0
  546. if h_len > 0:
  547. ratio = p_len / h_len
  548. if ratio > 2.0: len_penalty = (ratio - 2.0) * 0.2
  549. current_score = similarity - (gap * SKIP_PADDLE_PENALTY) - len_penalty
  550. # 只有正收益才转移
  551. if current_score > 0.1:
  552. total_score = prev_score + current_score
  553. if total_score > dp[i][end_j]:
  554. dp[i][end_j] = total_score
  555. path[(i, end_j)] = (prev_j, start_j)
  556. # --- 3. 回溯找最优路径 ---
  557. # 找到最后一行得分最高的结束位置
  558. best_end_j = -1
  559. max_score = -np.inf
  560. # 优先找最后一行,如果最后一行没匹配上,往前找
  561. found_end = False
  562. for i in range(n_html - 1, -1, -1):
  563. for j in range(n_paddle):
  564. if dp[i][j] > max_score:
  565. max_score = dp[i][j]
  566. best_end_j = j
  567. best_last_row = i
  568. if max_score > -np.inf:
  569. found_end = True
  570. break
  571. mapping = {}
  572. used_groups = set()
  573. if found_end:
  574. curr_i = best_last_row
  575. curr_j = best_end_j
  576. while curr_i >= 0:
  577. if (curr_i, curr_j) in path:
  578. prev_j, start_j = path[(curr_i, curr_j)]
  579. # 如果 start_j <= curr_j,说明消耗了 Paddle 组
  580. # 如果 start_j > curr_j,说明是跳过 HTML 行 (空范围)
  581. if start_j <= curr_j:
  582. indices = list(range(start_j, curr_j + 1))
  583. mapping[curr_i] = indices
  584. used_groups.update(indices)
  585. else:
  586. mapping[curr_i] = []
  587. curr_j = prev_j
  588. curr_i -= 1
  589. else:
  590. break
  591. # 填补未匹配的行
  592. for i in range(n_html):
  593. if i not in mapping:
  594. mapping[i] = []
  595. # --- 4. 后处理:未匹配组的归属 (Orphans) ---
  596. unused_groups = [i for i in range(len(grouped_boxes)) if i not in used_groups]
  597. if unused_groups:
  598. print(f" ℹ️ 发现 {len(unused_groups)} 个未匹配的 paddle 组: {unused_groups}")
  599. for unused_idx in unused_groups:
  600. unused_group = grouped_boxes[unused_idx]
  601. unused_y_min = min(b['bbox'][1] for b in unused_group['boxes'])
  602. unused_y_max = max(b['bbox'][3] for b in unused_group['boxes'])
  603. above_idx = None
  604. below_idx = None
  605. above_distance = float('inf')
  606. below_distance = float('inf')
  607. for i in range(unused_idx - 1, -1, -1):
  608. if i in used_groups:
  609. above_idx = i
  610. above_group = grouped_boxes[i]
  611. max_y_box = max(above_group['boxes'], key=lambda b: b['bbox'][3])
  612. above_y_center = (max_y_box['bbox'][1] + max_y_box['bbox'][3]) / 2
  613. above_distance = abs(unused_y_min - above_y_center)
  614. break
  615. for i in range(unused_idx + 1, len(grouped_boxes)):
  616. if i in used_groups:
  617. below_idx = i
  618. below_group = grouped_boxes[i]
  619. min_y_box = min(below_group['boxes'], key=lambda b: b['bbox'][1])
  620. below_y_center = (min_y_box['bbox'][1] + min_y_box['bbox'][3]) / 2
  621. below_distance = abs(below_y_center - unused_y_max)
  622. break
  623. closest_used_idx = None
  624. merge_direction = ""
  625. if above_idx is not None and below_idx is not None:
  626. if above_distance < below_distance:
  627. closest_used_idx = above_idx
  628. merge_direction = "上方"
  629. else:
  630. closest_used_idx = below_idx
  631. merge_direction = "下方"
  632. elif above_idx is not None:
  633. closest_used_idx = above_idx
  634. merge_direction = "上方"
  635. elif below_idx is not None:
  636. closest_used_idx = below_idx
  637. merge_direction = "下方"
  638. if closest_used_idx is not None:
  639. target_html_row = None
  640. for html_row_idx, group_indices in mapping.items():
  641. if closest_used_idx in group_indices:
  642. target_html_row = html_row_idx
  643. break
  644. if target_html_row is not None:
  645. if unused_idx not in mapping[target_html_row]:
  646. mapping[target_html_row].append(unused_idx)
  647. mapping[target_html_row].sort()
  648. print(f" • 组 {unused_idx} 合并到 HTML 行 {target_html_row}({merge_direction}行)")
  649. used_groups.add(unused_idx)
  650. # 🔑 策略 4: 第三遍 - 按 y 坐标排序每行的组索引
  651. for row_idx in mapping:
  652. if mapping[row_idx]:
  653. mapping[row_idx].sort(key=lambda idx: grouped_boxes[idx]['y_center'])
  654. return mapping
  655. def _get_adaptive_beam_width(self, html_row_count: int) -> int:
  656. """根据HTML行数动态调整剪枝参数"""
  657. if html_row_count <= 20:
  658. return 10
  659. elif html_row_count <= 40:
  660. return 15
  661. else:
  662. return 20 # 最大20,而不是30
  663. def _calculate_similarity(self, text1: str, text2: str) -> float:
  664. """
  665. 计算两个文本的相似度,结合字符覆盖率和序列相似度 (性能优化版)
  666. """
  667. if not text1 or not text2:
  668. return 0.0
  669. len1, len2 = len(text1), len(text2)
  670. # ⚡️ 优化 1: 长度快速检查
  671. # 如果长度差异过大(例如一个 50 字符,一个 2 字符),直接认为不匹配
  672. if len1 > 0 and len2 > 0:
  673. min_l, max_l = min(len1, len2), max(len1, len2)
  674. if max_l > 10 and min_l / max_l < 0.2:
  675. return 0.0
  676. # 1. 字符覆盖率 (Character Overlap)
  677. from collections import Counter
  678. c1 = Counter(text1)
  679. c2 = Counter(text2)
  680. intersection = c1 & c2
  681. overlap_count = sum(intersection.values())
  682. coverage = overlap_count / len1 if len1 > 0 else 0
  683. # ⚡️ 优化 2: 覆盖率低时跳过昂贵的 fuzz 计算
  684. # 如果字符重叠率低于 30%,说明内容基本不相关,没必要算序列相似度
  685. if coverage < 0.3:
  686. return coverage * 0.7
  687. # 2. 序列相似度 (Sequence Similarity)
  688. # 使用 token_sort_ratio 来容忍一定的乱序
  689. seq_score = fuzz.token_sort_ratio(text1, text2) / 100.0
  690. return (coverage * 0.7) + (seq_score * 0.3)
  691. def _preprocess_text_for_matching(self, text: str) -> str:
  692. """
  693. 预处理文本:在不同类型的字符(如中文和数字/英文)之间插入空格,
  694. 以便于 token_sort_ratio 更准确地进行分词和匹配。
  695. """
  696. if not text:
  697. return ""
  698. import re
  699. # 1. 在中文和非中文(数字/字母)之间插入空格
  700. # 例如: "2024年" -> "2024 年", "ID号码123" -> "ID号码 123"
  701. text = re.sub(r'([\u4e00-\u9fa5])([a-zA-Z0-9])', r'\1 \2', text)
  702. text = re.sub(r'([a-zA-Z0-9])([\u4e00-\u9fa5])', r'\1 \2', text)
  703. return text
  704. def _calculate_subsequence_score(self, target: str, source: str) -> float:
  705. """
  706. 计算子序列匹配得分 (解决 OCR 噪音插入问题)
  707. 例如: Target="12345", Source="12(date)34(time)5" -> Score close to 100
  708. """
  709. # 1. 仅保留字母和数字,忽略符号干扰
  710. t_clean = "".join(c for c in target if c.isalnum())
  711. s_clean = "".join(c for c in source if c.isalnum())
  712. if not t_clean or not s_clean:
  713. return 0.0
  714. # 2. 贪婪匹配子序列
  715. t_idx, s_idx = 0, 0
  716. matches = 0
  717. while t_idx < len(t_clean) and s_idx < len(s_clean):
  718. if t_clean[t_idx] == s_clean[s_idx]:
  719. matches += 1
  720. t_idx += 1
  721. s_idx += 1
  722. else:
  723. # 跳过 source 中的噪音字符
  724. s_idx += 1
  725. # 3. 计算得分
  726. match_rate = matches / len(t_clean)
  727. # 如果匹配率太低,直接返回
  728. if match_rate < 0.8:
  729. return match_rate * 100
  730. # 4. 噪音惩罚 (防止 Target="1", Source="123456789" 这种误判)
  731. # 计算噪音长度
  732. noise_len = len(s_clean) - matches
  733. # 允许一定比例的噪音 (例如日期时间插入,通常占总长度的 30%-50%)
  734. # 如果噪音长度超过目标长度的 60%,开始扣分
  735. penalty = 0
  736. if noise_len > len(t_clean) * 0.6:
  737. excess_noise = noise_len - (len(t_clean) * 0.6)
  738. penalty = excess_noise * 0.5 # 每多一个噪音字符扣 0.5 分
  739. penalty = min(penalty, 20) # 最多扣 20 分
  740. final_score = (match_rate * 100) - penalty
  741. return max(0, final_score)