table_cell_matcher.py 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909
  1. """
  2. 表格单元格匹配器
  3. 负责将 HTML 表格单元格与 PaddleOCR bbox 进行匹配
  4. """
  5. from typing import List, Dict, Tuple, Optional
  6. from bs4 import BeautifulSoup
  7. import numpy as np
  8. try:
  9. from rapidfuzz import fuzz
  10. except ImportError:
  11. from fuzzywuzzy import fuzz
  12. try:
  13. from .text_matcher import TextMatcher
  14. from .bbox_extractor import BBoxExtractor
  15. except ImportError:
  16. from text_matcher import TextMatcher
  17. from bbox_extractor import BBoxExtractor
  18. class TableCellMatcher:
  19. """表格单元格匹配器"""
  20. def __init__(self, text_matcher: TextMatcher,
  21. x_tolerance: int = 3,
  22. y_tolerance: int = 10,
  23. inclination_threshold: float = 0.3):
  24. """
  25. Args:
  26. text_matcher: 文本匹配器
  27. x_tolerance: X轴容差(用于列边界判断)
  28. y_tolerance: Y轴容差(用于行分组)
  29. """
  30. self.text_matcher = text_matcher
  31. self.x_tolerance = x_tolerance
  32. self.y_tolerance = y_tolerance
  33. self.inclination_threshold = inclination_threshold # 倾斜校正阈值(度数)
  34. def enhance_table_html_with_bbox(self, html: str, paddle_text_boxes: List[Dict],
  35. start_pointer: int, table_bbox: Optional[List[int]] = None) -> Tuple[str, List[Dict], int]:
  36. """
  37. 为 HTML 表格添加 bbox 信息(优化版:使用行级动态规划)
  38. """
  39. soup = BeautifulSoup(html, 'html.parser')
  40. cells = []
  41. # 🔑 第一步:筛选表格区域内的 paddle boxes
  42. table_region_boxes, actual_table_bbox = self._filter_boxes_in_table_region(
  43. paddle_text_boxes[start_pointer:],
  44. table_bbox,
  45. html
  46. )
  47. if not table_region_boxes:
  48. print(f"⚠️ 未在表格区域找到 paddle boxes")
  49. return str(soup), cells, start_pointer
  50. print(f"📊 表格区域: {len(table_region_boxes)} 个文本框")
  51. # 🔑 第二步:将表格区域的 boxes 按行分组
  52. grouped_boxes = self._group_paddle_boxes_by_rows(
  53. table_region_boxes,
  54. y_tolerance=self.y_tolerance,
  55. auto_correct_skew=True,
  56. inclination_threshold=self.inclination_threshold
  57. )
  58. # 🔑 第三步:在每组内按 x 坐标排序
  59. for group in grouped_boxes:
  60. group['boxes'].sort(key=lambda x: x['bbox'][0])
  61. grouped_boxes.sort(key=lambda g: g['y_center'])
  62. # 🔑 第四步:智能匹配 HTML 行与 paddle 行组
  63. html_rows = soup.find_all('tr')
  64. row_mapping = self._match_html_rows_to_paddle_groups(html_rows, grouped_boxes)
  65. print(f" HTML行: {len(html_rows)} 行, 映射: {len([v for v in row_mapping.values() if v])} 个有效映射")
  66. # 🔑 第五步:遍历 HTML 表格,使用 DP 进行行内匹配
  67. for row_idx, row in enumerate(html_rows):
  68. group_indices = row_mapping.get(row_idx, [])
  69. if not group_indices:
  70. continue
  71. # 合并多个组的 boxes
  72. current_boxes = []
  73. for group_idx in group_indices:
  74. if group_idx < len(grouped_boxes):
  75. current_boxes.extend(grouped_boxes[group_idx]['boxes'])
  76. # 再次按 x 排序确保顺序
  77. current_boxes.sort(key=lambda x: x['bbox'][0])
  78. html_cells = row.find_all(['td', 'th'])
  79. if not html_cells:
  80. continue
  81. # 🎯 核心变更:使用行级 DP 替代原来的顺序匹配
  82. # 输入:HTML 单元格列表, OCR Box 列表
  83. # 输出:匹配结果列表
  84. dp_results = self._match_cells_in_row_dp(html_cells, current_boxes)
  85. print(f" 行 {row_idx + 1}: {len(html_cells)} 列, 匹配到 {len(dp_results)} 个单元格")
  86. # 解析 DP 结果并填充 cells 列表
  87. for res in dp_results:
  88. cell_idx = res['cell_idx']
  89. match_info = res['match_info']
  90. cell_element = html_cells[cell_idx]
  91. cell_text = cell_element.get_text(strip=True)
  92. matched_boxes = match_info['boxes']
  93. matched_text = match_info['text']
  94. score = match_info['score']
  95. # 标记 box 为已使用
  96. paddle_indices = []
  97. for box in matched_boxes:
  98. box['used'] = True
  99. paddle_indices.append(box.get('paddle_bbox_index', -1))
  100. # 计算合并后的 bbox (使用原始坐标 original_bbox 优先)
  101. merged_bbox = self._merge_boxes_bbox(matched_boxes)
  102. # 注入 HTML 属性
  103. cell_element['data-bbox'] = f"[{merged_bbox[0]},{merged_bbox[1]},{merged_bbox[2]},{merged_bbox[3]}]"
  104. cell_element['data-score'] = f"{score:.4f}"
  105. cell_element['data-paddle-indices'] = str(paddle_indices)
  106. # 构建返回结构 (保持与原函数一致)
  107. cells.append({
  108. 'type': 'table_cell',
  109. 'text': cell_text,
  110. 'matched_text': matched_text,
  111. 'bbox': merged_bbox,
  112. 'row': row_idx + 1,
  113. 'col': cell_idx + 1,
  114. 'score': score,
  115. 'paddle_bbox_indices': paddle_indices
  116. })
  117. print(f" 列 {cell_idx + 1}: '{cell_text[:15]}...' 匹配 {len(matched_boxes)} 个box (分值: {score:.1f})")
  118. # 计算新的指针位置 (逻辑保持不变:基于 used 标记)
  119. used_count = sum(1 for box in table_region_boxes if box.get('used'))
  120. new_pointer = start_pointer + used_count
  121. print(f" 总计匹配: {len(cells)} 个单元格")
  122. return str(soup), cells, new_pointer
  123. def _merge_boxes_bbox(self, boxes: List[Dict]) -> List[int]:
  124. """辅助函数:合并多个 box 的坐标"""
  125. if not boxes:
  126. return [0, 0, 0, 0]
  127. # 优先使用 original_bbox,如果没有则使用 bbox
  128. def get_coords(b):
  129. return b.get('original_bbox', b['bbox'])
  130. x1 = min(get_coords(b)[0] for b in boxes)
  131. y1 = min(get_coords(b)[1] for b in boxes)
  132. x2 = max(get_coords(b)[2] for b in boxes)
  133. y2 = max(get_coords(b)[3] for b in boxes)
  134. return [x1, y1, x2, y2]
  135. def _match_cells_in_row_dp(self, html_cells: List, row_boxes: List[Dict]) -> List[Dict]:
  136. """
  137. 使用动态规划进行行内单元格匹配
  138. 目标:找到一种分配方案,使得整行的匹配总分最高
  139. """
  140. n_cells = len(html_cells)
  141. n_boxes = len(row_boxes)
  142. # dp[i][j] 表示:前 i 个单元格 消耗了 前 j 个 boxes 的最大得分
  143. dp = np.full((n_cells + 1, n_boxes + 1), -np.inf)
  144. dp[0][0] = 0
  145. # path[i][j] = (prev_j, matched_info) 用于回溯
  146. path = {}
  147. # 允许合并的最大 box 数量
  148. MAX_MERGE = 5
  149. for i in range(1, n_cells + 1):
  150. cell = html_cells[i-1]
  151. cell_text = cell.get_text(strip=True)
  152. # 如果单元格为空,允许继承状态(相当于跳过该单元格)
  153. if not cell_text:
  154. for j in range(n_boxes + 1):
  155. if dp[i-1][j] > -np.inf:
  156. dp[i][j] = dp[i-1][j]
  157. path[(i, j)] = (j, None)
  158. continue
  159. # 遍历当前 box 指针 j
  160. for j in range(n_boxes + 1):
  161. # 策略 A: 当前单元格不匹配任何 box (Cell Missing / OCR漏检)
  162. if dp[i-1][j] > dp[i][j]:
  163. dp[i][j] = dp[i-1][j]
  164. path[(i, j)] = (j, None)
  165. # 策略 B: 当前单元格匹配了 k 个 boxes (从 prev_j 到 j)
  166. # 限制搜索范围:最多往前看 MAX_MERGE 个 box
  167. search_limit = max(0, j - MAX_MERGE)
  168. # 允许中间跳过少量噪音 box (例如 prev_j 到 j 之间跨度大,但只取了部分)
  169. # 但为了简化,这里假设是连续取用 row_boxes[prev_j:j]
  170. for prev_j in range(j - 1, search_limit - 1, -1):
  171. if dp[i-1][prev_j] == -np.inf:
  172. continue
  173. candidate_boxes = row_boxes[prev_j:j]
  174. # 组合文本 (使用空格连接)
  175. merged_text = " ".join([b['text'] for b in candidate_boxes])
  176. # 计算得分
  177. score = self._compute_match_score(cell_text, merged_text)
  178. # 只有及格的匹配才考虑
  179. if score > 50:
  180. new_score = dp[i-1][prev_j] + score
  181. if new_score > dp[i][j]:
  182. dp[i][j] = new_score
  183. path[(i, j)] = (prev_j, {
  184. 'text': merged_text,
  185. 'boxes': candidate_boxes,
  186. 'score': score
  187. })
  188. # --- 回溯找最优解 ---
  189. best_j = np.argmax(dp[n_cells])
  190. if dp[n_cells][best_j] == -np.inf:
  191. return []
  192. results = []
  193. curr_i, curr_j = n_cells, best_j
  194. while curr_i > 0:
  195. step_info = path.get((curr_i, curr_j))
  196. if step_info:
  197. prev_j, match_info = step_info
  198. if match_info:
  199. results.append({
  200. 'cell_idx': curr_i - 1,
  201. 'match_info': match_info
  202. })
  203. curr_j = prev_j
  204. curr_i -= 1
  205. return results[::-1]
  206. def _compute_match_score(self, cell_text: str, box_text: str) -> float:
  207. """
  208. 纯粹的评分函数:计算单元格文本与候选 Box 文本的匹配得分
  209. 包含所有防御逻辑
  210. """
  211. # 1. 预处理
  212. cell_norm = self.text_matcher.normalize_text(cell_text)
  213. box_norm = self.text_matcher.normalize_text(box_text)
  214. if not cell_norm or not box_norm:
  215. return 0.0
  216. # --- ⚡️ 快速防御 ---
  217. len_cell = len(cell_norm)
  218. len_box = len(box_norm)
  219. # 长度差异过大直接 0 分 (除非是包含关系且特征明显)
  220. if len_box > len_cell * 3 + 5:
  221. if len_cell < 5: return 0.0
  222. # --- 🔍 核心相似度计算 ---
  223. cell_proc = self._preprocess_text_for_matching(cell_text)
  224. box_proc = self._preprocess_text_for_matching(box_text)
  225. # A. Token Sort (解决乱序)
  226. score_sort = fuzz.token_sort_ratio(cell_proc, box_proc)
  227. # B. Partial (解决截断/包含)
  228. score_partial = fuzz.partial_ratio(cell_norm, box_norm)
  229. # C. Subsequence (解决噪音插入)
  230. score_subseq = 0.0
  231. if len_cell > 5:
  232. score_subseq = self._calculate_subsequence_score(cell_norm, box_norm)
  233. # --- 🛡️ 深度防御逻辑 ---
  234. # 1. 短文本防御
  235. if score_partial > 80:
  236. import re
  237. has_content = lambda t: bool(re.search(r'[a-zA-Z0-9\u4e00-\u9fa5]', t))
  238. # 纯符号防御
  239. if not has_content(cell_norm) and has_content(box_norm):
  240. if len_box > len_cell + 2: score_partial = 0.0
  241. # 微小碎片防御
  242. elif len_cell <= 2 and len_box > 8:
  243. score_partial = 0.0
  244. # 覆盖率防御
  245. else:
  246. coverage = len_cell / len_box if len_box > 0 else 0
  247. if coverage < 0.3 and score_sort < 45:
  248. score_partial = 0.0
  249. # 2. 子序列防御
  250. if score_subseq > 80:
  251. if len_box > len_cell * 1.5:
  252. import re
  253. if re.match(r'^[\d\-\:\.\s]+$', cell_norm) and len_cell < 12:
  254. score_subseq = 0.0
  255. # --- 📊 综合评分 ---
  256. final_score = max(score_sort, score_partial, score_subseq)
  257. # 精确匹配奖励
  258. if cell_norm == box_norm:
  259. final_score = 100.0
  260. elif cell_norm in box_norm:
  261. final_score = min(100, final_score + 5)
  262. return final_score
  263. def _filter_boxes_in_table_region(self, paddle_boxes: List[Dict],
  264. table_bbox: Optional[List[int]],
  265. html: str) -> Tuple[List[Dict], List[int]]:
  266. """
  267. 筛选表格区域内的 paddle boxes
  268. 策略:
  269. 1. 如果有 table_bbox,使用边界框筛选(扩展边界)
  270. 2. 如果没有 table_bbox,通过内容匹配推断区域
  271. Args:
  272. paddle_boxes: paddle OCR 结果
  273. table_bbox: 表格边界框 [x1, y1, x2, y2]
  274. html: HTML 内容(用于内容验证)
  275. Returns:
  276. (筛选后的 boxes, 实际表格边界框)
  277. """
  278. if not paddle_boxes:
  279. return [], [0, 0, 0, 0]
  280. # 🎯 策略 1: 使用提供的 table_bbox(扩展边界)
  281. if table_bbox and len(table_bbox) == 4:
  282. x1, y1, x2, y2 = table_bbox
  283. # 扩展边界(考虑边框外的文本)
  284. margin = 20
  285. expanded_bbox = [
  286. max(0, x1 - margin),
  287. max(0, y1 - margin),
  288. x2 + margin,
  289. y2 + margin
  290. ]
  291. filtered = []
  292. for box in paddle_boxes:
  293. bbox = box['bbox']
  294. box_center_x = (bbox[0] + bbox[2]) / 2
  295. box_center_y = (bbox[1] + bbox[3]) / 2
  296. # 中心点在扩展区域内
  297. if (expanded_bbox[0] <= box_center_x <= expanded_bbox[2] and
  298. expanded_bbox[1] <= box_center_y <= expanded_bbox[3]):
  299. filtered.append(box)
  300. if filtered:
  301. # 计算实际边界框
  302. actual_bbox = [
  303. min(b['bbox'][0] for b in filtered),
  304. min(b['bbox'][1] for b in filtered),
  305. max(b['bbox'][2] for b in filtered),
  306. max(b['bbox'][3] for b in filtered)
  307. ]
  308. return filtered, actual_bbox
  309. # 🎯 策略 2: 通过内容匹配推断区域
  310. print(" ℹ️ 无 table_bbox,使用内容匹配推断表格区域...")
  311. # 提取 HTML 中的所有文本
  312. from bs4 import BeautifulSoup
  313. soup = BeautifulSoup(html, 'html.parser')
  314. html_texts = set()
  315. for cell in soup.find_all(['td', 'th']):
  316. text = cell.get_text(strip=True)
  317. if text:
  318. html_texts.add(self.text_matcher.normalize_text(text))
  319. if not html_texts:
  320. return [], [0, 0, 0, 0]
  321. # 找出与 HTML 内容匹配的 boxes
  322. matched_boxes = []
  323. for box in paddle_boxes:
  324. normalized_text = self.text_matcher.normalize_text(box['text'])
  325. # 检查是否匹配
  326. if any(normalized_text in ht or ht in normalized_text
  327. for ht in html_texts):
  328. matched_boxes.append(box)
  329. if not matched_boxes:
  330. # 🔑 降级:如果精确匹配失败,使用模糊匹配
  331. print(" ℹ️ 精确匹配失败,尝试模糊匹配...")
  332. for box in paddle_boxes:
  333. normalized_text = self.text_matcher.normalize_text(box['text'])
  334. for ht in html_texts:
  335. similarity = fuzz.partial_ratio(normalized_text, ht)
  336. if similarity >= 70: # 降低阈值
  337. matched_boxes.append(box)
  338. break
  339. if matched_boxes:
  340. # 计算边界框
  341. actual_bbox = [
  342. min(b['bbox'][0] for b in matched_boxes),
  343. min(b['bbox'][1] for b in matched_boxes),
  344. max(b['bbox'][2] for b in matched_boxes),
  345. max(b['bbox'][3] for b in matched_boxes)
  346. ]
  347. # 🔑 扩展边界,包含可能遗漏的文本
  348. margin = 30
  349. expanded_bbox = [
  350. max(0, actual_bbox[0] - margin),
  351. max(0, actual_bbox[1] - margin),
  352. actual_bbox[2] + margin,
  353. actual_bbox[3] + margin
  354. ]
  355. # 重新筛选(包含边界上的文本)
  356. final_filtered = []
  357. for box in paddle_boxes:
  358. bbox = box['bbox']
  359. box_center_x = (bbox[0] + bbox[2]) / 2
  360. box_center_y = (bbox[1] + bbox[3]) / 2
  361. if (expanded_bbox[0] <= box_center_x <= expanded_bbox[2] and
  362. expanded_bbox[1] <= box_center_y <= expanded_bbox[3]):
  363. final_filtered.append(box)
  364. return final_filtered, actual_bbox
  365. # 🔑 最后的降级:返回所有 boxes
  366. print(" ⚠️ 无法确定表格区域,使用所有 paddle boxes")
  367. if paddle_boxes:
  368. actual_bbox = [
  369. min(b['bbox'][0] for b in paddle_boxes),
  370. min(b['bbox'][1] for b in paddle_boxes),
  371. max(b['bbox'][2] for b in paddle_boxes),
  372. max(b['bbox'][3] for b in paddle_boxes)
  373. ]
  374. return paddle_boxes, actual_bbox
  375. return [], [0, 0, 0, 0]
  376. def _group_paddle_boxes_by_rows(self, paddle_boxes: List[Dict],
  377. y_tolerance: int = 10,
  378. auto_correct_skew: bool = True,
  379. inclination_threshold: float = 0.3) -> List[Dict]:
  380. """
  381. 将 paddle_text_boxes 按 y 坐标分组(聚类)- 增强版本
  382. Args:
  383. paddle_boxes: Paddle OCR 文字框列表
  384. y_tolerance: Y 坐标容忍度(像素)
  385. auto_correct_skew: 是否自动校正倾斜
  386. Returns:
  387. 分组列表,每组包含 {'y_center': float, 'boxes': List[Dict]}
  388. """
  389. if not paddle_boxes:
  390. return []
  391. # 🎯 步骤 1: 检测并校正倾斜(使用 BBoxExtractor)
  392. if auto_correct_skew:
  393. rotation_angle = BBoxExtractor.calculate_skew_angle(paddle_boxes)
  394. if abs(rotation_angle) > inclination_threshold:
  395. max_x = max(box['bbox'][2] for box in paddle_boxes)
  396. max_y = max(box['bbox'][3] for box in paddle_boxes)
  397. image_size = (max_x, max_y)
  398. print(f" 🔧 校正倾斜角度: {rotation_angle:.2f}°")
  399. paddle_boxes = BBoxExtractor.correct_boxes_skew(
  400. paddle_boxes, -rotation_angle, image_size
  401. )
  402. # 🎯 步骤 2: 按校正后的 y 坐标分组
  403. boxes_with_y = []
  404. for box in paddle_boxes:
  405. bbox = box['bbox']
  406. y_center = (bbox[1] + bbox[3]) / 2
  407. boxes_with_y.append({
  408. 'y_center': y_center,
  409. 'box': box
  410. })
  411. # 按 y 坐标排序
  412. boxes_with_y.sort(key=lambda x: x['y_center'])
  413. groups = []
  414. current_group = None
  415. for item in boxes_with_y:
  416. if current_group is None:
  417. # 开始新组
  418. current_group = {
  419. 'y_center': item['y_center'],
  420. 'boxes': [item['box']]
  421. }
  422. else:
  423. if abs(item['y_center'] - current_group['y_center']) <= y_tolerance:
  424. current_group['boxes'].append(item['box'])
  425. # 更新组的中心
  426. current_group['y_center'] = sum(
  427. (b['bbox'][1] + b['bbox'][3]) / 2 for b in current_group['boxes']
  428. ) / len(current_group['boxes'])
  429. else:
  430. groups.append(current_group)
  431. current_group = {
  432. 'y_center': item['y_center'],
  433. 'boxes': [item['box']]
  434. }
  435. if current_group:
  436. groups.append(current_group)
  437. print(f" ✓ 分组完成: {len(groups)} 行")
  438. return groups
  439. def _match_html_rows_to_paddle_groups(self, html_rows: List,
  440. grouped_boxes: List[Dict]) -> Dict[int, List[int]]:
  441. """
  442. 智能匹配 HTML 行与 paddle 分组(增强版 DP:支持跳过 HTML 行,防止链条断裂)
  443. """
  444. if not html_rows or not grouped_boxes:
  445. return {}
  446. mapping = {}
  447. # 🎯 策略 1: 数量相等,简单 1:1 映射
  448. if len(html_rows) == len(grouped_boxes):
  449. for i in range(len(html_rows)):
  450. mapping[i] = [i]
  451. return mapping
  452. # --- 准备数据 ---
  453. # 提取 HTML 文本
  454. html_row_texts = []
  455. for row in html_rows:
  456. cells = row.find_all(['td', 'th'])
  457. texts = [self.text_matcher.normalize_text(c.get_text(strip=True)) for c in cells]
  458. html_row_texts.append("".join(texts))
  459. # 预计算所有组的文本
  460. group_texts = []
  461. for group in grouped_boxes:
  462. boxes = group['boxes']
  463. texts = [self.text_matcher.normalize_text(b['text']) for b in boxes]
  464. group_texts.append("".join(texts))
  465. n_html = len(html_row_texts)
  466. n_paddle = len(grouped_boxes)
  467. # ⚡️ 优化 3: 预计算合并文本
  468. MAX_MERGE = 4
  469. merged_cache = {}
  470. for j in range(n_paddle):
  471. current_t = ""
  472. for k in range(MAX_MERGE):
  473. if j + k < n_paddle:
  474. current_t += group_texts[j + k]
  475. merged_cache[(j, k + 1)] = current_t
  476. else:
  477. break
  478. # --- 动态规划 (DP) ---
  479. # dp[i][j] 表示:HTML 前 i 行 (0..i) 匹配到了 Paddle 的前 j 组 (0..j) 的最大得分
  480. # 初始化为负无穷
  481. dp = np.full((n_html, n_paddle), -np.inf)
  482. # 记录路径:path[i][j] = (prev_j, start_j)
  483. # prev_j: 上一行结束的 paddle index
  484. # start_j: 当前行开始的 paddle index (因为一行可能对应多个组)
  485. path = {}
  486. # 参数配置
  487. SEARCH_WINDOW = 15 # 向前搜索窗口
  488. SKIP_PADDLE_PENALTY = 0.1 # 跳过 Paddle 组的惩罚
  489. SKIP_HTML_PENALTY = 0.3 # 关键:跳过 HTML 行的惩罚
  490. # --- 1. 初始化第一行 ---
  491. # 选项 A: 匹配 Paddle 组
  492. for end_j in range(min(n_paddle, SEARCH_WINDOW + MAX_MERGE)):
  493. for count in range(1, MAX_MERGE + 1):
  494. start_j = end_j - count + 1
  495. if start_j < 0: continue
  496. current_text = merged_cache.get((start_j, count), "")
  497. similarity = self._calculate_similarity(html_row_texts[0], current_text)
  498. penalty = start_j * SKIP_PADDLE_PENALTY
  499. score = similarity - penalty
  500. # 只有得分尚可才作为有效状态
  501. if score > 0.1:
  502. if score > dp[0][end_j]:
  503. dp[0][end_j] = score
  504. path[(0, end_j)] = (-1, start_j)
  505. # 选项 B: 第一行就跳过 (虽然少见,但为了完整性)
  506. # 如果第一行跳过,相当于没有消耗任何 paddle 组,状态难以用 dp[0][j] 表达
  507. # 这里简化处理,假设第一行必须匹配点什么,或者由后续行修正
  508. # --- 2. 状态转移 ---
  509. for i in range(1, n_html):
  510. html_text = html_row_texts[i]
  511. # 获取上一行所有有效位置
  512. valid_prev_indices = [j for j in range(n_paddle) if dp[i-1][j] > -np.inf]
  513. # 剪枝
  514. if len(valid_prev_indices) > 30:
  515. valid_prev_indices.sort(key=lambda j: dp[i-1][j], reverse=True)
  516. valid_prev_indices = valid_prev_indices[:30]
  517. # 🛡️ 关键修复:允许跳过当前 HTML 行 (继承上一行的状态)
  518. # 如果跳过当前行,Paddle 指针 j 不变
  519. for prev_j in valid_prev_indices:
  520. score_skip = dp[i-1][prev_j] - SKIP_HTML_PENALTY
  521. if score_skip > dp[i][prev_j]:
  522. dp[i][prev_j] = score_skip
  523. # 记录路径:start_j = prev_j + 1 表示没有消耗新组 (空范围)
  524. path[(i, prev_j)] = (prev_j, prev_j + 1)
  525. # 如果是空行,直接跳过计算,仅保留继承的状态
  526. if not html_text:
  527. continue
  528. # 正常匹配逻辑
  529. for prev_j in valid_prev_indices:
  530. prev_score = dp[i-1][prev_j]
  531. max_gap = min(SEARCH_WINDOW, n_paddle - prev_j - 1)
  532. for gap in range(max_gap):
  533. start_j = prev_j + 1 + gap
  534. for count in range(1, MAX_MERGE + 1):
  535. end_j = start_j + count - 1
  536. if end_j >= n_paddle: break
  537. current_text = merged_cache.get((start_j, count), "")
  538. # 长度预筛选
  539. h_len = len(html_text)
  540. p_len = len(current_text)
  541. if h_len > 10 and p_len < h_len * 0.2:
  542. continue
  543. similarity = self._calculate_similarity(html_text, current_text)
  544. # 计算惩罚
  545. # 1. 跳过惩罚 (gap)
  546. # 2. 长度惩罚 (防止过度合并)
  547. len_penalty = 0.0
  548. if h_len > 0:
  549. ratio = p_len / h_len
  550. if ratio > 2.0: len_penalty = (ratio - 2.0) * 0.2
  551. current_score = similarity - (gap * SKIP_PADDLE_PENALTY) - len_penalty
  552. # 只有正收益才转移
  553. if current_score > 0.1:
  554. total_score = prev_score + current_score
  555. if total_score > dp[i][end_j]:
  556. dp[i][end_j] = total_score
  557. path[(i, end_j)] = (prev_j, start_j)
  558. # --- 3. 回溯找最优路径 ---
  559. # 找到最后一行得分最高的结束位置
  560. best_end_j = -1
  561. max_score = -np.inf
  562. # 优先找最后一行,如果最后一行没匹配上,往前找
  563. found_end = False
  564. for i in range(n_html - 1, -1, -1):
  565. for j in range(n_paddle):
  566. if dp[i][j] > max_score:
  567. max_score = dp[i][j]
  568. best_end_j = j
  569. best_last_row = i
  570. if max_score > -np.inf:
  571. found_end = True
  572. break
  573. mapping = {}
  574. used_groups = set()
  575. if found_end:
  576. curr_i = best_last_row
  577. curr_j = best_end_j
  578. while curr_i >= 0:
  579. if (curr_i, curr_j) in path:
  580. prev_j, start_j = path[(curr_i, curr_j)]
  581. # 如果 start_j <= curr_j,说明消耗了 Paddle 组
  582. # 如果 start_j > curr_j,说明是跳过 HTML 行 (空范围)
  583. if start_j <= curr_j:
  584. indices = list(range(start_j, curr_j + 1))
  585. mapping[curr_i] = indices
  586. used_groups.update(indices)
  587. else:
  588. mapping[curr_i] = []
  589. curr_j = prev_j
  590. curr_i -= 1
  591. else:
  592. break
  593. # 填补未匹配的行
  594. for i in range(n_html):
  595. if i not in mapping:
  596. mapping[i] = []
  597. # --- 4. 后处理:未匹配组的归属 (Orphans) ---
  598. unused_groups = [i for i in range(len(grouped_boxes)) if i not in used_groups]
  599. if unused_groups:
  600. print(f" ℹ️ 发现 {len(unused_groups)} 个未匹配的 paddle 组: {unused_groups}")
  601. for unused_idx in unused_groups:
  602. unused_group = grouped_boxes[unused_idx]
  603. unused_y_min = min(b['bbox'][1] for b in unused_group['boxes'])
  604. unused_y_max = max(b['bbox'][3] for b in unused_group['boxes'])
  605. above_idx = None
  606. below_idx = None
  607. above_distance = float('inf')
  608. below_distance = float('inf')
  609. for i in range(unused_idx - 1, -1, -1):
  610. if i in used_groups:
  611. above_idx = i
  612. above_group = grouped_boxes[i]
  613. max_y_box = max(above_group['boxes'], key=lambda b: b['bbox'][3])
  614. above_y_center = (max_y_box['bbox'][1] + max_y_box['bbox'][3]) / 2
  615. above_distance = abs(unused_y_min - above_y_center)
  616. break
  617. for i in range(unused_idx + 1, len(grouped_boxes)):
  618. if i in used_groups:
  619. below_idx = i
  620. below_group = grouped_boxes[i]
  621. min_y_box = min(below_group['boxes'], key=lambda b: b['bbox'][1])
  622. below_y_center = (min_y_box['bbox'][1] + min_y_box['bbox'][3]) / 2
  623. below_distance = abs(below_y_center - unused_y_max)
  624. break
  625. closest_used_idx = None
  626. merge_direction = ""
  627. if above_idx is not None and below_idx is not None:
  628. if above_distance < below_distance:
  629. closest_used_idx = above_idx
  630. merge_direction = "上方"
  631. else:
  632. closest_used_idx = below_idx
  633. merge_direction = "下方"
  634. elif above_idx is not None:
  635. closest_used_idx = above_idx
  636. merge_direction = "上方"
  637. elif below_idx is not None:
  638. closest_used_idx = below_idx
  639. merge_direction = "下方"
  640. if closest_used_idx is not None:
  641. target_html_row = None
  642. for html_row_idx, group_indices in mapping.items():
  643. if closest_used_idx in group_indices:
  644. target_html_row = html_row_idx
  645. break
  646. if target_html_row is not None:
  647. if unused_idx not in mapping[target_html_row]:
  648. mapping[target_html_row].append(unused_idx)
  649. mapping[target_html_row].sort()
  650. print(f" • 组 {unused_idx} 合并到 HTML 行 {target_html_row}({merge_direction}行)")
  651. used_groups.add(unused_idx)
  652. # 🔑 策略 4: 第三遍 - 按 y 坐标排序每行的组索引
  653. for row_idx in mapping:
  654. if mapping[row_idx]:
  655. mapping[row_idx].sort(key=lambda idx: grouped_boxes[idx]['y_center'])
  656. return mapping
  657. def _calculate_similarity(self, text1: str, text2: str) -> float:
  658. """
  659. 计算两个文本的相似度,结合字符覆盖率和序列相似度 (性能优化版)
  660. """
  661. if not text1 or not text2:
  662. return 0.0
  663. len1, len2 = len(text1), len(text2)
  664. # ⚡️ 优化 1: 长度快速检查
  665. # 如果长度差异过大(例如一个 50 字符,一个 2 字符),直接认为不匹配
  666. if len1 > 0 and len2 > 0:
  667. min_l, max_l = min(len1, len2), max(len1, len2)
  668. if max_l > 10 and min_l / max_l < 0.2:
  669. return 0.0
  670. # 1. 字符覆盖率 (Character Overlap)
  671. from collections import Counter
  672. c1 = Counter(text1)
  673. c2 = Counter(text2)
  674. intersection = c1 & c2
  675. overlap_count = sum(intersection.values())
  676. coverage = overlap_count / len1 if len1 > 0 else 0
  677. # ⚡️ 优化 2: 覆盖率低时跳过昂贵的 fuzz 计算
  678. # 如果字符重叠率低于 30%,说明内容基本不相关,没必要算序列相似度
  679. if coverage < 0.3:
  680. return coverage * 0.7
  681. # 2. 序列相似度 (Sequence Similarity)
  682. # 使用 token_sort_ratio 来容忍一定的乱序
  683. seq_score = fuzz.token_sort_ratio(text1, text2) / 100.0
  684. return (coverage * 0.7) + (seq_score * 0.3)
  685. def _preprocess_text_for_matching(self, text: str) -> str:
  686. """
  687. 预处理文本:在不同类型的字符(如中文和数字/英文)之间插入空格,
  688. 以便于 token_sort_ratio 更准确地进行分词和匹配。
  689. """
  690. if not text:
  691. return ""
  692. import re
  693. # 1. 在中文和非中文(数字/字母)之间插入空格
  694. # 例如: "2024年" -> "2024 年", "ID号码123" -> "ID号码 123"
  695. text = re.sub(r'([\u4e00-\u9fa5])([a-zA-Z0-9])', r'\1 \2', text)
  696. text = re.sub(r'([a-zA-Z0-9])([\u4e00-\u9fa5])', r'\1 \2', text)
  697. return text
  698. def _calculate_subsequence_score(self, target: str, source: str) -> float:
  699. """
  700. 计算子序列匹配得分 (解决 OCR 噪音插入问题)
  701. 例如: Target="12345", Source="12(date)34(time)5" -> Score close to 100
  702. """
  703. # 1. 仅保留字母和数字,忽略符号干扰
  704. t_clean = "".join(c for c in target if c.isalnum())
  705. s_clean = "".join(c for c in source if c.isalnum())
  706. if not t_clean or not s_clean:
  707. return 0.0
  708. # 2. 贪婪匹配子序列
  709. t_idx, s_idx = 0, 0
  710. matches = 0
  711. while t_idx < len(t_clean) and s_idx < len(s_clean):
  712. if t_clean[t_idx] == s_clean[s_idx]:
  713. matches += 1
  714. t_idx += 1
  715. s_idx += 1
  716. else:
  717. # 跳过 source 中的噪音字符
  718. s_idx += 1
  719. # 3. 计算得分
  720. match_rate = matches / len(t_clean)
  721. # 如果匹配率太低,直接返回
  722. if match_rate < 0.8:
  723. return match_rate * 100
  724. # 4. 噪音惩罚 (防止 Target="1", Source="123456789" 这种误判)
  725. # 计算噪音长度
  726. noise_len = len(s_clean) - matches
  727. # 允许一定比例的噪音 (例如日期时间插入,通常占总长度的 30%-50%)
  728. # 如果噪音长度超过目标长度的 60%,开始扣分
  729. penalty = 0
  730. if noise_len > len(t_clean) * 0.6:
  731. excess_noise = noise_len - (len(t_clean) * 0.6)
  732. penalty = excess_noise * 0.5 # 每多一个噪音字符扣 0.5 分
  733. penalty = min(penalty, 20) # 最多扣 20 分
  734. final_score = (match_rate * 100) - penalty
  735. return max(0, final_score)