table_cell_matcher.py 45 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111
  1. """
  2. 表格单元格匹配器
  3. 负责将 HTML 表格单元格与 PaddleOCR bbox 进行匹配
  4. """
  5. from typing import List, Dict, Tuple, Optional
  6. from bs4 import BeautifulSoup
  7. import numpy as np
  8. try:
  9. from rapidfuzz import fuzz
  10. except ImportError:
  11. from fuzzywuzzy import fuzz
  12. try:
  13. from .text_matcher import TextMatcher
  14. from .bbox_extractor import BBoxExtractor
  15. except ImportError:
  16. from text_matcher import TextMatcher
  17. from bbox_extractor import BBoxExtractor
  18. class TableCellMatcher:
  19. """表格单元格匹配器"""
  20. def __init__(self, text_matcher: TextMatcher,
  21. x_tolerance: int = 3,
  22. y_tolerance: int = 10):
  23. """
  24. Args:
  25. text_matcher: 文本匹配器
  26. x_tolerance: X轴容差(用于列边界判断)
  27. y_tolerance: Y轴容差(用于行分组)
  28. """
  29. self.text_matcher = text_matcher
  30. self.x_tolerance = x_tolerance
  31. self.y_tolerance = y_tolerance
  32. def enhance_table_html_with_bbox(self, html: str, paddle_text_boxes: List[Dict],
  33. start_pointer: int, table_bbox: Optional[List[int]] = None) -> Tuple[str, List[Dict], int]:
  34. """
  35. 为 HTML 表格添加 bbox 信息(优化版:先筛选表格区域)
  36. 策略:
  37. 1. 根据 table_bbox 筛选出表格区域内的 paddle_text_boxes
  38. 2. 将筛选后的 boxes 按行分组
  39. 3. 智能匹配 HTML 行与 paddle 行组
  40. 4. 在匹配的组内查找单元格
  41. Args:
  42. html: HTML 表格
  43. paddle_text_boxes: 全部 paddle OCR 结果
  44. start_pointer: 开始位置
  45. table_bbox: 表格边界框 [x1, y1, x2, y2]
  46. """
  47. soup = BeautifulSoup(html, 'html.parser')
  48. cells = []
  49. # 🔑 第一步:筛选表格区域内的 paddle boxes
  50. table_region_boxes, actual_table_bbox = self._filter_boxes_in_table_region(
  51. paddle_text_boxes[start_pointer:],
  52. table_bbox,
  53. html
  54. )
  55. if not table_region_boxes:
  56. print(f"⚠️ 未在表格区域找到 paddle boxes")
  57. return str(soup), cells, start_pointer
  58. print(f"📊 表格区域: {len(table_region_boxes)} 个文本框")
  59. print(f" 边界: {actual_table_bbox}")
  60. # 🔑 第二步:将表格区域的 boxes 按行分组
  61. grouped_boxes = self._group_paddle_boxes_by_rows(
  62. table_region_boxes,
  63. y_tolerance=self.y_tolerance,
  64. auto_correct_skew=True
  65. )
  66. # 🔑 第三步:在每组内按 x 坐标排序
  67. for group in grouped_boxes:
  68. group['boxes'].sort(key=lambda x: x['bbox'][0])
  69. grouped_boxes.sort(key=lambda g: g['y_center'])
  70. print(f" 分组: {len(grouped_boxes)} 行")
  71. # 🔑 第四步:智能匹配 HTML 行与 paddle 行组
  72. html_rows = soup.find_all('tr')
  73. row_mapping = self._match_html_rows_to_paddle_groups(html_rows, grouped_boxes)
  74. print(f" HTML行: {len(html_rows)} 行")
  75. print(f" 映射: {len([v for v in row_mapping.values() if v])} 个有效映射")
  76. # 🔑 第五步:遍历 HTML 表格,使用映射关系查找
  77. for row_idx, row in enumerate(html_rows):
  78. group_indices = row_mapping.get(row_idx, [])
  79. if not group_indices:
  80. continue
  81. # 合并多个组的 boxes
  82. current_boxes = []
  83. for group_idx in group_indices:
  84. if group_idx < len(grouped_boxes):
  85. current_boxes.extend(grouped_boxes[group_idx]['boxes'])
  86. current_boxes.sort(key=lambda x: x['bbox'][0])
  87. # 🎯 关键改进:提取 HTML 单元格并预先确定列边界
  88. html_cells = row.find_all(['td', 'th'])
  89. if not html_cells:
  90. continue
  91. # 🔑 预估列边界(基于 x 坐标分布)
  92. col_boundaries = self._estimate_column_boundaries(
  93. current_boxes,
  94. len(html_cells)
  95. )
  96. print(f" 行 {row_idx + 1}: {len(html_cells)} 列,边界: {col_boundaries}")
  97. # 🎯 关键改进:顺序指针匹配
  98. box_pointer = 0 # 当前行的 boxes 指针
  99. for col_idx, cell in enumerate(html_cells):
  100. cell_text = cell.get_text(strip=True)
  101. if not cell_text:
  102. continue
  103. # 🔑 从当前指针开始匹配
  104. matched_result = self._match_cell_sequential(
  105. cell_text,
  106. current_boxes,
  107. col_boundaries,
  108. box_pointer
  109. )
  110. if matched_result:
  111. merged_bbox = matched_result['bbox']
  112. merged_text = matched_result['text']
  113. cell['data-bbox'] = f"[{merged_bbox[0]},{merged_bbox[1]},{merged_bbox[2]},{merged_bbox[3]}]"
  114. cell['data-score'] = f"{matched_result['score']:.4f}"
  115. cell['data-paddle-indices'] = str(matched_result['paddle_indices'])
  116. cells.append({
  117. 'type': 'table_cell',
  118. 'text': cell_text,
  119. 'matched_text': merged_text,
  120. 'bbox': merged_bbox,
  121. 'row': row_idx + 1,
  122. 'col': col_idx + 1,
  123. 'score': matched_result['score'],
  124. 'paddle_bbox_indices': matched_result['paddle_indices']
  125. })
  126. # 标记已使用
  127. for box in matched_result['used_boxes']:
  128. box['used'] = True
  129. # 🎯 移动指针到最后使用的 box 之后
  130. box_pointer = matched_result['last_used_index'] + 1
  131. print(f" 列 {col_idx + 1}: '{cell_text[:20]}...' 匹配 {len(matched_result['used_boxes'])} 个box (指针: {box_pointer})")
  132. # 计算新的指针位置
  133. used_count = sum(1 for box in table_region_boxes if box.get('used'))
  134. new_pointer = start_pointer + used_count
  135. print(f" 匹配: {len(cells)} 个单元格")
  136. return str(soup), cells, new_pointer
  137. def _estimate_column_boundaries(self, boxes: List[Dict],
  138. num_cols: int) -> List[Tuple[int, int]]:
  139. """
  140. 估算列边界(改进版:处理同列多文本框)
  141. Args:
  142. boxes: 当前行的所有 boxes(已按 x 排序)
  143. num_cols: HTML 表格的列数
  144. Returns:
  145. 列边界列表 [(x_start, x_end), ...]
  146. """
  147. if not boxes:
  148. return []
  149. # 🔑 关键改进:先按 x 坐标聚类(合并同列的多个文本框)
  150. x_clusters = self._cluster_boxes_by_x(boxes, x_tolerance=self.x_tolerance)
  151. print(f" X聚类: {len(boxes)} 个boxes -> {len(x_clusters)} 个列簇")
  152. # 获取所有 x 坐标范围
  153. x_min = min(cluster['x_min'] for cluster in x_clusters)
  154. x_max = max(cluster['x_max'] for cluster in x_clusters)
  155. # 🎯 策略 1: 如果聚类数量<=列数接近
  156. if len(x_clusters) <= num_cols:
  157. # 直接使用聚类边界
  158. boundaries = [(cluster['x_min'], cluster['x_max'])
  159. for cluster in x_clusters]
  160. return boundaries
  161. # 🎯 策略 2: 聚类数多于列数(某些列有多个文本簇)
  162. if len(x_clusters) > num_cols:
  163. print(f" ℹ️ 聚类数 {len(x_clusters)} > 列数 {num_cols},合并相近簇")
  164. # 合并相近的簇
  165. merged_clusters = self._merge_close_clusters(x_clusters, num_cols)
  166. boundaries = [(cluster['x_min'], cluster['x_max'])
  167. for cluster in merged_clusters]
  168. return boundaries
  169. return []
  170. def _cluster_boxes_by_x(self, boxes: List[Dict],
  171. x_tolerance: int = 3) -> List[Dict]:
  172. """
  173. 按 x 坐标聚类(合并同列的多个文本框)
  174. Args:
  175. boxes: 文本框列表
  176. x_tolerance: X坐标容忍度
  177. Returns:
  178. 聚类列表 [{'x_min': int, 'x_max': int, 'boxes': List[Dict]}, ...]
  179. """
  180. if not boxes:
  181. return []
  182. # 按左边界 x 坐标排序
  183. sorted_boxes = sorted(boxes, key=lambda b: b['bbox'][0])
  184. clusters = []
  185. current_cluster = None
  186. for box in sorted_boxes:
  187. bbox = box['bbox']
  188. x_start = bbox[0]
  189. x_end = bbox[2]
  190. if current_cluster is None:
  191. # 开始新簇
  192. current_cluster = {
  193. 'x_min': x_start,
  194. 'x_max': x_end,
  195. 'boxes': [box]
  196. }
  197. else:
  198. # 🔑 检查是否属于当前簇(修正后的逻辑)
  199. # 1. x 坐标有重叠:x_start <= current_x_max 且 x_end >= current_x_min
  200. # 2. 或者距离在容忍度内
  201. has_overlap = (x_start <= current_cluster['x_max'] and
  202. x_end >= current_cluster['x_min'])
  203. is_close = abs(x_start - current_cluster['x_max']) <= x_tolerance
  204. if has_overlap or is_close:
  205. # 合并到当前簇
  206. current_cluster['boxes'].append(box)
  207. current_cluster['x_min'] = min(current_cluster['x_min'], x_start)
  208. current_cluster['x_max'] = max(current_cluster['x_max'], x_end)
  209. else:
  210. # 保存当前簇,开始新簇
  211. clusters.append(current_cluster)
  212. current_cluster = {
  213. 'x_min': x_start,
  214. 'x_max': x_end,
  215. 'boxes': [box]
  216. }
  217. # 添加最后一簇
  218. if current_cluster:
  219. clusters.append(current_cluster)
  220. return clusters
  221. def _merge_close_clusters(self, clusters: List[Dict],
  222. target_count: int) -> List[Dict]:
  223. """
  224. 合并相近的簇,直到数量等于目标列数
  225. Args:
  226. clusters: 聚类列表
  227. target_count: 目标列数
  228. Returns:
  229. 合并后的聚类列表
  230. """
  231. if len(clusters) <= target_count:
  232. return clusters
  233. # 复制一份,避免修改原数据
  234. working_clusters = [c.copy() for c in clusters]
  235. while len(working_clusters) > target_count:
  236. # 找到距离最近的两个簇
  237. min_distance = float('inf')
  238. merge_idx = 0
  239. for i in range(len(working_clusters) - 1):
  240. distance = working_clusters[i + 1]['x_min'] - working_clusters[i]['x_max']
  241. if distance < min_distance:
  242. min_distance = distance
  243. merge_idx = i
  244. # 合并
  245. cluster1 = working_clusters[merge_idx]
  246. cluster2 = working_clusters[merge_idx + 1]
  247. merged_cluster = {
  248. 'x_min': cluster1['x_min'],
  249. 'x_max': cluster2['x_max'],
  250. 'boxes': cluster1['boxes'] + cluster2['boxes']
  251. }
  252. # 替换
  253. working_clusters[merge_idx] = merged_cluster
  254. working_clusters.pop(merge_idx + 1)
  255. return working_clusters
  256. def _get_boxes_in_column(self, boxes: List[Dict],
  257. boundaries: List[Tuple[int, int]],
  258. col_idx: int) -> List[Dict]:
  259. """
  260. 获取指定列范围内的 boxes(改进版:包含重叠)
  261. Args:
  262. boxes: 当前行的所有 boxes
  263. boundaries: 列边界
  264. col_idx: 列索引
  265. Returns:
  266. 该列的 boxes
  267. """
  268. if col_idx >= len(boundaries):
  269. return []
  270. x_start, x_end = boundaries[col_idx]
  271. col_boxes = []
  272. for box in boxes:
  273. bbox = box['bbox']
  274. box_x_start = bbox[0]
  275. box_x_end = bbox[2]
  276. # 🔑 改进:检查是否有重叠(不只是中心点)
  277. overlap = not (box_x_start > x_end or box_x_end < x_start)
  278. if overlap:
  279. col_boxes.append(box)
  280. return col_boxes
  281. def _filter_boxes_in_table_region(self, paddle_boxes: List[Dict],
  282. table_bbox: Optional[List[int]],
  283. html: str) -> Tuple[List[Dict], List[int]]:
  284. """
  285. 筛选表格区域内的 paddle boxes
  286. 策略:
  287. 1. 如果有 table_bbox,使用边界框筛选(扩展边界)
  288. 2. 如果没有 table_bbox,通过内容匹配推断区域
  289. Args:
  290. paddle_boxes: paddle OCR 结果
  291. table_bbox: 表格边界框 [x1, y1, x2, y2]
  292. html: HTML 内容(用于内容验证)
  293. Returns:
  294. (筛选后的 boxes, 实际表格边界框)
  295. """
  296. if not paddle_boxes:
  297. return [], [0, 0, 0, 0]
  298. # 🎯 策略 1: 使用提供的 table_bbox(扩展边界)
  299. if table_bbox and len(table_bbox) == 4:
  300. x1, y1, x2, y2 = table_bbox
  301. # 扩展边界(考虑边框外的文本)
  302. margin = 20
  303. expanded_bbox = [
  304. max(0, x1 - margin),
  305. max(0, y1 - margin),
  306. x2 + margin,
  307. y2 + margin
  308. ]
  309. filtered = []
  310. for box in paddle_boxes:
  311. bbox = box['bbox']
  312. box_center_x = (bbox[0] + bbox[2]) / 2
  313. box_center_y = (bbox[1] + bbox[3]) / 2
  314. # 中心点在扩展区域内
  315. if (expanded_bbox[0] <= box_center_x <= expanded_bbox[2] and
  316. expanded_bbox[1] <= box_center_y <= expanded_bbox[3]):
  317. filtered.append(box)
  318. if filtered:
  319. # 计算实际边界框
  320. actual_bbox = [
  321. min(b['bbox'][0] for b in filtered),
  322. min(b['bbox'][1] for b in filtered),
  323. max(b['bbox'][2] for b in filtered),
  324. max(b['bbox'][3] for b in filtered)
  325. ]
  326. return filtered, actual_bbox
  327. # 🎯 策略 2: 通过内容匹配推断区域
  328. print(" ℹ️ 无 table_bbox,使用内容匹配推断表格区域...")
  329. # 提取 HTML 中的所有文本
  330. from bs4 import BeautifulSoup
  331. soup = BeautifulSoup(html, 'html.parser')
  332. html_texts = set()
  333. for cell in soup.find_all(['td', 'th']):
  334. text = cell.get_text(strip=True)
  335. if text:
  336. html_texts.add(self.text_matcher.normalize_text(text))
  337. if not html_texts:
  338. return [], [0, 0, 0, 0]
  339. # 找出与 HTML 内容匹配的 boxes
  340. matched_boxes = []
  341. for box in paddle_boxes:
  342. normalized_text = self.text_matcher.normalize_text(box['text'])
  343. # 检查是否匹配
  344. if any(normalized_text in ht or ht in normalized_text
  345. for ht in html_texts):
  346. matched_boxes.append(box)
  347. if not matched_boxes:
  348. # 🔑 降级:如果精确匹配失败,使用模糊匹配
  349. print(" ℹ️ 精确匹配失败,尝试模糊匹配...")
  350. for box in paddle_boxes:
  351. normalized_text = self.text_matcher.normalize_text(box['text'])
  352. for ht in html_texts:
  353. similarity = fuzz.partial_ratio(normalized_text, ht)
  354. if similarity >= 70: # 降低阈值
  355. matched_boxes.append(box)
  356. break
  357. if matched_boxes:
  358. # 计算边界框
  359. actual_bbox = [
  360. min(b['bbox'][0] for b in matched_boxes),
  361. min(b['bbox'][1] for b in matched_boxes),
  362. max(b['bbox'][2] for b in matched_boxes),
  363. max(b['bbox'][3] for b in matched_boxes)
  364. ]
  365. # 🔑 扩展边界,包含可能遗漏的文本
  366. margin = 30
  367. expanded_bbox = [
  368. max(0, actual_bbox[0] - margin),
  369. max(0, actual_bbox[1] - margin),
  370. actual_bbox[2] + margin,
  371. actual_bbox[3] + margin
  372. ]
  373. # 重新筛选(包含边界上的文本)
  374. final_filtered = []
  375. for box in paddle_boxes:
  376. bbox = box['bbox']
  377. box_center_x = (bbox[0] + bbox[2]) / 2
  378. box_center_y = (bbox[1] + bbox[3]) / 2
  379. if (expanded_bbox[0] <= box_center_x <= expanded_bbox[2] and
  380. expanded_bbox[1] <= box_center_y <= expanded_bbox[3]):
  381. final_filtered.append(box)
  382. return final_filtered, actual_bbox
  383. # 🔑 最后的降级:返回所有 boxes
  384. print(" ⚠️ 无法确定表格区域,使用所有 paddle boxes")
  385. if paddle_boxes:
  386. actual_bbox = [
  387. min(b['bbox'][0] for b in paddle_boxes),
  388. min(b['bbox'][1] for b in paddle_boxes),
  389. max(b['bbox'][2] for b in paddle_boxes),
  390. max(b['bbox'][3] for b in paddle_boxes)
  391. ]
  392. return paddle_boxes, actual_bbox
  393. return [], [0, 0, 0, 0]
  394. def _group_paddle_boxes_by_rows(self, paddle_boxes: List[Dict],
  395. y_tolerance: int = 10,
  396. auto_correct_skew: bool = True) -> List[Dict]:
  397. """
  398. 将 paddle_text_boxes 按 y 坐标分组(聚类)- 增强版本
  399. Args:
  400. paddle_boxes: Paddle OCR 文字框列表
  401. y_tolerance: Y 坐标容忍度(像素)
  402. auto_correct_skew: 是否自动校正倾斜
  403. Returns:
  404. 分组列表,每组包含 {'y_center': float, 'boxes': List[Dict]}
  405. """
  406. if not paddle_boxes:
  407. return []
  408. # 🎯 步骤 1: 检测并校正倾斜(使用 BBoxExtractor)
  409. if auto_correct_skew:
  410. rotation_angle = BBoxExtractor.calculate_skew_angle(paddle_boxes)
  411. if abs(rotation_angle) > 0.5:
  412. max_x = max(box['bbox'][2] for box in paddle_boxes)
  413. max_y = max(box['bbox'][3] for box in paddle_boxes)
  414. image_size = (max_x, max_y)
  415. print(f" 🔧 校正倾斜角度: {rotation_angle:.2f}°")
  416. paddle_boxes = BBoxExtractor.correct_boxes_skew(
  417. paddle_boxes, -rotation_angle, image_size
  418. )
  419. # 🎯 步骤 2: 按校正后的 y 坐标分组
  420. boxes_with_y = []
  421. for box in paddle_boxes:
  422. bbox = box['bbox']
  423. y_center = (bbox[1] + bbox[3]) / 2
  424. boxes_with_y.append({
  425. 'y_center': y_center,
  426. 'box': box
  427. })
  428. # 按 y 坐标排序
  429. boxes_with_y.sort(key=lambda x: x['y_center'])
  430. groups = []
  431. current_group = None
  432. for item in boxes_with_y:
  433. if current_group is None:
  434. # 开始新组
  435. current_group = {
  436. 'y_center': item['y_center'],
  437. 'boxes': [item['box']]
  438. }
  439. else:
  440. if abs(item['y_center'] - current_group['y_center']) <= y_tolerance:
  441. current_group['boxes'].append(item['box'])
  442. # 更新组的中心
  443. current_group['y_center'] = sum(
  444. (b['bbox'][1] + b['bbox'][3]) / 2 for b in current_group['boxes']
  445. ) / len(current_group['boxes'])
  446. else:
  447. groups.append(current_group)
  448. current_group = {
  449. 'y_center': item['y_center'],
  450. 'boxes': [item['box']]
  451. }
  452. if current_group:
  453. groups.append(current_group)
  454. print(f" ✓ 分组完成: {len(groups)} 行")
  455. return groups
  456. def _match_html_rows_to_paddle_groups(self, html_rows: List,
  457. grouped_boxes: List[Dict]) -> Dict[int, List[int]]:
  458. """
  459. 智能匹配 HTML 行与 paddle 分组(增强版 DP:支持跳过 HTML 行,防止链条断裂)
  460. """
  461. if not html_rows or not grouped_boxes:
  462. return {}
  463. mapping = {}
  464. # 🎯 策略 1: 数量相等,简单 1:1 映射
  465. if len(html_rows) == len(grouped_boxes):
  466. for i in range(len(html_rows)):
  467. mapping[i] = [i]
  468. return mapping
  469. # --- 准备数据 ---
  470. # 提取 HTML 文本
  471. html_row_texts = []
  472. for row in html_rows:
  473. cells = row.find_all(['td', 'th'])
  474. texts = [self.text_matcher.normalize_text(c.get_text(strip=True)) for c in cells]
  475. html_row_texts.append("".join(texts))
  476. # 预计算所有组的文本
  477. group_texts = []
  478. for group in grouped_boxes:
  479. boxes = group['boxes']
  480. texts = [self.text_matcher.normalize_text(b['text']) for b in boxes]
  481. group_texts.append("".join(texts))
  482. n_html = len(html_row_texts)
  483. n_paddle = len(grouped_boxes)
  484. # ⚡️ 优化 3: 预计算合并文本
  485. MAX_MERGE = 4
  486. merged_cache = {}
  487. for j in range(n_paddle):
  488. current_t = ""
  489. for k in range(MAX_MERGE):
  490. if j + k < n_paddle:
  491. current_t += group_texts[j + k]
  492. merged_cache[(j, k + 1)] = current_t
  493. else:
  494. break
  495. # --- 动态规划 (DP) ---
  496. # dp[i][j] 表示:HTML 前 i 行 (0..i) 匹配到了 Paddle 的前 j 组 (0..j) 的最大得分
  497. # 初始化为负无穷
  498. dp = np.full((n_html, n_paddle), -np.inf)
  499. # 记录路径:path[i][j] = (prev_j, start_j)
  500. # prev_j: 上一行结束的 paddle index
  501. # start_j: 当前行开始的 paddle index (因为一行可能对应多个组)
  502. path = {}
  503. # 参数配置
  504. SEARCH_WINDOW = 15 # 向前搜索窗口
  505. SKIP_PADDLE_PENALTY = 0.1 # 跳过 Paddle 组的惩罚
  506. SKIP_HTML_PENALTY = 0.3 # 关键:跳过 HTML 行的惩罚
  507. # --- 1. 初始化第一行 ---
  508. # 选项 A: 匹配 Paddle 组
  509. for end_j in range(min(n_paddle, SEARCH_WINDOW + MAX_MERGE)):
  510. for count in range(1, MAX_MERGE + 1):
  511. start_j = end_j - count + 1
  512. if start_j < 0: continue
  513. current_text = merged_cache.get((start_j, count), "")
  514. similarity = self._calculate_similarity(html_row_texts[0], current_text)
  515. penalty = start_j * SKIP_PADDLE_PENALTY
  516. score = similarity - penalty
  517. # 只有得分尚可才作为有效状态
  518. if score > 0.1:
  519. if score > dp[0][end_j]:
  520. dp[0][end_j] = score
  521. path[(0, end_j)] = (-1, start_j)
  522. # 选项 B: 第一行就跳过 (虽然少见,但为了完整性)
  523. # 如果第一行跳过,相当于没有消耗任何 paddle 组,状态难以用 dp[0][j] 表达
  524. # 这里简化处理,假设第一行必须匹配点什么,或者由后续行修正
  525. # --- 2. 状态转移 ---
  526. for i in range(1, n_html):
  527. html_text = html_row_texts[i]
  528. # 获取上一行所有有效位置
  529. valid_prev_indices = [j for j in range(n_paddle) if dp[i-1][j] > -np.inf]
  530. # 剪枝
  531. if len(valid_prev_indices) > 30:
  532. valid_prev_indices.sort(key=lambda j: dp[i-1][j], reverse=True)
  533. valid_prev_indices = valid_prev_indices[:30]
  534. # 🛡️ 关键修复:允许跳过当前 HTML 行 (继承上一行的状态)
  535. # 如果跳过当前行,Paddle 指针 j 不变
  536. for prev_j in valid_prev_indices:
  537. score_skip = dp[i-1][prev_j] - SKIP_HTML_PENALTY
  538. if score_skip > dp[i][prev_j]:
  539. dp[i][prev_j] = score_skip
  540. # 记录路径:start_j = prev_j + 1 表示没有消耗新组 (空范围)
  541. path[(i, prev_j)] = (prev_j, prev_j + 1)
  542. # 如果是空行,直接跳过计算,仅保留继承的状态
  543. if not html_text:
  544. continue
  545. # 正常匹配逻辑
  546. for prev_j in valid_prev_indices:
  547. prev_score = dp[i-1][prev_j]
  548. max_gap = min(SEARCH_WINDOW, n_paddle - prev_j - 1)
  549. for gap in range(max_gap):
  550. start_j = prev_j + 1 + gap
  551. for count in range(1, MAX_MERGE + 1):
  552. end_j = start_j + count - 1
  553. if end_j >= n_paddle: break
  554. current_text = merged_cache.get((start_j, count), "")
  555. # 长度预筛选
  556. h_len = len(html_text)
  557. p_len = len(current_text)
  558. if h_len > 10 and p_len < h_len * 0.2:
  559. continue
  560. similarity = self._calculate_similarity(html_text, current_text)
  561. # 计算惩罚
  562. # 1. 跳过惩罚 (gap)
  563. # 2. 长度惩罚 (防止过度合并)
  564. len_penalty = 0.0
  565. if h_len > 0:
  566. ratio = p_len / h_len
  567. if ratio > 2.0: len_penalty = (ratio - 2.0) * 0.2
  568. current_score = similarity - (gap * SKIP_PADDLE_PENALTY) - len_penalty
  569. # 只有正收益才转移
  570. if current_score > 0.1:
  571. total_score = prev_score + current_score
  572. if total_score > dp[i][end_j]:
  573. dp[i][end_j] = total_score
  574. path[(i, end_j)] = (prev_j, start_j)
  575. # --- 3. 回溯找最优路径 ---
  576. # 找到最后一行得分最高的结束位置
  577. best_end_j = -1
  578. max_score = -np.inf
  579. # 优先找最后一行,如果最后一行没匹配上,往前找
  580. found_end = False
  581. for i in range(n_html - 1, -1, -1):
  582. for j in range(n_paddle):
  583. if dp[i][j] > max_score:
  584. max_score = dp[i][j]
  585. best_end_j = j
  586. best_last_row = i
  587. if max_score > -np.inf:
  588. found_end = True
  589. break
  590. mapping = {}
  591. used_groups = set()
  592. if found_end:
  593. curr_i = best_last_row
  594. curr_j = best_end_j
  595. while curr_i >= 0:
  596. if (curr_i, curr_j) in path:
  597. prev_j, start_j = path[(curr_i, curr_j)]
  598. # 如果 start_j <= curr_j,说明消耗了 Paddle 组
  599. # 如果 start_j > curr_j,说明是跳过 HTML 行 (空范围)
  600. if start_j <= curr_j:
  601. indices = list(range(start_j, curr_j + 1))
  602. mapping[curr_i] = indices
  603. used_groups.update(indices)
  604. else:
  605. mapping[curr_i] = []
  606. curr_j = prev_j
  607. curr_i -= 1
  608. else:
  609. break
  610. # 填补未匹配的行
  611. for i in range(n_html):
  612. if i not in mapping:
  613. mapping[i] = []
  614. # --- 4. 后处理:未匹配组的归属 (Orphans) ---
  615. unused_groups = [i for i in range(len(grouped_boxes)) if i not in used_groups]
  616. if unused_groups:
  617. print(f" ℹ️ 发现 {len(unused_groups)} 个未匹配的 paddle 组: {unused_groups}")
  618. for unused_idx in unused_groups:
  619. unused_group = grouped_boxes[unused_idx]
  620. unused_y_min = min(b['bbox'][1] for b in unused_group['boxes'])
  621. unused_y_max = max(b['bbox'][3] for b in unused_group['boxes'])
  622. above_idx = None
  623. below_idx = None
  624. above_distance = float('inf')
  625. below_distance = float('inf')
  626. for i in range(unused_idx - 1, -1, -1):
  627. if i in used_groups:
  628. above_idx = i
  629. above_group = grouped_boxes[i]
  630. max_y_box = max(above_group['boxes'], key=lambda b: b['bbox'][3])
  631. above_y_center = (max_y_box['bbox'][1] + max_y_box['bbox'][3]) / 2
  632. above_distance = abs(unused_y_min - above_y_center)
  633. break
  634. for i in range(unused_idx + 1, len(grouped_boxes)):
  635. if i in used_groups:
  636. below_idx = i
  637. below_group = grouped_boxes[i]
  638. min_y_box = min(below_group['boxes'], key=lambda b: b['bbox'][1])
  639. below_y_center = (min_y_box['bbox'][1] + min_y_box['bbox'][3]) / 2
  640. below_distance = abs(below_y_center - unused_y_max)
  641. break
  642. closest_used_idx = None
  643. merge_direction = ""
  644. if above_idx is not None and below_idx is not None:
  645. if above_distance < below_distance:
  646. closest_used_idx = above_idx
  647. merge_direction = "上方"
  648. else:
  649. closest_used_idx = below_idx
  650. merge_direction = "下方"
  651. elif above_idx is not None:
  652. closest_used_idx = above_idx
  653. merge_direction = "上方"
  654. elif below_idx is not None:
  655. closest_used_idx = below_idx
  656. merge_direction = "下方"
  657. if closest_used_idx is not None:
  658. target_html_row = None
  659. for html_row_idx, group_indices in mapping.items():
  660. if closest_used_idx in group_indices:
  661. target_html_row = html_row_idx
  662. break
  663. if target_html_row is not None:
  664. if unused_idx not in mapping[target_html_row]:
  665. mapping[target_html_row].append(unused_idx)
  666. mapping[target_html_row].sort()
  667. print(f" • 组 {unused_idx} 合并到 HTML 行 {target_html_row}({merge_direction}行)")
  668. used_groups.add(unused_idx)
  669. # 🔑 策略 4: 第三遍 - 按 y 坐标排序每行的组索引
  670. for row_idx in mapping:
  671. if mapping[row_idx]:
  672. mapping[row_idx].sort(key=lambda idx: grouped_boxes[idx]['y_center'])
  673. return mapping
  674. def _calculate_similarity(self, text1: str, text2: str) -> float:
  675. """
  676. 计算两个文本的相似度,结合字符覆盖率和序列相似度 (性能优化版)
  677. """
  678. if not text1 or not text2:
  679. return 0.0
  680. len1, len2 = len(text1), len(text2)
  681. # ⚡️ 优化 1: 长度快速检查
  682. # 如果长度差异过大(例如一个 50 字符,一个 2 字符),直接认为不匹配
  683. if len1 > 0 and len2 > 0:
  684. min_l, max_l = min(len1, len2), max(len1, len2)
  685. if max_l > 10 and min_l / max_l < 0.2:
  686. return 0.0
  687. # 1. 字符覆盖率 (Character Overlap)
  688. from collections import Counter
  689. c1 = Counter(text1)
  690. c2 = Counter(text2)
  691. intersection = c1 & c2
  692. overlap_count = sum(intersection.values())
  693. coverage = overlap_count / len1 if len1 > 0 else 0
  694. # ⚡️ 优化 2: 覆盖率低时跳过昂贵的 fuzz 计算
  695. # 如果字符重叠率低于 30%,说明内容基本不相关,没必要算序列相似度
  696. if coverage < 0.3:
  697. return coverage * 0.7
  698. # 2. 序列相似度 (Sequence Similarity)
  699. # 使用 token_sort_ratio 来容忍一定的乱序
  700. seq_score = fuzz.token_sort_ratio(text1, text2) / 100.0
  701. return (coverage * 0.7) + (seq_score * 0.3)
  702. def _preprocess_text_for_matching(self, text: str) -> str:
  703. """
  704. 预处理文本:在不同类型的字符(如中文和数字/英文)之间插入空格,
  705. 以便于 token_sort_ratio 更准确地进行分词和匹配。
  706. """
  707. if not text:
  708. return ""
  709. import re
  710. # 1. 在中文和非中文(数字/字母)之间插入空格
  711. # 例如: "2024年" -> "2024 年", "ID号码123" -> "ID号码 123"
  712. text = re.sub(r'([\u4e00-\u9fa5])([a-zA-Z0-9])', r'\1 \2', text)
  713. text = re.sub(r'([a-zA-Z0-9])([\u4e00-\u9fa5])', r'\1 \2', text)
  714. return text
  715. def _calculate_subsequence_score(self, target: str, source: str) -> float:
  716. """
  717. 计算子序列匹配得分 (解决 OCR 噪音插入问题)
  718. 例如: Target="12345", Source="12(date)34(time)5" -> Score close to 100
  719. """
  720. # 1. 仅保留字母和数字,忽略符号干扰
  721. t_clean = "".join(c for c in target if c.isalnum())
  722. s_clean = "".join(c for c in source if c.isalnum())
  723. if not t_clean or not s_clean:
  724. return 0.0
  725. # 2. 贪婪匹配子序列
  726. t_idx, s_idx = 0, 0
  727. matches = 0
  728. while t_idx < len(t_clean) and s_idx < len(s_clean):
  729. if t_clean[t_idx] == s_clean[s_idx]:
  730. matches += 1
  731. t_idx += 1
  732. s_idx += 1
  733. else:
  734. # 跳过 source 中的噪音字符
  735. s_idx += 1
  736. # 3. 计算得分
  737. match_rate = matches / len(t_clean)
  738. # 如果匹配率太低,直接返回
  739. if match_rate < 0.8:
  740. return match_rate * 100
  741. # 4. 噪音惩罚 (防止 Target="1", Source="123456789" 这种误判)
  742. # 计算噪音长度
  743. noise_len = len(s_clean) - matches
  744. # 允许一定比例的噪音 (例如日期时间插入,通常占总长度的 30%-50%)
  745. # 如果噪音长度超过目标长度的 60%,开始扣分
  746. penalty = 0
  747. if noise_len > len(t_clean) * 0.6:
  748. excess_noise = noise_len - (len(t_clean) * 0.6)
  749. penalty = excess_noise * 0.5 # 每多一个噪音字符扣 0.5 分
  750. penalty = min(penalty, 20) # 最多扣 20 分
  751. final_score = (match_rate * 100) - penalty
  752. return max(0, final_score)
  753. def _match_cell_sequential(self, cell_text: str,
  754. boxes: List[Dict],
  755. col_boundaries: List[Tuple[int, int]],
  756. start_idx: int) -> Optional[Dict]:
  757. """
  758. 🎯 顺序匹配单元格:从指定位置开始,逐步合并 boxes 直到匹配
  759. """
  760. cell_text_normalized = self.text_matcher.normalize_text(cell_text)
  761. cell_text_processed = self._preprocess_text_for_matching(cell_text)
  762. if len(cell_text_normalized) < 1:
  763. return None
  764. # 🔑 找到第一个未使用的 box
  765. first_unused_idx = start_idx
  766. while first_unused_idx < len(boxes) and boxes[first_unused_idx].get('used'):
  767. first_unused_idx += 1
  768. if first_unused_idx >= len(boxes):
  769. return None
  770. # 🔑 策略 1: 单个 box 精确匹配
  771. for box in boxes[first_unused_idx:]:
  772. box_text = self.text_matcher.normalize_text(box['text'])
  773. if cell_text_normalized == box_text:
  774. return self._build_match_result([box], box['text'], 100.0, boxes.index(box))
  775. # 🔑 策略 2: 多个 boxes 合并匹配
  776. unused_boxes = [b for b in boxes if not b.get('used')]
  777. # 合并同列的 boxes 合并
  778. merged_bboxes = []
  779. for col_idx in range(len(col_boundaries)):
  780. combo_boxes = self._get_boxes_in_column(unused_boxes, col_boundaries, col_idx)
  781. if len(combo_boxes) > 0:
  782. sorted_combo = sorted(combo_boxes, key=lambda b: (b['bbox'][1], b['bbox'][0]))
  783. # 🎯 改进:使用空格连接,以便于 token_sort_ratio 进行乱序匹配
  784. merged_text = ' '.join([b['text'] for b in sorted_combo])
  785. merged_bboxes.append({
  786. 'text': merged_text,
  787. 'sorted_combo': sorted_combo
  788. })
  789. for box in merged_bboxes:
  790. # 1. 精确匹配
  791. merged_text_normalized = self.text_matcher.normalize_text(box['text'])
  792. if cell_text_normalized == merged_text_normalized:
  793. last_sort_idx = boxes.index(box['sorted_combo'][-1])
  794. return self._build_match_result(box['sorted_combo'], box['text'], 100.0, last_sort_idx)
  795. # 2. 子串匹配
  796. is_substring = (cell_text_normalized in merged_text_normalized or
  797. merged_text_normalized in cell_text_normalized)
  798. # 3. 模糊匹配
  799. # 🎯 改进:使用预处理后的文本进行 token_sort_ratio 计算
  800. box_text_processed = self._preprocess_text_for_matching(box['text'])
  801. # token_sort_ratio: 自动分词并排序比较,解决 OCR 结果顺序与 HTML 不一致的问题
  802. token_sort_sim = fuzz.token_sort_ratio(cell_text_processed, box_text_processed)
  803. # partial_ratio: 子串模糊匹配,解决 OCR 识别错误
  804. partial_sim = fuzz.partial_ratio(cell_text_normalized, merged_text_normalized)
  805. # 🎯 新增:token_set_ratio (集合匹配)
  806. # 专门解决:目标文本被 OCR 文本中的噪音隔开的情况
  807. # 例如 Target="A B", OCR="A noise B" -> token_set_ratio 会很高
  808. token_set_sim = fuzz.token_set_ratio(cell_text_processed, box_text_processed)
  809. # 🎯 策略 4: 重构匹配 (Reconstruction Match) - 解决 ID 被噪音打断的问题
  810. # 逻辑:提取 OCR 中所有属于 Target 子串的 token,拼起来再比
  811. reconstruct_sim = 0.0
  812. if len(cell_text_normalized) > 10: # 仅对长文本启用,防止短文本误判
  813. # 使用预处理后的文本分词 (已处理中文/数字间隔)
  814. box_tokens = box_text_processed.split()
  815. # 筛选出所有是目标文本子串的 token
  816. valid_tokens = []
  817. for token in box_tokens:
  818. # 忽略太短的 token (除非目标也很短),防止 "1" 这种误匹配
  819. if len(token) < 2 and len(cell_text_normalized) > 5:
  820. continue
  821. if token in cell_text_normalized:
  822. valid_tokens.append(token)
  823. if valid_tokens:
  824. # 拼接回原始形态
  825. reconstructed_text = "".join(valid_tokens)
  826. reconstruct_sim = fuzz.ratio(cell_text_normalized, reconstructed_text)
  827. if reconstruct_sim > 90:
  828. print(f" 🧩 重构匹配生效: '{reconstructed_text}' (sim={reconstruct_sim})")
  829. # 🎯 策略 5: 子序列匹配 (Subsequence Match) - 解决粘连噪音问题
  830. # 专门针对: '1544...1050' + '2024-08-10' + '0433...' 这种场景
  831. subseq_sim = 0.0
  832. if len(cell_text_normalized) > 8: # 仅对较长文本启用
  833. subseq_sim = self._calculate_subsequence_score(cell_text_normalized, merged_text_normalized)
  834. # 🛡️ 关键修复:长度和类型防御
  835. if subseq_sim > 80:
  836. len_cell = len(cell_text_normalized)
  837. len_box = len(merged_text_normalized)
  838. # 1. 长度差异过大 (Box 比 Cell 长很多)
  839. if len_box > len_cell * 1.5:
  840. # 2. 且 Cell 是数字/日期/时间类型 (容易在长ID中误配)
  841. import re
  842. # 匹配纯数字、日期时间格式
  843. if re.match(r'^[\d\-\:\.\s]+$', cell_text_normalized):
  844. print(f" ⚠️ 拒绝子序列匹配: 长度差异大且为数字类型 (sim={subseq_sim})")
  845. subseq_sim = 0.0
  846. if subseq_sim > 90:
  847. print(f" 🔗 子序列匹配生效: '{cell_text[:10]}...' (sim={subseq_sim:.1f})")
  848. # 综合得分:取五者最大值
  849. similarity = max(token_sort_sim, partial_sim, token_set_sim, reconstruct_sim, subseq_sim)
  850. # 🎯 子串匹配加分
  851. if is_substring:
  852. similarity = min(100, similarity + 10)
  853. # 🎯 长度惩罚:如果 box 内容比 cell 多太多(例如吞了下一个单元格),扣分
  854. # 注意:token_set_ratio 对长度不敏感,所以这里必须严格检查长度,防止误判
  855. # 只有当 similarity 很高时才检查,防止误杀
  856. if similarity > 80:
  857. len_cell = len(cell_text_normalized)
  858. len_box = len(merged_text_normalized)
  859. # 如果是 token_set_sim 贡献的高分,说明 OCR 里包含了很多噪音
  860. # 我们需要确保这些噪音不是“下一个单元格的内容”
  861. # 这里可以加一个更严格的长度检查,或者检查是否包含换行符等
  862. if len_box > len_cell * 2.0 + 10: # 放宽一点,因为 token_set 本来就是处理噪音的
  863. similarity -= 10 # 稍微扣一点分,表示虽然全找到了,但噪音太多不太完美
  864. if similarity >= self.text_matcher.similarity_threshold:
  865. print(f" ✓ 匹配成功: '{cell_text[:15]}' vs '{box['text'][:15]}' (相似度: {similarity})")
  866. # 由于是模糊匹配,返回第一个未使用的 box 作为 last_index
  867. for b in boxes:
  868. if not b.get('used'):
  869. last_idx = max(boxes.index(b)-1, 0)
  870. break
  871. return self._build_match_result(box['sorted_combo'], box['text'], similarity, max(start_idx, last_idx))
  872. print(f" ✗ 匹配失败: '{cell_text[:15]}'")
  873. return None
  874. def _build_match_result(self, boxes: List[Dict], text: str,
  875. score: float, last_index: int) -> Dict:
  876. """构建匹配结果(使用原始坐标)"""
  877. # 🔑 关键修复:使用 original_bbox(如果存在)
  878. def get_original_bbox(box: Dict) -> List[int]:
  879. return box.get('original_bbox', box['bbox'])
  880. original_bboxes = [get_original_bbox(b) for b in boxes]
  881. merged_bbox = [
  882. min(b[0] for b in original_bboxes),
  883. min(b[1] for b in original_bboxes),
  884. max(b[2] for b in original_bboxes),
  885. max(b[3] for b in original_bboxes)
  886. ]
  887. return {
  888. 'bbox': merged_bbox, # ✅ 使用原始坐标
  889. 'text': text,
  890. 'score': score,
  891. 'paddle_indices': [b['paddle_bbox_index'] for b in boxes],
  892. 'used_boxes': boxes,
  893. 'last_used_index': last_index
  894. }