table_cell_matcher_greedy.py 49 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173
  1. """
  2. 表格单元格匹配器
  3. 负责将 HTML 表格单元格与 PaddleOCR bbox 进行匹配
  4. """
  5. from typing import List, Dict, Tuple, Optional
  6. from bs4 import BeautifulSoup
  7. import numpy as np
  8. try:
  9. from rapidfuzz import fuzz
  10. except ImportError:
  11. from fuzzywuzzy import fuzz
  12. try:
  13. from .text_matcher import TextMatcher
  14. from .bbox_extractor import BBoxExtractor
  15. except ImportError:
  16. from text_matcher import TextMatcher
  17. from bbox_extractor import BBoxExtractor
  18. class TableCellMatcher:
  19. """表格单元格匹配器"""
  20. def __init__(self, text_matcher: TextMatcher,
  21. x_tolerance: int = 3,
  22. y_tolerance: int = 10,
  23. inclination_threshold: float = 0.3):
  24. """
  25. Args:
  26. text_matcher: 文本匹配器
  27. x_tolerance: X轴容差(用于列边界判断)
  28. y_tolerance: Y轴容差(用于行分组)
  29. """
  30. self.text_matcher = text_matcher
  31. self.x_tolerance = x_tolerance
  32. self.y_tolerance = y_tolerance
  33. self.inclination_threshold = inclination_threshold # 倾斜校正阈值(度数)
  34. def enhance_table_html_with_bbox(self, html: str, paddle_text_boxes: List[Dict],
  35. start_pointer: int, table_bbox: Optional[List[int]] = None) -> Tuple[str, List[Dict], int]:
  36. """
  37. 为 HTML 表格添加 bbox 信息(优化版:先筛选表格区域)
  38. 策略:
  39. 1. 根据 table_bbox 筛选出表格区域内的 paddle_text_boxes
  40. 2. 将筛选后的 boxes 按行分组
  41. 3. 智能匹配 HTML 行与 paddle 行组
  42. 4. 在匹配的组内查找单元格
  43. Args:
  44. html: HTML 表格
  45. paddle_text_boxes: 全部 paddle OCR 结果
  46. start_pointer: 开始位置
  47. table_bbox: 表格边界框 [x1, y1, x2, y2]
  48. """
  49. soup = BeautifulSoup(html, 'html.parser')
  50. cells = []
  51. # 🔑 第一步:筛选表格区域内的 paddle boxes
  52. table_region_boxes, actual_table_bbox = self._filter_boxes_in_table_region(
  53. paddle_text_boxes[start_pointer:],
  54. table_bbox,
  55. html
  56. )
  57. if not table_region_boxes:
  58. print(f"⚠️ 未在表格区域找到 paddle boxes")
  59. return str(soup), cells, start_pointer
  60. print(f"📊 表格区域: {len(table_region_boxes)} 个文本框")
  61. print(f" 边界: {actual_table_bbox}")
  62. # 🔑 第二步:将表格区域的 boxes 按行分组
  63. grouped_boxes = self._group_paddle_boxes_by_rows(
  64. table_region_boxes,
  65. y_tolerance=self.y_tolerance,
  66. auto_correct_skew=True,
  67. inclination_threshold=self.inclination_threshold
  68. )
  69. # 🔑 第三步:在每组内按 x 坐标排序
  70. for group in grouped_boxes:
  71. group['boxes'].sort(key=lambda x: x['bbox'][0])
  72. grouped_boxes.sort(key=lambda g: g['y_center'])
  73. print(f" 分组: {len(grouped_boxes)} 行")
  74. # 🔑 第四步:智能匹配 HTML 行与 paddle 行组
  75. html_rows = soup.find_all('tr')
  76. row_mapping = self._match_html_rows_to_paddle_groups(html_rows, grouped_boxes)
  77. print(f" HTML行: {len(html_rows)} 行")
  78. print(f" 映射: {len([v for v in row_mapping.values() if v])} 个有效映射")
  79. # 🔑 第五步:遍历 HTML 表格,使用映射关系查找
  80. for row_idx, row in enumerate(html_rows):
  81. group_indices = row_mapping.get(row_idx, [])
  82. if not group_indices:
  83. continue
  84. # 合并多个组的 boxes
  85. current_boxes = []
  86. for group_idx in group_indices:
  87. if group_idx < len(grouped_boxes):
  88. current_boxes.extend(grouped_boxes[group_idx]['boxes'])
  89. current_boxes.sort(key=lambda x: x['bbox'][0])
  90. # 🎯 关键改进:提取 HTML 单元格并预先确定列边界
  91. html_cells = row.find_all(['td', 'th'])
  92. if not html_cells:
  93. continue
  94. # 🔑 预估列边界(基于 x 坐标分布)
  95. col_boundaries = self._estimate_column_boundaries(
  96. current_boxes,
  97. len(html_cells)
  98. )
  99. print(f" 行 {row_idx + 1}: {len(html_cells)} 列,边界: {col_boundaries}")
  100. # 🎯 关键改进:顺序指针匹配
  101. box_pointer = 0 # 当前行的 boxes 指针
  102. for col_idx, cell in enumerate(html_cells):
  103. cell_text = cell.get_text(strip=True)
  104. if not cell_text:
  105. continue
  106. # 🔑 从当前指针开始匹配
  107. matched_result = self._match_cell_sequential(
  108. cell_text,
  109. current_boxes,
  110. col_boundaries,
  111. box_pointer
  112. )
  113. if matched_result:
  114. merged_bbox = matched_result['bbox']
  115. merged_text = matched_result['text']
  116. cell['data-bbox'] = f"[{merged_bbox[0]},{merged_bbox[1]},{merged_bbox[2]},{merged_bbox[3]}]"
  117. cell['data-score'] = f"{matched_result['score']:.4f}"
  118. cell['data-paddle-indices'] = str(matched_result['paddle_indices'])
  119. cells.append({
  120. 'type': 'table_cell',
  121. 'text': cell_text,
  122. 'matched_text': merged_text,
  123. 'bbox': merged_bbox,
  124. 'row': row_idx + 1,
  125. 'col': col_idx + 1,
  126. 'score': matched_result['score'],
  127. 'paddle_bbox_indices': matched_result['paddle_indices']
  128. })
  129. # 标记已使用
  130. for box in matched_result['used_boxes']:
  131. box['used'] = True
  132. # 🎯 移动指针到最后使用的 box 之后
  133. box_pointer = matched_result['last_used_index'] + 1
  134. print(f" 列 {col_idx + 1}: '{cell_text[:20]}...' 匹配 {len(matched_result['used_boxes'])} 个box (指针: {box_pointer})")
  135. # 计算新的指针位置
  136. used_count = sum(1 for box in table_region_boxes if box.get('used'))
  137. new_pointer = start_pointer + used_count
  138. print(f" 匹配: {len(cells)} 个单元格")
  139. return str(soup), cells, new_pointer
  140. def _estimate_column_boundaries(self, boxes: List[Dict],
  141. num_cols: int) -> List[Tuple[int, int]]:
  142. """
  143. 估算列边界(改进版:处理同列多文本框)
  144. Args:
  145. boxes: 当前行的所有 boxes(已按 x 排序)
  146. num_cols: HTML 表格的列数
  147. Returns:
  148. 列边界列表 [(x_start, x_end), ...]
  149. """
  150. if not boxes:
  151. return []
  152. # 🔑 关键改进:先按 x 坐标聚类(合并同列的多个文本框)
  153. x_clusters = self._cluster_boxes_by_x(boxes, x_tolerance=self.x_tolerance)
  154. print(f" X聚类: {len(boxes)} 个boxes -> {len(x_clusters)} 个列簇")
  155. # 获取所有 x 坐标范围
  156. x_min = min(cluster['x_min'] for cluster in x_clusters)
  157. x_max = max(cluster['x_max'] for cluster in x_clusters)
  158. # 🎯 策略 1: 如果聚类数量<=列数接近
  159. if len(x_clusters) <= num_cols:
  160. # 直接使用聚类边界
  161. boundaries = [(cluster['x_min'], cluster['x_max'])
  162. for cluster in x_clusters]
  163. return boundaries
  164. # 🎯 策略 2: 聚类数多于列数(某些列有多个文本簇)
  165. if len(x_clusters) > num_cols:
  166. print(f" ℹ️ 聚类数 {len(x_clusters)} > 列数 {num_cols},合并相近簇")
  167. # 合并相近的簇
  168. merged_clusters = self._merge_close_clusters(x_clusters, num_cols)
  169. boundaries = [(cluster['x_min'], cluster['x_max'])
  170. for cluster in merged_clusters]
  171. return boundaries
  172. return []
  173. def _cluster_boxes_by_x(self, boxes: List[Dict],
  174. x_tolerance: int = 3) -> List[Dict]:
  175. """
  176. 按 x 坐标聚类(合并同列的多个文本框)
  177. Args:
  178. boxes: 文本框列表
  179. x_tolerance: X坐标容忍度
  180. Returns:
  181. 聚类列表 [{'x_min': int, 'x_max': int, 'boxes': List[Dict]}, ...]
  182. """
  183. if not boxes:
  184. return []
  185. # 按左边界 x 坐标排序
  186. sorted_boxes = sorted(boxes, key=lambda b: b['bbox'][0])
  187. clusters = []
  188. current_cluster = None
  189. for box in sorted_boxes:
  190. bbox = box['bbox']
  191. x_start = bbox[0]
  192. x_end = bbox[2]
  193. if current_cluster is None:
  194. # 开始新簇
  195. current_cluster = {
  196. 'x_min': x_start,
  197. 'x_max': x_end,
  198. 'boxes': [box]
  199. }
  200. else:
  201. # 🔑 检查是否属于当前簇(修正后的逻辑)
  202. # 1. x 坐标有重叠:x_start <= current_x_max 且 x_end >= current_x_min
  203. # 2. 或者距离在容忍度内
  204. has_overlap = (x_start <= current_cluster['x_max'] and
  205. x_end >= current_cluster['x_min'])
  206. is_close = abs(x_start - current_cluster['x_max']) <= x_tolerance
  207. if has_overlap or is_close:
  208. # 合并到当前簇
  209. current_cluster['boxes'].append(box)
  210. current_cluster['x_min'] = min(current_cluster['x_min'], x_start)
  211. current_cluster['x_max'] = max(current_cluster['x_max'], x_end)
  212. else:
  213. # 保存当前簇,开始新簇
  214. clusters.append(current_cluster)
  215. current_cluster = {
  216. 'x_min': x_start,
  217. 'x_max': x_end,
  218. 'boxes': [box]
  219. }
  220. # 添加最后一簇
  221. if current_cluster:
  222. clusters.append(current_cluster)
  223. return clusters
  224. def _merge_close_clusters(self, clusters: List[Dict],
  225. target_count: int) -> List[Dict]:
  226. """
  227. 合并相近的簇,直到数量等于目标列数
  228. Args:
  229. clusters: 聚类列表
  230. target_count: 目标列数
  231. Returns:
  232. 合并后的聚类列表
  233. """
  234. if len(clusters) <= target_count:
  235. return clusters
  236. # 复制一份,避免修改原数据
  237. working_clusters = [c.copy() for c in clusters]
  238. while len(working_clusters) > target_count:
  239. # 找到距离最近的两个簇
  240. min_distance = float('inf')
  241. merge_idx = 0
  242. for i in range(len(working_clusters) - 1):
  243. distance = working_clusters[i + 1]['x_min'] - working_clusters[i]['x_max']
  244. if distance < min_distance:
  245. min_distance = distance
  246. merge_idx = i
  247. # 合并
  248. cluster1 = working_clusters[merge_idx]
  249. cluster2 = working_clusters[merge_idx + 1]
  250. merged_cluster = {
  251. 'x_min': cluster1['x_min'],
  252. 'x_max': cluster2['x_max'],
  253. 'boxes': cluster1['boxes'] + cluster2['boxes']
  254. }
  255. # 替换
  256. working_clusters[merge_idx] = merged_cluster
  257. working_clusters.pop(merge_idx + 1)
  258. return working_clusters
  259. def _get_boxes_in_column(self, boxes: List[Dict],
  260. boundaries: List[Tuple[int, int]],
  261. col_idx: int) -> List[Dict]:
  262. """
  263. 获取指定列范围内的 boxes(改进版:包含重叠)
  264. Args:
  265. boxes: 当前行的所有 boxes
  266. boundaries: 列边界
  267. col_idx: 列索引
  268. Returns:
  269. 该列的 boxes
  270. """
  271. if col_idx >= len(boundaries):
  272. return []
  273. x_start, x_end = boundaries[col_idx]
  274. col_boxes = []
  275. for box in boxes:
  276. bbox = box['bbox']
  277. box_x_start = bbox[0]
  278. box_x_end = bbox[2]
  279. # 🔑 改进:检查是否有重叠(不只是中心点)
  280. overlap = not (box_x_start > x_end or box_x_end < x_start)
  281. if overlap:
  282. col_boxes.append(box)
  283. return col_boxes
  284. def _filter_boxes_in_table_region(self, paddle_boxes: List[Dict],
  285. table_bbox: Optional[List[int]],
  286. html: str) -> Tuple[List[Dict], List[int]]:
  287. """
  288. 筛选表格区域内的 paddle boxes
  289. 策略:
  290. 1. 如果有 table_bbox,使用边界框筛选(扩展边界)
  291. 2. 如果没有 table_bbox,通过内容匹配推断区域
  292. Args:
  293. paddle_boxes: paddle OCR 结果
  294. table_bbox: 表格边界框 [x1, y1, x2, y2]
  295. html: HTML 内容(用于内容验证)
  296. Returns:
  297. (筛选后的 boxes, 实际表格边界框)
  298. """
  299. if not paddle_boxes:
  300. return [], [0, 0, 0, 0]
  301. # 🎯 策略 1: 使用提供的 table_bbox(扩展边界)
  302. if table_bbox and len(table_bbox) == 4:
  303. x1, y1, x2, y2 = table_bbox
  304. # 扩展边界(考虑边框外的文本)
  305. margin = 20
  306. expanded_bbox = [
  307. max(0, x1 - margin),
  308. max(0, y1 - margin),
  309. x2 + margin,
  310. y2 + margin
  311. ]
  312. filtered = []
  313. for box in paddle_boxes:
  314. bbox = box['bbox']
  315. box_center_x = (bbox[0] + bbox[2]) / 2
  316. box_center_y = (bbox[1] + bbox[3]) / 2
  317. # 中心点在扩展区域内
  318. if (expanded_bbox[0] <= box_center_x <= expanded_bbox[2] and
  319. expanded_bbox[1] <= box_center_y <= expanded_bbox[3]):
  320. filtered.append(box)
  321. if filtered:
  322. # 计算实际边界框
  323. actual_bbox = [
  324. min(b['bbox'][0] for b in filtered),
  325. min(b['bbox'][1] for b in filtered),
  326. max(b['bbox'][2] for b in filtered),
  327. max(b['bbox'][3] for b in filtered)
  328. ]
  329. return filtered, actual_bbox
  330. # 🎯 策略 2: 通过内容匹配推断区域
  331. print(" ℹ️ 无 table_bbox,使用内容匹配推断表格区域...")
  332. # 提取 HTML 中的所有文本
  333. from bs4 import BeautifulSoup
  334. soup = BeautifulSoup(html, 'html.parser')
  335. html_texts = set()
  336. for cell in soup.find_all(['td', 'th']):
  337. text = cell.get_text(strip=True)
  338. if text:
  339. html_texts.add(self.text_matcher.normalize_text(text))
  340. if not html_texts:
  341. return [], [0, 0, 0, 0]
  342. # 找出与 HTML 内容匹配的 boxes
  343. matched_boxes = []
  344. for box in paddle_boxes:
  345. normalized_text = self.text_matcher.normalize_text(box['text'])
  346. # 检查是否匹配
  347. if any(normalized_text in ht or ht in normalized_text
  348. for ht in html_texts):
  349. matched_boxes.append(box)
  350. if not matched_boxes:
  351. # 🔑 降级:如果精确匹配失败,使用模糊匹配
  352. print(" ℹ️ 精确匹配失败,尝试模糊匹配...")
  353. for box in paddle_boxes:
  354. normalized_text = self.text_matcher.normalize_text(box['text'])
  355. for ht in html_texts:
  356. similarity = fuzz.partial_ratio(normalized_text, ht)
  357. if similarity >= 70: # 降低阈值
  358. matched_boxes.append(box)
  359. break
  360. if matched_boxes:
  361. # 计算边界框
  362. actual_bbox = [
  363. min(b['bbox'][0] for b in matched_boxes),
  364. min(b['bbox'][1] for b in matched_boxes),
  365. max(b['bbox'][2] for b in matched_boxes),
  366. max(b['bbox'][3] for b in matched_boxes)
  367. ]
  368. # 🔑 扩展边界,包含可能遗漏的文本
  369. margin = 30
  370. expanded_bbox = [
  371. max(0, actual_bbox[0] - margin),
  372. max(0, actual_bbox[1] - margin),
  373. actual_bbox[2] + margin,
  374. actual_bbox[3] + margin
  375. ]
  376. # 重新筛选(包含边界上的文本)
  377. final_filtered = []
  378. for box in paddle_boxes:
  379. bbox = box['bbox']
  380. box_center_x = (bbox[0] + bbox[2]) / 2
  381. box_center_y = (bbox[1] + bbox[3]) / 2
  382. if (expanded_bbox[0] <= box_center_x <= expanded_bbox[2] and
  383. expanded_bbox[1] <= box_center_y <= expanded_bbox[3]):
  384. final_filtered.append(box)
  385. return final_filtered, actual_bbox
  386. # 🔑 最后的降级:返回所有 boxes
  387. print(" ⚠️ 无法确定表格区域,使用所有 paddle boxes")
  388. if paddle_boxes:
  389. actual_bbox = [
  390. min(b['bbox'][0] for b in paddle_boxes),
  391. min(b['bbox'][1] for b in paddle_boxes),
  392. max(b['bbox'][2] for b in paddle_boxes),
  393. max(b['bbox'][3] for b in paddle_boxes)
  394. ]
  395. return paddle_boxes, actual_bbox
  396. return [], [0, 0, 0, 0]
  397. def _group_paddle_boxes_by_rows(self, paddle_boxes: List[Dict],
  398. y_tolerance: int = 10,
  399. auto_correct_skew: bool = True,
  400. inclination_threshold: float = 0.3) -> List[Dict]:
  401. """
  402. 将 paddle_text_boxes 按 y 坐标分组(聚类)- 增强版本
  403. Args:
  404. paddle_boxes: Paddle OCR 文字框列表
  405. y_tolerance: Y 坐标容忍度(像素)
  406. auto_correct_skew: 是否自动校正倾斜
  407. Returns:
  408. 分组列表,每组包含 {'y_center': float, 'boxes': List[Dict]}
  409. """
  410. if not paddle_boxes:
  411. return []
  412. # 🎯 步骤 1: 检测并校正倾斜(使用 BBoxExtractor)
  413. if auto_correct_skew:
  414. rotation_angle = BBoxExtractor.calculate_skew_angle(paddle_boxes)
  415. if abs(rotation_angle) > inclination_threshold:
  416. max_x = max(box['bbox'][2] for box in paddle_boxes)
  417. max_y = max(box['bbox'][3] for box in paddle_boxes)
  418. image_size = (max_x, max_y)
  419. print(f" 🔧 校正倾斜角度: {rotation_angle:.2f}°")
  420. paddle_boxes = BBoxExtractor.correct_boxes_skew(
  421. paddle_boxes, -rotation_angle, image_size
  422. )
  423. # 🎯 步骤 2: 按校正后的 y 坐标分组
  424. boxes_with_y = []
  425. for box in paddle_boxes:
  426. bbox = box['bbox']
  427. y_center = (bbox[1] + bbox[3]) / 2
  428. boxes_with_y.append({
  429. 'y_center': y_center,
  430. 'box': box
  431. })
  432. # 按 y 坐标排序
  433. boxes_with_y.sort(key=lambda x: x['y_center'])
  434. groups = []
  435. current_group = None
  436. for item in boxes_with_y:
  437. if current_group is None:
  438. # 开始新组
  439. current_group = {
  440. 'y_center': item['y_center'],
  441. 'boxes': [item['box']]
  442. }
  443. else:
  444. if abs(item['y_center'] - current_group['y_center']) <= y_tolerance:
  445. current_group['boxes'].append(item['box'])
  446. # 更新组的中心
  447. current_group['y_center'] = sum(
  448. (b['bbox'][1] + b['bbox'][3]) / 2 for b in current_group['boxes']
  449. ) / len(current_group['boxes'])
  450. else:
  451. groups.append(current_group)
  452. current_group = {
  453. 'y_center': item['y_center'],
  454. 'boxes': [item['box']]
  455. }
  456. if current_group:
  457. groups.append(current_group)
  458. print(f" ✓ 分组完成: {len(groups)} 行")
  459. return groups
  460. def _match_html_rows_to_paddle_groups(self, html_rows: List,
  461. grouped_boxes: List[Dict]) -> Dict[int, List[int]]:
  462. """
  463. 智能匹配 HTML 行与 paddle 分组(增强版 DP:支持跳过 HTML 行,防止链条断裂)
  464. """
  465. if not html_rows or not grouped_boxes:
  466. return {}
  467. mapping = {}
  468. # 🎯 策略 1: 数量相等,简单 1:1 映射
  469. if len(html_rows) == len(grouped_boxes):
  470. for i in range(len(html_rows)):
  471. mapping[i] = [i]
  472. return mapping
  473. # --- 准备数据 ---
  474. # 提取 HTML 文本
  475. html_row_texts = []
  476. for row in html_rows:
  477. cells = row.find_all(['td', 'th'])
  478. texts = [self.text_matcher.normalize_text(c.get_text(strip=True)) for c in cells]
  479. html_row_texts.append("".join(texts))
  480. # 预计算所有组的文本
  481. group_texts = []
  482. for group in grouped_boxes:
  483. boxes = group['boxes']
  484. texts = [self.text_matcher.normalize_text(b['text']) for b in boxes]
  485. group_texts.append("".join(texts))
  486. n_html = len(html_row_texts)
  487. n_paddle = len(grouped_boxes)
  488. # ⚡️ 优化 3: 预计算合并文本
  489. MAX_MERGE = 4
  490. merged_cache = {}
  491. for j in range(n_paddle):
  492. current_t = ""
  493. for k in range(MAX_MERGE):
  494. if j + k < n_paddle:
  495. current_t += group_texts[j + k]
  496. merged_cache[(j, k + 1)] = current_t
  497. else:
  498. break
  499. # --- 动态规划 (DP) ---
  500. # dp[i][j] 表示:HTML 前 i 行 (0..i) 匹配到了 Paddle 的前 j 组 (0..j) 的最大得分
  501. # 初始化为负无穷
  502. dp = np.full((n_html, n_paddle), -np.inf)
  503. # 记录路径:path[i][j] = (prev_j, start_j)
  504. # prev_j: 上一行结束的 paddle index
  505. # start_j: 当前行开始的 paddle index (因为一行可能对应多个组)
  506. path = {}
  507. # 参数配置
  508. SEARCH_WINDOW = 15 # 向前搜索窗口
  509. SKIP_PADDLE_PENALTY = 0.1 # 跳过 Paddle 组的惩罚
  510. SKIP_HTML_PENALTY = 0.3 # 关键:跳过 HTML 行的惩罚
  511. # --- 1. 初始化第一行 ---
  512. # 选项 A: 匹配 Paddle 组
  513. for end_j in range(min(n_paddle, SEARCH_WINDOW + MAX_MERGE)):
  514. for count in range(1, MAX_MERGE + 1):
  515. start_j = end_j - count + 1
  516. if start_j < 0: continue
  517. current_text = merged_cache.get((start_j, count), "")
  518. similarity = self._calculate_similarity(html_row_texts[0], current_text)
  519. penalty = start_j * SKIP_PADDLE_PENALTY
  520. score = similarity - penalty
  521. # 只有得分尚可才作为有效状态
  522. if score > 0.1:
  523. if score > dp[0][end_j]:
  524. dp[0][end_j] = score
  525. path[(0, end_j)] = (-1, start_j)
  526. # 选项 B: 第一行就跳过 (虽然少见,但为了完整性)
  527. # 如果第一行跳过,相当于没有消耗任何 paddle 组,状态难以用 dp[0][j] 表达
  528. # 这里简化处理,假设第一行必须匹配点什么,或者由后续行修正
  529. # --- 2. 状态转移 ---
  530. for i in range(1, n_html):
  531. html_text = html_row_texts[i]
  532. # 获取上一行所有有效位置
  533. valid_prev_indices = [j for j in range(n_paddle) if dp[i-1][j] > -np.inf]
  534. # 剪枝
  535. if len(valid_prev_indices) > 30:
  536. valid_prev_indices.sort(key=lambda j: dp[i-1][j], reverse=True)
  537. valid_prev_indices = valid_prev_indices[:30]
  538. # 🛡️ 关键修复:允许跳过当前 HTML 行 (继承上一行的状态)
  539. # 如果跳过当前行,Paddle 指针 j 不变
  540. for prev_j in valid_prev_indices:
  541. score_skip = dp[i-1][prev_j] - SKIP_HTML_PENALTY
  542. if score_skip > dp[i][prev_j]:
  543. dp[i][prev_j] = score_skip
  544. # 记录路径:start_j = prev_j + 1 表示没有消耗新组 (空范围)
  545. path[(i, prev_j)] = (prev_j, prev_j + 1)
  546. # 如果是空行,直接跳过计算,仅保留继承的状态
  547. if not html_text:
  548. continue
  549. # 正常匹配逻辑
  550. for prev_j in valid_prev_indices:
  551. prev_score = dp[i-1][prev_j]
  552. max_gap = min(SEARCH_WINDOW, n_paddle - prev_j - 1)
  553. for gap in range(max_gap):
  554. start_j = prev_j + 1 + gap
  555. for count in range(1, MAX_MERGE + 1):
  556. end_j = start_j + count - 1
  557. if end_j >= n_paddle: break
  558. current_text = merged_cache.get((start_j, count), "")
  559. # 长度预筛选
  560. h_len = len(html_text)
  561. p_len = len(current_text)
  562. if h_len > 10 and p_len < h_len * 0.2:
  563. continue
  564. similarity = self._calculate_similarity(html_text, current_text)
  565. # 计算惩罚
  566. # 1. 跳过惩罚 (gap)
  567. # 2. 长度惩罚 (防止过度合并)
  568. len_penalty = 0.0
  569. if h_len > 0:
  570. ratio = p_len / h_len
  571. if ratio > 2.0: len_penalty = (ratio - 2.0) * 0.2
  572. current_score = similarity - (gap * SKIP_PADDLE_PENALTY) - len_penalty
  573. # 只有正收益才转移
  574. if current_score > 0.1:
  575. total_score = prev_score + current_score
  576. if total_score > dp[i][end_j]:
  577. dp[i][end_j] = total_score
  578. path[(i, end_j)] = (prev_j, start_j)
  579. # --- 3. 回溯找最优路径 ---
  580. # 找到最后一行得分最高的结束位置
  581. best_end_j = -1
  582. max_score = -np.inf
  583. # 优先找最后一行,如果最后一行没匹配上,往前找
  584. found_end = False
  585. for i in range(n_html - 1, -1, -1):
  586. for j in range(n_paddle):
  587. if dp[i][j] > max_score:
  588. max_score = dp[i][j]
  589. best_end_j = j
  590. best_last_row = i
  591. if max_score > -np.inf:
  592. found_end = True
  593. break
  594. mapping = {}
  595. used_groups = set()
  596. if found_end:
  597. curr_i = best_last_row
  598. curr_j = best_end_j
  599. while curr_i >= 0:
  600. if (curr_i, curr_j) in path:
  601. prev_j, start_j = path[(curr_i, curr_j)]
  602. # 如果 start_j <= curr_j,说明消耗了 Paddle 组
  603. # 如果 start_j > curr_j,说明是跳过 HTML 行 (空范围)
  604. if start_j <= curr_j:
  605. indices = list(range(start_j, curr_j + 1))
  606. mapping[curr_i] = indices
  607. used_groups.update(indices)
  608. else:
  609. mapping[curr_i] = []
  610. curr_j = prev_j
  611. curr_i -= 1
  612. else:
  613. break
  614. # 填补未匹配的行
  615. for i in range(n_html):
  616. if i not in mapping:
  617. mapping[i] = []
  618. # --- 4. 后处理:未匹配组的归属 (Orphans) ---
  619. unused_groups = [i for i in range(len(grouped_boxes)) if i not in used_groups]
  620. if unused_groups:
  621. print(f" ℹ️ 发现 {len(unused_groups)} 个未匹配的 paddle 组: {unused_groups}")
  622. for unused_idx in unused_groups:
  623. unused_group = grouped_boxes[unused_idx]
  624. unused_y_min = min(b['bbox'][1] for b in unused_group['boxes'])
  625. unused_y_max = max(b['bbox'][3] for b in unused_group['boxes'])
  626. above_idx = None
  627. below_idx = None
  628. above_distance = float('inf')
  629. below_distance = float('inf')
  630. for i in range(unused_idx - 1, -1, -1):
  631. if i in used_groups:
  632. above_idx = i
  633. above_group = grouped_boxes[i]
  634. max_y_box = max(above_group['boxes'], key=lambda b: b['bbox'][3])
  635. above_y_center = (max_y_box['bbox'][1] + max_y_box['bbox'][3]) / 2
  636. above_distance = abs(unused_y_min - above_y_center)
  637. break
  638. for i in range(unused_idx + 1, len(grouped_boxes)):
  639. if i in used_groups:
  640. below_idx = i
  641. below_group = grouped_boxes[i]
  642. min_y_box = min(below_group['boxes'], key=lambda b: b['bbox'][1])
  643. below_y_center = (min_y_box['bbox'][1] + min_y_box['bbox'][3]) / 2
  644. below_distance = abs(below_y_center - unused_y_max)
  645. break
  646. closest_used_idx = None
  647. merge_direction = ""
  648. if above_idx is not None and below_idx is not None:
  649. if above_distance < below_distance:
  650. closest_used_idx = above_idx
  651. merge_direction = "上方"
  652. else:
  653. closest_used_idx = below_idx
  654. merge_direction = "下方"
  655. elif above_idx is not None:
  656. closest_used_idx = above_idx
  657. merge_direction = "上方"
  658. elif below_idx is not None:
  659. closest_used_idx = below_idx
  660. merge_direction = "下方"
  661. if closest_used_idx is not None:
  662. target_html_row = None
  663. for html_row_idx, group_indices in mapping.items():
  664. if closest_used_idx in group_indices:
  665. target_html_row = html_row_idx
  666. break
  667. if target_html_row is not None:
  668. if unused_idx not in mapping[target_html_row]:
  669. mapping[target_html_row].append(unused_idx)
  670. mapping[target_html_row].sort()
  671. print(f" • 组 {unused_idx} 合并到 HTML 行 {target_html_row}({merge_direction}行)")
  672. used_groups.add(unused_idx)
  673. # 🔑 策略 4: 第三遍 - 按 y 坐标排序每行的组索引
  674. for row_idx in mapping:
  675. if mapping[row_idx]:
  676. mapping[row_idx].sort(key=lambda idx: grouped_boxes[idx]['y_center'])
  677. return mapping
  678. def _calculate_similarity(self, text1: str, text2: str) -> float:
  679. """
  680. 计算两个文本的相似度,结合字符覆盖率和序列相似度 (性能优化版)
  681. """
  682. if not text1 or not text2:
  683. return 0.0
  684. len1, len2 = len(text1), len(text2)
  685. # ⚡️ 优化 1: 长度快速检查
  686. # 如果长度差异过大(例如一个 50 字符,一个 2 字符),直接认为不匹配
  687. if len1 > 0 and len2 > 0:
  688. min_l, max_l = min(len1, len2), max(len1, len2)
  689. if max_l > 10 and min_l / max_l < 0.2:
  690. return 0.0
  691. # 1. 字符覆盖率 (Character Overlap)
  692. from collections import Counter
  693. c1 = Counter(text1)
  694. c2 = Counter(text2)
  695. intersection = c1 & c2
  696. overlap_count = sum(intersection.values())
  697. coverage = overlap_count / len1 if len1 > 0 else 0
  698. # ⚡️ 优化 2: 覆盖率低时跳过昂贵的 fuzz 计算
  699. # 如果字符重叠率低于 30%,说明内容基本不相关,没必要算序列相似度
  700. if coverage < 0.3:
  701. return coverage * 0.7
  702. # 2. 序列相似度 (Sequence Similarity)
  703. # 使用 token_sort_ratio 来容忍一定的乱序
  704. seq_score = fuzz.token_sort_ratio(text1, text2) / 100.0
  705. return (coverage * 0.7) + (seq_score * 0.3)
  706. def _preprocess_text_for_matching(self, text: str) -> str:
  707. """
  708. 预处理文本:在不同类型的字符(如中文和数字/英文)之间插入空格,
  709. 以便于 token_sort_ratio 更准确地进行分词和匹配。
  710. """
  711. if not text:
  712. return ""
  713. import re
  714. # 1. 在中文和非中文(数字/字母)之间插入空格
  715. # 例如: "2024年" -> "2024 年", "ID号码123" -> "ID号码 123"
  716. text = re.sub(r'([\u4e00-\u9fa5])([a-zA-Z0-9])', r'\1 \2', text)
  717. text = re.sub(r'([a-zA-Z0-9])([\u4e00-\u9fa5])', r'\1 \2', text)
  718. return text
  719. def _calculate_subsequence_score(self, target: str, source: str) -> float:
  720. """
  721. 计算子序列匹配得分 (解决 OCR 噪音插入问题)
  722. 例如: Target="12345", Source="12(date)34(time)5" -> Score close to 100
  723. """
  724. # 1. 仅保留字母和数字,忽略符号干扰
  725. t_clean = "".join(c for c in target if c.isalnum())
  726. s_clean = "".join(c for c in source if c.isalnum())
  727. if not t_clean or not s_clean:
  728. return 0.0
  729. # 2. 贪婪匹配子序列
  730. t_idx, s_idx = 0, 0
  731. matches = 0
  732. while t_idx < len(t_clean) and s_idx < len(s_clean):
  733. if t_clean[t_idx] == s_clean[s_idx]:
  734. matches += 1
  735. t_idx += 1
  736. s_idx += 1
  737. else:
  738. # 跳过 source 中的噪音字符
  739. s_idx += 1
  740. # 3. 计算得分
  741. match_rate = matches / len(t_clean)
  742. # 如果匹配率太低,直接返回
  743. if match_rate < 0.8:
  744. return match_rate * 100
  745. # 4. 噪音惩罚 (防止 Target="1", Source="123456789" 这种误判)
  746. # 计算噪音长度
  747. noise_len = len(s_clean) - matches
  748. # 允许一定比例的噪音 (例如日期时间插入,通常占总长度的 30%-50%)
  749. # 如果噪音长度超过目标长度的 60%,开始扣分
  750. penalty = 0
  751. if noise_len > len(t_clean) * 0.6:
  752. excess_noise = noise_len - (len(t_clean) * 0.6)
  753. penalty = excess_noise * 0.5 # 每多一个噪音字符扣 0.5 分
  754. penalty = min(penalty, 20) # 最多扣 20 分
  755. final_score = (match_rate * 100) - penalty
  756. return max(0, final_score)
  757. def _match_cell_sequential(self, cell_text: str,
  758. boxes: List[Dict],
  759. col_boundaries: List[Tuple[int, int]],
  760. start_idx: int) -> Optional[Dict]:
  761. """
  762. 🎯 顺序匹配单元格:从指定位置开始,逐步合并 boxes 直到匹配
  763. """
  764. cell_text_normalized = self.text_matcher.normalize_text(cell_text)
  765. cell_text_processed = self._preprocess_text_for_matching(cell_text)
  766. if len(cell_text_normalized) < 1:
  767. return None
  768. # 🔑 找到第一个未使用的 box
  769. first_unused_idx = start_idx
  770. while first_unused_idx < len(boxes) and boxes[first_unused_idx].get('used'):
  771. first_unused_idx += 1
  772. if first_unused_idx >= len(boxes):
  773. return None
  774. # 🔑 策略 1: 单个 box 精确匹配
  775. for box in boxes[first_unused_idx:]:
  776. box_text = self.text_matcher.normalize_text(box['text'])
  777. if cell_text_normalized == box_text:
  778. return self._build_match_result([box], box['text'], 100.0, boxes.index(box))
  779. # 🔑 策略 2: 多个 boxes 合并匹配
  780. unused_boxes = [b for b in boxes[first_unused_idx:] if not b.get('used')]
  781. # 合并同列的 boxes 合并
  782. merged_bboxes = []
  783. for col_idx in range(len(col_boundaries)):
  784. combo_boxes = self._get_boxes_in_column(unused_boxes, col_boundaries, col_idx)
  785. if len(combo_boxes) > 0:
  786. sorted_combo = sorted(combo_boxes, key=lambda b: (b['bbox'][1], b['bbox'][0]))
  787. # 🎯 改进:使用空格连接,以便于 token_sort_ratio 进行乱序匹配
  788. merged_text = ' '.join([b['text'] for b in sorted_combo])
  789. merged_bboxes.append({
  790. 'text': merged_text,
  791. 'sorted_combo': sorted_combo
  792. })
  793. for box in merged_bboxes:
  794. # 1. 精确匹配
  795. merged_text_normalized = self.text_matcher.normalize_text(box['text'])
  796. if cell_text_normalized == merged_text_normalized:
  797. last_sort_idx = boxes.index(box['sorted_combo'][-1])
  798. return self._build_match_result(box['sorted_combo'], box['text'], 100.0, last_sort_idx)
  799. # 2. 子串匹配
  800. is_substring = (cell_text_normalized in merged_text_normalized or
  801. merged_text_normalized in cell_text_normalized)
  802. # 3. 模糊匹配
  803. # 🎯 改进:使用预处理后的文本进行 token_sort_ratio 计算
  804. box_text_processed = self._preprocess_text_for_matching(box['text'])
  805. # token_sort_ratio: 自动分词并排序比较,解决 OCR 结果顺序与 HTML 不一致的问题
  806. token_sort_sim = fuzz.token_sort_ratio(cell_text_processed, box_text_processed)
  807. # partial_ratio: 子串模糊匹配,解决 OCR 识别错误
  808. partial_sim = fuzz.partial_ratio(cell_text_normalized, merged_text_normalized)
  809. # 🛡️ 增强版防御:防止“短文本”误匹配“长文本”
  810. if partial_sim > 80:
  811. len_cell = len(cell_text_normalized)
  812. len_box = len(merged_text_normalized)
  813. # 确定短方和长方
  814. if len_cell < len_box:
  815. len_short, len_long = len_cell, len_box
  816. text_short = cell_text_normalized
  817. text_long = merged_text_normalized
  818. else:
  819. len_short, len_long = len_box, len_cell
  820. text_short = merged_text_normalized
  821. text_long = cell_text_normalized
  822. # 🎯 修正:检测有效内容 (字母、数字、汉字)
  823. # 使用 Unicode 范围匹配汉字: \u4e00-\u9fa5
  824. import re
  825. def has_valid_content(text):
  826. return bool(re.search(r'[a-zA-Z0-9\u4e00-\u9fa5]', text))
  827. short_has_content = has_valid_content(text_short)
  828. long_has_content = has_valid_content(text_long)
  829. # 🛑 拒绝条件 1: 短方是纯符号 (无有效内容),且长方有内容
  830. # 例如: Cell="-" vs Box="-200" (拦截)
  831. # 例如: Cell="中国银行" vs Box="中国银行储蓄卡" (不拦截,因为都有汉字)
  832. if not short_has_content and long_has_content:
  833. # 允许例外:如果长方也很短 (比如 Cell="-" Box="- "),可能只是多了个空格,不拦截
  834. if len_long > len_short + 2:
  835. print(f" ⚠️ 拒绝纯符号部分匹配: '{cell_text}' vs '{merged_text_normalized}'")
  836. partial_sim = 0.0
  837. # 🛑 拒绝条件 2: 短方虽然有内容,但太短了 (信息量不足)
  838. elif short_has_content:
  839. # 如果短方只有 1 个字符,且长方超过 3 个字符 -> 拒绝
  840. if len_short == 1 and len_long > 3:
  841. print(f" ⚠️ 拒绝单字符部分匹配: '{cell_text}' vs '{merged_text_normalized}'")
  842. partial_sim = 0.0
  843. # 如果短方只有 2 个字符,且长方超过 8 个字符 -> 拒绝
  844. elif len_short == 2 and len_long > 8:
  845. print(f" ⚠️ 拒绝微小碎片部分匹配: '{cell_text}' vs '{merged_text_normalized}'")
  846. partial_sim = 0.0
  847. # 🆕 新增条件 3: 覆盖率过低 (防止 "2024" 匹配 "ID2024...")
  848. # 场景: Cell 是长文本, Box 是短文本, 恰好包含在 Cell 中
  849. # 逻辑: 如果覆盖率 < 30% 且 整体相似度(token_sort) < 45,说明 Box 缺失了 Cell 的绝大部分内容
  850. else:
  851. coverage = len_short / len_long if len_long > 0 else 0
  852. if coverage < 0.3 and token_sort_sim < 45:
  853. print(f" ⚠️ 拒绝低覆盖率部分匹配: '{text_short}' in '{text_long}' (cov={coverage:.2f})")
  854. partial_sim = 0.0
  855. # 🎯 新增:token_set_ratio (集合匹配)
  856. # 专门解决:目标文本被 OCR 文本中的噪音隔开的情况
  857. # 例如 Target="A B", OCR="A noise B" -> token_set_ratio 会很高
  858. token_set_sim = fuzz.token_set_ratio(cell_text_processed, box_text_processed)
  859. # 🎯 策略 4: 重构匹配 (Reconstruction Match) - 解决 ID 被噪音打断的问题
  860. # 逻辑:提取 OCR 中所有属于 Target 子串的 token,拼起来再比
  861. reconstruct_sim = 0.0
  862. if len(cell_text_normalized) > 10: # 仅对长文本启用,防止短文本误判
  863. # 使用预处理后的文本分词 (已处理中文/数字间隔)
  864. box_tokens = box_text_processed.split()
  865. # 筛选出所有是目标文本子串的 token
  866. valid_tokens = []
  867. for token in box_tokens:
  868. # 忽略太短的 token (除非目标也很短),防止 "1" 这种误匹配
  869. if len(token) < 2 and len(cell_text_normalized) > 5:
  870. continue
  871. if token in cell_text_normalized:
  872. valid_tokens.append(token)
  873. if valid_tokens:
  874. # 拼接回原始形态
  875. reconstructed_text = "".join(valid_tokens)
  876. reconstruct_sim = fuzz.ratio(cell_text_normalized, reconstructed_text)
  877. if reconstruct_sim > 90:
  878. print(f" 🧩 重构匹配生效: '{reconstructed_text}' (sim={reconstruct_sim})")
  879. # 🎯 策略 5: 子序列匹配 (Subsequence Match) - 解决粘连噪音问题
  880. # 专门针对: '1544...1050' + '2024-08-10' + '0433...' 这种场景
  881. subseq_sim = 0.0
  882. if len(cell_text_normalized) > 8: # 仅对较长文本启用
  883. subseq_sim = self._calculate_subsequence_score(cell_text_normalized, merged_text_normalized)
  884. # 🛡️ 关键修复:长度和类型防御
  885. if subseq_sim > 80:
  886. len_cell = len(cell_text_normalized)
  887. len_box = len(merged_text_normalized)
  888. # 1. 长度差异过大 (Box 比 Cell 长很多)
  889. if len_box > len_cell * 1.5:
  890. # 2. 且 Cell 是数字/日期/时间类型
  891. import re
  892. if re.match(r'^[\d\-\:\.\s]+$', cell_text_normalized):
  893. # 🧠 智能豁免:如果 Cell 本身很长 (例如 > 12字符),说明是长ID
  894. # 长ID即使夹杂了噪音 (如 "ID...日期...文字"),只要子序列匹配高,通常也是对的
  895. # 只有短文本 (如 "2024") 才需要严格防御
  896. if len_cell < 12:
  897. print(f" ⚠️ 拒绝子序列匹配: 长度差异大且为短数字类型 (sim={subseq_sim})")
  898. subseq_sim = 0.0
  899. else:
  900. print(f" ✅ 接受长ID子序列匹配: 尽管长度差异大,但特征显著 (len={len_cell})")
  901. if subseq_sim > 90:
  902. print(f" 🔗 子序列匹配生效: '{cell_text[:10]}...' (sim={subseq_sim:.1f})")
  903. # 综合得分:取五者最大值
  904. similarity = max(token_sort_sim, partial_sim, token_set_sim, reconstruct_sim, subseq_sim)
  905. # 🎯 子串匹配加分
  906. if is_substring:
  907. similarity = min(100, similarity + 10)
  908. # 🎯 长度惩罚:如果 box 内容比 cell 多太多(例如吞了下一个单元格),扣分
  909. # 注意:token_set_ratio 对长度不敏感,所以这里必须严格检查长度,防止误判
  910. # 只有当 similarity 很高时才检查,防止误杀
  911. if similarity > 80:
  912. len_cell = len(cell_text_normalized)
  913. len_box = len(merged_text_normalized)
  914. # 如果是 token_set_sim 贡献的高分,说明 OCR 里包含了很多噪音
  915. # 我们需要确保这些噪音不是“下一个单元格的内容”
  916. # 这里可以加一个更严格的长度检查,或者检查是否包含换行符等
  917. if len_box > len_cell * 2.0 + 10: # 放宽一点,因为 token_set 本来就是处理噪音的
  918. similarity -= 10 # 稍微扣一点分,表示虽然全找到了,但噪音太多不太完美
  919. if similarity >= self.text_matcher.similarity_threshold:
  920. print(f" ✓ 匹配成功: '{cell_text[:15]}' vs '{box['text'][:15]}' (相似度: {similarity})")
  921. # 由于是模糊匹配,返回第一个未使用的 box 作为 last_index
  922. for b in boxes:
  923. if not b.get('used'):
  924. last_idx = max(boxes.index(b)-1, 0)
  925. break
  926. return self._build_match_result(box['sorted_combo'], box['text'], similarity, max(start_idx, last_idx))
  927. print(f" ✗ 匹配失败: '{cell_text[:15]}'")
  928. return None
  929. def _build_match_result(self, boxes: List[Dict], text: str,
  930. score: float, last_index: int) -> Dict:
  931. """构建匹配结果(使用原始坐标)"""
  932. # 🔑 关键修复:使用 original_bbox(如果存在)
  933. def get_original_bbox(box: Dict) -> List[int]:
  934. return box.get('original_bbox', box['bbox'])
  935. original_bboxes = [get_original_bbox(b) for b in boxes]
  936. merged_bbox = [
  937. min(b[0] for b in original_bboxes),
  938. min(b[1] for b in original_bboxes),
  939. max(b[2] for b in original_bboxes),
  940. max(b[3] for b in original_bboxes)
  941. ]
  942. return {
  943. 'bbox': merged_bbox, # ✅ 使用原始坐标
  944. 'text': text,
  945. 'score': score,
  946. 'paddle_indices': [b['paddle_bbox_index'] for b in boxes],
  947. 'used_boxes': boxes,
  948. 'last_used_index': last_index
  949. }