table_cell_matcher.py 45 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154
  1. """
  2. 表格单元格匹配器
  3. 负责将 HTML 表格单元格与 PaddleOCR bbox 进行匹配
  4. """
  5. from typing import List, Dict, Tuple, Optional
  6. from bs4 import BeautifulSoup
  7. import numpy as np
  8. try:
  9. from .text_matcher import TextMatcher
  10. except ImportError:
  11. from text_matcher import TextMatcher
  12. class TableCellMatcher:
  13. """表格单元格匹配器"""
  14. def __init__(self, text_matcher: TextMatcher,
  15. x_tolerance: int = 3,
  16. y_tolerance: int = 10):
  17. """
  18. Args:
  19. text_matcher: 文本匹配器
  20. x_tolerance: X轴容差(用于列边界判断)
  21. y_tolerance: Y轴容差(用于行分组)
  22. """
  23. self.text_matcher = text_matcher
  24. self.x_tolerance = x_tolerance
  25. self.y_tolerance = y_tolerance
  26. def enhance_table_html_with_bbox(self, html: str, paddle_text_boxes: List[Dict],
  27. start_pointer: int, table_bbox: Optional[List[int]] = None) -> Tuple[str, List[Dict], int]:
  28. """
  29. 为 HTML 表格添加 bbox 信息(优化版:先筛选表格区域)
  30. 策略:
  31. 1. 根据 table_bbox 筛选出表格区域内的 paddle_text_boxes
  32. 2. 将筛选后的 boxes 按行分组
  33. 3. 智能匹配 HTML 行与 paddle 行组
  34. 4. 在匹配的组内查找单元格
  35. Args:
  36. html: HTML 表格
  37. paddle_text_boxes: 全部 paddle OCR 结果
  38. start_pointer: 开始位置
  39. table_bbox: 表格边界框 [x1, y1, x2, y2]
  40. """
  41. soup = BeautifulSoup(html, 'html.parser')
  42. cells = []
  43. # 🔑 第一步:筛选表格区域内的 paddle boxes
  44. table_region_boxes, actual_table_bbox = self._filter_boxes_in_table_region(
  45. paddle_text_boxes[start_pointer:],
  46. table_bbox,
  47. html
  48. )
  49. if not table_region_boxes:
  50. print(f"⚠️ 未在表格区域找到 paddle boxes")
  51. return str(soup), cells, start_pointer
  52. print(f"📊 表格区域: {len(table_region_boxes)} 个文本框")
  53. print(f" 边界: {actual_table_bbox}")
  54. # 🔑 第二步:将表格区域的 boxes 按行分组
  55. grouped_boxes = self._group_paddle_boxes_by_rows(
  56. table_region_boxes,
  57. y_tolerance=self.y_tolerance,
  58. auto_correct_skew=True
  59. )
  60. # 🔑 第三步:在每组内按 x 坐标排序
  61. for group in grouped_boxes:
  62. group['boxes'].sort(key=lambda x: x['bbox'][0])
  63. grouped_boxes.sort(key=lambda g: g['y_center'])
  64. print(f" 分组: {len(grouped_boxes)} 行")
  65. # 🔑 第四步:智能匹配 HTML 行与 paddle 行组
  66. html_rows = soup.find_all('tr')
  67. row_mapping = self._match_html_rows_to_paddle_groups(html_rows, grouped_boxes)
  68. print(f" HTML行: {len(html_rows)} 行")
  69. print(f" 映射: {len([v for v in row_mapping.values() if v])} 个有效映射")
  70. # 🔑 第五步:遍历 HTML 表格,使用映射关系查找
  71. for row_idx, row in enumerate(html_rows):
  72. group_indices = row_mapping.get(row_idx, [])
  73. if not group_indices:
  74. continue
  75. # 合并多个组的 boxes
  76. current_boxes = []
  77. for group_idx in group_indices:
  78. if group_idx < len(grouped_boxes):
  79. current_boxes.extend(grouped_boxes[group_idx]['boxes'])
  80. current_boxes.sort(key=lambda x: x['bbox'][0])
  81. # 🎯 关键改进:提取 HTML 单元格并预先确定列边界
  82. html_cells = row.find_all(['td', 'th'])
  83. if not html_cells:
  84. continue
  85. # 🔑 预估列边界(基于 x 坐标分布)
  86. col_boundaries = self._estimate_column_boundaries(
  87. current_boxes,
  88. len(html_cells)
  89. )
  90. print(f" 行 {row_idx + 1}: {len(html_cells)} 列,边界: {col_boundaries}")
  91. # 🎯 关键改进:顺序指针匹配
  92. box_pointer = 0 # 当前行的 boxes 指针
  93. for col_idx, cell in enumerate(html_cells):
  94. cell_text = cell.get_text(strip=True)
  95. if not cell_text:
  96. continue
  97. # 🔑 从当前指针开始匹配
  98. matched_result = self._match_cell_sequential(
  99. cell_text,
  100. current_boxes,
  101. col_boundaries,
  102. box_pointer
  103. )
  104. if matched_result:
  105. merged_bbox = matched_result['bbox']
  106. merged_text = matched_result['text']
  107. cell['data-bbox'] = f"[{merged_bbox[0]},{merged_bbox[1]},{merged_bbox[2]},{merged_bbox[3]}]"
  108. cell['data-score'] = f"{matched_result['score']:.4f}"
  109. cell['data-paddle-indices'] = str(matched_result['paddle_indices'])
  110. cells.append({
  111. 'type': 'table_cell',
  112. 'text': cell_text,
  113. 'matched_text': merged_text,
  114. 'bbox': merged_bbox,
  115. 'row': row_idx + 1,
  116. 'col': col_idx + 1,
  117. 'score': matched_result['score'],
  118. 'paddle_bbox_indices': matched_result['paddle_indices']
  119. })
  120. # 标记已使用
  121. for box in matched_result['used_boxes']:
  122. box['used'] = True
  123. # 🎯 移动指针到最后使用的 box 之后
  124. box_pointer = matched_result['last_used_index'] + 1
  125. print(f" 列 {col_idx + 1}: '{cell_text[:20]}...' 匹配 {len(matched_result['used_boxes'])} 个box (指针: {box_pointer})")
  126. # 计算新的指针位置
  127. used_count = sum(1 for box in table_region_boxes if box.get('used'))
  128. new_pointer = start_pointer + used_count
  129. print(f" 匹配: {len(cells)} 个单元格")
  130. return str(soup), cells, new_pointer
  131. def _estimate_column_boundaries(self, boxes: List[Dict],
  132. num_cols: int) -> List[Tuple[int, int]]:
  133. """
  134. 估算列边界(改进版:处理同列多文本框)
  135. Args:
  136. boxes: 当前行的所有 boxes(已按 x 排序)
  137. num_cols: HTML 表格的列数
  138. Returns:
  139. 列边界列表 [(x_start, x_end), ...]
  140. """
  141. if not boxes:
  142. return []
  143. # 🔑 关键改进:先按 x 坐标聚类(合并同列的多个文本框)
  144. x_clusters = self._cluster_boxes_by_x(boxes, x_tolerance=self.x_tolerance)
  145. print(f" X聚类: {len(boxes)} 个boxes -> {len(x_clusters)} 个列簇")
  146. # 获取所有 x 坐标范围
  147. x_min = min(cluster['x_min'] for cluster in x_clusters)
  148. x_max = max(cluster['x_max'] for cluster in x_clusters)
  149. # 🎯 策略 1: 如果聚类数量<=列数接近
  150. if len(x_clusters) <= num_cols:
  151. # 直接使用聚类边界
  152. boundaries = [(cluster['x_min'], cluster['x_max'])
  153. for cluster in x_clusters]
  154. return boundaries
  155. # 🎯 策略 2: 聚类数多于列数(某些列有多个文本簇)
  156. if len(x_clusters) > num_cols:
  157. print(f" ℹ️ 聚类数 {len(x_clusters)} > 列数 {num_cols},合并相近簇")
  158. # 合并相近的簇
  159. merged_clusters = self._merge_close_clusters(x_clusters, num_cols)
  160. boundaries = [(cluster['x_min'], cluster['x_max'])
  161. for cluster in merged_clusters]
  162. return boundaries
  163. return []
  164. def _cluster_boxes_by_x(self, boxes: List[Dict],
  165. x_tolerance: int = 3) -> List[Dict]:
  166. """
  167. 按 x 坐标聚类(合并同列的多个文本框)
  168. Args:
  169. boxes: 文本框列表
  170. x_tolerance: X坐标容忍度
  171. Returns:
  172. 聚类列表 [{'x_min': int, 'x_max': int, 'boxes': List[Dict]}, ...]
  173. """
  174. if not boxes:
  175. return []
  176. # 按左边界 x 坐标排序
  177. sorted_boxes = sorted(boxes, key=lambda b: b['bbox'][0])
  178. clusters = []
  179. current_cluster = None
  180. for box in sorted_boxes:
  181. bbox = box['bbox']
  182. x_start = bbox[0]
  183. x_end = bbox[2]
  184. if current_cluster is None:
  185. # 开始新簇
  186. current_cluster = {
  187. 'x_min': x_start,
  188. 'x_max': x_end,
  189. 'boxes': [box]
  190. }
  191. else:
  192. # 🔑 检查是否属于当前簇(修正后的逻辑)
  193. # 1. x 坐标有重叠:x_start <= current_x_max 且 x_end >= current_x_min
  194. # 2. 或者距离在容忍度内
  195. has_overlap = (x_start <= current_cluster['x_max'] and
  196. x_end >= current_cluster['x_min'])
  197. is_close = abs(x_start - current_cluster['x_max']) <= x_tolerance
  198. if has_overlap or is_close:
  199. # 合并到当前簇
  200. current_cluster['boxes'].append(box)
  201. current_cluster['x_min'] = min(current_cluster['x_min'], x_start)
  202. current_cluster['x_max'] = max(current_cluster['x_max'], x_end)
  203. else:
  204. # 保存当前簇,开始新簇
  205. clusters.append(current_cluster)
  206. current_cluster = {
  207. 'x_min': x_start,
  208. 'x_max': x_end,
  209. 'boxes': [box]
  210. }
  211. # 添加最后一簇
  212. if current_cluster:
  213. clusters.append(current_cluster)
  214. return clusters
  215. def _merge_close_clusters(self, clusters: List[Dict],
  216. target_count: int) -> List[Dict]:
  217. """
  218. 合并相近的簇,直到数量等于目标列数
  219. Args:
  220. clusters: 聚类列表
  221. target_count: 目标列数
  222. Returns:
  223. 合并后的聚类列表
  224. """
  225. if len(clusters) <= target_count:
  226. return clusters
  227. # 复制一份,避免修改原数据
  228. working_clusters = [c.copy() for c in clusters]
  229. while len(working_clusters) > target_count:
  230. # 找到距离最近的两个簇
  231. min_distance = float('inf')
  232. merge_idx = 0
  233. for i in range(len(working_clusters) - 1):
  234. distance = working_clusters[i + 1]['x_min'] - working_clusters[i]['x_max']
  235. if distance < min_distance:
  236. min_distance = distance
  237. merge_idx = i
  238. # 合并
  239. cluster1 = working_clusters[merge_idx]
  240. cluster2 = working_clusters[merge_idx + 1]
  241. merged_cluster = {
  242. 'x_min': cluster1['x_min'],
  243. 'x_max': cluster2['x_max'],
  244. 'boxes': cluster1['boxes'] + cluster2['boxes']
  245. }
  246. # 替换
  247. working_clusters[merge_idx] = merged_cluster
  248. working_clusters.pop(merge_idx + 1)
  249. return working_clusters
  250. def _get_boxes_in_column(self, boxes: List[Dict],
  251. boundaries: List[Tuple[int, int]],
  252. col_idx: int) -> List[Dict]:
  253. """
  254. 获取指定列范围内的 boxes(改进版:包含重叠)
  255. Args:
  256. boxes: 当前行的所有 boxes
  257. boundaries: 列边界
  258. col_idx: 列索引
  259. Returns:
  260. 该列的 boxes
  261. """
  262. if col_idx >= len(boundaries):
  263. return []
  264. x_start, x_end = boundaries[col_idx]
  265. col_boxes = []
  266. for box in boxes:
  267. bbox = box['bbox']
  268. box_x_start = bbox[0]
  269. box_x_end = bbox[2]
  270. # 🔑 改进:检查是否有重叠(不只是中心点)
  271. overlap = not (box_x_start > x_end or box_x_end < x_start)
  272. if overlap:
  273. col_boxes.append(box)
  274. return col_boxes
  275. def _filter_boxes_in_table_region(self, paddle_boxes: List[Dict],
  276. table_bbox: Optional[List[int]],
  277. html: str) -> Tuple[List[Dict], List[int]]:
  278. """
  279. 筛选表格区域内的 paddle boxes
  280. 策略:
  281. 1. 如果有 table_bbox,使用边界框筛选(扩展边界)
  282. 2. 如果没有 table_bbox,通过内容匹配推断区域
  283. Args:
  284. paddle_boxes: paddle OCR 结果
  285. table_bbox: 表格边界框 [x1, y1, x2, y2]
  286. html: HTML 内容(用于内容验证)
  287. Returns:
  288. (筛选后的 boxes, 实际表格边界框)
  289. """
  290. if not paddle_boxes:
  291. return [], [0, 0, 0, 0]
  292. # 🎯 策略 1: 使用提供的 table_bbox(扩展边界)
  293. if table_bbox and len(table_bbox) == 4:
  294. x1, y1, x2, y2 = table_bbox
  295. # 扩展边界(考虑边框外的文本)
  296. margin = 20
  297. expanded_bbox = [
  298. max(0, x1 - margin),
  299. max(0, y1 - margin),
  300. x2 + margin,
  301. y2 + margin
  302. ]
  303. filtered = []
  304. for box in paddle_boxes:
  305. bbox = box['bbox']
  306. box_center_x = (bbox[0] + bbox[2]) / 2
  307. box_center_y = (bbox[1] + bbox[3]) / 2
  308. # 中心点在扩展区域内
  309. if (expanded_bbox[0] <= box_center_x <= expanded_bbox[2] and
  310. expanded_bbox[1] <= box_center_y <= expanded_bbox[3]):
  311. filtered.append(box)
  312. if filtered:
  313. # 计算实际边界框
  314. actual_bbox = [
  315. min(b['bbox'][0] for b in filtered),
  316. min(b['bbox'][1] for b in filtered),
  317. max(b['bbox'][2] for b in filtered),
  318. max(b['bbox'][3] for b in filtered)
  319. ]
  320. return filtered, actual_bbox
  321. # 🎯 策略 2: 通过内容匹配推断区域
  322. print(" ℹ️ 无 table_bbox,使用内容匹配推断表格区域...")
  323. # 提取 HTML 中的所有文本
  324. from bs4 import BeautifulSoup
  325. soup = BeautifulSoup(html, 'html.parser')
  326. html_texts = set()
  327. for cell in soup.find_all(['td', 'th']):
  328. text = cell.get_text(strip=True)
  329. if text:
  330. html_texts.add(self.text_matcher.normalize_text(text))
  331. if not html_texts:
  332. return [], [0, 0, 0, 0]
  333. # 找出与 HTML 内容匹配的 boxes
  334. matched_boxes = []
  335. for box in paddle_boxes:
  336. normalized_text = self.text_matcher.normalize_text(box['text'])
  337. # 检查是否匹配
  338. if any(normalized_text in ht or ht in normalized_text
  339. for ht in html_texts):
  340. matched_boxes.append(box)
  341. if not matched_boxes:
  342. # 🔑 降级:如果精确匹配失败,使用模糊匹配
  343. print(" ℹ️ 精确匹配失败,尝试模糊匹配...")
  344. from fuzzywuzzy import fuzz
  345. for box in paddle_boxes:
  346. normalized_text = self.text_matcher.normalize_text(box['text'])
  347. for ht in html_texts:
  348. similarity = fuzz.partial_ratio(normalized_text, ht)
  349. if similarity >= 70: # 降低阈值
  350. matched_boxes.append(box)
  351. break
  352. if matched_boxes:
  353. # 计算边界框
  354. actual_bbox = [
  355. min(b['bbox'][0] for b in matched_boxes),
  356. min(b['bbox'][1] for b in matched_boxes),
  357. max(b['bbox'][2] for b in matched_boxes),
  358. max(b['bbox'][3] for b in matched_boxes)
  359. ]
  360. # 🔑 扩展边界,包含可能遗漏的文本
  361. margin = 30
  362. expanded_bbox = [
  363. max(0, actual_bbox[0] - margin),
  364. max(0, actual_bbox[1] - margin),
  365. actual_bbox[2] + margin,
  366. actual_bbox[3] + margin
  367. ]
  368. # 重新筛选(包含边界上的文本)
  369. final_filtered = []
  370. for box in paddle_boxes:
  371. bbox = box['bbox']
  372. box_center_x = (bbox[0] + bbox[2]) / 2
  373. box_center_y = (bbox[1] + bbox[3]) / 2
  374. if (expanded_bbox[0] <= box_center_x <= expanded_bbox[2] and
  375. expanded_bbox[1] <= box_center_y <= expanded_bbox[3]):
  376. final_filtered.append(box)
  377. return final_filtered, actual_bbox
  378. # 🔑 最后的降级:返回所有 boxes
  379. print(" ⚠️ 无法确定表格区域,使用所有 paddle boxes")
  380. if paddle_boxes:
  381. actual_bbox = [
  382. min(b['bbox'][0] for b in paddle_boxes),
  383. min(b['bbox'][1] for b in paddle_boxes),
  384. max(b['bbox'][2] for b in paddle_boxes),
  385. max(b['bbox'][3] for b in paddle_boxes)
  386. ]
  387. return paddle_boxes, actual_bbox
  388. return [], [0, 0, 0, 0]
  389. def _group_paddle_boxes_by_rows(self, paddle_boxes: List[Dict],
  390. y_tolerance: int = 10,
  391. auto_correct_skew: bool = True) -> List[Dict]:
  392. """
  393. 将 paddle_text_boxes 按 y 坐标分组(聚类)- 增强版本
  394. Args:
  395. paddle_boxes: Paddle OCR 文字框列表
  396. y_tolerance: Y 坐标容忍度(像素)
  397. auto_correct_skew: 是否自动校正倾斜
  398. Returns:
  399. 分组列表,每组包含 {'y_center': float, 'boxes': List[Dict]}
  400. """
  401. if not paddle_boxes:
  402. return []
  403. # 🎯 步骤 1: 检测并校正倾斜
  404. if auto_correct_skew:
  405. rotation_angle = self._calculate_rotation_angle_from_polys(paddle_boxes)
  406. if abs(rotation_angle) > 0.5: # 倾斜角度 > 0.5 度才校正
  407. # 假设图像尺寸从第一个 box 估算
  408. max_x = max(box['bbox'][2] for box in paddle_boxes)
  409. max_y = max(box['bbox'][3] for box in paddle_boxes)
  410. image_size = (max_x, max_y)
  411. print(f" 🔧 校正倾斜角度: {rotation_angle:.2f}°")
  412. paddle_boxes = self._correct_bbox_skew(paddle_boxes, -rotation_angle, image_size)
  413. # 🎯 步骤 2: 按校正后的 y 坐标分组
  414. boxes_with_y = []
  415. for box in paddle_boxes:
  416. bbox = box['bbox']
  417. y_center = (bbox[1] + bbox[3]) / 2
  418. boxes_with_y.append({
  419. 'y_center': y_center,
  420. 'box': box
  421. })
  422. # 按 y 坐标排序
  423. boxes_with_y.sort(key=lambda x: x['y_center'])
  424. # 聚类(增强容忍度)
  425. groups = []
  426. current_group = None
  427. # 🔑 动态调整容忍度(倾斜校正后可以更严格)
  428. # effective_tolerance = y_tolerance if auto_correct_skew else y_tolerance * 1.5
  429. for item in boxes_with_y:
  430. if current_group is None:
  431. # 开始新组
  432. current_group = {
  433. 'y_center': item['y_center'],
  434. 'boxes': [item['box']]
  435. }
  436. else:
  437. if abs(item['y_center'] - current_group['y_center']) <= y_tolerance:
  438. current_group['boxes'].append(item['box'])
  439. # 更新组的中心
  440. current_group['y_center'] = sum(
  441. (b['bbox'][1] + b['bbox'][3]) / 2 for b in current_group['boxes']
  442. ) / len(current_group['boxes'])
  443. else:
  444. groups.append(current_group)
  445. current_group = {
  446. 'y_center': item['y_center'],
  447. 'boxes': [item['box']]
  448. }
  449. if current_group:
  450. groups.append(current_group)
  451. print(f" ✓ 分组完成: {len(groups)} 行")
  452. return groups
  453. def _calculate_rotation_angle_from_polys(self, paddle_boxes: List[Dict],
  454. sample_ratio: float = 0.5,
  455. outlier_threshold: float = 0.3) -> float:
  456. """
  457. 从 dt_polys 计算文档倾斜角度(改进版:更鲁棒)
  458. """
  459. if not paddle_boxes:
  460. return 0.0
  461. # 🎯 步骤1: 收集文本行的倾斜角度
  462. line_angles = []
  463. for box in paddle_boxes:
  464. poly = box.get('poly', [])
  465. if len(poly) < 4:
  466. continue
  467. # 提取上边缘的两个点
  468. x1, y1 = poly[0]
  469. x2, y2 = poly[1]
  470. # 计算宽度和高度
  471. width = abs(x2 - x1)
  472. height = abs(poly[2][1] - y1)
  473. # 🔑 过滤条件
  474. if width < 50: # 太短的文本不可靠
  475. continue
  476. if width < height * 0.5: # 垂直文本
  477. continue
  478. # ⚠️ 关键修复:考虑图像坐标系(y 轴向下)
  479. dx = x2 - x1
  480. dy = y2 - y1
  481. if abs(dx) > 10:
  482. # 🔧 使用 -arctan2 来校正坐标系方向
  483. # 图像中向右下倾斜(dy>0)应该返回负角度
  484. angle_rad = -np.arctan2(dy, dx)
  485. # 只保留小角度倾斜(-15° ~ +15°)
  486. if abs(angle_rad) < np.radians(15):
  487. line_angles.append({
  488. 'angle': angle_rad,
  489. 'weight': width, # 长文本行权重更高
  490. 'y_center': (y1 + poly[2][1]) / 2
  491. })
  492. if len(line_angles) < 5:
  493. print(" ⚠️ 有效样本不足,跳过倾斜校正")
  494. return 0.0
  495. # 🎯 步骤2: 按 y 坐标排序,只使用中间区域
  496. line_angles.sort(key=lambda x: x['y_center'])
  497. start_idx = int(len(line_angles) * (1 - sample_ratio) / 2)
  498. end_idx = int(len(line_angles) * (1 + sample_ratio) / 2)
  499. sampled_angles = line_angles[start_idx:end_idx]
  500. # 🎯 步骤3: 计算中位数角度(初步估计)
  501. raw_angles = [item['angle'] for item in sampled_angles]
  502. median_angle = np.median(raw_angles)
  503. # 🎯 步骤4: 过滤异常值(与中位数差异过大)
  504. filtered_angles = []
  505. for item in sampled_angles:
  506. if abs(item['angle'] - median_angle) < outlier_threshold:
  507. filtered_angles.append(item)
  508. if len(filtered_angles) < 3:
  509. print(" ⚠️ 过滤后样本不足")
  510. return np.degrees(median_angle)
  511. # 🎯 步骤5: 加权平均(长文本行权重更高)
  512. total_weight = sum(item['weight'] for item in filtered_angles)
  513. weighted_angle = sum(
  514. item['angle'] * item['weight'] for item in filtered_angles
  515. ) / total_weight
  516. angle_deg = np.degrees(weighted_angle)
  517. print(f" 📐 倾斜角度检测:")
  518. print(f" • 原始样本: {len(line_angles)} 个")
  519. print(f" • 中间采样: {len(sampled_angles)} 个")
  520. print(f" • 过滤后: {len(filtered_angles)} 个")
  521. print(f" • 中位数角度: {np.degrees(median_angle):.3f}°")
  522. print(f" • 加权平均: {angle_deg:.3f}°")
  523. return angle_deg
  524. def _rotate_point(self, point: Tuple[float, float],
  525. angle_deg: float,
  526. center: Tuple[float, float] = (0, 0)) -> Tuple[float, float]:
  527. """
  528. 旋转点坐标
  529. Args:
  530. point: 原始点 (x, y)
  531. angle_deg: 旋转角度(度数,正值表示逆时针)
  532. center: 旋转中心
  533. Returns:
  534. 旋转后的点 (x', y')
  535. """
  536. x, y = point
  537. cx, cy = center
  538. # 转换为弧度
  539. angle_rad = np.radians(angle_deg)
  540. # 平移到原点
  541. x -= cx
  542. y -= cy
  543. # 旋转
  544. x_new = x * np.cos(angle_rad) - y * np.sin(angle_rad)
  545. y_new = x * np.sin(angle_rad) + y * np.cos(angle_rad)
  546. # 平移回去
  547. x_new += cx
  548. y_new += cy
  549. return (x_new, y_new)
  550. def _correct_bbox_skew(self, paddle_boxes: List[Dict],
  551. rotation_angle: float,
  552. image_size: Tuple[int, int]) -> List[Dict]:
  553. """
  554. 校正文本框的倾斜
  555. Args:
  556. paddle_boxes: Paddle OCR 结果
  557. rotation_angle: 倾斜角度
  558. image_size: 图像尺寸 (width, height)
  559. Returns:
  560. 校正后的文本框列表
  561. """
  562. if abs(rotation_angle) < 0.1: # 倾斜角度很小,不需要校正
  563. return paddle_boxes
  564. width, height = image_size
  565. center = (width / 2, height / 2)
  566. corrected_boxes = []
  567. for box in paddle_boxes:
  568. poly = box.get('poly', [])
  569. if len(poly) < 4:
  570. corrected_boxes.append(box)
  571. continue
  572. # 🎯 旋转多边形的四个角点
  573. rotated_poly = [
  574. self._rotate_point(point, -rotation_angle, center)
  575. for point in poly
  576. ]
  577. # 重新计算 bbox
  578. x_coords = [p[0] for p in rotated_poly]
  579. y_coords = [p[1] for p in rotated_poly]
  580. corrected_bbox = [
  581. min(x_coords),
  582. min(y_coords),
  583. max(x_coords),
  584. max(y_coords)
  585. ]
  586. # 创建校正后的 box
  587. corrected_box = box.copy()
  588. corrected_box['bbox'] = corrected_bbox
  589. corrected_box['poly'] = rotated_poly
  590. corrected_box['original_bbox'] = box['bbox'] # 保存原始坐标
  591. corrected_boxes.append(corrected_box)
  592. return corrected_boxes
  593. def _match_html_rows_to_paddle_groups(self, html_rows: List,
  594. grouped_boxes: List[Dict]) -> Dict[int, List[int]]:
  595. """
  596. 智能匹配 HTML 行与 paddle 分组(优化版:支持跳过无关组 + 防贪婪)
  597. """
  598. if not html_rows or not grouped_boxes:
  599. return {}
  600. mapping = {}
  601. # 🎯 策略 1: 数量相等,简单 1:1 映射
  602. if len(html_rows) == len(grouped_boxes):
  603. for i in range(len(html_rows)):
  604. mapping[i] = [i]
  605. return mapping
  606. # 🎯 策略 2: 基于内容匹配(带跳过机制的单调匹配)
  607. from fuzzywuzzy import fuzz
  608. used_groups = set()
  609. next_group_to_check = 0
  610. for row_idx, row in enumerate(html_rows):
  611. row_cells = row.find_all(['td', 'th'])
  612. row_texts = [cell.get_text(strip=True) for cell in row_cells]
  613. row_texts = [t for t in row_texts if t]
  614. # 提取行首文本(通常是项目名称),用于加权匹配
  615. row_header = row_texts[0] if row_texts else ""
  616. if not row_texts:
  617. mapping[row_idx] = []
  618. continue
  619. row_text_normalized = [self.text_matcher.normalize_text(t) for t in row_texts]
  620. row_combined_text = ''.join(row_text_normalized)
  621. best_groups = []
  622. best_score = 0
  623. # 🆕 动态跳过窗口:首行允许跳过较多(处理文档标题),后续行跳过较少(处理噪声)
  624. max_skip = 15 if row_idx == 0 else 5
  625. # 遍历可能的跳过数量
  626. for skip in range(max_skip + 1):
  627. start_group = next_group_to_check + skip
  628. if start_group >= len(grouped_boxes):
  629. break
  630. # 尝试合并不同数量的组 (1-5)
  631. max_merge_window = 5
  632. for group_count in range(1, max_merge_window + 1):
  633. end_group = start_group + group_count
  634. if end_group > len(grouped_boxes):
  635. break
  636. combined_group_indices = list(range(start_group, end_group))
  637. # 收集组内所有文本
  638. combined_texts = []
  639. for g_idx in combined_group_indices:
  640. group_boxes = grouped_boxes[g_idx].get('boxes', [])
  641. for box in group_boxes:
  642. if box.get('used'):
  643. continue
  644. normalized_text = self.text_matcher.normalize_text(box.get('text', ''))
  645. if normalized_text:
  646. combined_texts.append(normalized_text)
  647. if not combined_texts:
  648. continue
  649. paddle_combined_text = ''.join(combined_texts)
  650. # --- 评分逻辑 ---
  651. match_count = 0
  652. # 1. 单元格覆盖率
  653. for rt in row_text_normalized:
  654. if len(rt) < 2:
  655. continue
  656. if rt in paddle_combined_text:
  657. match_count += 1
  658. continue
  659. for ct in combined_texts:
  660. if fuzz.partial_ratio(rt, ct) >= 80:
  661. match_count += 1
  662. break
  663. coverage = match_count / len(row_texts) if row_texts else 0
  664. # 2. 整行相似度
  665. row_similarity = fuzz.partial_ratio(row_combined_text, paddle_combined_text) / 100.0
  666. # 3. 表头关键匹配(加权)
  667. header_score = 0
  668. if len(row_header) > 1:
  669. if row_header in paddle_combined_text:
  670. header_score = 1.0
  671. else:
  672. header_sim = fuzz.partial_ratio(row_header, paddle_combined_text)
  673. if header_sim > 80:
  674. header_score = 0.8
  675. else:
  676. header_score = 0.5
  677. final_score = (coverage * 0.3) + (row_similarity * 0.3) + (header_score * 0.4)
  678. # 🔑 惩罚项:合并惩罚 + 跳过惩罚
  679. # 优先选择:不跳过 > 少合并
  680. merge_penalty = (group_count - 1) * 0.05
  681. skip_penalty = skip * 0.02
  682. adjusted_score = final_score - merge_penalty - skip_penalty
  683. if adjusted_score > best_score:
  684. best_score = adjusted_score
  685. best_groups = combined_group_indices
  686. # 早停:如果单组匹配极好,不尝试合并更多
  687. if group_count == 1 and final_score > 0.85:
  688. break
  689. # 优化:如果当前 skip 找到了非常好的匹配,就不再尝试更大的 skip
  690. # 避免跳过正确的组去匹配后面相似的组
  691. if best_score > 0.85:
  692. break
  693. # 判定匹配
  694. if best_groups and best_score >= 0.4:
  695. mapping[row_idx] = best_groups
  696. used_groups.update(best_groups)
  697. next_group_to_check = max(best_groups) + 1
  698. print(f" ✓ 行 {row_idx} ('{row_header[:10]}...'): 匹配组 {best_groups} (得分: {best_score:.2f})")
  699. else:
  700. mapping[row_idx] = []
  701. # 如果没匹配上,next_group_to_check 不变,给下一行机会
  702. print(f" ✗ 行 {row_idx} ('{row_header[:10]}...'): 无匹配 (最佳得分: {best_score:.2f})")
  703. # 🎯 策略 3: 第二遍 - 处理未使用的组(关键!)
  704. unused_groups = [i for i in range(len(grouped_boxes)) if i not in used_groups]
  705. if unused_groups:
  706. print(f" ℹ️ 发现 {len(unused_groups)} 个未匹配的 paddle 组: {unused_groups}")
  707. # 🔑 将未使用的组合并到相邻的已匹配行
  708. for unused_idx in unused_groups:
  709. # 🎯 关键改进:计算与相邻行的边界距离
  710. unused_group = grouped_boxes[unused_idx]
  711. unused_y_min = min(b['bbox'][1] for b in unused_group['boxes'])
  712. unused_y_max = max(b['bbox'][3] for b in unused_group['boxes'])
  713. # 🔑 查找上方和下方最近的已使用组
  714. above_idx = None
  715. below_idx = None
  716. above_distance = float('inf')
  717. below_distance = float('inf')
  718. # 向上查找
  719. for i in range(unused_idx - 1, -1, -1):
  720. if i in used_groups:
  721. above_idx = i
  722. # 🎯 边界距离:unused 的最小 y - above 的最大 y
  723. above_group = grouped_boxes[i]
  724. max_y_box = max(
  725. above_group['boxes'],
  726. key=lambda b: b['bbox'][3]
  727. )
  728. above_y_center = (max_y_box['bbox'][1] + max_y_box['bbox'][3]) / 2
  729. above_distance = abs(unused_y_min - above_y_center)
  730. print(f" • 组 {unused_idx} 与上方组 {i} 距离: {above_distance:.1f}px")
  731. break
  732. # 向下查找
  733. for i in range(unused_idx + 1, len(grouped_boxes)):
  734. if i in used_groups:
  735. below_idx = i
  736. # 🎯 边界距离:below 的最小 y - unused 的最大 y
  737. below_group = grouped_boxes[i]
  738. min_y_box = min(
  739. below_group['boxes'],
  740. key=lambda b: b['bbox'][1]
  741. )
  742. below_y_center = (min_y_box['bbox'][1] + min_y_box['bbox'][3]) / 2
  743. below_distance = abs(below_y_center - unused_y_max)
  744. print(f" • 组 {unused_idx} 与下方组 {i} 距离: {below_distance:.1f}px")
  745. break
  746. # 🎯 选择距离更近的一侧
  747. if above_idx is not None and below_idx is not None:
  748. # 都存在,选择距离更近的
  749. if above_distance < below_distance:
  750. closest_used_idx = above_idx
  751. merge_direction = "上方"
  752. else:
  753. closest_used_idx = below_idx
  754. merge_direction = "下方"
  755. print(f" ✓ 组 {unused_idx} 选择合并到{merge_direction}组 {closest_used_idx}")
  756. elif above_idx is not None:
  757. closest_used_idx = above_idx
  758. merge_direction = "上方"
  759. elif below_idx is not None:
  760. closest_used_idx = below_idx
  761. merge_direction = "下方"
  762. else:
  763. print(f" ⚠️ 组 {unused_idx} 无相邻已使用组,跳过")
  764. continue
  765. # 🔑 找到该组对应的 HTML 行
  766. target_html_row = None
  767. for html_row_idx, group_indices in mapping.items():
  768. if closest_used_idx in group_indices:
  769. target_html_row = html_row_idx
  770. break
  771. if target_html_row is not None:
  772. # 🎯 根据合并方向决定目标行
  773. if merge_direction == "上方":
  774. # 合并到上方对应的 HTML 行
  775. if target_html_row in mapping:
  776. if unused_idx not in mapping[target_html_row]:
  777. mapping[target_html_row].append(unused_idx)
  778. print(f" • 组 {unused_idx} 合并到 HTML 行 {target_html_row}(上方行)")
  779. else:
  780. # 合并到下方对应的 HTML 行
  781. if target_html_row in mapping:
  782. if unused_idx not in mapping[target_html_row]:
  783. mapping[target_html_row].append(unused_idx)
  784. print(f" • 组 {unused_idx} 合并到 HTML 行 {target_html_row}(下方行)")
  785. used_groups.add(unused_idx)
  786. # 🔑 策略 4: 第三遍 - 按 y 坐标排序每行的组索引
  787. for row_idx in mapping:
  788. if mapping[row_idx]:
  789. mapping[row_idx].sort(key=lambda idx: grouped_boxes[idx]['y_center'])
  790. return mapping
  791. def _preprocess_close_groups(self, grouped_boxes: List[Dict],
  792. y_gap_threshold: int = 10) -> List[List[int]]:
  793. """
  794. 🆕 预处理:将 y 间距很小的组预合并
  795. Args:
  796. grouped_boxes: 原始分组
  797. y_gap_threshold: Y 间距阈值(小于此值认为是同一行)
  798. Returns:
  799. 预处理后的组索引列表 [[0,1], [2], [3,4,5], ...]
  800. """
  801. if not grouped_boxes:
  802. return []
  803. preprocessed = []
  804. current_group = [0]
  805. for i in range(1, len(grouped_boxes)):
  806. prev_group = grouped_boxes[i - 1]
  807. curr_group = grouped_boxes[i]
  808. # 计算间距
  809. prev_y_max = max(b['bbox'][3] for b in prev_group['boxes'])
  810. curr_y_min = min(b['bbox'][1] for b in curr_group['boxes'])
  811. gap = abs(curr_y_min - prev_y_max)
  812. if gap <= y_gap_threshold:
  813. # 间距很小,合并
  814. current_group.append(i)
  815. print(f" 预合并: 组 {i-1} 和 {i} (间距: {gap}px)")
  816. else:
  817. # 间距较大,开始新组
  818. preprocessed.append(current_group)
  819. current_group = [i]
  820. # 添加最后一组
  821. if current_group:
  822. preprocessed.append(current_group)
  823. return preprocessed
  824. def _match_cell_sequential(self, cell_text: str,
  825. boxes: List[Dict],
  826. col_boundaries: List[Tuple[int, int]],
  827. start_idx: int) -> Optional[Dict]:
  828. """
  829. 🎯 顺序匹配单元格:从指定位置开始,逐步合并 boxes 直到匹配
  830. 策略:
  831. 1. 找到第一个未使用的 box
  832. 2. 尝试单个 box 精确匹配
  833. 3. 如果失败,尝试合并多个 boxes
  834. Args:
  835. cell_text: HTML 单元格文本
  836. boxes: 候选 boxes(已按 x 坐标排序)
  837. col_boundaries: 列边界列表
  838. start_idx: 起始索引
  839. Returns:
  840. {'bbox': [x1,y1,x2,y2], 'text': str, 'score': float,
  841. 'paddle_indices': [idx1, idx2], 'used_boxes': [box1, box2],
  842. 'last_used_index': int}
  843. """
  844. from fuzzywuzzy import fuzz
  845. cell_text_normalized = self.text_matcher.normalize_text(cell_text)
  846. if len(cell_text_normalized) < 2:
  847. return None
  848. # 🔑 找到第一个未使用的 box
  849. first_unused_idx = start_idx
  850. while first_unused_idx < len(boxes) and boxes[first_unused_idx].get('used'):
  851. first_unused_idx += 1
  852. if first_unused_idx >= len(boxes):
  853. return None
  854. # 🔑 策略 1: 单个 box 精确匹配
  855. for box in boxes[first_unused_idx:]:
  856. if box.get('used'):
  857. continue
  858. box_text = self.text_matcher.normalize_text(box['text'])
  859. if cell_text_normalized == box_text:
  860. return self._build_match_result([box], box['text'], 100.0, boxes.index(box))
  861. # 🔑 策略 2: 多个 boxes 合并匹配
  862. unused_boxes = [b for b in boxes if not b.get('used')]
  863. # 合并同列的 boxes 合并
  864. merged_bboxes = []
  865. for col_idx in range(len(col_boundaries)):
  866. combo_boxes = self._get_boxes_in_column(unused_boxes, col_boundaries, col_idx)
  867. if len(combo_boxes) > 0:
  868. sorted_combo = sorted(combo_boxes, key=lambda b: (b['bbox'][1], b['bbox'][0]))
  869. merged_text = ''.join([b['text'] for b in sorted_combo])
  870. merged_bboxes.append({
  871. 'text': merged_text,
  872. 'sorted_combo': sorted_combo
  873. })
  874. for box in merged_bboxes:
  875. # 1. 精确匹配
  876. merged_text_normalized = self.text_matcher.normalize_text(box['text'])
  877. if cell_text_normalized == merged_text_normalized:
  878. last_sort_idx = boxes.index(box['sorted_combo'][-1])
  879. return self._build_match_result(box['sorted_combo'], box['text'], 100.0, last_sort_idx)
  880. # 2. 子串匹配
  881. is_substring = (cell_text_normalized in merged_text_normalized or
  882. merged_text_normalized in cell_text_normalized)
  883. # 3. 模糊匹配
  884. similarity = fuzz.partial_ratio(cell_text_normalized, merged_text_normalized)
  885. # 🎯 子串匹配加分
  886. if is_substring:
  887. similarity = min(100, similarity + 10)
  888. if similarity >= self.text_matcher.similarity_threshold:
  889. print(f" ✓ 匹配成功: '{cell_text[:15]}' vs '{merged_text[:15]}' (相似度: {similarity})")
  890. return self._build_match_result(box['sorted_combo'], box['text'], similarity, start_idx)
  891. print(f" ✗ 匹配失败: '{cell_text[:15]}'")
  892. return None
  893. def _build_match_result(self, boxes: List[Dict], text: str,
  894. score: float, last_index: int) -> Dict:
  895. """构建匹配结果(使用原始坐标)"""
  896. # 🔑 关键修复:使用 original_bbox(如果存在)
  897. def get_original_bbox(box: Dict) -> List[int]:
  898. return box.get('original_bbox', box['bbox'])
  899. original_bboxes = [get_original_bbox(b) for b in boxes]
  900. merged_bbox = [
  901. min(b[0] for b in original_bboxes),
  902. min(b[1] for b in original_bboxes),
  903. max(b[2] for b in original_bboxes),
  904. max(b[3] for b in original_bboxes)
  905. ]
  906. return {
  907. 'bbox': merged_bbox, # ✅ 使用原始坐标
  908. 'text': text,
  909. 'score': score,
  910. 'paddle_indices': [b['paddle_bbox_index'] for b in boxes],
  911. 'used_boxes': boxes,
  912. 'last_used_index': last_index
  913. }