table_cell_matcher.py 44 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152
  1. """
  2. 表格单元格匹配器
  3. 负责将 HTML 表格单元格与 PaddleOCR bbox 进行匹配
  4. """
  5. from typing import List, Dict, Tuple, Optional
  6. from bs4 import BeautifulSoup
  7. import numpy as np
  8. try:
  9. from .text_matcher import TextMatcher
  10. except ImportError:
  11. from text_matcher import TextMatcher
  12. class TableCellMatcher:
  13. """表格单元格匹配器"""
  14. def __init__(self, text_matcher: TextMatcher,
  15. x_tolerance: int = 3,
  16. y_tolerance: int = 10):
  17. """
  18. Args:
  19. text_matcher: 文本匹配器
  20. x_tolerance: X轴容差(用于列边界判断)
  21. y_tolerance: Y轴容差(用于行分组)
  22. """
  23. self.text_matcher = text_matcher
  24. self.x_tolerance = x_tolerance
  25. self.y_tolerance = y_tolerance
  26. def enhance_table_html_with_bbox(self, html: str, paddle_text_boxes: List[Dict],
  27. start_pointer: int, table_bbox: Optional[List[int]] = None) -> Tuple[str, List[Dict], int]:
  28. """
  29. 为 HTML 表格添加 bbox 信息(优化版:先筛选表格区域)
  30. 策略:
  31. 1. 根据 table_bbox 筛选出表格区域内的 paddle_text_boxes
  32. 2. 将筛选后的 boxes 按行分组
  33. 3. 智能匹配 HTML 行与 paddle 行组
  34. 4. 在匹配的组内查找单元格
  35. Args:
  36. html: HTML 表格
  37. paddle_text_boxes: 全部 paddle OCR 结果
  38. start_pointer: 开始位置
  39. table_bbox: 表格边界框 [x1, y1, x2, y2]
  40. """
  41. soup = BeautifulSoup(html, 'html.parser')
  42. cells = []
  43. # 🔑 第一步:筛选表格区域内的 paddle boxes
  44. table_region_boxes, actual_table_bbox = self._filter_boxes_in_table_region(
  45. paddle_text_boxes[start_pointer:],
  46. table_bbox,
  47. html
  48. )
  49. if not table_region_boxes:
  50. print(f"⚠️ 未在表格区域找到 paddle boxes")
  51. return str(soup), cells, start_pointer
  52. print(f"📊 表格区域: {len(table_region_boxes)} 个文本框")
  53. print(f" 边界: {actual_table_bbox}")
  54. # 🔑 第二步:将表格区域的 boxes 按行分组
  55. grouped_boxes = self._group_paddle_boxes_by_rows(
  56. table_region_boxes,
  57. y_tolerance=self.y_tolerance,
  58. auto_correct_skew=True
  59. )
  60. # 🔑 第三步:在每组内按 x 坐标排序
  61. for group in grouped_boxes:
  62. group['boxes'].sort(key=lambda x: x['bbox'][0])
  63. grouped_boxes.sort(key=lambda g: g['y_center'])
  64. print(f" 分组: {len(grouped_boxes)} 行")
  65. # 🔑 第四步:智能匹配 HTML 行与 paddle 行组
  66. html_rows = soup.find_all('tr')
  67. row_mapping = self._match_html_rows_to_paddle_groups(html_rows, grouped_boxes)
  68. print(f" HTML行: {len(html_rows)} 行")
  69. print(f" 映射: {len([v for v in row_mapping.values() if v])} 个有效映射")
  70. # 🔑 第五步:遍历 HTML 表格,使用映射关系查找
  71. for row_idx, row in enumerate(html_rows):
  72. group_indices = row_mapping.get(row_idx, [])
  73. if not group_indices:
  74. continue
  75. # 合并多个组的 boxes
  76. current_boxes = []
  77. for group_idx in group_indices:
  78. if group_idx < len(grouped_boxes):
  79. current_boxes.extend(grouped_boxes[group_idx]['boxes'])
  80. current_boxes.sort(key=lambda x: x['bbox'][0])
  81. # 🎯 关键改进:提取 HTML 单元格并预先确定列边界
  82. html_cells = row.find_all(['td', 'th'])
  83. if not html_cells:
  84. continue
  85. # 🔑 预估列边界(基于 x 坐标分布)
  86. col_boundaries = self._estimate_column_boundaries(
  87. current_boxes,
  88. len(html_cells)
  89. )
  90. print(f" 行 {row_idx + 1}: {len(html_cells)} 列,边界: {col_boundaries}")
  91. # 🎯 关键改进:顺序指针匹配
  92. box_pointer = 0 # 当前行的 boxes 指针
  93. for col_idx, cell in enumerate(html_cells):
  94. cell_text = cell.get_text(strip=True)
  95. if not cell_text:
  96. continue
  97. # 🔑 从当前指针开始匹配
  98. matched_result = self._match_cell_sequential(
  99. cell_text,
  100. current_boxes,
  101. col_boundaries,
  102. box_pointer
  103. )
  104. if matched_result:
  105. merged_bbox = matched_result['bbox']
  106. merged_text = matched_result['text']
  107. cell['data-bbox'] = f"[{merged_bbox[0]},{merged_bbox[1]},{merged_bbox[2]},{merged_bbox[3]}]"
  108. cell['data-score'] = f"{matched_result['score']:.4f}"
  109. cell['data-paddle-indices'] = str(matched_result['paddle_indices'])
  110. cells.append({
  111. 'type': 'table_cell',
  112. 'text': cell_text,
  113. 'matched_text': merged_text,
  114. 'bbox': merged_bbox,
  115. 'row': row_idx + 1,
  116. 'col': col_idx + 1,
  117. 'score': matched_result['score'],
  118. 'paddle_bbox_indices': matched_result['paddle_indices']
  119. })
  120. # 标记已使用
  121. for box in matched_result['used_boxes']:
  122. box['used'] = True
  123. # 🎯 移动指针到最后使用的 box 之后
  124. box_pointer = matched_result['last_used_index'] + 1
  125. print(f" 列 {col_idx + 1}: '{cell_text[:20]}...' 匹配 {len(matched_result['used_boxes'])} 个box (指针: {box_pointer})")
  126. # 计算新的指针位置
  127. used_count = sum(1 for box in table_region_boxes if box.get('used'))
  128. new_pointer = start_pointer + used_count
  129. print(f" 匹配: {len(cells)} 个单元格")
  130. return str(soup), cells, new_pointer
  131. def _estimate_column_boundaries(self, boxes: List[Dict],
  132. num_cols: int) -> List[Tuple[int, int]]:
  133. """
  134. 估算列边界(改进版:处理同列多文本框)
  135. Args:
  136. boxes: 当前行的所有 boxes(已按 x 排序)
  137. num_cols: HTML 表格的列数
  138. Returns:
  139. 列边界列表 [(x_start, x_end), ...]
  140. """
  141. if not boxes:
  142. return []
  143. # 🔑 关键改进:先按 x 坐标聚类(合并同列的多个文本框)
  144. x_clusters = self._cluster_boxes_by_x(boxes, x_tolerance=self.x_tolerance)
  145. print(f" X聚类: {len(boxes)} 个boxes -> {len(x_clusters)} 个列簇")
  146. # 获取所有 x 坐标范围
  147. x_min = min(cluster['x_min'] for cluster in x_clusters)
  148. x_max = max(cluster['x_max'] for cluster in x_clusters)
  149. # 🎯 策略 1: 如果聚类数量<=列数接近
  150. if len(x_clusters) <= num_cols:
  151. # 直接使用聚类边界
  152. boundaries = [(cluster['x_min'], cluster['x_max'])
  153. for cluster in x_clusters]
  154. return boundaries
  155. # 🎯 策略 2: 聚类数多于列数(某些列有多个文本簇)
  156. if len(x_clusters) > num_cols:
  157. print(f" ℹ️ 聚类数 {len(x_clusters)} > 列数 {num_cols},合并相近簇")
  158. # 合并相近的簇
  159. merged_clusters = self._merge_close_clusters(x_clusters, num_cols)
  160. boundaries = [(cluster['x_min'], cluster['x_max'])
  161. for cluster in merged_clusters]
  162. return boundaries
  163. return []
  164. def _cluster_boxes_by_x(self, boxes: List[Dict],
  165. x_tolerance: int = 3) -> List[Dict]:
  166. """
  167. 按 x 坐标聚类(合并同列的多个文本框)
  168. Args:
  169. boxes: 文本框列表
  170. x_tolerance: X坐标容忍度
  171. Returns:
  172. 聚类列表 [{'x_min': int, 'x_max': int, 'boxes': List[Dict]}, ...]
  173. """
  174. if not boxes:
  175. return []
  176. # 按左边界 x 坐标排序
  177. sorted_boxes = sorted(boxes, key=lambda b: b['bbox'][0])
  178. clusters = []
  179. current_cluster = None
  180. for box in sorted_boxes:
  181. bbox = box['bbox']
  182. x_start = bbox[0]
  183. x_end = bbox[2]
  184. if current_cluster is None:
  185. # 开始新簇
  186. current_cluster = {
  187. 'x_min': x_start,
  188. 'x_max': x_end,
  189. 'boxes': [box]
  190. }
  191. else:
  192. # 🔑 检查是否属于当前簇(修正后的逻辑)
  193. # 1. x 坐标有重叠:x_start <= current_x_max 且 x_end >= current_x_min
  194. # 2. 或者距离在容忍度内
  195. has_overlap = (x_start <= current_cluster['x_max'] and
  196. x_end >= current_cluster['x_min'])
  197. is_close = abs(x_start - current_cluster['x_max']) <= x_tolerance
  198. if has_overlap or is_close:
  199. # 合并到当前簇
  200. current_cluster['boxes'].append(box)
  201. current_cluster['x_min'] = min(current_cluster['x_min'], x_start)
  202. current_cluster['x_max'] = max(current_cluster['x_max'], x_end)
  203. else:
  204. # 保存当前簇,开始新簇
  205. clusters.append(current_cluster)
  206. current_cluster = {
  207. 'x_min': x_start,
  208. 'x_max': x_end,
  209. 'boxes': [box]
  210. }
  211. # 添加最后一簇
  212. if current_cluster:
  213. clusters.append(current_cluster)
  214. return clusters
  215. def _merge_close_clusters(self, clusters: List[Dict],
  216. target_count: int) -> List[Dict]:
  217. """
  218. 合并相近的簇,直到数量等于目标列数
  219. Args:
  220. clusters: 聚类列表
  221. target_count: 目标列数
  222. Returns:
  223. 合并后的聚类列表
  224. """
  225. if len(clusters) <= target_count:
  226. return clusters
  227. # 复制一份,避免修改原数据
  228. working_clusters = [c.copy() for c in clusters]
  229. while len(working_clusters) > target_count:
  230. # 找到距离最近的两个簇
  231. min_distance = float('inf')
  232. merge_idx = 0
  233. for i in range(len(working_clusters) - 1):
  234. distance = working_clusters[i + 1]['x_min'] - working_clusters[i]['x_max']
  235. if distance < min_distance:
  236. min_distance = distance
  237. merge_idx = i
  238. # 合并
  239. cluster1 = working_clusters[merge_idx]
  240. cluster2 = working_clusters[merge_idx + 1]
  241. merged_cluster = {
  242. 'x_min': cluster1['x_min'],
  243. 'x_max': cluster2['x_max'],
  244. 'boxes': cluster1['boxes'] + cluster2['boxes']
  245. }
  246. # 替换
  247. working_clusters[merge_idx] = merged_cluster
  248. working_clusters.pop(merge_idx + 1)
  249. return working_clusters
  250. def _get_boxes_in_column(self, boxes: List[Dict],
  251. boundaries: List[Tuple[int, int]],
  252. col_idx: int) -> List[Dict]:
  253. """
  254. 获取指定列范围内的 boxes(改进版:包含重叠)
  255. Args:
  256. boxes: 当前行的所有 boxes
  257. boundaries: 列边界
  258. col_idx: 列索引
  259. Returns:
  260. 该列的 boxes
  261. """
  262. if col_idx >= len(boundaries):
  263. return []
  264. x_start, x_end = boundaries[col_idx]
  265. col_boxes = []
  266. for box in boxes:
  267. bbox = box['bbox']
  268. box_x_start = bbox[0]
  269. box_x_end = bbox[2]
  270. # 🔑 改进:检查是否有重叠(不只是中心点)
  271. overlap = not (box_x_start > x_end or box_x_end < x_start)
  272. if overlap:
  273. col_boxes.append(box)
  274. return col_boxes
  275. def _filter_boxes_in_table_region(self, paddle_boxes: List[Dict],
  276. table_bbox: Optional[List[int]],
  277. html: str) -> Tuple[List[Dict], List[int]]:
  278. """
  279. 筛选表格区域内的 paddle boxes
  280. 策略:
  281. 1. 如果有 table_bbox,使用边界框筛选(扩展边界)
  282. 2. 如果没有 table_bbox,通过内容匹配推断区域
  283. Args:
  284. paddle_boxes: paddle OCR 结果
  285. table_bbox: 表格边界框 [x1, y1, x2, y2]
  286. html: HTML 内容(用于内容验证)
  287. Returns:
  288. (筛选后的 boxes, 实际表格边界框)
  289. """
  290. if not paddle_boxes:
  291. return [], [0, 0, 0, 0]
  292. # 🎯 策略 1: 使用提供的 table_bbox(扩展边界)
  293. if table_bbox and len(table_bbox) == 4:
  294. x1, y1, x2, y2 = table_bbox
  295. # 扩展边界(考虑边框外的文本)
  296. margin = 20
  297. expanded_bbox = [
  298. max(0, x1 - margin),
  299. max(0, y1 - margin),
  300. x2 + margin,
  301. y2 + margin
  302. ]
  303. filtered = []
  304. for box in paddle_boxes:
  305. bbox = box['bbox']
  306. box_center_x = (bbox[0] + bbox[2]) / 2
  307. box_center_y = (bbox[1] + bbox[3]) / 2
  308. # 中心点在扩展区域内
  309. if (expanded_bbox[0] <= box_center_x <= expanded_bbox[2] and
  310. expanded_bbox[1] <= box_center_y <= expanded_bbox[3]):
  311. filtered.append(box)
  312. if filtered:
  313. # 计算实际边界框
  314. actual_bbox = [
  315. min(b['bbox'][0] for b in filtered),
  316. min(b['bbox'][1] for b in filtered),
  317. max(b['bbox'][2] for b in filtered),
  318. max(b['bbox'][3] for b in filtered)
  319. ]
  320. return filtered, actual_bbox
  321. # 🎯 策略 2: 通过内容匹配推断区域
  322. print(" ℹ️ 无 table_bbox,使用内容匹配推断表格区域...")
  323. # 提取 HTML 中的所有文本
  324. from bs4 import BeautifulSoup
  325. soup = BeautifulSoup(html, 'html.parser')
  326. html_texts = set()
  327. for cell in soup.find_all(['td', 'th']):
  328. text = cell.get_text(strip=True)
  329. if text:
  330. html_texts.add(self.text_matcher.normalize_text(text))
  331. if not html_texts:
  332. return [], [0, 0, 0, 0]
  333. # 找出与 HTML 内容匹配的 boxes
  334. matched_boxes = []
  335. for box in paddle_boxes:
  336. normalized_text = self.text_matcher.normalize_text(box['text'])
  337. # 检查是否匹配
  338. if any(normalized_text in ht or ht in normalized_text
  339. for ht in html_texts):
  340. matched_boxes.append(box)
  341. if not matched_boxes:
  342. # 🔑 降级:如果精确匹配失败,使用模糊匹配
  343. print(" ℹ️ 精确匹配失败,尝试模糊匹配...")
  344. from fuzzywuzzy import fuzz
  345. for box in paddle_boxes:
  346. normalized_text = self.text_matcher.normalize_text(box['text'])
  347. for ht in html_texts:
  348. similarity = fuzz.partial_ratio(normalized_text, ht)
  349. if similarity >= 70: # 降低阈值
  350. matched_boxes.append(box)
  351. break
  352. if matched_boxes:
  353. # 计算边界框
  354. actual_bbox = [
  355. min(b['bbox'][0] for b in matched_boxes),
  356. min(b['bbox'][1] for b in matched_boxes),
  357. max(b['bbox'][2] for b in matched_boxes),
  358. max(b['bbox'][3] for b in matched_boxes)
  359. ]
  360. # 🔑 扩展边界,包含可能遗漏的文本
  361. margin = 30
  362. expanded_bbox = [
  363. max(0, actual_bbox[0] - margin),
  364. max(0, actual_bbox[1] - margin),
  365. actual_bbox[2] + margin,
  366. actual_bbox[3] + margin
  367. ]
  368. # 重新筛选(包含边界上的文本)
  369. final_filtered = []
  370. for box in paddle_boxes:
  371. bbox = box['bbox']
  372. box_center_x = (bbox[0] + bbox[2]) / 2
  373. box_center_y = (bbox[1] + bbox[3]) / 2
  374. if (expanded_bbox[0] <= box_center_x <= expanded_bbox[2] and
  375. expanded_bbox[1] <= box_center_y <= expanded_bbox[3]):
  376. final_filtered.append(box)
  377. return final_filtered, actual_bbox
  378. # 🔑 最后的降级:返回所有 boxes
  379. print(" ⚠️ 无法确定表格区域,使用所有 paddle boxes")
  380. if paddle_boxes:
  381. actual_bbox = [
  382. min(b['bbox'][0] for b in paddle_boxes),
  383. min(b['bbox'][1] for b in paddle_boxes),
  384. max(b['bbox'][2] for b in paddle_boxes),
  385. max(b['bbox'][3] for b in paddle_boxes)
  386. ]
  387. return paddle_boxes, actual_bbox
  388. return [], [0, 0, 0, 0]
  389. def _group_paddle_boxes_by_rows(self, paddle_boxes: List[Dict],
  390. y_tolerance: int = 10,
  391. auto_correct_skew: bool = True) -> List[Dict]:
  392. """
  393. 将 paddle_text_boxes 按 y 坐标分组(聚类)- 增强版本
  394. Args:
  395. paddle_boxes: Paddle OCR 文字框列表
  396. y_tolerance: Y 坐标容忍度(像素)
  397. auto_correct_skew: 是否自动校正倾斜
  398. Returns:
  399. 分组列表,每组包含 {'y_center': float, 'boxes': List[Dict]}
  400. """
  401. if not paddle_boxes:
  402. return []
  403. # 🎯 步骤 1: 检测并校正倾斜
  404. if auto_correct_skew:
  405. rotation_angle = self._calculate_rotation_angle_from_polys(paddle_boxes)
  406. if abs(rotation_angle) > 0.5: # 倾斜角度 > 0.5 度才校正
  407. # 假设图像尺寸从第一个 box 估算
  408. max_x = max(box['bbox'][2] for box in paddle_boxes)
  409. max_y = max(box['bbox'][3] for box in paddle_boxes)
  410. image_size = (max_x, max_y)
  411. print(f" 🔧 校正倾斜角度: {rotation_angle:.2f}°")
  412. paddle_boxes = self._correct_bbox_skew(paddle_boxes, -rotation_angle, image_size)
  413. # 🎯 步骤 2: 按校正后的 y 坐标分组
  414. boxes_with_y = []
  415. for box in paddle_boxes:
  416. bbox = box['bbox']
  417. y_center = (bbox[1] + bbox[3]) / 2
  418. boxes_with_y.append({
  419. 'y_center': y_center,
  420. 'box': box
  421. })
  422. # 按 y 坐标排序
  423. boxes_with_y.sort(key=lambda x: x['y_center'])
  424. # 聚类(增强容忍度)
  425. groups = []
  426. current_group = None
  427. # 🔑 动态调整容忍度(倾斜校正后可以更严格)
  428. # effective_tolerance = y_tolerance if auto_correct_skew else y_tolerance * 1.5
  429. for item in boxes_with_y:
  430. if current_group is None:
  431. # 开始新组
  432. current_group = {
  433. 'y_center': item['y_center'],
  434. 'boxes': [item['box']]
  435. }
  436. else:
  437. if abs(item['y_center'] - current_group['y_center']) <= y_tolerance:
  438. current_group['boxes'].append(item['box'])
  439. # 更新组的中心
  440. current_group['y_center'] = sum(
  441. (b['bbox'][1] + b['bbox'][3]) / 2 for b in current_group['boxes']
  442. ) / len(current_group['boxes'])
  443. else:
  444. groups.append(current_group)
  445. current_group = {
  446. 'y_center': item['y_center'],
  447. 'boxes': [item['box']]
  448. }
  449. if current_group:
  450. groups.append(current_group)
  451. print(f" ✓ 分组完成: {len(groups)} 行")
  452. return groups
  453. def _calculate_rotation_angle_from_polys(self, paddle_boxes: List[Dict],
  454. sample_ratio: float = 0.5,
  455. outlier_threshold: float = 0.3) -> float:
  456. """
  457. 从 dt_polys 计算文档倾斜角度(改进版:更鲁棒)
  458. """
  459. if not paddle_boxes:
  460. return 0.0
  461. # 🎯 步骤1: 收集文本行的倾斜角度
  462. line_angles = []
  463. for box in paddle_boxes:
  464. poly = box.get('poly', [])
  465. if len(poly) < 4:
  466. continue
  467. # 提取上边缘的两个点
  468. x1, y1 = poly[0]
  469. x2, y2 = poly[1]
  470. # 计算宽度和高度
  471. width = abs(x2 - x1)
  472. height = abs(poly[2][1] - y1)
  473. # 🔑 过滤条件
  474. if width < 50: # 太短的文本不可靠
  475. continue
  476. if width < height * 0.5: # 垂直文本
  477. continue
  478. # ⚠️ 关键修复:考虑图像坐标系(y 轴向下)
  479. dx = x2 - x1
  480. dy = y2 - y1
  481. if abs(dx) > 10:
  482. # 🔧 使用 -arctan2 来校正坐标系方向
  483. # 图像中向右下倾斜(dy>0)应该返回负角度
  484. angle_rad = -np.arctan2(dy, dx)
  485. # 只保留小角度倾斜(-15° ~ +15°)
  486. if abs(angle_rad) < np.radians(15):
  487. line_angles.append({
  488. 'angle': angle_rad,
  489. 'weight': width, # 长文本行权重更高
  490. 'y_center': (y1 + poly[2][1]) / 2
  491. })
  492. if len(line_angles) < 5:
  493. print(" ⚠️ 有效样本不足,跳过倾斜校正")
  494. return 0.0
  495. # 🎯 步骤2: 按 y 坐标排序,只使用中间区域
  496. line_angles.sort(key=lambda x: x['y_center'])
  497. start_idx = int(len(line_angles) * (1 - sample_ratio) / 2)
  498. end_idx = int(len(line_angles) * (1 + sample_ratio) / 2)
  499. sampled_angles = line_angles[start_idx:end_idx]
  500. # 🎯 步骤3: 计算中位数角度(初步估计)
  501. raw_angles = [item['angle'] for item in sampled_angles]
  502. median_angle = np.median(raw_angles)
  503. # 🎯 步骤4: 过滤异常值(与中位数差异过大)
  504. filtered_angles = []
  505. for item in sampled_angles:
  506. if abs(item['angle'] - median_angle) < outlier_threshold:
  507. filtered_angles.append(item)
  508. if len(filtered_angles) < 3:
  509. print(" ⚠️ 过滤后样本不足")
  510. return np.degrees(median_angle)
  511. # 🎯 步骤5: 加权平均(长文本行权重更高)
  512. total_weight = sum(item['weight'] for item in filtered_angles)
  513. weighted_angle = sum(
  514. item['angle'] * item['weight'] for item in filtered_angles
  515. ) / total_weight
  516. angle_deg = np.degrees(weighted_angle)
  517. print(f" 📐 倾斜角度检测:")
  518. print(f" • 原始样本: {len(line_angles)} 个")
  519. print(f" • 中间采样: {len(sampled_angles)} 个")
  520. print(f" • 过滤后: {len(filtered_angles)} 个")
  521. print(f" • 中位数角度: {np.degrees(median_angle):.3f}°")
  522. print(f" • 加权平均: {angle_deg:.3f}°")
  523. return angle_deg
  524. def _rotate_point(self, point: Tuple[float, float],
  525. angle_deg: float,
  526. center: Tuple[float, float] = (0, 0)) -> Tuple[float, float]:
  527. """
  528. 旋转点坐标
  529. Args:
  530. point: 原始点 (x, y)
  531. angle_deg: 旋转角度(度数,正值表示逆时针)
  532. center: 旋转中心
  533. Returns:
  534. 旋转后的点 (x', y')
  535. """
  536. x, y = point
  537. cx, cy = center
  538. # 转换为弧度
  539. angle_rad = np.radians(angle_deg)
  540. # 平移到原点
  541. x -= cx
  542. y -= cy
  543. # 旋转
  544. x_new = x * np.cos(angle_rad) - y * np.sin(angle_rad)
  545. y_new = x * np.sin(angle_rad) + y * np.cos(angle_rad)
  546. # 平移回去
  547. x_new += cx
  548. y_new += cy
  549. return (x_new, y_new)
  550. def _correct_bbox_skew(self, paddle_boxes: List[Dict],
  551. rotation_angle: float,
  552. image_size: Tuple[int, int]) -> List[Dict]:
  553. """
  554. 校正文本框的倾斜
  555. Args:
  556. paddle_boxes: Paddle OCR 结果
  557. rotation_angle: 倾斜角度
  558. image_size: 图像尺寸 (width, height)
  559. Returns:
  560. 校正后的文本框列表
  561. """
  562. if abs(rotation_angle) < 0.1: # 倾斜角度很小,不需要校正
  563. return paddle_boxes
  564. width, height = image_size
  565. center = (width / 2, height / 2)
  566. corrected_boxes = []
  567. for box in paddle_boxes:
  568. poly = box.get('poly', [])
  569. if len(poly) < 4:
  570. corrected_boxes.append(box)
  571. continue
  572. # 🎯 旋转多边形的四个角点
  573. rotated_poly = [
  574. self._rotate_point(point, -rotation_angle, center)
  575. for point in poly
  576. ]
  577. # 重新计算 bbox
  578. x_coords = [p[0] for p in rotated_poly]
  579. y_coords = [p[1] for p in rotated_poly]
  580. corrected_bbox = [
  581. min(x_coords),
  582. min(y_coords),
  583. max(x_coords),
  584. max(y_coords)
  585. ]
  586. # 创建校正后的 box
  587. corrected_box = box.copy()
  588. corrected_box['bbox'] = corrected_bbox
  589. corrected_box['poly'] = rotated_poly
  590. corrected_box['original_bbox'] = box['bbox'] # 保存原始坐标
  591. corrected_boxes.append(corrected_box)
  592. return corrected_boxes
  593. def _match_html_rows_to_paddle_groups(self, html_rows: List,
  594. grouped_boxes: List[Dict]) -> Dict[int, List[int]]:
  595. """
  596. 智能匹配 HTML 行与 paddle 分组(修正版:严格顺序匹配)
  597. 策略:
  598. 1. 数量相等:1:1 映射
  599. 2. 数量不等:按内容匹配,但保持 y 坐标顺序
  600. """
  601. if not html_rows or not grouped_boxes:
  602. return {}
  603. mapping = {}
  604. # 🎯 策略 1: 数量相等,简单 1:1 映射
  605. if len(html_rows) == len(grouped_boxes):
  606. for i in range(len(html_rows)):
  607. mapping[i] = [i]
  608. return mapping
  609. # 🎯 策略 2: 基于内容匹配(修正版:严格单调递增)
  610. from fuzzywuzzy import fuzz
  611. used_groups = set()
  612. next_group_to_check = 0 # 🔑 关键改进:维护全局组索引
  613. for row_idx, row in enumerate(html_rows):
  614. row_texts = [cell.get_text(strip=True) for cell in row.find_all(['td', 'th'])]
  615. row_texts = [t for t in row_texts if t]
  616. if not row_texts:
  617. mapping[row_idx] = []
  618. continue
  619. row_text_normalized = [self.text_matcher.normalize_text(t) for t in row_texts]
  620. row_combined_text = ''.join(row_text_normalized)
  621. best_groups = []
  622. best_score = 0
  623. # 🔑 关键改进:从 next_group_to_check 开始搜索
  624. max_window = 5
  625. for group_count in range(1, max_window + 1):
  626. # 🔑 从当前位置开始,而不是从第一个未使用的组
  627. start_group = next_group_to_check
  628. end_group = start_group + group_count
  629. if end_group > len(grouped_boxes):
  630. break
  631. combined_group_indices = list(range(start_group, end_group))
  632. # 🔑 跳过已使用的组(但不重新计算 start_group)
  633. if any(idx in used_groups for idx in combined_group_indices):
  634. continue
  635. # 收集组内所有文本
  636. combined_texts = []
  637. for g_idx in combined_group_indices:
  638. group_boxes = grouped_boxes[g_idx].get('boxes', [])
  639. for box in group_boxes:
  640. if box.get('used'):
  641. continue
  642. normalized_text = self.text_matcher.normalize_text(box.get('text', ''))
  643. if normalized_text:
  644. combined_texts.append(normalized_text)
  645. if not combined_texts:
  646. continue
  647. paddle_combined_text = ''.join(combined_texts)
  648. # 匹配策略
  649. match_count = 0
  650. for rt in row_text_normalized:
  651. if len(rt) < 2:
  652. continue
  653. # 精确匹配
  654. if any(rt == ct for ct in combined_texts):
  655. match_count += 1
  656. continue
  657. # 子串匹配
  658. if any(rt in ct or ct in rt for ct in combined_texts):
  659. match_count += 1
  660. continue
  661. # 在合并文本中查找
  662. if rt in paddle_combined_text:
  663. match_count += 1
  664. continue
  665. # 模糊匹配
  666. for ct in combined_texts:
  667. similarity = fuzz.partial_ratio(rt, ct)
  668. if similarity >= 75:
  669. match_count += 1
  670. break
  671. # 整行匹配
  672. row_similarity = fuzz.partial_ratio(row_combined_text, paddle_combined_text)
  673. coverage = match_count / len(row_texts) if row_texts else 0
  674. combined_coverage = row_similarity / 100.0
  675. final_score = max(coverage, combined_coverage)
  676. if final_score > best_score:
  677. best_score = final_score
  678. best_groups = combined_group_indices
  679. print(f" 行 {row_idx} 候选: 组 {combined_group_indices}, "
  680. f"单元格匹配: {match_count}/{len(row_texts)}, "
  681. f"整行相似度: {row_similarity}%, "
  682. f"最终得分: {final_score:.2f}")
  683. if final_score >= 0.9:
  684. break
  685. # 🔑 降低阈值
  686. if best_groups and best_score >= 0.3:
  687. mapping[row_idx] = best_groups
  688. used_groups.update(best_groups)
  689. # 🔑 关键改进:更新下一个要检查的组
  690. next_group_to_check = max(best_groups) + 1
  691. print(f" ✓ 行 {row_idx}: 匹配组 {best_groups} (得分: {best_score:.2f}), "
  692. f"下次从组 {next_group_to_check} 开始")
  693. else:
  694. mapping[row_idx] = []
  695. # 🔑 关键改进:即使没匹配,也要推进指针(假设跳过 1 个组)
  696. if next_group_to_check < len(grouped_boxes):
  697. next_group_to_check += 1
  698. print(f" ✗ 行 {row_idx}: 无匹配 (最佳得分: {best_score:.2f}), "
  699. f"推进到组 {next_group_to_check}")
  700. # 🎯 策略 3: 第二遍 - 处理未使用的组(关键!)
  701. unused_groups = [i for i in range(len(grouped_boxes)) if i not in used_groups]
  702. if unused_groups:
  703. print(f" ℹ️ 发现 {len(unused_groups)} 个未匹配的 paddle 组: {unused_groups}")
  704. # 🔑 将未使用的组合并到相邻的已匹配行
  705. for unused_idx in unused_groups:
  706. # 🎯 关键改进:计算与相邻行的边界距离
  707. unused_group = grouped_boxes[unused_idx]
  708. unused_y_min = min(b['bbox'][1] for b in unused_group['boxes'])
  709. unused_y_max = max(b['bbox'][3] for b in unused_group['boxes'])
  710. # 🔑 查找上方和下方最近的已使用组
  711. above_idx = None
  712. below_idx = None
  713. above_distance = float('inf')
  714. below_distance = float('inf')
  715. # 向上查找
  716. for i in range(unused_idx - 1, -1, -1):
  717. if i in used_groups:
  718. above_idx = i
  719. # 🎯 边界距离:unused 的最小 y - above 的最大 y
  720. above_group = grouped_boxes[i]
  721. max_y_box = max(
  722. above_group['boxes'],
  723. key=lambda b: b['bbox'][3]
  724. )
  725. above_y_center = (max_y_box['bbox'][1] + max_y_box['bbox'][3]) / 2
  726. above_distance = abs(unused_y_min - above_y_center)
  727. print(f" • 组 {unused_idx} 与上方组 {i} 距离: {above_distance:.1f}px")
  728. break
  729. # 向下查找
  730. for i in range(unused_idx + 1, len(grouped_boxes)):
  731. if i in used_groups:
  732. below_idx = i
  733. # 🎯 边界距离:below 的最小 y - unused 的最大 y
  734. below_group = grouped_boxes[i]
  735. min_y_box = min(
  736. below_group['boxes'],
  737. key=lambda b: b['bbox'][1]
  738. )
  739. below_y_center = (min_y_box['bbox'][1] + min_y_box['bbox'][3]) / 2
  740. below_distance = abs(below_y_center - unused_y_max)
  741. print(f" • 组 {unused_idx} 与下方组 {i} 距离: {below_distance:.1f}px")
  742. break
  743. # 🎯 选择距离更近的一侧
  744. if above_idx is not None and below_idx is not None:
  745. # 都存在,选择距离更近的
  746. if above_distance < below_distance:
  747. closest_used_idx = above_idx
  748. merge_direction = "上方"
  749. else:
  750. closest_used_idx = below_idx
  751. merge_direction = "下方"
  752. print(f" ✓ 组 {unused_idx} 选择合并到{merge_direction}组 {closest_used_idx}")
  753. elif above_idx is not None:
  754. closest_used_idx = above_idx
  755. merge_direction = "上方"
  756. elif below_idx is not None:
  757. closest_used_idx = below_idx
  758. merge_direction = "下方"
  759. else:
  760. print(f" ⚠️ 组 {unused_idx} 无相邻已使用组,跳过")
  761. continue
  762. # 🔑 找到该组对应的 HTML 行
  763. target_html_row = None
  764. for html_row_idx, group_indices in mapping.items():
  765. if closest_used_idx in group_indices:
  766. target_html_row = html_row_idx
  767. break
  768. if target_html_row is not None:
  769. # 🎯 根据合并方向决定目标行
  770. if merge_direction == "上方":
  771. # 合并到上方对应的 HTML 行
  772. if target_html_row in mapping:
  773. if unused_idx not in mapping[target_html_row]:
  774. mapping[target_html_row].append(unused_idx)
  775. print(f" • 组 {unused_idx} 合并到 HTML 行 {target_html_row}(上方行)")
  776. else:
  777. # 合并到下方对应的 HTML 行
  778. if target_html_row in mapping:
  779. if unused_idx not in mapping[target_html_row]:
  780. mapping[target_html_row].append(unused_idx)
  781. print(f" • 组 {unused_idx} 合并到 HTML 行 {target_html_row}(下方行)")
  782. used_groups.add(unused_idx)
  783. # 🔑 策略 4: 第三遍 - 按 y 坐标排序每行的组索引
  784. for row_idx in mapping:
  785. if mapping[row_idx]:
  786. mapping[row_idx].sort(key=lambda idx: grouped_boxes[idx]['y_center'])
  787. return mapping
  788. def _preprocess_close_groups(self, grouped_boxes: List[Dict],
  789. y_gap_threshold: int = 10) -> List[List[int]]:
  790. """
  791. 🆕 预处理:将 y 间距很小的组预合并
  792. Args:
  793. grouped_boxes: 原始分组
  794. y_gap_threshold: Y 间距阈值(小于此值认为是同一行)
  795. Returns:
  796. 预处理后的组索引列表 [[0,1], [2], [3,4,5], ...]
  797. """
  798. if not grouped_boxes:
  799. return []
  800. preprocessed = []
  801. current_group = [0]
  802. for i in range(1, len(grouped_boxes)):
  803. prev_group = grouped_boxes[i - 1]
  804. curr_group = grouped_boxes[i]
  805. # 计算间距
  806. prev_y_max = max(b['bbox'][3] for b in prev_group['boxes'])
  807. curr_y_min = min(b['bbox'][1] for b in curr_group['boxes'])
  808. gap = abs(curr_y_min - prev_y_max)
  809. if gap <= y_gap_threshold:
  810. # 间距很小,合并
  811. current_group.append(i)
  812. print(f" 预合并: 组 {i-1} 和 {i} (间距: {gap}px)")
  813. else:
  814. # 间距较大,开始新组
  815. preprocessed.append(current_group)
  816. current_group = [i]
  817. # 添加最后一组
  818. if current_group:
  819. preprocessed.append(current_group)
  820. return preprocessed
  821. def _match_cell_sequential(self, cell_text: str,
  822. boxes: List[Dict],
  823. col_boundaries: List[Tuple[int, int]],
  824. start_idx: int) -> Optional[Dict]:
  825. """
  826. 🎯 顺序匹配单元格:从指定位置开始,逐步合并 boxes 直到匹配
  827. 策略:
  828. 1. 找到第一个未使用的 box
  829. 2. 尝试单个 box 精确匹配
  830. 3. 如果失败,尝试合并多个 boxes
  831. Args:
  832. cell_text: HTML 单元格文本
  833. boxes: 候选 boxes(已按 x 坐标排序)
  834. col_boundaries: 列边界列表
  835. start_idx: 起始索引
  836. Returns:
  837. {'bbox': [x1,y1,x2,y2], 'text': str, 'score': float,
  838. 'paddle_indices': [idx1, idx2], 'used_boxes': [box1, box2],
  839. 'last_used_index': int}
  840. """
  841. from fuzzywuzzy import fuzz
  842. cell_text_normalized = self.text_matcher.normalize_text(cell_text)
  843. if len(cell_text_normalized) < 2:
  844. return None
  845. # 🔑 找到第一个未使用的 box
  846. first_unused_idx = start_idx
  847. while first_unused_idx < len(boxes) and boxes[first_unused_idx].get('used'):
  848. first_unused_idx += 1
  849. if first_unused_idx >= len(boxes):
  850. return None
  851. # 🔑 策略 1: 单个 box 精确匹配
  852. for box in boxes[first_unused_idx:]:
  853. if box.get('used'):
  854. continue
  855. box_text = self.text_matcher.normalize_text(box['text'])
  856. if cell_text_normalized == box_text:
  857. return self._build_match_result([box], box['text'], 100.0, boxes.index(box))
  858. # 🔑 策略 2: 多个 boxes 合并匹配
  859. unused_boxes = [b for b in boxes if not b.get('used')]
  860. # 合并同列的 boxes 合并
  861. merged_bboxes = []
  862. for col_idx in range(len(col_boundaries)):
  863. combo_boxes = self._get_boxes_in_column(unused_boxes, col_boundaries, col_idx)
  864. if len(combo_boxes) > 0:
  865. sorted_combo = sorted(combo_boxes, key=lambda b: (b['bbox'][1], b['bbox'][0]))
  866. merged_text = ''.join([b['text'] for b in sorted_combo])
  867. merged_bboxes.append({
  868. 'text': merged_text,
  869. 'sorted_combo': sorted_combo
  870. })
  871. for box in merged_bboxes:
  872. # 1. 精确匹配
  873. merged_text_normalized = self.text_matcher.normalize_text(box['text'])
  874. if cell_text_normalized == merged_text_normalized:
  875. last_sort_idx = boxes.index(box['sorted_combo'][-1])
  876. return self._build_match_result(box['sorted_combo'], box['text'], 100.0, last_sort_idx)
  877. # 2. 子串匹配
  878. is_substring = (cell_text_normalized in merged_text_normalized or
  879. merged_text_normalized in cell_text_normalized)
  880. # 3. 模糊匹配
  881. similarity = fuzz.partial_ratio(cell_text_normalized, merged_text_normalized)
  882. # 🎯 子串匹配加分
  883. if is_substring:
  884. similarity = min(100, similarity + 10)
  885. if similarity >= self.text_matcher.similarity_threshold:
  886. print(f" ✓ 匹配成功: '{cell_text[:15]}' vs '{merged_text[:15]}' (相似度: {similarity})")
  887. return self._build_match_result(box['sorted_combo'], box['text'], similarity, start_idx)
  888. print(f" ✗ 匹配失败: '{cell_text[:15]}'")
  889. return None
  890. def _build_match_result(self, boxes: List[Dict], text: str,
  891. score: float, last_index: int) -> Dict:
  892. """构建匹配结果(使用原始坐标)"""
  893. # 🔑 关键修复:使用 original_bbox(如果存在)
  894. def get_original_bbox(box: Dict) -> List[int]:
  895. return box.get('original_bbox', box['bbox'])
  896. original_bboxes = [get_original_bbox(b) for b in boxes]
  897. merged_bbox = [
  898. min(b[0] for b in original_bboxes),
  899. min(b[1] for b in original_bboxes),
  900. max(b[2] for b in original_bboxes),
  901. max(b[3] for b in original_bboxes)
  902. ]
  903. return {
  904. 'bbox': merged_bbox, # ✅ 使用原始坐标
  905. 'text': text,
  906. 'score': score,
  907. 'paddle_indices': [b['paddle_bbox_index'] for b in boxes],
  908. 'used_boxes': boxes,
  909. 'last_used_index': last_index
  910. }