bbox_utils.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632
  1. """
  2. bbox 提取和坐标转换工具模块
  3. 提供通用的 bbox 处理功能:
  4. - 从 PaddleOCR 结果中提取文字框信息
  5. - 坐标旋转和反向旋转(与图像旋转保持一致)
  6. - 倾斜角度计算和校正
  7. - bbox 和多边形之间的转换
  8. - 表格单元格 bbox 提取
  9. 此模块从 merger 中提取,供多个模块共享使用。
  10. """
  11. from typing import List, Dict, Tuple
  12. import numpy as np
  13. from pathlib import Path
  14. class BBoxExtractor:
  15. """bbox 提取和坐标转换工具类"""
  16. @staticmethod
  17. def extract_paddle_text_boxes(paddle_data: Dict) -> Tuple[List[Dict], float, Tuple[int, int]]:
  18. """
  19. 提取 PaddleOCR 的文字框信息
  20. Args:
  21. paddle_data: PaddleOCR 输出的数据
  22. Returns:
  23. 文字框列表(保持旋转后的angle角度)和旋转角度
  24. """
  25. text_boxes = []
  26. rotation_angle = 0.0
  27. orig_image_size = (0,0)
  28. if 'overall_ocr_res' not in paddle_data:
  29. return text_boxes, rotation_angle, orig_image_size
  30. ocr_res = paddle_data['overall_ocr_res']
  31. rec_texts = ocr_res.get('rec_texts', [])
  32. rec_polys = ocr_res.get('rec_polys', [])
  33. rec_scores = ocr_res.get('rec_scores', [])
  34. # 🎯 获取旋转角度
  35. rotation_angle = BBoxExtractor._get_rotation_angle(paddle_data)
  36. if rotation_angle != 0:
  37. orig_image_size = BBoxExtractor._get_original_image_size(paddle_data)
  38. print(f"🔄 检测到旋转角度: {rotation_angle}°")
  39. print(f"📐 原始图像尺寸: {orig_image_size[0]} x {orig_image_size[1]}")
  40. for i, (text, poly, score) in enumerate(zip(rec_texts, rec_polys, rec_scores)):
  41. if text and text.strip():
  42. # 计算 bbox (x_min, y_min, x_max, y_max)
  43. bbox = BBoxExtractor._poly_to_bbox(poly)
  44. text_boxes.append({
  45. 'text': text,
  46. 'bbox': bbox,
  47. 'poly': poly,
  48. 'score': score,
  49. 'paddle_bbox_index': i,
  50. 'used': False
  51. })
  52. return text_boxes, rotation_angle, orig_image_size
  53. @staticmethod
  54. def extract_paddle_text_boxes_inverse_rotate(paddle_data: Dict) -> Tuple[List[Dict], float, Tuple[int, int]]:
  55. """
  56. 提取 PaddleOCR 的文字框信息
  57. Args:
  58. paddle_data: PaddleOCR 输出的数据
  59. Returns:
  60. 文字框列表(坐标已转换为 angle=0 时的坐标)
  61. """
  62. text_boxes = []
  63. rotation_angle = 0.0
  64. orig_image_size = (0,0)
  65. if 'overall_ocr_res' not in paddle_data:
  66. return text_boxes, rotation_angle, orig_image_size
  67. ocr_res = paddle_data['overall_ocr_res']
  68. rec_texts = ocr_res.get('rec_texts', [])
  69. rec_polys = ocr_res.get('rec_polys', [])
  70. rec_scores = ocr_res.get('rec_scores', [])
  71. # 🎯 获取旋转角度
  72. rotation_angle = BBoxExtractor._get_rotation_angle(paddle_data)
  73. if rotation_angle != 0:
  74. orig_image_size = BBoxExtractor._get_original_image_size(paddle_data)
  75. print(f"🔄 检测到旋转角度: {rotation_angle}°")
  76. print(f"📐 原始图像尺寸: {orig_image_size[0]} x {orig_image_size[1]}")
  77. for i, (text, poly, score) in enumerate(zip(rec_texts, rec_polys, rec_scores)):
  78. if text and text.strip():
  79. # 🎯 如果有旋转角度,转换坐标
  80. if rotation_angle != 0 and orig_image_size:
  81. poly = BBoxExtractor.inverse_rotate_coordinates(
  82. poly, rotation_angle, orig_image_size
  83. )
  84. # 计算 bbox (x_min, y_min, x_max, y_max)
  85. bbox = BBoxExtractor._poly_to_bbox(poly)
  86. text_boxes.append({
  87. 'text': text,
  88. 'bbox': bbox,
  89. 'poly': poly,
  90. 'score': score,
  91. 'paddle_bbox_index': i,
  92. 'used': False
  93. })
  94. return text_boxes, rotation_angle, orig_image_size
  95. @staticmethod
  96. def _get_rotation_angle(paddle_data: Dict) -> float:
  97. """获取旋转角度"""
  98. if 'doc_preprocessor_res' not in paddle_data:
  99. return 0.0
  100. doc_res = paddle_data['doc_preprocessor_res']
  101. if isinstance(doc_res, dict) and 'angle' in doc_res:
  102. return float(doc_res['angle'])
  103. return 0.0
  104. @staticmethod
  105. def _get_original_image_size(paddle_data: Dict) -> tuple:
  106. """
  107. 获取原始图像尺寸(从图片文件读取)
  108. Args:
  109. paddle_data: PaddleOCR 数据
  110. Returns:
  111. (width, height) 元组
  112. """
  113. from PIL import Image
  114. # 🎯 从 input_path 读取图像
  115. input_path = paddle_data.get('input_path')
  116. if input_path and Path(input_path).exists():
  117. try:
  118. with Image.open(input_path) as img:
  119. # 返回原始图像尺寸
  120. return img.size # (width, height)
  121. except Exception as e:
  122. print(f"⚠️ 无法读取图像文件 {input_path}: {e}")
  123. # 🎯 降级方案:从 layout_det_res 推断
  124. if 'layout_det_res' in paddle_data:
  125. layout_res = paddle_data['layout_det_res']
  126. if 'boxes' in layout_res and layout_res['boxes']:
  127. max_x = 0
  128. max_y = 0
  129. for box in layout_res['boxes']:
  130. coord = box.get('coordinate', [])
  131. if len(coord) >= 4:
  132. max_x = max(max_x, coord[2])
  133. max_y = max(max_y, coord[3])
  134. if max_x > 0 and max_y > 0:
  135. return (int(max_x) + 50, int(max_y) + 50)
  136. # 🎯 最后降级:从 overall_ocr_res 推断
  137. if 'overall_ocr_res' in paddle_data:
  138. ocr_res = paddle_data['overall_ocr_res']
  139. rec_polys = ocr_res.get('rec_polys', [])
  140. if rec_polys:
  141. max_x = 0
  142. max_y = 0
  143. for poly in rec_polys:
  144. for point in poly:
  145. max_x = max(max_x, point[0])
  146. max_y = max(max_y, point[1])
  147. if max_x > 0 and max_y > 0:
  148. return (int(max_x) + 50, int(max_y) + 50)
  149. # 🎯 默认 A4 尺寸
  150. print("⚠️ 无法确定原始图像尺寸,使用默认值")
  151. return (2480, 3508)
  152. @staticmethod
  153. def rotate_box_coordinates(bbox: List[float],
  154. angle: float,
  155. orig_image_size: tuple) -> List[float]:
  156. """
  157. 旋转 bbox 坐标(与图像旋转保持一致)
  158. 参考 ocr_validator_utils.rotate_image_and_coordinates 的操作
  159. 旋转逻辑:
  160. - 0°: 不旋转
  161. - 90°: 逆时针旋转 90°
  162. - 180°: 旋转 180°
  163. - 270°: 顺时针旋转 90°(或逆时针 270°)
  164. Args:
  165. bbox: 原图像上的边界框 [x_min, y_min, x_max, y_max]
  166. angle: 旋转角度(0, 90, 180, 270)
  167. orig_image_size: 原始图像尺寸 (width, height)
  168. """
  169. poly = BBoxExtractor._bbox_to_poly(bbox)
  170. rotated_poly = BBoxExtractor.rotate_coordinates(poly, angle, orig_image_size)
  171. rotated_bbox = BBoxExtractor._poly_to_bbox(rotated_poly)
  172. return rotated_bbox
  173. @staticmethod
  174. def inverse_rotate_box_coordinates(bbox: List[float],
  175. angle: float,
  176. orig_image_size: tuple) -> List[float]:
  177. """
  178. 反向旋转 bbox 坐标
  179. 参考 ocr_validator_utils.rotate_image_and_coordinates 的逆操作
  180. PaddleOCR 在旋转后的图像上识别,坐标是旋转后的
  181. 我们需要将坐标转换回原始图像(未旋转)
  182. Args:
  183. bbox: 旋转后图像上的边界框 [x_min, y_min, x_max, y_max]
  184. angle: 旋转角度(度数,PaddleX 使用的角度)
  185. orig_image_size: 原始图像尺寸 (width, height)
  186. """
  187. poly = BBoxExtractor._bbox_to_poly(bbox)
  188. inverse_poly = BBoxExtractor.inverse_rotate_coordinates(poly, angle, orig_image_size)
  189. inverse_bbox = BBoxExtractor._poly_to_bbox(inverse_poly)
  190. return inverse_bbox
  191. @staticmethod
  192. def inverse_rotate_coordinates(poly: List[List[float]],
  193. angle: float,
  194. orig_image_size: tuple) -> List[List[float]]:
  195. """
  196. 反向旋转坐标
  197. 参考 ocr_validator_utils.rotate_image_and_coordinates 的逆操作
  198. PaddleOCR 在旋转后的图像上识别,坐标是旋转后的
  199. 我们需要将坐标转换回原始图像(未旋转)
  200. Args:
  201. poly: 旋转后图像上的多边形坐标 [[x',y'], ...]
  202. angle: 旋转角度(度数,PaddleX 使用的角度)
  203. orig_image_size: 原始图像尺寸 (width, height)
  204. Returns:
  205. 原始图像上的多边形坐标 [[x,y], ...]
  206. """
  207. orig_width, orig_height = orig_image_size
  208. # 🎯 根据旋转角度计算旋转后的图像尺寸
  209. if angle == 90:
  210. rotated_width, rotated_height = orig_height, orig_width
  211. elif angle == 270:
  212. rotated_width, rotated_height = orig_height, orig_width
  213. else:
  214. rotated_width, rotated_height = orig_width, orig_height
  215. inverse_poly = []
  216. for point in poly:
  217. x_rot, y_rot = point[0], point[1] # 旋转后的坐标
  218. # 🎯 反向旋转(参考 rotate_image_and_coordinates 的逆操作)
  219. if angle == 90:
  220. # 正向: rotated = image.rotate(90, expand=True)
  221. # x_rot = y_orig
  222. # y_rot = rotated_width - x_orig = orig_height - x_orig
  223. # 反向: x_orig = rotated_width - y_rot = orig_height - y_rot
  224. # y_orig = x_rot
  225. x_orig = rotated_width - y_rot
  226. y_orig = x_rot
  227. elif angle == 270:
  228. # 正向: rotated = image.rotate(-90, expand=True)
  229. # x_rot = rotated_width - y_orig = orig_height - y_orig
  230. # y_rot = x_orig
  231. # 反向: y_orig = rotated_width - x_rot = orig_height - x_rot
  232. # x_orig = y_rot
  233. x_orig = y_rot
  234. y_orig = rotated_width - x_rot
  235. elif angle == 180:
  236. # 正向: rotated = image.rotate(180)
  237. # x_rot = orig_width - x_orig
  238. # y_rot = orig_height - y_orig
  239. # 反向: x_orig = orig_width - x_rot
  240. # y_orig = orig_height - y_rot
  241. x_orig = orig_width - x_rot
  242. y_orig = orig_height - y_rot
  243. else:
  244. # 其他角度或0度,不转换
  245. x_orig = x_rot
  246. y_orig = y_rot
  247. inverse_poly.append([x_orig, y_orig])
  248. return inverse_poly
  249. @staticmethod
  250. def rotate_coordinates(poly: List[List[float]],
  251. angle: float,
  252. orig_image_size: tuple) -> List[List[float]]:
  253. """
  254. 旋转多边形坐标(与图像旋转保持一致)
  255. 参考 ocr_validator_utils.rotate_image_and_coordinates 的操作
  256. 旋转逻辑:
  257. - 0°: 不旋转
  258. - 90°: 逆时针旋转 90°
  259. - 180°: 旋转 180°
  260. - 270°: 顺时针旋转 90°(或逆时针 270°)
  261. Args:
  262. poly: 原图像上的多边形坐标 [[x', y'], ...]
  263. angle: 旋转角度(0, 90, 180, 270)
  264. orig_image_size: 原始图像尺寸 (width, height)
  265. Returns:
  266. 旋转后的多边形坐标 [[x, y], ...]
  267. Example:
  268. >>> poly = [[100, 200], [150, 200], [150, 250], [100, 250]]
  269. >>> rotated = rotate_coordinates(poly, 90, (1000, 800))
  270. >>> print(rotated)
  271. [[200, 900], [200, 850], [250, 850], [250, 900]]
  272. """
  273. if not poly or angle == 0:
  274. return poly
  275. orig_width, orig_height = orig_image_size
  276. rotated_poly = []
  277. for point in poly:
  278. x, y = point[0], point[1]
  279. if angle == 90:
  280. # 逆时针旋转 90°
  281. # 新坐标系: 宽度=原高度, 高度=原宽度
  282. # x_new = y_old
  283. # y_new = 原宽度 - x_old
  284. new_x = y
  285. new_y = orig_width - x
  286. elif angle == 180:
  287. # 旋转 180°
  288. # 新坐标系: 宽度=原宽度, 高度=原高度
  289. # x_new = 原宽度 - x_old
  290. # y_new = 原高度 - y_old
  291. new_x = orig_width - x
  292. new_y = orig_height - y
  293. elif angle == 270:
  294. # 顺时针旋转 90°(或逆时针 270°)
  295. # 新坐标系: 宽度=原高度, 高度=原宽度
  296. # x_new = 原高度 - y_old
  297. # y_new = x_old
  298. new_x = orig_height - y
  299. new_y = x
  300. else:
  301. # 不支持的角度,保持原坐标
  302. new_x, new_y = x, y
  303. rotated_poly.append([new_x, new_y])
  304. return rotated_poly
  305. @staticmethod
  306. def _bbox_to_poly(bbox: List[float]) -> List[List[float]]:
  307. """
  308. 将 bbox 转换为多边形(4个角点,逆时针顺序)
  309. Args:
  310. bbox: 边界框 [x_min, y_min, x_max, y_max]
  311. Returns:
  312. 多边形坐标 [[x1, y1], [x2, y2], [x3, y3], [x4, y4]]
  313. 顺序:左上 -> 右上 -> 右下 -> 左下(逆时针)
  314. Example:
  315. >>> bbox = [100, 200, 150, 250]
  316. >>> poly = BBoxExtractor._bbox_to_poly(bbox)
  317. >>> print(poly)
  318. [[100, 200], [150, 200], [150, 250], [100, 250]]
  319. """
  320. if not bbox or len(bbox) < 4:
  321. return []
  322. x_min, y_min, x_max, y_max = bbox[:4]
  323. # 🎯 4个角点(逆时针顺序)
  324. poly = [
  325. [x_min, y_min], # 左上角
  326. [x_max, y_min], # 右上角
  327. [x_max, y_max], # 右下角
  328. [x_min, y_max] # 左下角
  329. ]
  330. return poly
  331. @staticmethod
  332. def _poly_to_bbox(poly: List[List[float]]) -> List[float]:
  333. """将多边形转换为 bbox [x_min, y_min, x_max, y_max]"""
  334. xs = [p[0] for p in poly]
  335. ys = [p[1] for p in poly]
  336. return [min(xs), min(ys), max(xs), max(ys)]
  337. @staticmethod
  338. def extract_table_cells_with_bbox(merged_data: List[Dict]) -> List[Dict]:
  339. """
  340. 提取所有表格单元格及其 bbox 信息
  341. Args:
  342. merged_data: 合并后的数据
  343. Returns:
  344. 单元格列表
  345. """
  346. import json
  347. from bs4 import BeautifulSoup
  348. cells = []
  349. for item in merged_data:
  350. if item['type'] != 'table':
  351. continue
  352. html = item.get('table_body_with_bbox', item.get('table_body', ''))
  353. soup = BeautifulSoup(html, 'html.parser')
  354. for row_idx, row in enumerate(soup.find_all('tr')):
  355. for col_idx, cell in enumerate(row.find_all(['td', 'th'])):
  356. cell_text = cell.get_text(strip=True)
  357. bbox_str = cell.get('data-bbox', '')
  358. if bbox_str:
  359. try:
  360. bbox = json.loads(bbox_str)
  361. cells.append({
  362. 'text': cell_text,
  363. 'bbox': bbox,
  364. 'row': row_idx,
  365. 'col': col_idx,
  366. 'score': float(cell.get('data-score', 0)),
  367. 'paddle_index': int(cell.get('data-paddle-index', -1))
  368. })
  369. except (json.JSONDecodeError, ValueError):
  370. pass
  371. return cells
  372. @staticmethod
  373. def calculate_skew_angle(paddle_boxes: List[Dict],
  374. sample_ratio: float = 0.5,
  375. outlier_threshold: float = 0.3) -> float:
  376. """
  377. 计算文档倾斜角度(基于文本行分析)
  378. Args:
  379. paddle_boxes: Paddle OCR 结果(包含 poly)
  380. sample_ratio: 采样比例(使用中间区域)
  381. outlier_threshold: 异常值阈值(弧度)
  382. Returns:
  383. 倾斜角度(度数,正值=逆时针,负值=顺时针)
  384. """
  385. if not paddle_boxes:
  386. return 0.0
  387. # 收集文本行的倾斜角度
  388. line_angles = []
  389. for box in paddle_boxes:
  390. poly = box.get('poly', [])
  391. if len(poly) < 4:
  392. continue
  393. x1, y1 = poly[0]
  394. x2, y2 = poly[1]
  395. width = abs(x2 - x1)
  396. height = abs(poly[2][1] - y1)
  397. # 过滤条件
  398. if width < 50 or width < height * 0.5:
  399. continue
  400. dx = x2 - x1
  401. dy = y2 - y1
  402. if abs(dx) > 10:
  403. angle_rad = -np.arctan2(dy, dx)
  404. if abs(angle_rad) < np.radians(15):
  405. line_angles.append({
  406. 'angle': angle_rad,
  407. 'weight': width,
  408. 'y_center': (y1 + poly[2][1]) / 2
  409. })
  410. if len(line_angles) < 5:
  411. return 0.0
  412. # 中间区域采样
  413. line_angles.sort(key=lambda x: x['y_center'])
  414. start_idx = int(len(line_angles) * (1 - sample_ratio) / 2)
  415. end_idx = int(len(line_angles) * (1 + sample_ratio) / 2)
  416. sampled_angles = line_angles[start_idx:end_idx]
  417. # 计算中位数
  418. raw_angles = [item['angle'] for item in sampled_angles]
  419. median_angle = np.median(raw_angles)
  420. # 过滤异常值
  421. filtered_angles = [
  422. item for item in sampled_angles
  423. if abs(item['angle'] - median_angle) < outlier_threshold
  424. ]
  425. if len(filtered_angles) < 3:
  426. return np.degrees(median_angle)
  427. # 加权平均
  428. total_weight = sum(item['weight'] for item in filtered_angles)
  429. weighted_angle = sum(
  430. item['angle'] * item['weight'] for item in filtered_angles
  431. ) / total_weight
  432. return np.degrees(weighted_angle)
  433. @staticmethod
  434. def rotate_point(point: Tuple[float, float],
  435. angle_deg: float,
  436. center: Tuple[float, float] = (0, 0)) -> Tuple[float, float]:
  437. """
  438. 旋转点坐标 (图像坐标系:Y轴向下)
  439. Args:
  440. point: 原始点 (x, y)
  441. angle_deg: 旋转角度(度数,正值=逆时针)
  442. center: 旋转中心
  443. Returns:
  444. 旋转后的点 (x', y')
  445. """
  446. x, y = point
  447. cx, cy = center
  448. angle_rad = np.radians(angle_deg)
  449. x -= cx
  450. y -= cy
  451. # 图像坐标系(Y轴向下)下的逆时针旋转公式
  452. # x' = x cosθ + y sinθ
  453. # y' = -x sinθ + y cosθ
  454. x_new = x * np.cos(angle_rad) + y * np.sin(angle_rad)
  455. y_new = -x * np.sin(angle_rad) + y * np.cos(angle_rad)
  456. x_new += cx
  457. y_new += cy
  458. return (x_new, y_new)
  459. @staticmethod
  460. def correct_boxes_skew(paddle_boxes: List[Dict],
  461. correction_angle: float,
  462. image_size: Tuple[int, int]) -> List[Dict]:
  463. """
  464. 校正文本框的倾斜
  465. Args:
  466. paddle_boxes: Paddle OCR 结果
  467. correction_angle: 校正旋转角度(度数,正值=逆时针,负值=顺时针)
  468. 注意:这里直接传入需要旋转的角度,不再自动取反
  469. image_size: 图像尺寸 (width, height)
  470. Returns:
  471. 校正后的文本框列表
  472. """
  473. if abs(correction_angle) < 0.01:
  474. return paddle_boxes
  475. width, height = image_size
  476. center = (width / 2, height / 2)
  477. corrected_boxes = []
  478. for box in paddle_boxes:
  479. poly = box.get('poly', [])
  480. # 🆕 修复:如果没有 poly,尝试从 bbox 生成
  481. # 这是为了兼容 MinerU 或其他没有 poly 的数据源
  482. if not poly or len(poly) < 4:
  483. if 'bbox' in box and len(box['bbox']) == 4:
  484. poly = BBoxExtractor._bbox_to_poly(box['bbox'])
  485. else:
  486. corrected_boxes.append(box)
  487. continue
  488. # 旋转多边形
  489. rotated_poly = []
  490. for point in poly:
  491. # 确保点是 tuple 或 list,并只有 2 个坐标
  492. p = (point[0], point[1]) if isinstance(point, (list, tuple)) and len(point) >= 2 else (0.0, 0.0)
  493. # 直接使用 correction_angle 进行旋转
  494. rotated_point = BBoxExtractor.rotate_point(p, correction_angle, center)
  495. rotated_poly.append([rotated_point[0], rotated_point[1]]) # 转换回 list 以匹配 _poly_to_bbox 类型
  496. # 重新计算 bbox
  497. corrected_bbox = BBoxExtractor._poly_to_bbox(rotated_poly)
  498. corrected_box = box.copy()
  499. corrected_box['bbox'] = corrected_bbox
  500. corrected_box['poly'] = rotated_poly
  501. corrected_box['original_bbox'] = box['bbox']
  502. corrected_boxes.append(corrected_box)
  503. return corrected_boxes