bbox_extractor.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453
  1. """
  2. bbox 提取模块
  3. 负责从 PaddleOCR 结果中提取文字框信息
  4. """
  5. from typing import List, Dict, Tuple
  6. import numpy as np
  7. from pathlib import Path
  8. class BBoxExtractor:
  9. """bbox 提取器"""
  10. @staticmethod
  11. def extract_paddle_text_boxes(paddle_data: Dict) -> Tuple[List[Dict], float, Tuple[int, int]]:
  12. """
  13. 提取 PaddleOCR 的文字框信息
  14. Args:
  15. paddle_data: PaddleOCR 输出的数据
  16. Returns:
  17. 文字框列表(保持旋转后的angle角度)和旋转角度
  18. """
  19. text_boxes = []
  20. rotation_angle = 0.0
  21. orig_image_size = (0,0)
  22. if 'overall_ocr_res' not in paddle_data:
  23. return text_boxes, rotation_angle, orig_image_size
  24. ocr_res = paddle_data['overall_ocr_res']
  25. rec_texts = ocr_res.get('rec_texts', [])
  26. rec_polys = ocr_res.get('rec_polys', [])
  27. rec_scores = ocr_res.get('rec_scores', [])
  28. # 🎯 获取旋转角度
  29. rotation_angle = BBoxExtractor._get_rotation_angle(paddle_data)
  30. if rotation_angle != 0:
  31. orig_image_size = BBoxExtractor._get_original_image_size(paddle_data)
  32. print(f"🔄 检测到旋转角度: {rotation_angle}°")
  33. print(f"📐 原始图像尺寸: {orig_image_size[0]} x {orig_image_size[1]}")
  34. for i, (text, poly, score) in enumerate(zip(rec_texts, rec_polys, rec_scores)):
  35. if text and text.strip():
  36. # 计算 bbox (x_min, y_min, x_max, y_max)
  37. bbox = BBoxExtractor._poly_to_bbox(poly)
  38. text_boxes.append({
  39. 'text': text,
  40. 'bbox': bbox,
  41. 'poly': poly,
  42. 'score': score,
  43. 'paddle_bbox_index': i,
  44. 'used': False
  45. })
  46. return text_boxes, rotation_angle, orig_image_size
  47. @staticmethod
  48. def extract_paddle_text_boxes_inverse_rotate(paddle_data: Dict) -> Tuple[List[Dict], float, Tuple[int, int]]:
  49. """
  50. 提取 PaddleOCR 的文字框信息
  51. Args:
  52. paddle_data: PaddleOCR 输出的数据
  53. Returns:
  54. 文字框列表(坐标已转换为 angle=0 时的坐标)
  55. """
  56. text_boxes = []
  57. rotation_angle = 0.0
  58. orig_image_size = (0,0)
  59. if 'overall_ocr_res' not in paddle_data:
  60. return text_boxes, rotation_angle, orig_image_size
  61. ocr_res = paddle_data['overall_ocr_res']
  62. rec_texts = ocr_res.get('rec_texts', [])
  63. rec_polys = ocr_res.get('rec_polys', [])
  64. rec_scores = ocr_res.get('rec_scores', [])
  65. # 🎯 获取旋转角度
  66. rotation_angle = BBoxExtractor._get_rotation_angle(paddle_data)
  67. if rotation_angle != 0:
  68. orig_image_size = BBoxExtractor._get_original_image_size(paddle_data)
  69. print(f"🔄 检测到旋转角度: {rotation_angle}°")
  70. print(f"📐 原始图像尺寸: {orig_image_size[0]} x {orig_image_size[1]}")
  71. for i, (text, poly, score) in enumerate(zip(rec_texts, rec_polys, rec_scores)):
  72. if text and text.strip():
  73. # 🎯 如果有旋转角度,转换坐标
  74. if rotation_angle != 0 and orig_image_size:
  75. poly = BBoxExtractor._inverse_rotate_coordinates(
  76. poly, rotation_angle, orig_image_size
  77. )
  78. # 计算 bbox (x_min, y_min, x_max, y_max)
  79. bbox = BBoxExtractor._poly_to_bbox(poly)
  80. text_boxes.append({
  81. 'text': text,
  82. 'bbox': bbox,
  83. 'poly': poly,
  84. 'score': score,
  85. 'paddle_bbox_index': i,
  86. 'used': False
  87. })
  88. return text_boxes, rotation_angle, orig_image_size
  89. @staticmethod
  90. def _get_rotation_angle(paddle_data: Dict) -> float:
  91. """获取旋转角度"""
  92. if 'doc_preprocessor_res' not in paddle_data:
  93. return 0.0
  94. doc_res = paddle_data['doc_preprocessor_res']
  95. if isinstance(doc_res, dict) and 'angle' in doc_res:
  96. return float(doc_res['angle'])
  97. return 0.0
  98. @staticmethod
  99. def _get_original_image_size(paddle_data: Dict) -> tuple:
  100. """
  101. 获取原始图像尺寸(从图片文件读取)
  102. Args:
  103. paddle_data: PaddleOCR 数据
  104. Returns:
  105. (width, height) 元组
  106. """
  107. from PIL import Image
  108. # 🎯 从 input_path 读取图像
  109. input_path = paddle_data.get('input_path')
  110. if input_path and Path(input_path).exists():
  111. try:
  112. with Image.open(input_path) as img:
  113. # 返回原始图像尺寸
  114. return img.size # (width, height)
  115. except Exception as e:
  116. print(f"⚠️ 无法读取图像文件 {input_path}: {e}")
  117. # 🎯 降级方案:从 layout_det_res 推断
  118. if 'layout_det_res' in paddle_data:
  119. layout_res = paddle_data['layout_det_res']
  120. if 'boxes' in layout_res and layout_res['boxes']:
  121. max_x = 0
  122. max_y = 0
  123. for box in layout_res['boxes']:
  124. coord = box.get('coordinate', [])
  125. if len(coord) >= 4:
  126. max_x = max(max_x, coord[2])
  127. max_y = max(max_y, coord[3])
  128. if max_x > 0 and max_y > 0:
  129. return (int(max_x) + 50, int(max_y) + 50)
  130. # 🎯 最后降级:从 overall_ocr_res 推断
  131. if 'overall_ocr_res' in paddle_data:
  132. ocr_res = paddle_data['overall_ocr_res']
  133. rec_polys = ocr_res.get('rec_polys', [])
  134. if rec_polys:
  135. max_x = 0
  136. max_y = 0
  137. for poly in rec_polys:
  138. for point in poly:
  139. max_x = max(max_x, point[0])
  140. max_y = max(max_y, point[1])
  141. if max_x > 0 and max_y > 0:
  142. return (int(max_x) + 50, int(max_y) + 50)
  143. # 🎯 默认 A4 尺寸
  144. print("⚠️ 无法确定原始图像尺寸,使用默认值")
  145. return (2480, 3508)
  146. @staticmethod
  147. def rotate_box_coordinates(bbox: List[float],
  148. angle: float,
  149. orig_image_size: tuple) -> List[float]:
  150. """
  151. 旋转 bbox 坐标(与图像旋转保持一致)
  152. 参考 ocr_validator_utils.rotate_image_and_coordinates 的操作
  153. 旋转逻辑:
  154. - 0°: 不旋转
  155. - 90°: 逆时针旋转 90°
  156. - 180°: 旋转 180°
  157. - 270°: 顺时针旋转 90°(或逆时针 270°)
  158. Args:
  159. bbox: 原图像上的边界框 [x_min, y_min, x_max, y_max]
  160. angle: 旋转角度(0, 90, 180, 270)
  161. orig_image_size: 原始图像尺寸 (width, height)
  162. """
  163. poly = BBoxExtractor._bbox_to_poly(bbox)
  164. rotated_poly = BBoxExtractor._rotate_coordinates(poly, angle, orig_image_size)
  165. rotated_bbox = BBoxExtractor._poly_to_bbox(rotated_poly)
  166. return rotated_bbox
  167. @staticmethod
  168. def inverse_rotate_box_coordinates(bbox: List[float],
  169. angle: float,
  170. orig_image_size: tuple) -> List[float]:
  171. """
  172. 反向旋转 bbox 坐标
  173. 参考 ocr_validator_utils.rotate_image_and_coordinates 的逆操作
  174. PaddleOCR 在旋转后的图像上识别,坐标是旋转后的
  175. 我们需要将坐标转换回原始图像(未旋转)
  176. Args:
  177. bbox: 旋转后图像上的边界框 [x_min, y_min, x_max, y_max]
  178. angle: 旋转角度(度数,PaddleX 使用的角度)
  179. orig_image_size: 原始图像尺寸 (width, height)
  180. """
  181. poly = BBoxExtractor._bbox_to_poly(bbox)
  182. inverse_poly = BBoxExtractor._inverse_rotate_coordinates(poly, angle, orig_image_size)
  183. inverse_bbox = BBoxExtractor._poly_to_bbox(inverse_poly)
  184. return inverse_bbox
  185. @staticmethod
  186. def _inverse_rotate_coordinates(poly: List[List[float]],
  187. angle: float,
  188. orig_image_size: tuple) -> List[List[float]]:
  189. """
  190. 反向旋转坐标
  191. 参考 ocr_validator_utils.rotate_image_and_coordinates 的逆操作
  192. PaddleOCR 在旋转后的图像上识别,坐标是旋转后的
  193. 我们需要将坐标转换回原始图像(未旋转)
  194. Args:
  195. poly: 旋转后图像上的多边形坐标 [[x',y'], ...]
  196. angle: 旋转角度(度数,PaddleX 使用的角度)
  197. orig_image_size: 原始图像尺寸 (width, height)
  198. Returns:
  199. 原始图像上的多边形坐标 [[x,y], ...]
  200. """
  201. orig_width, orig_height = orig_image_size
  202. # 🎯 根据旋转角度计算旋转后的图像尺寸
  203. if angle == 90:
  204. rotated_width, rotated_height = orig_height, orig_width
  205. elif angle == 270:
  206. rotated_width, rotated_height = orig_height, orig_width
  207. else:
  208. rotated_width, rotated_height = orig_width, orig_height
  209. inverse_poly = []
  210. for point in poly:
  211. x_rot, y_rot = point[0], point[1] # 旋转后的坐标
  212. # 🎯 反向旋转(参考 rotate_image_and_coordinates 的逆操作)
  213. if angle == 90:
  214. # 正向: rotated = image.rotate(90, expand=True)
  215. # x_rot = y_orig
  216. # y_rot = rotated_width - x_orig = orig_height - x_orig
  217. # 反向: x_orig = rotated_width - y_rot = orig_height - y_rot
  218. # y_orig = x_rot
  219. x_orig = rotated_width - y_rot
  220. y_orig = x_rot
  221. elif angle == 270:
  222. # 正向: rotated = image.rotate(-90, expand=True)
  223. # x_rot = rotated_width - y_orig = orig_height - y_orig
  224. # y_rot = x_orig
  225. # 反向: y_orig = rotated_width - x_rot = orig_height - x_rot
  226. # x_orig = y_rot
  227. x_orig = y_rot
  228. y_orig = rotated_width - x_rot
  229. elif angle == 180:
  230. # 正向: rotated = image.rotate(180)
  231. # x_rot = orig_width - x_orig
  232. # y_rot = orig_height - y_orig
  233. # 反向: x_orig = orig_width - x_rot
  234. # y_orig = orig_height - y_rot
  235. x_orig = orig_width - x_rot
  236. y_orig = orig_height - y_rot
  237. else:
  238. # 其他角度或0度,不转换
  239. x_orig = x_rot
  240. y_orig = y_rot
  241. inverse_poly.append([x_orig, y_orig])
  242. return inverse_poly
  243. @staticmethod
  244. def _rotate_coordinates(poly: List[List[float]],
  245. angle: float,
  246. orig_image_size: tuple) -> List[List[float]]:
  247. """
  248. 旋转多边形坐标(与图像旋转保持一致)
  249. 参考 ocr_validator_utils.rotate_image_and_coordinates 的操作
  250. 旋转逻辑:
  251. - 0°: 不旋转
  252. - 90°: 逆时针旋转 90°
  253. - 180°: 旋转 180°
  254. - 270°: 顺时针旋转 90°(或逆时针 270°)
  255. Args:
  256. poly: 原图像上的多边形坐标 [[x', y'], ...]
  257. angle: 旋转角度(0, 90, 180, 270)
  258. orig_image_size: 原始图像尺寸 (width, height)
  259. Returns:
  260. 旋转后的多边形坐标 [[x, y], ...]
  261. Example:
  262. >>> poly = [[100, 200], [150, 200], [150, 250], [100, 250]]
  263. >>> rotated = rotate_coordinates(poly, 90, (1000, 800))
  264. >>> print(rotated)
  265. [[200, 900], [200, 850], [250, 850], [250, 900]]
  266. """
  267. if not poly or angle == 0:
  268. return poly
  269. orig_width, orig_height = orig_image_size
  270. rotated_poly = []
  271. for point in poly:
  272. x, y = point[0], point[1]
  273. if angle == 90:
  274. # 逆时针旋转 90°
  275. # 新坐标系: 宽度=原高度, 高度=原宽度
  276. # x_new = y_old
  277. # y_new = 原宽度 - x_old
  278. new_x = y
  279. new_y = orig_width - x
  280. elif angle == 180:
  281. # 旋转 180°
  282. # 新坐标系: 宽度=原宽度, 高度=原高度
  283. # x_new = 原宽度 - x_old
  284. # y_new = 原高度 - y_old
  285. new_x = orig_width - x
  286. new_y = orig_height - y
  287. elif angle == 270:
  288. # 顺时针旋转 90°(或逆时针 270°)
  289. # 新坐标系: 宽度=原高度, 高度=原宽度
  290. # x_new = 原高度 - y_old
  291. # y_new = x_old
  292. new_x = orig_height - y
  293. new_y = x
  294. else:
  295. # 不支持的角度,保持原坐标
  296. new_x, new_y = x, y
  297. rotated_poly.append([new_x, new_y])
  298. return rotated_poly
  299. @staticmethod
  300. def _bbox_to_poly(bbox: List[float]) -> List[List[float]]:
  301. """
  302. 将 bbox 转换为多边形(4个角点,逆时针顺序)
  303. Args:
  304. bbox: 边界框 [x_min, y_min, x_max, y_max]
  305. Returns:
  306. 多边形坐标 [[x1, y1], [x2, y2], [x3, y3], [x4, y4]]
  307. 顺序:左上 -> 右上 -> 右下 -> 左下(逆时针)
  308. Example:
  309. >>> bbox = [100, 200, 150, 250]
  310. >>> poly = BBoxExtractor._bbox_to_poly(bbox)
  311. >>> print(poly)
  312. [[100, 200], [150, 200], [150, 250], [100, 250]]
  313. """
  314. if not bbox or len(bbox) < 4:
  315. return []
  316. x_min, y_min, x_max, y_max = bbox[:4]
  317. # 🎯 4个角点(逆时针顺序)
  318. poly = [
  319. [x_min, y_min], # 左上角
  320. [x_max, y_min], # 右上角
  321. [x_max, y_max], # 右下角
  322. [x_min, y_max] # 左下角
  323. ]
  324. return poly
  325. @staticmethod
  326. def _poly_to_bbox(poly: List[List[float]]) -> List[float]:
  327. """将多边形转换为 bbox [x_min, y_min, x_max, y_max]"""
  328. xs = [p[0] for p in poly]
  329. ys = [p[1] for p in poly]
  330. return [min(xs), min(ys), max(xs), max(ys)]
  331. @staticmethod
  332. def extract_table_cells_with_bbox(merged_data: List[Dict]) -> List[Dict]:
  333. """
  334. 提取所有表格单元格及其 bbox 信息
  335. Args:
  336. merged_data: 合并后的数据
  337. Returns:
  338. 单元格列表
  339. """
  340. import json
  341. from bs4 import BeautifulSoup
  342. cells = []
  343. for item in merged_data:
  344. if item['type'] != 'table':
  345. continue
  346. html = item.get('table_body_with_bbox', item.get('table_body', ''))
  347. soup = BeautifulSoup(html, 'html.parser')
  348. for row_idx, row in enumerate(soup.find_all('tr')):
  349. for col_idx, cell in enumerate(row.find_all(['td', 'th'])):
  350. cell_text = cell.get_text(strip=True)
  351. bbox_str = cell.get('data-bbox', '')
  352. if bbox_str:
  353. try:
  354. bbox = json.loads(bbox_str)
  355. cells.append({
  356. 'text': cell_text,
  357. 'bbox': bbox,
  358. 'row': row_idx,
  359. 'col': col_idx,
  360. 'score': float(cell.get('data-score', 0)),
  361. 'paddle_index': int(cell.get('data-paddle-index', -1))
  362. })
  363. except (json.JSONDecodeError, ValueError):
  364. pass
  365. return cells