bbox_extractor.py 3.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. """
  2. bbox 提取模块
  3. 负责从 PaddleOCR 结果中提取文字框信息
  4. """
  5. from typing import List, Dict
  6. class BBoxExtractor:
  7. """bbox 提取器"""
  8. @staticmethod
  9. def extract_paddle_text_boxes(paddle_data: Dict) -> List[Dict]:
  10. """
  11. 提取 PaddleOCR 的文字框信息
  12. Args:
  13. paddle_data: PaddleOCR 输出的数据
  14. Returns:
  15. 文字框列表
  16. """
  17. text_boxes = []
  18. if 'overall_ocr_res' not in paddle_data:
  19. return text_boxes
  20. ocr_res = paddle_data['overall_ocr_res']
  21. rec_texts = ocr_res.get('rec_texts', [])
  22. rec_polys = ocr_res.get('rec_polys', [])
  23. rec_scores = ocr_res.get('rec_scores', [])
  24. for i, (text, poly, score) in enumerate(zip(rec_texts, rec_polys, rec_scores)):
  25. if text and text.strip():
  26. # 计算 bbox (x_min, y_min, x_max, y_max)
  27. bbox = BBoxExtractor._poly_to_bbox(poly)
  28. text_boxes.append({
  29. 'text': text,
  30. 'bbox': bbox,
  31. 'poly': poly,
  32. 'score': score,
  33. 'paddle_bbox_index': i,
  34. 'used': False
  35. })
  36. return text_boxes
  37. @staticmethod
  38. def _poly_to_bbox(poly: List[List[float]]) -> List[float]:
  39. """将多边形转换为 bbox"""
  40. xs = [p[0] for p in poly]
  41. ys = [p[1] for p in poly]
  42. return [min(xs), min(ys), max(xs), max(ys)]
  43. @staticmethod
  44. def extract_table_cells_with_bbox(merged_data: List[Dict]) -> List[Dict]:
  45. """
  46. 提取所有表格单元格及其 bbox 信息
  47. Args:
  48. merged_data: 合并后的数据
  49. Returns:
  50. 单元格列表
  51. """
  52. import json
  53. from bs4 import BeautifulSoup
  54. cells = []
  55. for item in merged_data:
  56. if item['type'] != 'table':
  57. continue
  58. html = item.get('table_body_with_bbox', item.get('table_body', ''))
  59. soup = BeautifulSoup(html, 'html.parser')
  60. for row_idx, row in enumerate(soup.find_all('tr')):
  61. for col_idx, cell in enumerate(row.find_all(['td', 'th'])):
  62. cell_text = cell.get_text(strip=True)
  63. bbox_str = cell.get('data-bbox', '')
  64. if bbox_str:
  65. try:
  66. bbox = json.loads(bbox_str)
  67. cells.append({
  68. 'text': cell_text,
  69. 'bbox': bbox,
  70. 'row': row_idx,
  71. 'col': col_idx,
  72. 'score': float(cell.get('data-score', 0)),
  73. 'paddle_index': int(cell.get('data-paddle-index', -1))
  74. })
  75. except (json.JSONDecodeError, ValueError):
  76. pass
  77. return cells