| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697 |
- """
- bbox 提取模块
- 负责从 PaddleOCR 结果中提取文字框信息
- """
- from typing import List, Dict
- class BBoxExtractor:
- """bbox 提取器"""
-
- @staticmethod
- def extract_paddle_text_boxes(paddle_data: Dict) -> List[Dict]:
- """
- 提取 PaddleOCR 的文字框信息
-
- Args:
- paddle_data: PaddleOCR 输出的数据
-
- Returns:
- 文字框列表
- """
- text_boxes = []
-
- if 'overall_ocr_res' not in paddle_data:
- return text_boxes
-
- ocr_res = paddle_data['overall_ocr_res']
- rec_texts = ocr_res.get('rec_texts', [])
- rec_polys = ocr_res.get('rec_polys', [])
- rec_scores = ocr_res.get('rec_scores', [])
- for i, (text, poly, score) in enumerate(zip(rec_texts, rec_polys, rec_scores)):
- if text and text.strip():
- # 计算 bbox (x_min, y_min, x_max, y_max)
- bbox = BBoxExtractor._poly_to_bbox(poly)
-
- text_boxes.append({
- 'text': text,
- 'bbox': bbox,
- 'poly': poly,
- 'score': score,
- 'paddle_bbox_index': i,
- 'used': False
- })
- return text_boxes
-
- @staticmethod
- def _poly_to_bbox(poly: List[List[float]]) -> List[float]:
- """将多边形转换为 bbox"""
- xs = [p[0] for p in poly]
- ys = [p[1] for p in poly]
- return [min(xs), min(ys), max(xs), max(ys)]
-
- @staticmethod
- def extract_table_cells_with_bbox(merged_data: List[Dict]) -> List[Dict]:
- """
- 提取所有表格单元格及其 bbox 信息
-
- Args:
- merged_data: 合并后的数据
-
- Returns:
- 单元格列表
- """
- import json
- from bs4 import BeautifulSoup
-
- cells = []
-
- for item in merged_data:
- if item['type'] != 'table':
- continue
-
- html = item.get('table_body_with_bbox', item.get('table_body', ''))
- soup = BeautifulSoup(html, 'html.parser')
-
- for row_idx, row in enumerate(soup.find_all('tr')):
- for col_idx, cell in enumerate(row.find_all(['td', 'th'])):
- cell_text = cell.get_text(strip=True)
- bbox_str = cell.get('data-bbox', '')
-
- if bbox_str:
- try:
- bbox = json.loads(bbox_str)
- cells.append({
- 'text': cell_text,
- 'bbox': bbox,
- 'row': row_idx,
- 'col': col_idx,
- 'score': float(cell.get('data-score', 0)),
- 'paddle_index': int(cell.get('data-paddle-index', -1))
- })
- except (json.JSONDecodeError, ValueError):
- pass
-
- return cells
|