""" bbox 提取模块 负责从 PaddleOCR 结果中提取文字框信息 """ from typing import List, Dict class BBoxExtractor: """bbox 提取器""" @staticmethod def extract_paddle_text_boxes(paddle_data: Dict) -> List[Dict]: """ 提取 PaddleOCR 的文字框信息 Args: paddle_data: PaddleOCR 输出的数据 Returns: 文字框列表 """ text_boxes = [] if 'overall_ocr_res' not in paddle_data: return text_boxes ocr_res = paddle_data['overall_ocr_res'] rec_texts = ocr_res.get('rec_texts', []) rec_polys = ocr_res.get('rec_polys', []) rec_scores = ocr_res.get('rec_scores', []) for i, (text, poly, score) in enumerate(zip(rec_texts, rec_polys, rec_scores)): if text and text.strip(): # 计算 bbox (x_min, y_min, x_max, y_max) bbox = BBoxExtractor._poly_to_bbox(poly) text_boxes.append({ 'text': text, 'bbox': bbox, 'poly': poly, 'score': score, 'paddle_bbox_index': i, 'used': False }) return text_boxes @staticmethod def _poly_to_bbox(poly: List[List[float]]) -> List[float]: """将多边形转换为 bbox""" xs = [p[0] for p in poly] ys = [p[1] for p in poly] return [min(xs), min(ys), max(xs), max(ys)] @staticmethod def extract_table_cells_with_bbox(merged_data: List[Dict]) -> List[Dict]: """ 提取所有表格单元格及其 bbox 信息 Args: merged_data: 合并后的数据 Returns: 单元格列表 """ import json from bs4 import BeautifulSoup cells = [] for item in merged_data: if item['type'] != 'table': continue html = item.get('table_body_with_bbox', item.get('table_body', '')) soup = BeautifulSoup(html, 'html.parser') for row_idx, row in enumerate(soup.find_all('tr')): for col_idx, cell in enumerate(row.find_all(['td', 'th'])): cell_text = cell.get_text(strip=True) bbox_str = cell.get('data-bbox', '') if bbox_str: try: bbox = json.loads(bbox_str) cells.append({ 'text': cell_text, 'bbox': bbox, 'row': row_idx, 'col': col_idx, 'score': float(cell.get('data-score', 0)), 'paddle_index': int(cell.get('data-paddle-index', -1)) }) except (json.JSONDecodeError, ValueError): pass return cells