""" bbox 提取模块 负责从 PaddleOCR 结果中提取文字框信息 """ from typing import List, Dict import numpy as np from pathlib import Path class BBoxExtractor: """bbox 提取器""" @staticmethod def extract_paddle_text_boxes(paddle_data: Dict) -> List[Dict]: """ 提取 PaddleOCR 的文字框信息 Args: paddle_data: PaddleOCR 输出的数据 Returns: 文字框列表(坐标已转换为 angle=0 时的坐标) """ text_boxes = [] if 'overall_ocr_res' not in paddle_data: return text_boxes ocr_res = paddle_data['overall_ocr_res'] rec_texts = ocr_res.get('rec_texts', []) rec_polys = ocr_res.get('rec_polys', []) rec_scores = ocr_res.get('rec_scores', []) # 🎯 获取旋转角度 rotation_angle = BBoxExtractor._get_rotation_angle(paddle_data) # 🎯 如果有旋转,需要获取原始图像尺寸 orig_image_size = None if rotation_angle != 0: orig_image_size = BBoxExtractor._get_original_image_size(paddle_data) print(f"🔄 检测到旋转角度: {rotation_angle}°") print(f"📐 原始图像尺寸: {orig_image_size[0]} x {orig_image_size[1]}") for i, (text, poly, score) in enumerate(zip(rec_texts, rec_polys, rec_scores)): if text and text.strip(): # 🎯 如果有旋转角度,转换坐标 if rotation_angle != 0 and orig_image_size: poly = BBoxExtractor._inverse_rotate_coordinates( poly, rotation_angle, orig_image_size ) # 计算 bbox (x_min, y_min, x_max, y_max) bbox = BBoxExtractor._poly_to_bbox(poly) text_boxes.append({ 'text': text, 'bbox': bbox, 'poly': poly, 'score': score, 'paddle_bbox_index': i, 'used': False }) return text_boxes @staticmethod def _get_rotation_angle(paddle_data: Dict) -> float: """获取旋转角度""" if 'doc_preprocessor_res' not in paddle_data: return 0.0 doc_res = paddle_data['doc_preprocessor_res'] if isinstance(doc_res, dict) and 'angle' in doc_res: return float(doc_res['angle']) return 0.0 @staticmethod def _get_original_image_size(paddle_data: Dict) -> tuple: """ 获取原始图像尺寸(从图片文件读取) Args: paddle_data: PaddleOCR 数据 Returns: (width, height) 元组 """ from PIL import Image # 🎯 从 input_path 读取图像 input_path = paddle_data.get('input_path') if input_path and Path(input_path).exists(): try: with Image.open(input_path) as img: # 返回原始图像尺寸 return img.size # (width, height) except Exception as e: print(f"⚠️ 无法读取图像文件 {input_path}: {e}") # 🎯 降级方案:从 layout_det_res 推断 if 'layout_det_res' in paddle_data: layout_res = paddle_data['layout_det_res'] if 'boxes' in layout_res and layout_res['boxes']: max_x = 0 max_y = 0 for box in layout_res['boxes']: coord = box.get('coordinate', []) if len(coord) >= 4: max_x = max(max_x, coord[2]) max_y = max(max_y, coord[3]) if max_x > 0 and max_y > 0: return (int(max_x) + 50, int(max_y) + 50) # 🎯 最后降级:从 overall_ocr_res 推断 if 'overall_ocr_res' in paddle_data: ocr_res = paddle_data['overall_ocr_res'] rec_polys = ocr_res.get('rec_polys', []) if rec_polys: max_x = 0 max_y = 0 for poly in rec_polys: for point in poly: max_x = max(max_x, point[0]) max_y = max(max_y, point[1]) if max_x > 0 and max_y > 0: return (int(max_x) + 50, int(max_y) + 50) # 🎯 默认 A4 尺寸 print("⚠️ 无法确定原始图像尺寸,使用默认值") return (2480, 3508) @staticmethod def _inverse_rotate_coordinates(poly: List[List[float]], angle: float, orig_image_size: tuple) -> List[List[float]]: """ 反向旋转坐标 参考 ocr_validator_utils.rotate_image_and_coordinates 的逆操作 PaddleOCR 在旋转后的图像上识别,坐标是旋转后的 我们需要将坐标转换回原始图像(未旋转) Args: poly: 旋转后图像上的多边形坐标 [[x',y'], ...] angle: 旋转角度(度数,PaddleX 使用的角度) orig_image_size: 原始图像尺寸 (width, height) Returns: 原始图像上的多边形坐标 [[x,y], ...] """ orig_width, orig_height = orig_image_size # 🎯 根据旋转角度计算旋转后的图像尺寸 if angle == 90: rotated_width, rotated_height = orig_height, orig_width elif angle == 270: rotated_width, rotated_height = orig_height, orig_width else: rotated_width, rotated_height = orig_width, orig_height inverse_poly = [] for point in poly: x_rot, y_rot = point[0], point[1] # 旋转后的坐标 # 🎯 反向旋转(参考 rotate_image_and_coordinates 的逆操作) if angle == 90: # 正向: rotated = image.rotate(90, expand=True) # x_rot = y_orig # y_rot = rotated_width - x_orig = orig_height - x_orig # 反向: x_orig = rotated_width - y_rot = orig_height - y_rot # y_orig = x_rot x_orig = rotated_width - y_rot y_orig = x_rot elif angle == 270: # 正向: rotated = image.rotate(-90, expand=True) # x_rot = rotated_width - y_orig = orig_height - y_orig # y_rot = x_orig # 反向: y_orig = rotated_width - x_rot = orig_height - x_rot # x_orig = y_rot x_orig = y_rot y_orig = rotated_width - x_rot elif angle == 180: # 正向: rotated = image.rotate(180) # x_rot = orig_width - x_orig # y_rot = orig_height - y_orig # 反向: x_orig = orig_width - x_rot # y_orig = orig_height - y_rot x_orig = orig_width - x_rot y_orig = orig_height - y_rot else: # 其他角度或0度,不转换 x_orig = x_rot y_orig = y_rot inverse_poly.append([x_orig, y_orig]) return inverse_poly @staticmethod def _poly_to_bbox(poly: List[List[float]]) -> List[float]: """将多边形转换为 bbox [x_min, y_min, x_max, y_max]""" xs = [p[0] for p in poly] ys = [p[1] for p in poly] return [min(xs), min(ys), max(xs), max(ys)] @staticmethod def extract_table_cells_with_bbox(merged_data: List[Dict]) -> List[Dict]: """ 提取所有表格单元格及其 bbox 信息 Args: merged_data: 合并后的数据 Returns: 单元格列表 """ import json from bs4 import BeautifulSoup cells = [] for item in merged_data: if item['type'] != 'table': continue html = item.get('table_body_with_bbox', item.get('table_body', '')) soup = BeautifulSoup(html, 'html.parser') for row_idx, row in enumerate(soup.find_all('tr')): for col_idx, cell in enumerate(row.find_all(['td', 'th'])): cell_text = cell.get_text(strip=True) bbox_str = cell.get('data-bbox', '') if bbox_str: try: bbox = json.loads(bbox_str) cells.append({ 'text': cell_text, 'bbox': bbox, 'row': row_idx, 'col': col_idx, 'score': float(cell.get('data-score', 0)), 'paddle_index': int(cell.get('data-paddle-index', -1)) }) except (json.JSONDecodeError, ValueError): pass return cells