zhengchun
/
ocr_verify


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258
							"""
bbox 提取模块
负责从 PaddleOCR 结果中提取文字框信息
"""
from typing import List, Dict
import numpy as np
from pathlib import Path


class BBoxExtractor:
    """bbox 提取器"""
    
    @staticmethod
    def extract_paddle_text_boxes(paddle_data: Dict) -> List[Dict]:
        """
        提取 PaddleOCR 的文字框信息
        
        Args:
            paddle_data: PaddleOCR 输出的数据
        
        Returns:
            文字框列表（坐标已转换为 angle=0 时的坐标）
        """
        text_boxes = []
        
        if 'overall_ocr_res' not in paddle_data:
            return text_boxes
        
        ocr_res = paddle_data['overall_ocr_res']
        rec_texts = ocr_res.get('rec_texts', [])
        rec_polys = ocr_res.get('rec_polys', [])
        rec_scores = ocr_res.get('rec_scores', [])
        
        # 🎯 获取旋转角度
        rotation_angle = BBoxExtractor._get_rotation_angle(paddle_data)
        
        # 🎯 如果有旋转，需要获取原始图像尺寸
        orig_image_size = None
        
        if rotation_angle != 0:
            orig_image_size = BBoxExtractor._get_original_image_size(paddle_data)
            print(f"🔄 检测到旋转角度: {rotation_angle}°")
            print(f"📐 原始图像尺寸: {orig_image_size[0]} x {orig_image_size[1]}")
        
        for i, (text, poly, score) in enumerate(zip(rec_texts, rec_polys, rec_scores)):
            if text and text.strip():
                # 🎯 如果有旋转角度，转换坐标
                if rotation_angle != 0 and orig_image_size:
                    poly = BBoxExtractor._inverse_rotate_coordinates(
                        poly, rotation_angle, orig_image_size
                    )
                
                # 计算 bbox (x_min, y_min, x_max, y_max)
                bbox = BBoxExtractor._poly_to_bbox(poly)
                
                text_boxes.append({
                    'text': text,
                    'bbox': bbox,
                    'poly': poly,
                    'score': score,
                    'paddle_bbox_index': i,
                    'used': False
                })
        
        return text_boxes
    
    @staticmethod
    def _get_rotation_angle(paddle_data: Dict) -> float:
        """获取旋转角度"""
        if 'doc_preprocessor_res' not in paddle_data:
            return 0.0
        
        doc_res = paddle_data['doc_preprocessor_res']
        if isinstance(doc_res, dict) and 'angle' in doc_res:
            return float(doc_res['angle'])
        
        return 0.0
    
    @staticmethod
    def _get_original_image_size(paddle_data: Dict) -> tuple:
        """
        获取原始图像尺寸（从图片文件读取）
        
        Args:
            paddle_data: PaddleOCR 数据
        
        Returns:
            (width, height) 元组
        """
        from PIL import Image
        
        # 🎯 从 input_path 读取图像
        input_path = paddle_data.get('input_path')
        
        if input_path and Path(input_path).exists():
            try:
                with Image.open(input_path) as img:
                    # 返回原始图像尺寸
                    return img.size  # (width, height)
            except Exception as e:
                print(f"⚠️ 无法读取图像文件 {input_path}: {e}")
        
        # 🎯 降级方案：从 layout_det_res 推断
        if 'layout_det_res' in paddle_data:
            layout_res = paddle_data['layout_det_res']
            if 'boxes' in layout_res and layout_res['boxes']:
                max_x = 0
                max_y = 0
                for box in layout_res['boxes']:
                    coord = box.get('coordinate', [])
                    if len(coord) >= 4:
                        max_x = max(max_x, coord[2])
                        max_y = max(max_y, coord[3])
                
                if max_x > 0 and max_y > 0:
                    return (int(max_x) + 50, int(max_y) + 50)
        
        # 🎯 最后降级：从 overall_ocr_res 推断
        if 'overall_ocr_res' in paddle_data:
            ocr_res = paddle_data['overall_ocr_res']
            rec_polys = ocr_res.get('rec_polys', [])
            if rec_polys:
                max_x = 0
                max_y = 0
                for poly in rec_polys:
                    for point in poly:
                        max_x = max(max_x, point[0])
                        max_y = max(max_y, point[1])
                
                if max_x > 0 and max_y > 0:
                    return (int(max_x) + 50, int(max_y) + 50)
        
        # 🎯 默认 A4 尺寸
        print("⚠️ 无法确定原始图像尺寸，使用默认值")
        return (2480, 3508)
    
    @staticmethod
    def _inverse_rotate_coordinates(poly: List[List[float]], 
                                    angle: float,
                                    orig_image_size: tuple) -> List[List[float]]:
        """
        反向旋转坐标
        
        参考 ocr_validator_utils.rotate_image_and_coordinates 的逆操作
        
        PaddleOCR 在旋转后的图像上识别，坐标是旋转后的
        我们需要将坐标转换回原始图像（未旋转）
        
        Args:
            poly: 旋转后图像上的多边形坐标 [[x',y'], ...]
            angle: 旋转角度（度数，PaddleX 使用的角度）
            orig_image_size: 原始图像尺寸 (width, height)
        
        Returns:
            原始图像上的多边形坐标 [[x,y], ...]
        """
        orig_width, orig_height = orig_image_size
        
        # 🎯 根据旋转角度计算旋转后的图像尺寸
        if angle == 90:
            rotated_width, rotated_height = orig_height, orig_width
        elif angle == 270:
            rotated_width, rotated_height = orig_height, orig_width
        else:
            rotated_width, rotated_height = orig_width, orig_height
        
        inverse_poly = []
        
        for point in poly:
            x_rot, y_rot = point[0], point[1]  # 旋转后的坐标
            
            # 🎯 反向旋转（参考 rotate_image_and_coordinates 的逆操作）
            if angle == 90:
                # 正向: rotated = image.rotate(90, expand=True)
                #      x_rot = y_orig
                #      y_rot = rotated_width - x_orig = orig_height - x_orig
                # 反向: x_orig = rotated_width - y_rot = orig_height - y_rot
                #      y_orig = x_rot
                x_orig = rotated_width - y_rot
                y_orig = x_rot
                
            elif angle == 270:
                # 正向: rotated = image.rotate(-90, expand=True)
                #      x_rot = rotated_width - y_orig = orig_height - y_orig
                #      y_rot = x_orig
                # 反向: y_orig = rotated_width - x_rot = orig_height - x_rot
                #      x_orig = y_rot
                x_orig = y_rot
                y_orig = rotated_width - x_rot
                
            elif angle == 180:
                # 正向: rotated = image.rotate(180)
                #      x_rot = orig_width - x_orig
                #      y_rot = orig_height - y_orig
                # 反向: x_orig = orig_width - x_rot
                #      y_orig = orig_height - y_rot
                x_orig = orig_width - x_rot
                y_orig = orig_height - y_rot
                
            else:
                # 其他角度或0度，不转换
                x_orig = x_rot
                y_orig = y_rot
            
            inverse_poly.append([x_orig, y_orig])
        
        return inverse_poly
    
    @staticmethod
    def _poly_to_bbox(poly: List[List[float]]) -> List[float]:
        """将多边形转换为 bbox [x_min, y_min, x_max, y_max]"""
        xs = [p[0] for p in poly]
        ys = [p[1] for p in poly]
        return [min(xs), min(ys), max(xs), max(ys)]
    
    @staticmethod
    def extract_table_cells_with_bbox(merged_data: List[Dict]) -> List[Dict]:
        """
        提取所有表格单元格及其 bbox 信息
        
        Args:
            merged_data: 合并后的数据
        
        Returns:
            单元格列表
        """
        import json
        from bs4 import BeautifulSoup
        
        cells = []
        
        for item in merged_data:
            if item['type'] != 'table':
                continue
            
            html = item.get('table_body_with_bbox', item.get('table_body', ''))
            soup = BeautifulSoup(html, 'html.parser')
            
            for row_idx, row in enumerate(soup.find_all('tr')):
                for col_idx, cell in enumerate(row.find_all(['td', 'th'])):
                    cell_text = cell.get_text(strip=True)
                    bbox_str = cell.get('data-bbox', '')
                    
                    if bbox_str:
                        try:
                            bbox = json.loads(bbox_str)
                            cells.append({
                                'text': cell_text,
                                'bbox': bbox,
                                'row': row_idx,
                                'col': col_idx,
                                'score': float(cell.get('data-score', 0)),
                                'paddle_index': int(cell.get('data-paddle-index', -1))
                            })
                        except (json.JSONDecodeError, ValueError):
                            pass
        
        return cells