zhengchun
/
ocr_verify


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453
							"""
bbox 提取模块
负责从 PaddleOCR 结果中提取文字框信息
"""
from typing import List, Dict, Tuple
import numpy as np
from pathlib import Path


class BBoxExtractor:
    """bbox 提取器"""
    
    @staticmethod
    def extract_paddle_text_boxes(paddle_data: Dict) -> Tuple[List[Dict], float, Tuple[int, int]]:
        """
        提取 PaddleOCR 的文字框信息
        
        Args:
            paddle_data: PaddleOCR 输出的数据
        
        Returns:
            文字框列表（保持旋转后的angle角度）和旋转角度
        """
        text_boxes = []
        rotation_angle = 0.0
        orig_image_size = (0,0)
        
        if 'overall_ocr_res' not in paddle_data:
            return text_boxes, rotation_angle, orig_image_size
        
        ocr_res = paddle_data['overall_ocr_res']
        rec_texts = ocr_res.get('rec_texts', [])
        rec_polys = ocr_res.get('rec_polys', [])
        rec_scores = ocr_res.get('rec_scores', [])
        
        # 🎯 获取旋转角度
        rotation_angle = BBoxExtractor._get_rotation_angle(paddle_data)
        if rotation_angle != 0:
            orig_image_size = BBoxExtractor._get_original_image_size(paddle_data)
            print(f"🔄 检测到旋转角度: {rotation_angle}°")
            print(f"📐 原始图像尺寸: {orig_image_size[0]} x {orig_image_size[1]}")
                
        for i, (text, poly, score) in enumerate(zip(rec_texts, rec_polys, rec_scores)):
            if text and text.strip():
                # 计算 bbox (x_min, y_min, x_max, y_max)
                bbox = BBoxExtractor._poly_to_bbox(poly)
                
                text_boxes.append({
                    'text': text,
                    'bbox': bbox,
                    'poly': poly,
                    'score': score,
                    'paddle_bbox_index': i,
                    'used': False
                })
        
        return text_boxes, rotation_angle, orig_image_size
    
    @staticmethod
    def extract_paddle_text_boxes_inverse_rotate(paddle_data: Dict) -> Tuple[List[Dict], float, Tuple[int, int]]:
        """
        提取 PaddleOCR 的文字框信息
        
        Args:
            paddle_data: PaddleOCR 输出的数据
        
        Returns:
            文字框列表（坐标已转换为 angle=0 时的坐标）
        """
        text_boxes = []
        rotation_angle = 0.0
        orig_image_size = (0,0)
        
        if 'overall_ocr_res' not in paddle_data:
            return text_boxes, rotation_angle, orig_image_size
        
        ocr_res = paddle_data['overall_ocr_res']
        rec_texts = ocr_res.get('rec_texts', [])
        rec_polys = ocr_res.get('rec_polys', [])
        rec_scores = ocr_res.get('rec_scores', [])
        
        # 🎯 获取旋转角度
        rotation_angle = BBoxExtractor._get_rotation_angle(paddle_data)
        
        if rotation_angle != 0:
            orig_image_size = BBoxExtractor._get_original_image_size(paddle_data)
            print(f"🔄 检测到旋转角度: {rotation_angle}°")
            print(f"📐 原始图像尺寸: {orig_image_size[0]} x {orig_image_size[1]}")
        
        for i, (text, poly, score) in enumerate(zip(rec_texts, rec_polys, rec_scores)):
            if text and text.strip():
                # 🎯 如果有旋转角度，转换坐标
                if rotation_angle != 0 and orig_image_size:
                    poly = BBoxExtractor._inverse_rotate_coordinates(
                        poly, rotation_angle, orig_image_size
                    )
                
                # 计算 bbox (x_min, y_min, x_max, y_max)
                bbox = BBoxExtractor._poly_to_bbox(poly)
                
                text_boxes.append({
                    'text': text,
                    'bbox': bbox,
                    'poly': poly,
                    'score': score,
                    'paddle_bbox_index': i,
                    'used': False
                })
        
        return text_boxes, rotation_angle, orig_image_size
    
    @staticmethod
    def _get_rotation_angle(paddle_data: Dict) -> float:
        """获取旋转角度"""
        if 'doc_preprocessor_res' not in paddle_data:
            return 0.0
        
        doc_res = paddle_data['doc_preprocessor_res']
        if isinstance(doc_res, dict) and 'angle' in doc_res:
            return float(doc_res['angle'])
        
        return 0.0
    
    @staticmethod
    def _get_original_image_size(paddle_data: Dict) -> tuple:
        """
        获取原始图像尺寸（从图片文件读取）
        
        Args:
            paddle_data: PaddleOCR 数据
        
        Returns:
            (width, height) 元组
        """
        from PIL import Image
        
        # 🎯 从 input_path 读取图像
        input_path = paddle_data.get('input_path')
        
        if input_path and Path(input_path).exists():
            try:
                with Image.open(input_path) as img:
                    # 返回原始图像尺寸
                    return img.size  # (width, height)
            except Exception as e:
                print(f"⚠️ 无法读取图像文件 {input_path}: {e}")
        
        # 🎯 降级方案：从 layout_det_res 推断
        if 'layout_det_res' in paddle_data:
            layout_res = paddle_data['layout_det_res']
            if 'boxes' in layout_res and layout_res['boxes']:
                max_x = 0
                max_y = 0
                for box in layout_res['boxes']:
                    coord = box.get('coordinate', [])
                    if len(coord) >= 4:
                        max_x = max(max_x, coord[2])
                        max_y = max(max_y, coord[3])
                
                if max_x > 0 and max_y > 0:
                    return (int(max_x) + 50, int(max_y) + 50)
        
        # 🎯 最后降级：从 overall_ocr_res 推断
        if 'overall_ocr_res' in paddle_data:
            ocr_res = paddle_data['overall_ocr_res']
            rec_polys = ocr_res.get('rec_polys', [])
            if rec_polys:
                max_x = 0
                max_y = 0
                for poly in rec_polys:
                    for point in poly:
                        max_x = max(max_x, point[0])
                        max_y = max(max_y, point[1])
                
                if max_x > 0 and max_y > 0:
                    return (int(max_x) + 50, int(max_y) + 50)
        
        # 🎯 默认 A4 尺寸
        print("⚠️ 无法确定原始图像尺寸，使用默认值")
        return (2480, 3508)
    
    @staticmethod
    def rotate_box_coordinates(bbox: List[float], 
                             angle: float,
                             orig_image_size: tuple) -> List[float]:
        """
        旋转 bbox 坐标（与图像旋转保持一致）
        
        参考 ocr_validator_utils.rotate_image_and_coordinates 的操作
        
        旋转逻辑：
        - 0°: 不旋转
        - 90°: 逆时针旋转 90°
        - 180°: 旋转 180°
        - 270°: 顺时针旋转 90°（或逆时针 270°）
        
        Args:
            bbox: 原图像上的边界框 [x_min, y_min, x_max, y_max]
            angle: 旋转角度（0, 90, 180, 270）
            orig_image_size: 原始图像尺寸 (width, height)
        """
        poly = BBoxExtractor._bbox_to_poly(bbox)
        rotated_poly = BBoxExtractor._rotate_coordinates(poly, angle, orig_image_size)
        rotated_bbox = BBoxExtractor._poly_to_bbox(rotated_poly)
        return rotated_bbox

    @staticmethod
    def inverse_rotate_box_coordinates(bbox: List[float], 
                                    angle: float,
                                    orig_image_size: tuple) -> List[float]:
        """
        反向旋转 bbox 坐标
        
        参考 ocr_validator_utils.rotate_image_and_coordinates 的逆操作
        
        PaddleOCR 在旋转后的图像上识别，坐标是旋转后的
        我们需要将坐标转换回原始图像（未旋转）
        
        Args:
            bbox: 旋转后图像上的边界框 [x_min, y_min, x_max, y_max]
            angle: 旋转角度（度数，PaddleX 使用的角度）
            orig_image_size: 原始图像尺寸 (width, height)
        """
        poly = BBoxExtractor._bbox_to_poly(bbox)
        inverse_poly = BBoxExtractor._inverse_rotate_coordinates(poly, angle, orig_image_size)
        inverse_bbox = BBoxExtractor._poly_to_bbox(inverse_poly)
        return inverse_bbox

    @staticmethod
    def _inverse_rotate_coordinates(poly: List[List[float]], 
                                    angle: float,
                                    orig_image_size: tuple) -> List[List[float]]:
        """
        反向旋转坐标
        
        参考 ocr_validator_utils.rotate_image_and_coordinates 的逆操作
        
        PaddleOCR 在旋转后的图像上识别，坐标是旋转后的
        我们需要将坐标转换回原始图像（未旋转）
        
        Args:
            poly: 旋转后图像上的多边形坐标 [[x',y'], ...]
            angle: 旋转角度（度数，PaddleX 使用的角度）
            orig_image_size: 原始图像尺寸 (width, height)
        
        Returns:
            原始图像上的多边形坐标 [[x,y], ...]
        """
        orig_width, orig_height = orig_image_size
        
        # 🎯 根据旋转角度计算旋转后的图像尺寸
        if angle == 90:
            rotated_width, rotated_height = orig_height, orig_width
        elif angle == 270:
            rotated_width, rotated_height = orig_height, orig_width
        else:
            rotated_width, rotated_height = orig_width, orig_height
        
        inverse_poly = []
        
        for point in poly:
            x_rot, y_rot = point[0], point[1]  # 旋转后的坐标
            
            # 🎯 反向旋转（参考 rotate_image_and_coordinates 的逆操作）
            if angle == 90:
                # 正向: rotated = image.rotate(90, expand=True)
                #      x_rot = y_orig
                #      y_rot = rotated_width - x_orig = orig_height - x_orig
                # 反向: x_orig = rotated_width - y_rot = orig_height - y_rot
                #      y_orig = x_rot
                x_orig = rotated_width - y_rot
                y_orig = x_rot
                
            elif angle == 270:
                # 正向: rotated = image.rotate(-90, expand=True)
                #      x_rot = rotated_width - y_orig = orig_height - y_orig
                #      y_rot = x_orig
                # 反向: y_orig = rotated_width - x_rot = orig_height - x_rot
                #      x_orig = y_rot
                x_orig = y_rot
                y_orig = rotated_width - x_rot
                
            elif angle == 180:
                # 正向: rotated = image.rotate(180)
                #      x_rot = orig_width - x_orig
                #      y_rot = orig_height - y_orig
                # 反向: x_orig = orig_width - x_rot
                #      y_orig = orig_height - y_rot
                x_orig = orig_width - x_rot
                y_orig = orig_height - y_rot
                
            else:
                # 其他角度或0度，不转换
                x_orig = x_rot
                y_orig = y_rot
            
            inverse_poly.append([x_orig, y_orig])
        
        return inverse_poly
    
    @staticmethod
    def _rotate_coordinates(poly: List[List[float]], 
                        angle: float,
                        orig_image_size: tuple) -> List[List[float]]:
        """
        旋转多边形坐标（与图像旋转保持一致）
        
        参考 ocr_validator_utils.rotate_image_and_coordinates 的操作
        
        旋转逻辑：
        - 0°: 不旋转
        - 90°: 逆时针旋转 90°
        - 180°: 旋转 180°
        - 270°: 顺时针旋转 90°（或逆时针 270°）
        
        Args:
            poly: 原图像上的多边形坐标 [[x', y'], ...]
            angle: 旋转角度（0, 90, 180, 270）
            orig_image_size: 原始图像尺寸 (width, height)
        
        Returns:
            旋转后的多边形坐标 [[x, y], ...]
        
        Example:
            >>> poly = [[100, 200], [150, 200], [150, 250], [100, 250]]
            >>> rotated = rotate_coordinates(poly, 90, (1000, 800))
            >>> print(rotated)
            [[200, 900], [200, 850], [250, 850], [250, 900]]
        """
        if not poly or angle == 0:
            return poly
        
        orig_width, orig_height = orig_image_size
        rotated_poly = []
        
        for point in poly:
            x, y = point[0], point[1]
            
            if angle == 90:
                # 逆时针旋转 90°
                # 新坐标系: 宽度=原高度, 高度=原宽度
                # x_new = y_old
                # y_new = 原宽度 - x_old
                new_x = y
                new_y = orig_width - x
                
            elif angle == 180:
                # 旋转 180°
                # 新坐标系: 宽度=原宽度, 高度=原高度
                # x_new = 原宽度 - x_old
                # y_new = 原高度 - y_old
                new_x = orig_width - x
                new_y = orig_height - y
                
            elif angle == 270:
                # 顺时针旋转 90°（或逆时针 270°）
                # 新坐标系: 宽度=原高度, 高度=原宽度
                # x_new = 原高度 - y_old
                # y_new = x_old
                new_x = orig_height - y
                new_y = x
                
            else:
                # 不支持的角度，保持原坐标
                new_x, new_y = x, y
            
            rotated_poly.append([new_x, new_y])
        
        return rotated_poly

    @staticmethod
    def _bbox_to_poly(bbox: List[float]) -> List[List[float]]:
        """
        将 bbox 转换为多边形（4个角点，逆时针顺序）
        
        Args:
            bbox: 边界框 [x_min, y_min, x_max, y_max]
        
        Returns:
            多边形坐标 [[x1, y1], [x2, y2], [x3, y3], [x4, y4]]
            顺序：左上 -> 右上 -> 右下 -> 左下（逆时针）
        
        Example:
            >>> bbox = [100, 200, 150, 250]
            >>> poly = BBoxExtractor._bbox_to_poly(bbox)
            >>> print(poly)
            [[100, 200], [150, 200], [150, 250], [100, 250]]
        """
        if not bbox or len(bbox) < 4:
            return []
        
        x_min, y_min, x_max, y_max = bbox[:4]
        
        # 🎯 4个角点（逆时针顺序）
        poly = [
            [x_min, y_min],  # 左上角
            [x_max, y_min],  # 右上角
            [x_max, y_max],  # 右下角
            [x_min, y_max]   # 左下角
        ]
        
        return poly

    @staticmethod
    def _poly_to_bbox(poly: List[List[float]]) -> List[float]:
        """将多边形转换为 bbox [x_min, y_min, x_max, y_max]"""
        xs = [p[0] for p in poly]
        ys = [p[1] for p in poly]
        return [min(xs), min(ys), max(xs), max(ys)]
    
    @staticmethod
    def extract_table_cells_with_bbox(merged_data: List[Dict]) -> List[Dict]:
        """
        提取所有表格单元格及其 bbox 信息
        
        Args:
            merged_data: 合并后的数据
        
        Returns:
            单元格列表
        """
        import json
        from bs4 import BeautifulSoup
        
        cells = []
        
        for item in merged_data:
            if item['type'] != 'table':
                continue
            
            html = item.get('table_body_with_bbox', item.get('table_body', ''))
            soup = BeautifulSoup(html, 'html.parser')
            
            for row_idx, row in enumerate(soup.find_all('tr')):
                for col_idx, cell in enumerate(row.find_all(['td', 'th'])):
                    cell_text = cell.get_text(strip=True)
                    bbox_str = cell.get('data-bbox', '')
                    
                    if bbox_str:
                        try:
                            bbox = json.loads(bbox_str)
                            cells.append({
                                'text': cell_text,
                                'bbox': bbox,
                                'row': row_idx,
                                'col': col_idx,
                                'score': float(cell.get('data-score', 0)),
                                'paddle_index': int(cell.get('data-paddle-index', -1))
                            })
                        except (json.JSONDecodeError, ValueError):
                            pass
        
        return cells