il y a 1 semaine · 2ec53f5194
--- a/merger/bbox_extractor.py
+++ b/merger/bbox_extractor.py
@@ -3,6 +3,8 @@ bbox 提取模块
 
				 负责从 PaddleOCR 结果中提取文字框信息
			
 
				 """
			
 
				 from typing import List, Dict
			
 
				+import numpy as np
			
 
				+from pathlib import Path
			
 
				 
			
 
				 
			
 
				 class BBoxExtractor:
			
@@ -17,7 +19,7 @@ class BBoxExtractor:
 
				             paddle_data: PaddleOCR 输出的数据
			
 
				         
			
 
				         Returns:
			
 
				-            文字框列表
			
 
				+            文字框列表（坐标已转换为 angle=0 时的坐标）
			
 
				         """
			
 
				         text_boxes = []
			
 
				         
			
@@ -28,9 +30,26 @@ class BBoxExtractor:
 
				         rec_texts = ocr_res.get('rec_texts', [])
			
 
				         rec_polys = ocr_res.get('rec_polys', [])
			
 
				         rec_scores = ocr_res.get('rec_scores', [])
			
 
				-
			
 
				+        
			
 
				+        # 🎯 获取旋转角度
			
 
				+        rotation_angle = BBoxExtractor._get_rotation_angle(paddle_data)
			
 
				+        
			
 
				+        # 🎯 如果有旋转，需要获取原始图像尺寸
			
 
				+        orig_image_size = None
			
 
				+        
			
 
				+        if rotation_angle != 0:
			
 
				+            orig_image_size = BBoxExtractor._get_original_image_size(paddle_data)
			
 
				+            print(f"🔄 检测到旋转角度: {rotation_angle}°")
			
 
				+            print(f"📐 原始图像尺寸: {orig_image_size[0]} x {orig_image_size[1]}")
			
 
				+        
			
 
				         for i, (text, poly, score) in enumerate(zip(rec_texts, rec_polys, rec_scores)):
			
 
				             if text and text.strip():
			
 
				+                # 🎯 如果有旋转角度，转换坐标
			
 
				+                if rotation_angle != 0 and orig_image_size:
			
 
				+                    poly = BBoxExtractor._inverse_rotate_coordinates(
			
 
				+                        poly, rotation_angle, orig_image_size
			
 
				+                    )
			
 
				+                
			
 
				                 # 计算 bbox (x_min, y_min, x_max, y_max)
			
 
				                 bbox = BBoxExtractor._poly_to_bbox(poly)
			
 
				                 
			
@@ -42,12 +61,154 @@ class BBoxExtractor:
 
				                     'paddle_bbox_index': i,
			
 
				                     'used': False
			
 
				                 })
			
 
				-
			
 
				+        
			
 
				         return text_boxes
			
 
				     
			
 
				     @staticmethod
			
 
				+    def _get_rotation_angle(paddle_data: Dict) -> float:
			
 
				+        """获取旋转角度"""
			
 
				+        if 'doc_preprocessor_res' not in paddle_data:
			
 
				+            return 0.0
			
 
				+        
			
 
				+        doc_res = paddle_data['doc_preprocessor_res']
			
 
				+        if isinstance(doc_res, dict) and 'angle' in doc_res:
			
 
				+            return float(doc_res['angle'])
			
 
				+        
			
 
				+        return 0.0
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def _get_original_image_size(paddle_data: Dict) -> tuple:
			
 
				+        """
			
 
				+        获取原始图像尺寸（从图片文件读取）
			
 
				+        
			
 
				+        Args:
			
 
				+            paddle_data: PaddleOCR 数据
			
 
				+        
			
 
				+        Returns:
			
 
				+            (width, height) 元组
			
 
				+        """
			
 
				+        from PIL import Image
			
 
				+        
			
 
				+        # 🎯 从 input_path 读取图像
			
 
				+        input_path = paddle_data.get('input_path')
			
 
				+        
			
 
				+        if input_path and Path(input_path).exists():
			
 
				+            try:
			
 
				+                with Image.open(input_path) as img:
			
 
				+                    # 返回原始图像尺寸
			
 
				+                    return img.size  # (width, height)
			
 
				+            except Exception as e:
			
 
				+                print(f"⚠️ 无法读取图像文件 {input_path}: {e}")
			
 
				+        
			
 
				+        # 🎯 降级方案：从 layout_det_res 推断
			
 
				+        if 'layout_det_res' in paddle_data:
			
 
				+            layout_res = paddle_data['layout_det_res']
			
 
				+            if 'boxes' in layout_res and layout_res['boxes']:
			
 
				+                max_x = 0
			
 
				+                max_y = 0
			
 
				+                for box in layout_res['boxes']:
			
 
				+                    coord = box.get('coordinate', [])
			
 
				+                    if len(coord) >= 4:
			
 
				+                        max_x = max(max_x, coord[2])
			
 
				+                        max_y = max(max_y, coord[3])
			
 
				+                
			
 
				+                if max_x > 0 and max_y > 0:
			
 
				+                    return (int(max_x) + 50, int(max_y) + 50)
			
 
				+        
			
 
				+        # 🎯 最后降级：从 overall_ocr_res 推断
			
 
				+        if 'overall_ocr_res' in paddle_data:
			
 
				+            ocr_res = paddle_data['overall_ocr_res']
			
 
				+            rec_polys = ocr_res.get('rec_polys', [])
			
 
				+            if rec_polys:
			
 
				+                max_x = 0
			
 
				+                max_y = 0
			
 
				+                for poly in rec_polys:
			
 
				+                    for point in poly:
			
 
				+                        max_x = max(max_x, point[0])
			
 
				+                        max_y = max(max_y, point[1])
			
 
				+                
			
 
				+                if max_x > 0 and max_y > 0:
			
 
				+                    return (int(max_x) + 50, int(max_y) + 50)
			
 
				+        
			
 
				+        # 🎯 默认 A4 尺寸
			
 
				+        print("⚠️ 无法确定原始图像尺寸，使用默认值")
			
 
				+        return (2480, 3508)
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def _inverse_rotate_coordinates(poly: List[List[float]], 
			
 
				+                                    angle: float,
			
 
				+                                    orig_image_size: tuple) -> List[List[float]]:
			
 
				+        """
			
 
				+        反向旋转坐标
			
 
				+        
			
 
				+        参考 ocr_validator_utils.rotate_image_and_coordinates 的逆操作
			
 
				+        
			
 
				+        PaddleOCR 在旋转后的图像上识别，坐标是旋转后的
			
 
				+        我们需要将坐标转换回原始图像（未旋转）
			
 
				+        
			
 
				+        Args:
			
 
				+            poly: 旋转后图像上的多边形坐标 [[x',y'], ...]
			
 
				+            angle: 旋转角度（度数，PaddleX 使用的角度）
			
 
				+            orig_image_size: 原始图像尺寸 (width, height)
			
 
				+        
			
 
				+        Returns:
			
 
				+            原始图像上的多边形坐标 [[x,y], ...]
			
 
				+        """
			
 
				+        orig_width, orig_height = orig_image_size
			
 
				+        
			
 
				+        # 🎯 根据旋转角度计算旋转后的图像尺寸
			
 
				+        if angle == 90:
			
 
				+            rotated_width, rotated_height = orig_height, orig_width
			
 
				+        elif angle == 270:
			
 
				+            rotated_width, rotated_height = orig_height, orig_width
			
 
				+        else:
			
 
				+            rotated_width, rotated_height = orig_width, orig_height
			
 
				+        
			
 
				+        inverse_poly = []
			
 
				+        
			
 
				+        for point in poly:
			
 
				+            x_rot, y_rot = point[0], point[1]  # 旋转后的坐标
			
 
				+            
			
 
				+            # 🎯 反向旋转（参考 rotate_image_and_coordinates 的逆操作）
			
 
				+            if angle == 90:
			
 
				+                # 正向: rotated = image.rotate(90, expand=True)
			
 
				+                #      x_rot = y_orig
			
 
				+                #      y_rot = rotated_width - x_orig = orig_height - x_orig
			
 
				+                # 反向: x_orig = rotated_width - y_rot = orig_height - y_rot
			
 
				+                #      y_orig = x_rot
			
 
				+                x_orig = rotated_width - y_rot
			
 
				+                y_orig = x_rot
			
 
				+                
			
 
				+            elif angle == 270:
			
 
				+                # 正向: rotated = image.rotate(-90, expand=True)
			
 
				+                #      x_rot = rotated_width - y_orig = orig_height - y_orig
			
 
				+                #      y_rot = x_orig
			
 
				+                # 反向: y_orig = rotated_width - x_rot = orig_height - x_rot
			
 
				+                #      x_orig = y_rot
			
 
				+                x_orig = y_rot
			
 
				+                y_orig = rotated_width - x_rot
			
 
				+                
			
 
				+            elif angle == 180:
			
 
				+                # 正向: rotated = image.rotate(180)
			
 
				+                #      x_rot = orig_width - x_orig
			
 
				+                #      y_rot = orig_height - y_orig
			
 
				+                # 反向: x_orig = orig_width - x_rot
			
 
				+                #      y_orig = orig_height - y_rot
			
 
				+                x_orig = orig_width - x_rot
			
 
				+                y_orig = orig_height - y_rot
			
 
				+                
			
 
				+            else:
			
 
				+                # 其他角度或0度，不转换
			
 
				+                x_orig = x_rot
			
 
				+                y_orig = y_rot
			
 
				+            
			
 
				+            inverse_poly.append([x_orig, y_orig])
			
 
				+        
			
 
				+        return inverse_poly
			
 
				+    
			
 
				+    @staticmethod
			
 
				     def _poly_to_bbox(poly: List[List[float]]) -> List[float]:
			
 
				-        """将多边形转换为 bbox"""
			
 
				+        """将多边形转换为 bbox [x_min, y_min, x_max, y_max]"""
			
 
				         xs = [p[0] for p in poly]
			
 
				         ys = [p[1] for p in poly]
			
 
				         return [min(xs), min(ys), max(xs), max(ys)]