13 цаг өмнө · d04f16fd9c
--- a/merger/bbox_extractor.py
+++ b/merger/bbox_extractor.py
@@ -2,7 +2,7 @@
 
				 bbox 提取模块
			
 
				 负责从 PaddleOCR 结果中提取文字框信息
			
 
				 """
			
 
				-from typing import List, Dict
			
 
				+from typing import List, Dict, Tuple
			
 
				 import numpy as np
			
 
				 from pathlib import Path
			
 
				 
			
@@ -11,7 +11,7 @@ class BBoxExtractor:
 
				     """bbox 提取器"""
			
 
				     
			
 
				     @staticmethod
			
 
				-    def extract_paddle_text_boxes(paddle_data: Dict) -> List[Dict]:
			
 
				+    def extract_paddle_text_boxes(paddle_data: Dict) -> Tuple[List[Dict], float, Tuple[int, int]]:
			
 
				         """
			
 
				         提取 PaddleOCR 的文字框信息
			
 
				         
			
@@ -19,12 +19,14 @@ class BBoxExtractor:
 
				             paddle_data: PaddleOCR 输出的数据
			
 
				         
			
 
				         Returns:
			
 
				-            文字框列表（坐标已转换为 angle=0 时的坐标）
			
 
				+            文字框列表（保持旋转后的angle角度）和旋转角度
			
 
				         """
			
 
				         text_boxes = []
			
 
				+        rotation_angle = 0.0
			
 
				+        orig_image_size = (0,0)
			
 
				         
			
 
				         if 'overall_ocr_res' not in paddle_data:
			
 
				-            return text_boxes
			
 
				+            return text_boxes, rotation_angle, orig_image_size
			
 
				         
			
 
				         ocr_res = paddle_data['overall_ocr_res']
			
 
				         rec_texts = ocr_res.get('rec_texts', [])
			
@@ -33,9 +35,52 @@ class BBoxExtractor:
 
				         
			
 
				         # 🎯 获取旋转角度
			
 
				         rotation_angle = BBoxExtractor._get_rotation_angle(paddle_data)
			
 
				+        if rotation_angle != 0:
			
 
				+            orig_image_size = BBoxExtractor._get_original_image_size(paddle_data)
			
 
				+            print(f"🔄 检测到旋转角度: {rotation_angle}°")
			
 
				+            print(f"📐 原始图像尺寸: {orig_image_size[0]} x {orig_image_size[1]}")
			
 
				+                
			
 
				+        for i, (text, poly, score) in enumerate(zip(rec_texts, rec_polys, rec_scores)):
			
 
				+            if text and text.strip():
			
 
				+                # 计算 bbox (x_min, y_min, x_max, y_max)
			
 
				+                bbox = BBoxExtractor._poly_to_bbox(poly)
			
 
				+                
			
 
				+                text_boxes.append({
			
 
				+                    'text': text,
			
 
				+                    'bbox': bbox,
			
 
				+                    'poly': poly,
			
 
				+                    'score': score,
			
 
				+                    'paddle_bbox_index': i,
			
 
				+                    'used': False
			
 
				+                })
			
 
				         
			
 
				-        # 🎯 如果有旋转，需要获取原始图像尺寸
			
 
				-        orig_image_size = None
			
 
				+        return text_boxes, rotation_angle, orig_image_size
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def extract_paddle_text_boxes_inverse_rotate(paddle_data: Dict) -> Tuple[List[Dict], float, Tuple[int, int]]:
			
 
				+        """
			
 
				+        提取 PaddleOCR 的文字框信息
			
 
				+        
			
 
				+        Args:
			
 
				+            paddle_data: PaddleOCR 输出的数据
			
 
				+        
			
 
				+        Returns:
			
 
				+            文字框列表（坐标已转换为 angle=0 时的坐标）
			
 
				+        """
			
 
				+        text_boxes = []
			
 
				+        rotation_angle = 0.0
			
 
				+        orig_image_size = (0,0)
			
 
				+        
			
 
				+        if 'overall_ocr_res' not in paddle_data:
			
 
				+            return text_boxes, rotation_angle, orig_image_size
			
 
				+        
			
 
				+        ocr_res = paddle_data['overall_ocr_res']
			
 
				+        rec_texts = ocr_res.get('rec_texts', [])
			
 
				+        rec_polys = ocr_res.get('rec_polys', [])
			
 
				+        rec_scores = ocr_res.get('rec_scores', [])
			
 
				+        
			
 
				+        # 🎯 获取旋转角度
			
 
				+        rotation_angle = BBoxExtractor._get_rotation_angle(paddle_data)
			
 
				         
			
 
				         if rotation_angle != 0:
			
 
				             orig_image_size = BBoxExtractor._get_original_image_size(paddle_data)
			
@@ -62,7 +107,7 @@ class BBoxExtractor:
 
				                     'used': False
			
 
				                 })
			
 
				         
			
 
				-        return text_boxes
			
 
				+        return text_boxes, rotation_angle, orig_image_size
			
 
				     
			
 
				     @staticmethod
			
 
				     def _get_rotation_angle(paddle_data: Dict) -> float:
			
@@ -135,6 +180,53 @@ class BBoxExtractor:
 
				         return (2480, 3508)
			
 
				     
			
 
				     @staticmethod
			
 
				+    def rotate_box_coordinates(bbox: List[float], 
			
 
				+                             angle: float,
			
 
				+                             orig_image_size: tuple) -> List[float]:
			
 
				+        """
			
 
				+        旋转 bbox 坐标（与图像旋转保持一致）
			
 
				+        
			
 
				+        参考 ocr_validator_utils.rotate_image_and_coordinates 的操作
			
 
				+        
			
 
				+        旋转逻辑：
			
 
				+        - 0°: 不旋转
			
 
				+        - 90°: 逆时针旋转 90°
			
 
				+        - 180°: 旋转 180°
			
 
				+        - 270°: 顺时针旋转 90°（或逆时针 270°）
			
 
				+        
			
 
				+        Args:
			
 
				+            bbox: 原图像上的边界框 [x_min, y_min, x_max, y_max]
			
 
				+            angle: 旋转角度（0, 90, 180, 270）
			
 
				+            orig_image_size: 原始图像尺寸 (width, height)
			
 
				+        """
			
 
				+        poly = BBoxExtractor._bbox_to_poly(bbox)
			
 
				+        rotated_poly = BBoxExtractor._rotate_coordinates(poly, angle, orig_image_size)
			
 
				+        rotated_bbox = BBoxExtractor._poly_to_bbox(rotated_poly)
			
 
				+        return rotated_bbox
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def inverse_rotate_box_coordinates(bbox: List[float], 
			
 
				+                                    angle: float,
			
 
				+                                    orig_image_size: tuple) -> List[float]:
			
 
				+        """
			
 
				+        反向旋转 bbox 坐标
			
 
				+        
			
 
				+        参考 ocr_validator_utils.rotate_image_and_coordinates 的逆操作
			
 
				+        
			
 
				+        PaddleOCR 在旋转后的图像上识别，坐标是旋转后的
			
 
				+        我们需要将坐标转换回原始图像（未旋转）
			
 
				+        
			
 
				+        Args:
			
 
				+            bbox: 旋转后图像上的边界框 [x_min, y_min, x_max, y_max]
			
 
				+            angle: 旋转角度（度数，PaddleX 使用的角度）
			
 
				+            orig_image_size: 原始图像尺寸 (width, height)
			
 
				+        """
			
 
				+        poly = BBoxExtractor._bbox_to_poly(bbox)
			
 
				+        inverse_poly = BBoxExtractor._inverse_rotate_coordinates(poly, angle, orig_image_size)
			
 
				+        inverse_bbox = BBoxExtractor._poly_to_bbox(inverse_poly)
			
 
				+        return inverse_bbox
			
 
				+
			
 
				+    @staticmethod
			
 
				     def _inverse_rotate_coordinates(poly: List[List[float]], 
			
 
				                                     angle: float,
			
 
				                                     orig_image_size: tuple) -> List[List[float]]:
			
@@ -207,6 +299,109 @@ class BBoxExtractor:
 
				         return inverse_poly
			
 
				     
			
 
				     @staticmethod
			
 
				+    def _rotate_coordinates(poly: List[List[float]], 
			
 
				+                        angle: float,
			
 
				+                        orig_image_size: tuple) -> List[List[float]]:
			
 
				+        """
			
 
				+        旋转多边形坐标（与图像旋转保持一致）
			
 
				+        
			
 
				+        参考 ocr_validator_utils.rotate_image_and_coordinates 的操作
			
 
				+        
			
 
				+        旋转逻辑：
			
 
				+        - 0°: 不旋转
			
 
				+        - 90°: 逆时针旋转 90°
			
 
				+        - 180°: 旋转 180°
			
 
				+        - 270°: 顺时针旋转 90°（或逆时针 270°）
			
 
				+        
			
 
				+        Args:
			
 
				+            poly: 原图像上的多边形坐标 [[x', y'], ...]
			
 
				+            angle: 旋转角度（0, 90, 180, 270）
			
 
				+            orig_image_size: 原始图像尺寸 (width, height)
			
 
				+        
			
 
				+        Returns:
			
 
				+            旋转后的多边形坐标 [[x, y], ...]
			
 
				+        
			
 
				+        Example:
			
 
				+            >>> poly = [[100, 200], [150, 200], [150, 250], [100, 250]]
			
 
				+            >>> rotated = rotate_coordinates(poly, 90, (1000, 800))
			
 
				+            >>> print(rotated)
			
 
				+            [[200, 900], [200, 850], [250, 850], [250, 900]]
			
 
				+        """
			
 
				+        if not poly or angle == 0:
			
 
				+            return poly
			
 
				+        
			
 
				+        orig_width, orig_height = orig_image_size
			
 
				+        rotated_poly = []
			
 
				+        
			
 
				+        for point in poly:
			
 
				+            x, y = point[0], point[1]
			
 
				+            
			
 
				+            if angle == 90:
			
 
				+                # 逆时针旋转 90°
			
 
				+                # 新坐标系: 宽度=原高度, 高度=原宽度
			
 
				+                # x_new = y_old
			
 
				+                # y_new = 原宽度 - x_old
			
 
				+                new_x = y
			
 
				+                new_y = orig_width - x
			
 
				+                
			
 
				+            elif angle == 180:
			
 
				+                # 旋转 180°
			
 
				+                # 新坐标系: 宽度=原宽度, 高度=原高度
			
 
				+                # x_new = 原宽度 - x_old
			
 
				+                # y_new = 原高度 - y_old
			
 
				+                new_x = orig_width - x
			
 
				+                new_y = orig_height - y
			
 
				+                
			
 
				+            elif angle == 270:
			
 
				+                # 顺时针旋转 90°（或逆时针 270°）
			
 
				+                # 新坐标系: 宽度=原高度, 高度=原宽度
			
 
				+                # x_new = 原高度 - y_old
			
 
				+                # y_new = x_old
			
 
				+                new_x = orig_height - y
			
 
				+                new_y = x
			
 
				+                
			
 
				+            else:
			
 
				+                # 不支持的角度，保持原坐标
			
 
				+                new_x, new_y = x, y
			
 
				+            
			
 
				+            rotated_poly.append([new_x, new_y])
			
 
				+        
			
 
				+        return rotated_poly
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _bbox_to_poly(bbox: List[float]) -> List[List[float]]:
			
 
				+        """
			
 
				+        将 bbox 转换为多边形（4个角点，逆时针顺序）
			
 
				+        
			
 
				+        Args:
			
 
				+            bbox: 边界框 [x_min, y_min, x_max, y_max]
			
 
				+        
			
 
				+        Returns:
			
 
				+            多边形坐标 [[x1, y1], [x2, y2], [x3, y3], [x4, y4]]
			
 
				+            顺序：左上 -> 右上 -> 右下 -> 左下（逆时针）
			
 
				+        
			
 
				+        Example:
			
 
				+            >>> bbox = [100, 200, 150, 250]
			
 
				+            >>> poly = BBoxExtractor._bbox_to_poly(bbox)
			
 
				+            >>> print(poly)
			
 
				+            [[100, 200], [150, 200], [150, 250], [100, 250]]
			
 
				+        """
			
 
				+        if not bbox or len(bbox) < 4:
			
 
				+            return []
			
 
				+        
			
 
				+        x_min, y_min, x_max, y_max = bbox[:4]
			
 
				+        
			
 
				+        # 🎯 4个角点（逆时针顺序）
			
 
				+        poly = [
			
 
				+            [x_min, y_min],  # 左上角
			
 
				+            [x_max, y_min],  # 右上角
			
 
				+            [x_max, y_max],  # 右下角
			
 
				+            [x_min, y_max]   # 左下角
			
 
				+        ]
			
 
				+        
			
 
				+        return poly
			
 
				+
			
 
				+    @staticmethod
			
 
				     def _poly_to_bbox(poly: List[List[float]]) -> List[float]:
			
 
				         """将多边形转换为 bbox [x_min, y_min, x_max, y_max]"""
			
 
				         xs = [p[0] for p in poly]