Parcourir la source

feat: 添加旋转角度处理和原始图像尺寸获取功能,支持坐标反向旋转

zhch158_admin il y a 1 semaine
Parent
commit
2ec53f5194
1 fichiers modifiés avec 165 ajouts et 4 suppressions
  1. 165 4
      merger/bbox_extractor.py

+ 165 - 4
merger/bbox_extractor.py

@@ -3,6 +3,8 @@ bbox 提取模块
 负责从 PaddleOCR 结果中提取文字框信息
 """
 from typing import List, Dict
+import numpy as np
+from pathlib import Path
 
 
 class BBoxExtractor:
@@ -17,7 +19,7 @@ class BBoxExtractor:
             paddle_data: PaddleOCR 输出的数据
         
         Returns:
-            文字框列表
+            文字框列表(坐标已转换为 angle=0 时的坐标)
         """
         text_boxes = []
         
@@ -28,9 +30,26 @@ class BBoxExtractor:
         rec_texts = ocr_res.get('rec_texts', [])
         rec_polys = ocr_res.get('rec_polys', [])
         rec_scores = ocr_res.get('rec_scores', [])
-
+        
+        # 🎯 获取旋转角度
+        rotation_angle = BBoxExtractor._get_rotation_angle(paddle_data)
+        
+        # 🎯 如果有旋转,需要获取原始图像尺寸
+        orig_image_size = None
+        
+        if rotation_angle != 0:
+            orig_image_size = BBoxExtractor._get_original_image_size(paddle_data)
+            print(f"🔄 检测到旋转角度: {rotation_angle}°")
+            print(f"📐 原始图像尺寸: {orig_image_size[0]} x {orig_image_size[1]}")
+        
         for i, (text, poly, score) in enumerate(zip(rec_texts, rec_polys, rec_scores)):
             if text and text.strip():
+                # 🎯 如果有旋转角度,转换坐标
+                if rotation_angle != 0 and orig_image_size:
+                    poly = BBoxExtractor._inverse_rotate_coordinates(
+                        poly, rotation_angle, orig_image_size
+                    )
+                
                 # 计算 bbox (x_min, y_min, x_max, y_max)
                 bbox = BBoxExtractor._poly_to_bbox(poly)
                 
@@ -42,12 +61,154 @@ class BBoxExtractor:
                     'paddle_bbox_index': i,
                     'used': False
                 })
-
+        
         return text_boxes
     
     @staticmethod
+    def _get_rotation_angle(paddle_data: Dict) -> float:
+        """获取旋转角度"""
+        if 'doc_preprocessor_res' not in paddle_data:
+            return 0.0
+        
+        doc_res = paddle_data['doc_preprocessor_res']
+        if isinstance(doc_res, dict) and 'angle' in doc_res:
+            return float(doc_res['angle'])
+        
+        return 0.0
+    
+    @staticmethod
+    def _get_original_image_size(paddle_data: Dict) -> tuple:
+        """
+        获取原始图像尺寸(从图片文件读取)
+        
+        Args:
+            paddle_data: PaddleOCR 数据
+        
+        Returns:
+            (width, height) 元组
+        """
+        from PIL import Image
+        
+        # 🎯 从 input_path 读取图像
+        input_path = paddle_data.get('input_path')
+        
+        if input_path and Path(input_path).exists():
+            try:
+                with Image.open(input_path) as img:
+                    # 返回原始图像尺寸
+                    return img.size  # (width, height)
+            except Exception as e:
+                print(f"⚠️ 无法读取图像文件 {input_path}: {e}")
+        
+        # 🎯 降级方案:从 layout_det_res 推断
+        if 'layout_det_res' in paddle_data:
+            layout_res = paddle_data['layout_det_res']
+            if 'boxes' in layout_res and layout_res['boxes']:
+                max_x = 0
+                max_y = 0
+                for box in layout_res['boxes']:
+                    coord = box.get('coordinate', [])
+                    if len(coord) >= 4:
+                        max_x = max(max_x, coord[2])
+                        max_y = max(max_y, coord[3])
+                
+                if max_x > 0 and max_y > 0:
+                    return (int(max_x) + 50, int(max_y) + 50)
+        
+        # 🎯 最后降级:从 overall_ocr_res 推断
+        if 'overall_ocr_res' in paddle_data:
+            ocr_res = paddle_data['overall_ocr_res']
+            rec_polys = ocr_res.get('rec_polys', [])
+            if rec_polys:
+                max_x = 0
+                max_y = 0
+                for poly in rec_polys:
+                    for point in poly:
+                        max_x = max(max_x, point[0])
+                        max_y = max(max_y, point[1])
+                
+                if max_x > 0 and max_y > 0:
+                    return (int(max_x) + 50, int(max_y) + 50)
+        
+        # 🎯 默认 A4 尺寸
+        print("⚠️ 无法确定原始图像尺寸,使用默认值")
+        return (2480, 3508)
+    
+    @staticmethod
+    def _inverse_rotate_coordinates(poly: List[List[float]], 
+                                    angle: float,
+                                    orig_image_size: tuple) -> List[List[float]]:
+        """
+        反向旋转坐标
+        
+        参考 ocr_validator_utils.rotate_image_and_coordinates 的逆操作
+        
+        PaddleOCR 在旋转后的图像上识别,坐标是旋转后的
+        我们需要将坐标转换回原始图像(未旋转)
+        
+        Args:
+            poly: 旋转后图像上的多边形坐标 [[x',y'], ...]
+            angle: 旋转角度(度数,PaddleX 使用的角度)
+            orig_image_size: 原始图像尺寸 (width, height)
+        
+        Returns:
+            原始图像上的多边形坐标 [[x,y], ...]
+        """
+        orig_width, orig_height = orig_image_size
+        
+        # 🎯 根据旋转角度计算旋转后的图像尺寸
+        if angle == 90:
+            rotated_width, rotated_height = orig_height, orig_width
+        elif angle == 270:
+            rotated_width, rotated_height = orig_height, orig_width
+        else:
+            rotated_width, rotated_height = orig_width, orig_height
+        
+        inverse_poly = []
+        
+        for point in poly:
+            x_rot, y_rot = point[0], point[1]  # 旋转后的坐标
+            
+            # 🎯 反向旋转(参考 rotate_image_and_coordinates 的逆操作)
+            if angle == 90:
+                # 正向: rotated = image.rotate(90, expand=True)
+                #      x_rot = y_orig
+                #      y_rot = rotated_width - x_orig = orig_height - x_orig
+                # 反向: x_orig = rotated_width - y_rot = orig_height - y_rot
+                #      y_orig = x_rot
+                x_orig = rotated_width - y_rot
+                y_orig = x_rot
+                
+            elif angle == 270:
+                # 正向: rotated = image.rotate(-90, expand=True)
+                #      x_rot = rotated_width - y_orig = orig_height - y_orig
+                #      y_rot = x_orig
+                # 反向: y_orig = rotated_width - x_rot = orig_height - x_rot
+                #      x_orig = y_rot
+                x_orig = y_rot
+                y_orig = rotated_width - x_rot
+                
+            elif angle == 180:
+                # 正向: rotated = image.rotate(180)
+                #      x_rot = orig_width - x_orig
+                #      y_rot = orig_height - y_orig
+                # 反向: x_orig = orig_width - x_rot
+                #      y_orig = orig_height - y_rot
+                x_orig = orig_width - x_rot
+                y_orig = orig_height - y_rot
+                
+            else:
+                # 其他角度或0度,不转换
+                x_orig = x_rot
+                y_orig = y_rot
+            
+            inverse_poly.append([x_orig, y_orig])
+        
+        return inverse_poly
+    
+    @staticmethod
     def _poly_to_bbox(poly: List[List[float]]) -> List[float]:
-        """将多边形转换为 bbox"""
+        """将多边形转换为 bbox [x_min, y_min, x_max, y_max]"""
         xs = [p[0] for p in poly]
         ys = [p[1] for p in poly]
         return [min(xs), min(ys), max(xs), max(ys)]