2 weeks ago · 7657b6dfa4
--- a/zhch/unified_pytorch_models/orientation_classifier_v2.py
+++ b/zhch/unified_pytorch_models/orientation_classifier_v2.py
@@ -0,0 +1,257 @@
 
															+"""
														
 
															+增强版文档方向分类模块 - 独立版本
														
 
															+无需依赖 PaddleX 内部结构
														
 
															+"""
														
 
															+import cv2
														
 
															+import numpy as np
														
 
															+import onnxruntime as ort
														
 
															+from typing import Dict, Tuple, Optional
														
 
															+from pathlib import Path
														
 
															+from dataclasses import dataclass
														
 
															+
														
 
															+
														
 
															+@dataclass
														
 
															+class OrientationResult:
														
 
															+    """方向分类结果"""
														
 
															+    rotation_angle: str = "0"  # "0", "90", "180", "270"
														
 
															+    confidence: float = 1.0
														
 
															+    needs_rotation: bool = False
														
 
															+    vertical_text_count: int = 0
														
 
															+    aspect_ratio: float = 1.0
														
 
															+    
														
 
															+    def __str__(self):
														
 
															+        return (
														
 
															+            f"OrientationResult(\n"
														
 
															+            f"  angle={self.rotation_angle}°, "
														
 
															+            f"  confidence={self.confidence:.3f}, "
														
 
															+            f"  needs_rotation={self.needs_rotation}, "
														
 
															+            f"  vertical_texts={self.vertical_text_count}, "
														
 
															+            f"  aspect_ratio={self.aspect_ratio:.2f}\n"
														
 
															+            f")"
														
 
															+        )
														
 
															+
														
 
															+
														
 
															+class OrientationClassifierV2:
														
 
															+    """
														
 
															+    增强版方向分类器
														
 
															+    参考 MinerU 的两阶段检测策略
														
 
															+    """
														
 
															+    
														
 
															+    def __init__(
														
 
															+        self, 
														
 
															+        model_path: str,
														
 
															+        text_detector=None,  # 可选的 OCR 检测器
														
 
															+        aspect_ratio_threshold: float = 1.2,
														
 
															+        vertical_text_ratio: float = 0.28,
														
 
															+        vertical_text_min_count: int = 3,
														
 
															+        use_gpu: bool = False
														
 
															+    ):
														
 
															+        """
														
 
															+        Args:
														
 
															+            model_path: ONNX 模型路径
														
 
															+            text_detector: 文本检测器(可选,用于辅助判断)
														
 
															+            aspect_ratio_threshold: 长宽比阈值
														
 
															+            vertical_text_ratio: 垂直文本框占比阈值
														
 
															+            vertical_text_min_count: 最小垂直文本框数量
														
 
															+            use_gpu: 是否使用GPU
														
 
															+        """
														
 
															+        # 初始化 ONNX Runtime
														
 
															+        providers = ['CoreMLExecutionProvider', 'CPUExecutionProvider'] if use_gpu else ['CPUExecutionProvider']
														
 
															+        self.session = ort.InferenceSession(model_path, providers=providers)
														
 
															+        
														
 
															+        self.text_detector = text_detector
														
 
															+        self.aspect_ratio_threshold = aspect_ratio_threshold
														
 
															+        self.vertical_text_ratio = vertical_text_ratio
														
 
															+        self.vertical_text_min_count = vertical_text_min_count
														
 
															+        
														
 
															+        # 预计算标准化系数 (ImageNet 标准)
														
 
															+        self.mean = np.array([0.485, 0.456, 0.406])
														
 
															+        self.std = np.array([0.229, 0.224, 0.225])
														
 
															+        self.scale = 1.0 / 255.0
														
 
															+        
														
 
															+        self.target_size = 256  # 缩放后的最短边
														
 
															+        self.crop_size = (224, 224)  # 裁剪尺寸
														
 
															+        self.labels = ["0", "90", "180", "270"]
														
 
															+        
														
 
															+        print(f"✅ Orientation classifier initialized")
														
 
															+        print(f"   Model: {Path(model_path).name}")
														
 
															+        print(f"   Aspect ratio threshold: {aspect_ratio_threshold}")
														
 
															+        print(f"   Vertical text ratio: {vertical_text_ratio}")
														
 
															+    
														
 
															+    def _needs_rotation_check(self, img: np.ndarray) -> Tuple[bool, float]:
														
 
															+        """检查图像是否需要进行旋转检测"""
														
 
															+        h, w = img.shape[:2]
														
 
															+        aspect_ratio = h / w if w > 0 else 1.0
														
 
															+        needs_check = aspect_ratio > self.aspect_ratio_threshold
														
 
															+        return needs_check, aspect_ratio
														
 
															+    
														
 
															+    def _detect_vertical_text(self, img: np.ndarray) -> Tuple[bool, int]:
														
 
															+        """
														
 
															+        使用文本检测判断是否存在大量垂直文本
														
 
															+        
														
 
															+        Returns:
														
 
															+            (is_rotated, vertical_count): 是否旋转, 垂直文本框数量
														
 
															+        """
														
 
															+        if self.text_detector is None:
														
 
															+            return False, 0
														
 
															+        
														
 
															+        try:
														
 
															+            # ✅ 修改：适配 MinerUOCRAdapter 的返回格式
														
 
															+            # 返回格式: [[[box], (text, conf)], ...] 或 [[boxes], ...]
														
 
															+            det_results = self.text_detector.ocr(img, det=True, rec=False)
														
 
															+            
														
 
															+            if not det_results or not det_results[0]:
														
 
															+                return False, 0
														
 
															+            
														
 
															+            boxes = det_results[0]
														
 
															+            
														
 
															+            # ✅ 处理两种格式
														
 
															+            vertical_count = 0
														
 
															+            for item in boxes:
														
 
															+                # 格式1: [box] (仅检测)
														
 
															+                # 格式2: [[box], (text, conf)] (检测+识别)
														
 
															+                if isinstance(item, list) and len(item) > 0:
														
 
															+                    if isinstance(item[0], list):
														
 
															+                        # 格式2: [[box], ...]
														
 
															+                        box = np.array(item[0])
														
 
															+                    else:
														
 
															+                        # 格式1: [box]
														
 
															+                        box = np.array(item)
														
 
															+                else:
														
 
															+                    continue
														
 
															+                
														
 
															+                # 计算文本框的宽高
														
 
															+                if len(box) >= 4:
														
 
															+                    points = box
														
 
															+                    width = np.linalg.norm(points[1] - points[0])
														
 
															+                    height = np.linalg.norm(points[2] - points[1])
														
 
															+                    
														
 
															+                    aspect_ratio = width / height if height > 0 else 1.0
														
 
															+                    
														
 
															+                    # 统计垂直文本框 (高 > 宽)
														
 
															+                    if aspect_ratio < 0.8:
														
 
															+                        vertical_count += 1
														
 
															+            
														
 
															+            # 判断是否需要旋转
														
 
															+            total_boxes = len(boxes)
														
 
															+            is_rotated = (
														
 
															+                vertical_count >= total_boxes * self.vertical_text_ratio 
														
 
															+                and vertical_count >= self.vertical_text_min_count
														
 
															+            )
														
 
															+            
														
 
															+            return is_rotated, vertical_count
														
 
															+            
														
 
															+        except Exception as e:
														
 
															+            print(f"⚠️  Text detection failed: {e}")
														
 
															+            import traceback
														
 
															+            traceback.print_exc()
														
 
															+            return False, 0
														
 
															+    
														
 
															+    def _preprocess(self, img: np.ndarray) -> np.ndarray:
														
 
															+        """
														
 
															+        图像预处理
														
 
															+        1. 缩放最短边到 256
														
 
															+        2. 中心裁剪到 224×224
														
 
															+        3. 标准化
														
 
															+        """
														
 
															+        h, w = img.shape[:2]
														
 
															+        
														
 
															+        # 1. 缩放
														
 
															+        scale = self.target_size / min(h, w)
														
 
															+        new_h = round(h * scale)
														
 
															+        new_w = round(w * scale)
														
 
															+        img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
														
 
															+        
														
 
															+        # 2. 中心裁剪
														
 
															+        h, w = img.shape[:2]
														
 
															+        cw, ch = self.crop_size
														
 
															+        x1 = max(0, (w - cw) // 2)
														
 
															+        y1 = max(0, (h - ch) // 2)
														
 
															+        x2 = min(w, x1 + cw)
														
 
															+        y2 = min(h, y1 + ch)
														
 
															+        
														
 
															+        if w < cw or h < ch:
														
 
															+            # Padding instead of error
														
 
															+            padded = np.ones((ch, cw, 3), dtype=np.uint8) * 114
														
 
															+            paste_h = min(h, ch)
														
 
															+            paste_w = min(w, cw)
														
 
															+            padded[:paste_h, :paste_w] = img[:paste_h, :paste_w]
														
 
															+            img = padded
														
 
															+        else:
														
 
															+            img = img[y1:y2, x1:x2]
														
 
															+        
														
 
															+        # 3. 标准化 (转 RGB + ImageNet 标准化)
														
 
															+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
														
 
															+        img = img.astype(np.float32) * self.scale  # [0, 1]
														
 
															+        
														
 
															+        # 分通道标准化
														
 
															+        for c in range(3):
														
 
															+            img[:, :, c] = (img[:, :, c] - self.mean[c]) / self.std[c]
														
 
															+        
														
 
															+        # 4. 转换为 NCHW 格式
														
 
															+        img = img.transpose((2, 0, 1))
														
 
															+        img = np.expand_dims(img, axis=0)
														
 
															+        
														
 
															+        return img.astype(np.float32)
														
 
															+    
														
 
															+    def predict(self, img: np.ndarray, return_debug: bool = False) -> OrientationResult:
														
 
															+        """
														
 
															+        预测图像方向
														
 
															+        
														
 
															+        Args:
														
 
															+            img: BGR 格式的输入图像
														
 
															+            return_debug: 是否返回调试信息
														
 
															+            
														
 
															+        Returns:
														
 
															+            OrientationResult 对象
														
 
															+        """
														
 
															+        result = OrientationResult()
														
 
															+        
														
 
															+        # 1. 检查长宽比
														
 
															+        needs_check, aspect_ratio = self._needs_rotation_check(img)
														
 
															+        result.aspect_ratio = aspect_ratio
														
 
															+        
														
 
															+        if not needs_check:
														
 
															+            if return_debug:
														
 
															+                print(f"   ⏭️  Skipped (aspect_ratio={aspect_ratio:.2f} <= {self.aspect_ratio_threshold})")
														
 
															+            return result
														
 
															+        
														
 
															+        # 2. 使用文本检测判断是否旋转
														
 
															+        is_rotated, vertical_count = self._detect_vertical_text(img)
														
 
															+        result.vertical_text_count = vertical_count
														
 
															+        
														
 
															+        if not is_rotated:
														
 
															+            if return_debug:
														
 
															+                print(f"   ⏭️  No rotation needed (vertical_texts={vertical_count})")
														
 
															+            return result
														
 
															+        
														
 
															+        # 3. 使用分类模型预测旋转角度
														
 
															+        input_tensor = self._preprocess(img)
														
 
															+        
														
 
															+        # ONNX 推理
														
 
															+        input_name = self.session.get_inputs()[0].name
														
 
															+        output_name = self.session.get_outputs()[0].name
														
 
															+        outputs = self.session.run([output_name], {input_name: input_tensor})
														
 
															+        
														
 
															+        probabilities = outputs[0][0]  # [4,]
														
 
															+        
														
 
															+        predicted_idx = np.argmax(probabilities)
														
 
															+        result.rotation_angle = self.labels[predicted_idx]
														
 
															+        result.confidence = float(probabilities[predicted_idx])
														
 
															+        result.needs_rotation = result.rotation_angle != '0'
														
 
															+        
														
 
															+        if return_debug:
														
 
															+            print(f"   🎯 Predicted angle: {result.rotation_angle}° (conf={result.confidence:.3f})")
														
 
															+        
														
 
															+        return result
														
 
															+    
														
 
															+    def rotate_image(self, img: np.ndarray, angle: str) -> np.ndarray:
														
 
															+        """根据预测角度旋转图像"""
														
 
															+        if angle == "90":
														
 
															+            return cv2.rotate(img, cv2.ROTATE_90_COUNTERCLOCKWISE)
														
 
															+        elif angle == "180":
														
 
															+            return cv2.rotate(img, cv2.ROTATE_180)
														
 
															+        elif angle == "270":
														
 
															+            return cv2.rotate(img, cv2.ROTATE_90_CLOCKWISE)
														
 
															+        else:
														
 
															+            return img