1 周之前 · 576b5a5773
--- a/zhch/adapters/doc_preprocessor_adapter.py
+++ b/zhch/adapters/doc_preprocessor_adapter.py
@@ -0,0 +1,503 @@
 
				+"""
			
 
				+文档预处理适配器
			
 
				+使用 MinerU 的方向判断算法，但保留 PaddleX 的模型
			
 
				+"""
			
 
				+
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+from typing import Any, Dict, List, Optional, Union
			
 
				+import numpy as np
			
 
				+import cv2
			
 
				+
			
 
				+from paddlex.inference.pipelines.doc_preprocessor.result import DocPreprocessorResult
			
 
				+from paddlex.inference.common.reader import ReadImage
			
 
				+from paddlex.inference.common.batch_sampler import ImageBatchSampler
			
 
				+from paddlex.inference.pipelines.components import rotate_image
			
 
				+
			
 
				+
			
 
				+class EnhancedDocPreprocessor:
			
 
				+    """
			
 
				+    增强版文档预处理器
			
 
				+    核心思路：采用 MinerU 的两阶段方向判断算法
			
 
				+    1. 快速过滤：宽高比判断（纵向图片才需要方向分类）
			
 
				+    2. OCR 引导：检测文本框，判断是否有大量垂直文本
			
 
				+    3. 精确分类：仅对疑似旋转的图片调用分类模型
			
 
				+    """
			
 
				+    
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        doc_ori_classify_model,
			
 
				+        doc_unwarping_model,
			
 
				+        ocr_det_model=None,  # 🎯 OCR 检测模型（可选）
			
 
				+        device: str = "cpu",
			
 
				+        use_doc_orientation_classify: bool = True,
			
 
				+        use_doc_unwarping: bool = False,
			
 
				+        batch_size: int = 1,
			
 
				+    ):
			
 
				+        """
			
 
				+        Args:
			
 
				+            doc_ori_classify_model: PaddleX 的方向分类模型
			
 
				+            doc_unwarping_model: PaddleX 的文档矫正模型
			
 
				+            ocr_det_model: OCR 文本检测模型（用于判断是否需要旋转，可选）
			
 
				+            device: 设备类型（cpu/gpu）
			
 
				+            use_doc_orientation_classify: 是否使用方向分类
			
 
				+            use_doc_unwarping: 是否使用文档矫正
			
 
				+            batch_size: 批处理大小
			
 
				+        """
			
 
				+        self.doc_ori_classify_model = doc_ori_classify_model
			
 
				+        self.doc_unwarping_model = doc_unwarping_model
			
 
				+        self.device = device
			
 
				+        self.use_doc_orientation_classify = use_doc_orientation_classify
			
 
				+        self.use_doc_unwarping = use_doc_unwarping
			
 
				+        self.batch_size = batch_size
			
 
				+        
			
 
				+        self.img_reader = ReadImage(format="BGR")
			
 
				+        self.batch_sampler = ImageBatchSampler(batch_size=batch_size)
			
 
				+        
			
 
				+        # 🎯 MinerU 算法参数
			
 
				+        self.portrait_threshold = 1.2  # 宽高比阈值
			
 
				+        self.vertical_ratio_threshold = 0.28  # 垂直文本框比例阈值
			
 
				+        self.min_vertical_count = 3  # 最少垂直文本框数量
			
 
				+        
			
 
				+        # 🎯 初始化 OCR 检测模型（只初始化一次）
			
 
				+        self.ocr_det_model = ocr_det_model
			
 
				+        if self.ocr_det_model is None:
			
 
				+            self._initialize_ocr_det_model()
			
 
				+        
			
 
				+        print(f"📐 Enhanced DocPreprocessor initialized")
			
 
				+        print(f"   - Device: {self.device}")
			
 
				+        print(f"   - Portrait threshold: {self.portrait_threshold}")
			
 
				+        print(f"   - Vertical ratio threshold: {self.vertical_ratio_threshold}")
			
 
				+        print(f"   - Min vertical count: {self.min_vertical_count}")
			
 
				+        print(f"   - OCR detection model: {'✅ Available' if self.ocr_det_model else '❌ Not available'}")
			
 
				+    
			
 
				+    def _initialize_ocr_det_model(self):
			
 
				+        """初始化 OCR 检测模型（只执行一次）"""
			
 
				+        try:
			
 
				+            from paddlex import create_model
			
 
				+            
			
 
				+            print("🔧 Initializing OCR detection model...")
			
 
				+            self.ocr_det_model = create_model(
			
 
				+                'PP-OCRv5_server_det',
			
 
				+                device=self.device
			
 
				+            )
			
 
				+            print("✅ OCR detection model initialized successfully")
			
 
				+            
			
 
				+        except Exception as e:
			
 
				+            print(f"⚠️  Failed to initialize OCR detection model: {e}")
			
 
				+            print("   Will skip OCR-guided filtering")
			
 
				+            self.ocr_det_model = None
			
 
				+    
			
 
				+    def _is_portrait_image(self, image: np.ndarray) -> bool:
			
 
				+        """判断是否为纵向图片"""
			
 
				+        img_height, img_width = image.shape[:2]
			
 
				+        aspect_ratio = img_height / img_width if img_width > 0 else 1.0
			
 
				+        is_portrait = aspect_ratio > self.portrait_threshold
			
 
				+        print(f"   📏 Image size: {img_width}x{img_height}, aspect_ratio: {aspect_ratio:.2f}, is_portrait: {is_portrait}")
			
 
				+        return is_portrait
			
 
				+    
			
 
				+    def _detect_vertical_text_boxes(self, image: np.ndarray) -> tuple[int, int]:
			
 
				+        """
			
 
				+        检测图片中的垂直文本框
			
 
				+        
			
 
				+        Returns:
			
 
				+            (vertical_count, total_count): 垂直文本框数量和总数量
			
 
				+        """
			
 
				+        if self.ocr_det_model is None:
			
 
				+            print("   ⚠️  OCR detection model not available")
			
 
				+            return 0, 0
			
 
				+        
			
 
				+        try:
			
 
				+            # 🎯 调用 OCR 检测模型
			
 
				+            det_results = list(self.ocr_det_model([image]))
			
 
				+            if not det_results or len(det_results) == 0:
			
 
				+                print("   ℹ️  No OCR detection results")
			
 
				+                return 0, 0
			
 
				+            
			
 
				+            det_result = det_results[0]
			
 
				+            
			
 
				+            # 🎯 从检测结果中提取文本框
			
 
				+            # PaddleX 的检测结果格式: {"dt_polys": [...], ...}
			
 
				+            boxes = None
			
 
				+            if isinstance(det_result, dict):
			
 
				+                boxes = det_result.get('dt_polys', None)
			
 
				+            elif isinstance(det_result, np.ndarray):
			
 
				+                boxes = det_result
			
 
				+            
			
 
				+            if boxes is None or len(boxes) == 0:
			
 
				+                print("   ℹ️  No text boxes detected")
			
 
				+                return 0, 0
			
 
				+            
			
 
				+            # 🎯 统计垂直文本框
			
 
				+            vertical_count = 0
			
 
				+            total_count = len(boxes)
			
 
				+            
			
 
				+            # 🎯 处理 numpy 数组格式: shape=(N, 4, 2)
			
 
				+            if isinstance(boxes, np.ndarray):
			
 
				+                if len(boxes.shape) == 3 and boxes.shape[1] == 4 and boxes.shape[2] == 2:
			
 
				+                    # 格式: (N, 4, 2) - 每个框有4个点，每个点有(x,y)坐标
			
 
				+                    for box in boxes:
			
 
				+                        # box: shape=(4, 2) - [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
			
 
				+                        p1, p2, p3, p4 = box
			
 
				+                        
			
 
				+                        # 计算宽高
			
 
				+                        width = abs(float(p2[0] - p1[0]))  # x2 - x1
			
 
				+                        height = abs(float(p3[1] - p2[1]))  # y3 - y2
			
 
				+                        
			
 
				+                        if height == 0:
			
 
				+                            continue
			
 
				+                        
			
 
				+                        aspect_ratio = width / height
			
 
				+                        
			
 
				+                        # 🎯 MinerU 的判断标准：宽高比 < 0.8 为垂直文本
			
 
				+                        if aspect_ratio < 0.8:
			
 
				+                            vertical_count += 1
			
 
				+                else:
			
 
				+                    # 其他格式，尝试遍历处理
			
 
				+                    for box in boxes:
			
 
				+                        if isinstance(box, np.ndarray) and len(box) >= 4:
			
 
				+                            self._process_single_box(box, vertical_count)
			
 
				+            else:
			
 
				+                # 处理列表格式
			
 
				+                for box in boxes:
			
 
				+                    if isinstance(box, (list, tuple, np.ndarray)):
			
 
				+                        if len(box) >= 4:
			
 
				+                            # 格式: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
			
 
				+                            if isinstance(box[0], (list, tuple, np.ndarray)) and len(box[0]) >= 2:
			
 
				+                                p1, p2, p3, p4 = box[:4]
			
 
				+                                width = abs(float(p2[0]) - float(p1[0]))
			
 
				+                                height = abs(float(p3[1]) - float(p2[1]))
			
 
				+                            # 格式: [x1,y1,x2,y2,x3,y3,x4,y4]
			
 
				+                            elif len(box) >= 8:
			
 
				+                                width = abs(float(box[2]) - float(box[0]))
			
 
				+                                height = abs(float(box[5]) - float(box[3]))
			
 
				+                            else:
			
 
				+                                continue
			
 
				+                            
			
 
				+                            if height == 0:
			
 
				+                                continue
			
 
				+                            
			
 
				+                            aspect_ratio = width / height
			
 
				+                            
			
 
				+                            # 🎯 MinerU 的判断标准：宽高比 < 0.8 为垂直文本
			
 
				+                            if aspect_ratio < 0.8:
			
 
				+                                vertical_count += 1
			
 
				+            
			
 
				+            print(f"   📊 OCR detection: {vertical_count}/{total_count} vertical boxes ({vertical_count/total_count:.1%} vertical)")
			
 
				+            return vertical_count, total_count
			
 
				+            
			
 
				+        except Exception as e:
			
 
				+            print(f"   ⚠️  OCR detection failed: {e}")
			
 
				+            import traceback
			
 
				+            traceback.print_exc()
			
 
				+            return 0, 0
			
 
				+    
			
 
				+    def _should_classify_orientation(self, image: np.ndarray) -> bool:
			
 
				+        """
			
 
				+        判断是否需要进行方向分类
			
 
				+        参考 MinerU 的两阶段判断逻辑
			
 
				+        
			
 
				+        Returns:
			
 
				+            True: 需要分类
			
 
				+            False: 跳过分类（直接使用原图）
			
 
				+        """
			
 
				+        print("🔍 Checking if orientation classification is needed...")
			
 
				+        
			
 
				+        # 🎯 阶段 1: 快速过滤 - 宽高比检查
			
 
				+        if not self._is_portrait_image(image):
			
 
				+            print("   ⏭️  Skipped: Image is landscape")
			
 
				+            return False
			
 
				+        
			
 
				+        # 🎯 阶段 2: OCR 引导判断 - 检测垂直文本框
			
 
				+        vertical_count, total_count = self._detect_vertical_text_boxes(image)
			
 
				+        
			
 
				+        if total_count == 0:
			
 
				+            print("   ⏭️  Skipped: No text detected")
			
 
				+            return False
			
 
				+        
			
 
				+        # 🎯 MinerU 的判断标准：
			
 
				+        # 垂直文本框比例 >= 28% 且数量 >= 3，才认为可能需要旋转
			
 
				+        vertical_ratio = vertical_count / total_count
			
 
				+        is_rotated = (
			
 
				+            vertical_ratio >= self.vertical_ratio_threshold and 
			
 
				+            vertical_count >= self.min_vertical_count
			
 
				+        )
			
 
				+        
			
 
				+        print(f"   📈 Vertical ratio: {vertical_ratio:.1%} (threshold: {self.vertical_ratio_threshold:.1%})")
			
 
				+        print(f"   📊 Vertical count: {vertical_count} (min: {self.min_vertical_count})")
			
 
				+        print(f"   🎯 Need classification: {is_rotated}")
			
 
				+        
			
 
				+        return is_rotated
			
 
				+    
			
 
				+    def _predict_orientation(self, image: np.ndarray) -> int:
			
 
				+        """
			
 
				+        预测图像方向
			
 
				+        
			
 
				+        Args:
			
 
				+            image: BGR 格式的图像
			
 
				+            
			
 
				+        Returns:
			
 
				+            旋转角度 (0, 90, 180, 270)
			
 
				+        """
			
 
				+        if not self.use_doc_orientation_classify or self.doc_ori_classify_model is None:
			
 
				+            return 0
			
 
				+        
			
 
				+        try:
			
 
				+            # 调用 PaddleX 的分类模型
			
 
				+            preds = list(self.doc_ori_classify_model([image]))
			
 
				+            if preds and len(preds) > 0:
			
 
				+                pred = preds[0]
			
 
				+                angle = int(pred["label_names"][0])
			
 
				+                print(f"   🔄 Orientation classification result: {angle}°")
			
 
				+                return angle
			
 
				+            return 0
			
 
				+        except Exception as e:
			
 
				+            print(f"   ⚠️  Orientation prediction failed: {e}")
			
 
				+            return 0
			
 
				+    
			
 
				+    def predict(
			
 
				+        self,
			
 
				+        input: Union[str, List[str], np.ndarray, List[np.ndarray]],
			
 
				+        use_doc_orientation_classify: Optional[bool] = None,
			
 
				+        use_doc_unwarping: Optional[bool] = None,
			
 
				+    ):
			
 
				+        """
			
 
				+        预测文档预处理结果
			
 
				+        
			
 
				+        Args:
			
 
				+            input: 输入图像路径、数组或列表
			
 
				+            use_doc_orientation_classify: 是否使用方向分类
			
 
				+            use_doc_unwarping: 是否使用文档矫正
			
 
				+            
			
 
				+        Yields:
			
 
				+            DocPreprocessorResult: 预处理结果
			
 
				+        """
			
 
				+        # 处理模型设置
			
 
				+        if use_doc_orientation_classify is None:
			
 
				+            use_doc_orientation_classify = self.use_doc_orientation_classify
			
 
				+        if use_doc_unwarping is None:
			
 
				+            use_doc_unwarping = self.use_doc_unwarping
			
 
				+        
			
 
				+        model_settings = {
			
 
				+            "use_doc_orientation_classify": use_doc_orientation_classify,
			
 
				+            "use_doc_unwarping": use_doc_unwarping,
			
 
				+        }
			
 
				+        
			
 
				+        print(f"\n{'='*60}")
			
 
				+        print(f"🎯 Enhanced DocPreprocessor - MinerU Algorithm")
			
 
				+        print(f"   Settings: orientation={use_doc_orientation_classify}, unwarping={use_doc_unwarping}")
			
 
				+        print(f"{'='*60}\n")
			
 
				+        
			
 
				+        # 批处理
			
 
				+        for batch_data in self.batch_sampler(input):
			
 
				+            # 读取图像
			
 
				+            image_arrays = self.img_reader(batch_data.instances)
			
 
				+            
			
 
				+            # 🎯 增强的方向分类和旋转逻辑
			
 
				+            angles = []
			
 
				+            rot_imgs = []
			
 
				+            
			
 
				+            for idx, img in enumerate(image_arrays):
			
 
				+                print(f"\n📄 Processing image {idx + 1}/{len(image_arrays)}")
			
 
				+                
			
 
				+                if use_doc_orientation_classify:
			
 
				+                    # 🎯 关键改进：先判断是否需要分类
			
 
				+                    if self._should_classify_orientation(img):
			
 
				+                        # 需要分类：调用模型预测角度
			
 
				+                        angle = self._predict_orientation(img)
			
 
				+                    else:
			
 
				+                        # 跳过分类：直接使用 0 度
			
 
				+                        angle = 0
			
 
				+                        print("   ⏭️  Skipped orientation classification")
			
 
				+                    
			
 
				+                    angles.append(angle)
			
 
				+                    if angle != 0:
			
 
				+                        rot_img = rotate_image(img, angle)
			
 
				+                    else:
			
 
				+                        rot_img = img
			
 
				+                    rot_imgs.append(rot_img)
			
 
				+                else:
			
 
				+                    angles.append(-1)  # -1 表示未进行方向分类
			
 
				+                    rot_imgs.append(img)
			
 
				+            
			
 
				+            # 文档矫正
			
 
				+            if use_doc_unwarping and self.doc_unwarping_model is not None:
			
 
				+                output_imgs = [
			
 
				+                    item["doctr_img"][:, :, ::-1]
			
 
				+                    for item in self.doc_unwarping_model(rot_imgs)
			
 
				+                ]
			
 
				+            else:
			
 
				+                output_imgs = rot_imgs
			
 
				+            
			
 
				+            # 生成结果
			
 
				+            for input_path, page_index, image_array, angle, rot_img, output_img in zip(
			
 
				+                batch_data.input_paths,
			
 
				+                batch_data.page_indexes,
			
 
				+                image_arrays,
			
 
				+                angles,
			
 
				+                rot_imgs,
			
 
				+                output_imgs,
			
 
				+            ):
			
 
				+                single_img_res = {
			
 
				+                    "input_path": input_path,
			
 
				+                    "page_index": page_index,
			
 
				+                    "input_img": image_array,
			
 
				+                    "model_settings": model_settings,
			
 
				+                    "angle": angle,
			
 
				+                    "rot_img": rot_img,
			
 
				+                    "output_img": output_img,
			
 
				+                }
			
 
				+                yield DocPreprocessorResult(single_img_res)
			
 
				+    
			
 
				+    def __call__(self, *args, **kwargs):
			
 
				+        """支持像函数一样调用"""
			
 
				+        return self.predict(*args, **kwargs)
			
 
				+
			
 
				+
			
 
				+class DocPreprocessorAdapter:
			
 
				+    """
			
 
				+    文档预处理适配器
			
 
				+    替换 _DocPreprocessorPipeline 的 predict 方法
			
 
				+    """
			
 
				+    
			
 
				+    _original_predict = None
			
 
				+    _shared_ocr_det_model = None  # 🎯 共享的 OCR 检测模型
			
 
				+    _enhanced_preprocessor_cache = {}  # 🎯 缓存 enhanced_preprocessor 实例
			
 
				+    
			
 
				+    @classmethod
			
 
				+    def _get_cache_key(cls, device: str, use_doc_orientation_classify: bool, 
			
 
				+                       use_doc_unwarping: bool, batch_size: int) -> str:
			
 
				+        """生成缓存键"""
			
 
				+        return f"{device}_{use_doc_orientation_classify}_{use_doc_unwarping}_{batch_size}"
			
 
				+    
			
 
				+    @classmethod
			
 
				+    def apply(cls, use_enhanced: bool = True):
			
 
				+        """
			
 
				+        应用适配器
			
 
				+        
			
 
				+        Args:
			
 
				+            use_enhanced: 是否使用增强版预处理器
			
 
				+        """
			
 
				+        if not use_enhanced:
			
 
				+            cls.restore()
			
 
				+            return False
			
 
				+        
			
 
				+        try:
			
 
				+            from paddlex.inference.pipelines.doc_preprocessor import pipeline
			
 
				+            
			
 
				+            # 保存原始方法
			
 
				+            if cls._original_predict is None:
			
 
				+                cls._original_predict = pipeline._DocPreprocessorPipeline.predict
			
 
				+            
			
 
				+            # 创建增强版 predict 方法
			
 
				+            def enhanced_predict(
			
 
				+                self,
			
 
				+                input: Union[str, List[str], np.ndarray, List[np.ndarray]],
			
 
				+                use_doc_orientation_classify: Optional[bool] = None,
			
 
				+                use_doc_unwarping: Optional[bool] = None,
			
 
				+            ):
			
 
				+                """增强版 predict 方法"""
			
 
				+                
			
 
				+                # 🎯 关键改进 1：初始化共享的 OCR 检测模型（只初始化一次）
			
 
				+                if cls._shared_ocr_det_model is None:
			
 
				+                    print("\n" + "="*80)
			
 
				+                    print(">>> [Adapter] Enhanced DocPreprocessor - First Time Initialization")
			
 
				+                    print("="*80)
			
 
				+                    print("🔧 Initializing shared OCR detection model...")
			
 
				+                    try:
			
 
				+                        from paddlex import create_model
			
 
				+                        cls._shared_ocr_det_model = create_model(
			
 
				+                            'PP-OCRv5_server_det',
			
 
				+                            device=self.device
			
 
				+                        )
			
 
				+                        print("✅ Shared OCR detection model initialized")
			
 
				+                    except Exception as e:
			
 
				+                        print(f"⚠️  Failed to initialize OCR detection model: {e}")
			
 
				+                        cls._shared_ocr_det_model = None
			
 
				+                
			
 
				+                # 🎯 关键改进 2：使用缓存的 enhanced_preprocessor（只创建一次）
			
 
				+                cache_key = cls._get_cache_key(
			
 
				+                    device=self.device,
			
 
				+                    use_doc_orientation_classify=self.use_doc_orientation_classify,
			
 
				+                    use_doc_unwarping=self.use_doc_unwarping,
			
 
				+                    batch_size=self.batch_sampler.batch_size
			
 
				+                )
			
 
				+                
			
 
				+                if cache_key not in cls._enhanced_preprocessor_cache:
			
 
				+                    print("🔧 Creating new enhanced preprocessor instance...")
			
 
				+                    enhanced_preprocessor = EnhancedDocPreprocessor(
			
 
				+                        doc_ori_classify_model=self.doc_ori_classify_model if self.use_doc_orientation_classify else None,
			
 
				+                        doc_unwarping_model=self.doc_unwarping_model if self.use_doc_unwarping else None,
			
 
				+                        ocr_det_model=cls._shared_ocr_det_model,  # 使用共享的模型
			
 
				+                        device=self.device,
			
 
				+                        use_doc_orientation_classify=self.use_doc_orientation_classify,
			
 
				+                        use_doc_unwarping=self.use_doc_unwarping,
			
 
				+                        batch_size=self.batch_sampler.batch_size,
			
 
				+                    )
			
 
				+                    cls._enhanced_preprocessor_cache[cache_key] = enhanced_preprocessor
			
 
				+                    print(f"✅ Enhanced preprocessor cached with key: {cache_key}")
			
 
				+                else:
			
 
				+                    enhanced_preprocessor = cls._enhanced_preprocessor_cache[cache_key]
			
 
				+                    print(f"♻️  Reusing cached enhanced preprocessor: {cache_key}")
			
 
				+                
			
 
				+                # 调用增强版处理逻辑
			
 
				+                return enhanced_preprocessor.predict(
			
 
				+                    input,
			
 
				+                    use_doc_orientation_classify,
			
 
				+                    use_doc_unwarping,
			
 
				+                )
			
 
				+            
			
 
				+            # 替换方法
			
 
				+            pipeline._DocPreprocessorPipeline.predict = enhanced_predict
			
 
				+            
			
 
				+            print("✅ DocPreprocessor adapter applied successfully (MinerU algorithm)")
			
 
				+            return True
			
 
				+            
			
 
				+        except Exception as e:
			
 
				+            print(f"❌ Failed to apply DocPreprocessor adapter: {e}")
			
 
				+            import traceback
			
 
				+            traceback.print_exc()
			
 
				+            return False
			
 
				+    
			
 
				+    @classmethod
			
 
				+    def restore(cls):
			
 
				+        """恢复原始方法"""
			
 
				+        if cls._original_predict is None:
			
 
				+            return False
			
 
				+        
			
 
				+        try:
			
 
				+            from paddlex.inference.pipelines.doc_preprocessor import pipeline
			
 
				+            
			
 
				+            pipeline._DocPreprocessorPipeline.predict = cls._original_predict
			
 
				+            cls._original_predict = None
			
 
				+            
			
 
				+            # 🎯 清理共享资源
			
 
				+            cls._shared_ocr_det_model = None
			
 
				+            cls._enhanced_preprocessor_cache.clear()
			
 
				+            
			
 
				+            print("✅ DocPreprocessor adapter restored")
			
 
				+            return True
			
 
				+            
			
 
				+        except Exception as e:
			
 
				+            print(f"❌ Failed to restore DocPreprocessor adapter: {e}")
			
 
				+            return False
			
 
				+
			
 
				+
			
 
				+# 🎯 便捷函数
			
 
				+def apply_enhanced_doc_preprocessor():
			
 
				+    """应用增强版文档预处理器"""
			
 
				+    return DocPreprocessorAdapter.apply(use_enhanced=True)
			
 
				+
			
 
				+
			
 
				+def restore_paddlex_doc_preprocessor():
			
 
				+    """恢复 PaddleX 原始文档预处理器"""
			
 
				+    return DocPreprocessorAdapter.restore()
			
 
				+
			
 
				+
			
 
				+# 导出
			
 
				+__all__ = [
			
 
				+    'EnhancedDocPreprocessor',
			
 
				+    'DocPreprocessorAdapter',
			
 
				+    'apply_enhanced_doc_preprocessor',
			
 
				+    'restore_paddlex_doc_preprocessor',
			
 
				+]
			
--- a/zhch/adapters/enhanced_doc_orientation.py
+++ b/zhch/adapters/enhanced_doc_orientation.py
@@ -1,210 +0,0 @@
 
				-# zhch/custom_modules/enhanced_doc_orientation.py
			
 
				-
			
 
				-"""增强版文档方向分类器 - 结合 OCR 和 CNN"""
			
 
				-
			
 
				-import cv2
			
 
				-import numpy as np
			
 
				-import onnxruntime
			
 
				-from paddlex import create_model
			
 
				-from typing import Union, List
			
 
				-from PIL import Image
			
 
				-
			
 
				-
			
 
				-class EnhancedDocOrientationClassify:
			
 
				-    """
			
 
				-    增强版文档方向分类器
			
 
				-    
			
 
				-    参考 MinerU 的多阶段判断逻辑:
			
 
				-    1. 快速过滤: 宽高比检查
			
 
				-    2. OCR 分析: 文本框方向判断
			
 
				-    3. CNN 分类: 精确角度预测
			
 
				-    """
			
 
				-    
			
 
				-    def __init__(
			
 
				-        self,
			
 
				-        cnn_model_name: str = "PP-LCNet_x1_0_doc_ori",
			
 
				-        ocr_model_name: str = "PP-OCRv5_server_det",
			
 
				-        aspect_ratio_threshold: float = 1.2,
			
 
				-        vertical_box_ratio_threshold: float = 0.28,
			
 
				-        min_vertical_boxes: int = 3,
			
 
				-        text_box_aspect_ratio: float = 0.8,
			
 
				-    ):
			
 
				-        """
			
 
				-        Args:
			
 
				-            cnn_model_name: CNN 方向分类模型名称
			
 
				-            ocr_model_name: OCR 文本检测模型名称
			
 
				-            aspect_ratio_threshold: 图像宽高比阈值（> 此值才进行 OCR 检测）
			
 
				-            vertical_box_ratio_threshold: 垂直文本框占比阈值
			
 
				-            min_vertical_boxes: 最少垂直文本框数量
			
 
				-            text_box_aspect_ratio: 文本框宽高比阈值（< 此值为垂直文本）
			
 
				-        """
			
 
				-        # 1. 加载 CNN 方向分类模型
			
 
				-        self.cnn_model = create_model(cnn_model_name)
			
 
				-        
			
 
				-        # 2. 加载 OCR 文本检测模型
			
 
				-        self.ocr_detector = create_model(ocr_model_name)
			
 
				-        
			
 
				-        # 3. 参数设置
			
 
				-        self.aspect_ratio_threshold = aspect_ratio_threshold
			
 
				-        self.vertical_box_ratio_threshold = vertical_box_ratio_threshold
			
 
				-        self.min_vertical_boxes = min_vertical_boxes
			
 
				-        self.text_box_aspect_ratio = text_box_aspect_ratio
			
 
				-        
			
 
				-        # 4. 标签映射
			
 
				-        self.labels = ["0", "90", "180", "270"]
			
 
				-    
			
 
				-    def predict(
			
 
				-        self, 
			
 
				-        img: Union[str, np.ndarray, Image.Image],
			
 
				-        use_ocr_filter: bool = True,
			
 
				-        batch_size: int = 1,
			
 
				-    ) -> dict:
			
 
				-        """
			
 
				-        预测图像方向
			
 
				-        
			
 
				-        Args:
			
 
				-            img: 输入图像
			
 
				-            use_ocr_filter: 是否使用 OCR 过滤（False 则直接使用 CNN）
			
 
				-            batch_size: 批处理大小
			
 
				-            
			
 
				-        Returns:
			
 
				-            {
			
 
				-                "orientation": "0",  # "0", "90", "180", "270"
			
 
				-                "confidence": 0.95,
			
 
				-                "method": "ocr_filter" or "cnn",
			
 
				-                "details": {...}
			
 
				-            }
			
 
				-        """
			
 
				-        # 统一输入格式为 numpy array
			
 
				-        if isinstance(img, str):
			
 
				-            img = cv2.imread(img)
			
 
				-            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
			
 
				-        elif isinstance(img, Image.Image):
			
 
				-            img = np.array(img)
			
 
				-        
			
 
				-        # ============================================
			
 
				-        # 阶段1: 快速过滤（宽高比检查）
			
 
				-        # ============================================
			
 
				-        img_height, img_width = img.shape[:2]
			
 
				-        img_aspect_ratio = img_height / img_width if img_width > 0 else 1.0
			
 
				-        
			
 
				-        if not use_ocr_filter or img_aspect_ratio <= self.aspect_ratio_threshold:
			
 
				-            # 横向图像，直接返回 0 度
			
 
				-            return {
			
 
				-                "orientation": "0",
			
 
				-                "confidence": 1.0,
			
 
				-                "method": "aspect_ratio_filter",
			
 
				-                "details": {
			
 
				-                    "img_aspect_ratio": img_aspect_ratio,
			
 
				-                    "threshold": self.aspect_ratio_threshold,
			
 
				-                    "reason": "Image is landscape, no rotation needed"
			
 
				-                }
			
 
				-            }
			
 
				-        
			
 
				-        # ============================================
			
 
				-        # 阶段2: OCR 文本框分析
			
 
				-        # ============================================
			
 
				-        # 转换为 BGR（PaddleOCR 需要）
			
 
				-        bgr_img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
			
 
				-        
			
 
				-        # OCR 检测文本框
			
 
				-        det_result = self.ocr_detector.predict(bgr_img)
			
 
				-        det_boxes = det_result.get("boxes", [])
			
 
				-        
			
 
				-        if not det_boxes:
			
 
				-            # 没有检测到文本框，使用 CNN
			
 
				-            return self._cnn_predict(img, reason="No text boxes detected")
			
 
				-        
			
 
				-        # 分析文本框的方向
			
 
				-        vertical_count = 0
			
 
				-        for box in det_boxes:
			
 
				-            # 提取文本框坐标
			
 
				-            coords = box.get("coordinate", [])
			
 
				-            if len(coords) < 4:
			
 
				-                continue
			
 
				-            
			
 
				-            # 计算文本框的宽度和高度
			
 
				-            # PaddleX 返回的是 [x1, y1, x2, y2] 格式
			
 
				-            x1, y1, x2, y2 = coords[:4]
			
 
				-            width = abs(x2 - x1)
			
 
				-            height = abs(y2 - y1)
			
 
				-            
			
 
				-            aspect_ratio = width / height if height > 0 else 1.0
			
 
				-            
			
 
				-            # 判断是否为垂直文本框
			
 
				-            if aspect_ratio < self.text_box_aspect_ratio:
			
 
				-                vertical_count += 1
			
 
				-        
			
 
				-        # 计算垂直文本框占比
			
 
				-        vertical_ratio = vertical_count / len(det_boxes) if det_boxes else 0
			
 
				-        
			
 
				-        # 判断是否需要旋转
			
 
				-        is_rotated = (
			
 
				-            vertical_ratio >= self.vertical_box_ratio_threshold
			
 
				-            and vertical_count >= self.min_vertical_boxes
			
 
				-        )
			
 
				-        
			
 
				-        if not is_rotated:
			
 
				-            # 文本框正常，不需要旋转
			
 
				-            return {
			
 
				-                "orientation": "0",
			
 
				-                "confidence": 1.0,
			
 
				-                "method": "ocr_filter",
			
 
				-                "details": {
			
 
				-                    "total_boxes": len(det_boxes),
			
 
				-                    "vertical_boxes": vertical_count,
			
 
				-                    "vertical_ratio": vertical_ratio,
			
 
				-                    "threshold": self.vertical_box_ratio_threshold,
			
 
				-                    "reason": "Text boxes are mostly horizontal"
			
 
				-                }
			
 
				-            }
			
 
				-        
			
 
				-        # ============================================
			
 
				-        # 阶段3: CNN 精确分类
			
 
				-        # ============================================
			
 
				-        return self._cnn_predict(
			
 
				-            img,
			
 
				-            reason=f"OCR detected rotation (vertical_ratio={vertical_ratio:.2f})",
			
 
				-            ocr_details={
			
 
				-                "total_boxes": len(det_boxes),
			
 
				-                "vertical_boxes": vertical_count,
			
 
				-                "vertical_ratio": vertical_ratio,
			
 
				-            }
			
 
				-        )
			
 
				-    
			
 
				-    def _cnn_predict(self, img: np.ndarray, reason: str = "", ocr_details: dict = None) -> dict:
			
 
				-        """使用 CNN 模型预测方向"""
			
 
				-        # 转换为 PIL Image（PaddleX 需要）
			
 
				-        if isinstance(img, np.ndarray):
			
 
				-            img = Image.fromarray(img)
			
 
				-        
			
 
				-        # CNN 推理
			
 
				-        result = self.cnn_model.predict(img)
			
 
				-        
			
 
				-        # 提取结果
			
 
				-        orientation = result.get("label", "0")
			
 
				-        confidence = result.get("score", 0.0)
			
 
				-        
			
 
				-        return {
			
 
				-            "orientation": orientation,
			
 
				-            "confidence": confidence,
			
 
				-            "method": "cnn",
			
 
				-            "details": {
			
 
				-                "reason": reason,
			
 
				-                "ocr_analysis": ocr_details or {},
			
 
				-                "cnn_scores": result.get("label_names", [])
			
 
				-            }
			
 
				-        }
			
 
				-    
			
 
				-    def batch_predict(
			
 
				-        self,
			
 
				-        imgs: List[Union[str, np.ndarray, Image.Image]],
			
 
				-        use_ocr_filter: bool = True,
			
 
				-        batch_size: int = 8,
			
 
				-    ) -> List[dict]:
			
 
				-        """批量预测"""
			
 
				-        results = []
			
 
				-        for img in imgs:
			
 
				-            result = self.predict(img, use_ocr_filter=use_ocr_filter)
			
 
				-            results.append(result)
			
 
				-        return results