소스 검색

feat: Add PaddleX core modules for document processing

- Introduced core modules for PaddleOCR-VL and PP-StructureV3, including a unified processor, utility functions, and adapters for enhanced document processing.
- Implemented configuration files for various pipelines, including layout parsing and table recognition.
- Added README documentation for configuration usage and module descriptions.
zhch158_admin 2 주 전
부모
커밋
b433d1211c

+ 9 - 0
ocr_tools/paddle_common/__init__.py

@@ -0,0 +1,9 @@
+"""
+PaddleX 共享核心模块
+
+提供 PaddleOCR-VL 和 PP-StructureV3 工具共享的核心处理逻辑
+"""
+
+__version__ = "1.0.0"
+__author__ = "zhch158"
+

+ 28 - 0
ocr_tools/paddle_common/adapters/__init__.py

@@ -0,0 +1,28 @@
+"""
+适配器包初始化 - 支持自动激活
+"""
+from .table_recognition_adapter import (
+    apply_table_recognition_adapter,
+    restore_original_function,
+    enhanced_predict_single_table_recognition_res
+)
+
+from .doc_preprocessor_adapter import (
+    apply_enhanced_doc_preprocessor,
+    restore_paddlex_doc_preprocessor,
+    DocPreprocessorAdapter,
+    EnhancedDocPreprocessor,
+)
+
+__all__ = [
+    # 表格识别适配器
+    'apply_table_recognition_adapter',
+    'restore_original_function',
+    'enhanced_predict_single_table_recognition_res',
+
+    # 文档预处理适配器
+    'apply_enhanced_doc_preprocessor',
+    'restore_paddlex_doc_preprocessor',
+    'DocPreprocessorAdapter',
+    'EnhancedDocPreprocessor',
+]

+ 472 - 0
ocr_tools/paddle_common/adapters/doc_preprocessor_adapter.py

@@ -0,0 +1,472 @@
+"""
+文档预处理适配器
+使用 MinerU 的方向判断算法,但保留 PaddleX 的模型
+"""
+
+import sys
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Union, Tuple
+import numpy as np
+import cv2
+
+from paddlex.inference.pipelines.doc_preprocessor.result import DocPreprocessorResult
+from paddlex.inference.common.reader import ReadImage
+from paddlex.inference.common.batch_sampler import ImageBatchSampler
+from paddlex.inference.pipelines.components import rotate_image
+
+
+class EnhancedDocPreprocessor:
+    """
+    增强版文档预处理器
+    核心思路:采用 MinerU 的两阶段方向判断算法
+    1. 快速过滤:宽高比判断(纵向图片才需要方向分类)
+    2. OCR 引导:检测文本框,判断是否有大量垂直文本
+    3. 精确分类:仅对疑似旋转的图片调用分类模型
+    """
+    
+    def __init__(
+        self,
+        doc_ori_classify_model,
+        doc_unwarping_model,
+        ocr_det_model=None,  # 🎯 OCR 检测模型(可选)
+        device: str = "cpu",
+        use_doc_orientation_classify: bool = True,
+        use_doc_unwarping: bool = False,
+        batch_size: int = 1,
+    ):
+        """
+        Args:
+            doc_ori_classify_model: PaddleX 的方向分类模型
+            doc_unwarping_model: PaddleX 的文档矫正模型
+            ocr_det_model: OCR 文本检测模型(用于判断是否需要旋转,可选)
+            device: 设备类型(cpu/gpu)
+            use_doc_orientation_classify: 是否使用方向分类
+            use_doc_unwarping: 是否使用文档矫正
+            batch_size: 批处理大小
+        """
+        self.doc_ori_classify_model = doc_ori_classify_model
+        self.doc_unwarping_model = doc_unwarping_model
+        self.device = device
+        self.use_doc_orientation_classify = use_doc_orientation_classify
+        self.use_doc_unwarping = use_doc_unwarping
+        self.batch_size = batch_size
+        
+        self.img_reader = ReadImage(format="BGR")
+        self.batch_sampler = ImageBatchSampler(batch_size=batch_size)
+        
+        # 🎯 MinerU 算法参数
+        self.portrait_threshold = 1.2  # 宽高比阈值
+        self.vertical_ratio_threshold = 0.28  # 垂直文本框比例阈值
+        self.min_vertical_count = 3  # 最少垂直文本框数量
+        
+        # 🎯 初始化 OCR 检测模型(只初始化一次)
+        self.ocr_det_model = ocr_det_model
+        if self.ocr_det_model is None:
+            self._initialize_ocr_det_model()
+        
+        print(f"📐 Enhanced DocPreprocessor initialized")
+        print(f"   - Device: {self.device}")
+        print(f"   - Portrait threshold: {self.portrait_threshold}")
+        print(f"   - Vertical ratio threshold: {self.vertical_ratio_threshold}")
+        print(f"   - Min vertical count: {self.min_vertical_count}")
+        print(f"   - OCR detection model: {'✅ Available' if self.ocr_det_model else '❌ Not available'}")
+    
+    def _initialize_ocr_det_model(self):
+        """初始化 OCR 检测模型(只执行一次)"""
+        try:
+            from paddlex import create_model
+            
+            print("🔧 Initializing OCR detection model...")
+            self.ocr_det_model = create_model(
+                'PP-OCRv5_server_det',
+                device=self.device
+            )
+            print("✅ OCR detection model initialized successfully")
+            
+        except Exception as e:
+            print(f"⚠️  Failed to initialize OCR detection model: {e}")
+            print("   Will skip OCR-guided filtering")
+            self.ocr_det_model = None
+    
+    def _is_portrait_image(self, image: np.ndarray) -> bool:
+        """判断是否为纵向图片"""
+        img_height, img_width = image.shape[:2]
+        aspect_ratio = img_height / img_width if img_width > 0 else 1.0
+        is_portrait = aspect_ratio > self.portrait_threshold
+        print(f"   📏 Image size: {img_width}x{img_height}, aspect_ratio: {aspect_ratio:.2f}, is_portrait: {is_portrait}")
+        return is_portrait
+    
+    def _detect_vertical_text_boxes(self, image: np.ndarray) -> Tuple[int, int]:
+        """
+        检测图片中的垂直文本框
+        
+        Returns:
+            (vertical_count, total_count): 垂直文本框数量和总数量
+        """
+        if self.ocr_det_model is None:
+            print("   ⚠️  OCR detection model not available")
+            return 0, 0
+        
+        try:
+            # 🎯 调用 OCR 检测模型
+            det_results = list(self.ocr_det_model([image]))
+            if not det_results or len(det_results) == 0:
+                print("   ℹ️  No OCR detection results")
+                return 0, 0
+            
+            det_result = det_results[0]
+            
+            # 🎯 从检测结果中提取文本框
+            # PaddleX 的检测结果格式: {"dt_polys": [...], ...}
+            boxes = None
+            if isinstance(det_result, dict):
+                boxes = det_result.get('dt_polys', None)
+            elif isinstance(det_result, np.ndarray):
+                boxes = det_result
+            
+            if boxes is None or len(boxes) == 0:
+                print("   ℹ️  No text boxes detected")
+                return 0, 0
+            
+            # 🎯 统计垂直文本框
+            vertical_count = 0
+            total_count = len(boxes)
+            
+            # 🎯 处理 numpy 数组格式: shape=(N, 4, 2)
+            if isinstance(boxes, np.ndarray) and len(boxes.shape) == 3 and boxes.shape[1] == 4 and boxes.shape[2] == 2:
+                # 格式: (N, 4, 2) - 每个框有4个点,每个点有(x,y)坐标
+                for box in boxes:
+                    # box: shape=(4, 2) - [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
+                    p1, p2, p3, p4 = box
+                    
+                    # 计算宽高
+                    width = abs(float(p2[0] - p1[0]))  # x2 - x1
+                    height = abs(float(p3[1] - p2[1]))  # y3 - y2
+                    
+                    if height == 0:
+                        continue
+                    
+                    aspect_ratio = width / height
+                    
+                    # 🎯 MinerU 的判断标准:宽高比 < 0.8 为垂直文本
+                    if aspect_ratio < 0.8:
+                        vertical_count += 1
+            
+            print(f"   📊 OCR detection: {vertical_count}/{total_count} vertical boxes ({vertical_count/total_count:.1%} vertical)")
+            return vertical_count, total_count
+            
+        except Exception as e:
+            print(f"   ⚠️  OCR detection failed: {e}")
+            import traceback
+            traceback.print_exc()
+            return 0, 0
+    
+    def _should_classify_orientation(self, image: np.ndarray) -> bool:
+        """
+        判断是否需要进行方向分类
+        参考 MinerU 的两阶段判断逻辑
+        
+        Returns:
+            True: 需要分类
+            False: 跳过分类(直接使用原图)
+        """
+        print("🔍 Checking if orientation classification is needed...")
+        
+        # 🎯 阶段 1: 快速过滤 - 宽高比检查
+        if not self._is_portrait_image(image):
+            print("   ⏭️  Skipped: Image is landscape")
+            return False
+        
+        # 🎯 阶段 2: OCR 引导判断 - 检测垂直文本框
+        vertical_count, total_count = self._detect_vertical_text_boxes(image)
+        
+        if total_count == 0:
+            print("   ⏭️  Skipped: No text detected")
+            return False
+        
+        # 🎯 MinerU 的判断标准:
+        # 垂直文本框比例 >= 28% 且数量 >= 3,才认为可能需要旋转
+        vertical_ratio = vertical_count / total_count
+        is_rotated = (
+            vertical_ratio >= self.vertical_ratio_threshold and 
+            vertical_count >= self.min_vertical_count
+        )
+        
+        print(f"   📈 Vertical ratio: {vertical_ratio:.1%} (threshold: {self.vertical_ratio_threshold:.1%})")
+        print(f"   📊 Vertical count: {vertical_count} (min: {self.min_vertical_count})")
+        print(f"   🎯 Need classification: {is_rotated}")
+        
+        return is_rotated
+    
+    def _predict_orientation(self, image: np.ndarray) -> int:
+        """
+        预测图像方向
+        
+        Args:
+            image: BGR 格式的图像
+            
+        Returns:
+            旋转角度 (0, 90, 180, 270)
+        """
+        if not self.use_doc_orientation_classify or self.doc_ori_classify_model is None:
+            return 0
+        
+        try:
+            # 调用 PaddleX 的分类模型
+            preds = list(self.doc_ori_classify_model([image]))
+            if preds and len(preds) > 0:
+                pred = preds[0]
+                angle = int(pred["label_names"][0])
+                print(f"   🔄 Orientation classification result: {angle}°")
+                return angle
+            return 0
+        except Exception as e:
+            print(f"   ⚠️  Orientation prediction failed: {e}")
+            return 0
+    
+    def predict(
+        self,
+        input: Union[str, List[str], np.ndarray, List[np.ndarray]],
+        use_doc_orientation_classify: Optional[bool] = None,
+        use_doc_unwarping: Optional[bool] = None,
+    ):
+        """
+        预测文档预处理结果
+        
+        Args:
+            input: 输入图像路径、数组或列表
+            use_doc_orientation_classify: 是否使用方向分类
+            use_doc_unwarping: 是否使用文档矫正
+            
+        Yields:
+            DocPreprocessorResult: 预处理结果
+        """
+        # 处理模型设置
+        if use_doc_orientation_classify is None:
+            use_doc_orientation_classify = self.use_doc_orientation_classify
+        if use_doc_unwarping is None:
+            use_doc_unwarping = self.use_doc_unwarping
+        
+        model_settings = {
+            "use_doc_orientation_classify": use_doc_orientation_classify,
+            "use_doc_unwarping": use_doc_unwarping,
+        }
+        
+        print(f"\n{'='*60}")
+        print(f"🎯 Enhanced DocPreprocessor - MinerU Algorithm")
+        print(f"   Settings: orientation={use_doc_orientation_classify}, unwarping={use_doc_unwarping}")
+        print(f"{'='*60}\n")
+        
+        # 批处理
+        for batch_data in self.batch_sampler(input):
+            # 读取图像
+            image_arrays = self.img_reader(batch_data.instances)
+            
+            # 🎯 增强的方向分类和旋转逻辑
+            angles = []
+            rot_imgs = []
+            
+            for idx, img in enumerate(image_arrays):
+                print(f"\n📄 Processing image {idx + 1}/{len(image_arrays)}")
+                
+                if use_doc_orientation_classify:
+                    # 🎯 关键改进:先判断是否需要分类
+                    if self._should_classify_orientation(img):
+                        # 需要分类:调用模型预测角度
+                        angle = self._predict_orientation(img)
+                    else:
+                        # 跳过分类:直接使用 0 度
+                        angle = 0
+                        print("   ⏭️  Skipped orientation classification")
+                    
+                    angles.append(angle)
+                    if angle != 0:
+                        rot_img = rotate_image(img, angle)
+                    else:
+                        rot_img = img
+                    rot_imgs.append(rot_img)
+                else:
+                    angles.append(-1)  # -1 表示未进行方向分类
+                    rot_imgs.append(img)
+            
+            # 文档矫正
+            if use_doc_unwarping and self.doc_unwarping_model is not None:
+                output_imgs = [
+                    item["doctr_img"][:, :, ::-1]
+                    for item in self.doc_unwarping_model(rot_imgs)
+                ]
+            else:
+                output_imgs = rot_imgs
+            
+            # 生成结果
+            for input_path, page_index, image_array, angle, rot_img, output_img in zip(
+                batch_data.input_paths,
+                batch_data.page_indexes,
+                image_arrays,
+                angles,
+                rot_imgs,
+                output_imgs,
+            ):
+                single_img_res = {
+                    "input_path": input_path,
+                    "page_index": page_index,
+                    "input_img": image_array,
+                    "model_settings": model_settings,
+                    "angle": angle,
+                    "rot_img": rot_img,
+                    "output_img": output_img,
+                }
+                yield DocPreprocessorResult(single_img_res)
+    
+    def __call__(self, *args, **kwargs):
+        """支持像函数一样调用"""
+        return self.predict(*args, **kwargs)
+
+
+class DocPreprocessorAdapter:
+    """
+    文档预处理适配器
+    替换 _DocPreprocessorPipeline 的 predict 方法
+    """
+    
+    _original_predict = None
+    _shared_ocr_det_model = None  # 🎯 共享的 OCR 检测模型
+    _enhanced_preprocessor_cache = {}  # 🎯 缓存 enhanced_preprocessor 实例
+    
+    @classmethod
+    def _get_cache_key(cls, device: str, use_doc_orientation_classify: bool, 
+                       use_doc_unwarping: bool, batch_size: int) -> str:
+        """生成缓存键"""
+        return f"{device}_{use_doc_orientation_classify}_{use_doc_unwarping}_{batch_size}"
+    
+    @classmethod
+    def apply(cls, use_enhanced: bool = True):
+        """
+        应用适配器
+        
+        Args:
+            use_enhanced: 是否使用增强版预处理器
+        """
+        if not use_enhanced:
+            cls.restore()
+            return False
+        
+        try:
+            from paddlex.inference.pipelines.doc_preprocessor import pipeline
+            
+            # 保存原始方法
+            if cls._original_predict is None:
+                cls._original_predict = pipeline._DocPreprocessorPipeline.predict
+            
+            # 创建增强版 predict 方法
+            def enhanced_predict(
+                self,
+                input: Union[str, List[str], np.ndarray, List[np.ndarray]],
+                use_doc_orientation_classify: Optional[bool] = None,
+                use_doc_unwarping: Optional[bool] = None,
+            ):
+                """增强版 predict 方法"""
+                
+                # 🎯 关键改进 1:初始化共享的 OCR 检测模型(只初始化一次)
+                if cls._shared_ocr_det_model is None:
+                    print("\n" + "="*80)
+                    print(">>> [Adapter] Enhanced DocPreprocessor - First Time Initialization")
+                    print("="*80)
+                    print("🔧 Initializing shared OCR detection model...")
+                    try:
+                        from paddlex import create_model
+                        cls._shared_ocr_det_model = create_model(
+                            'PP-OCRv5_server_det',
+                            device=self.device
+                        )
+                        print("✅ Shared OCR detection model initialized")
+                    except Exception as e:
+                        print(f"⚠️  Failed to initialize OCR detection model: {e}")
+                        cls._shared_ocr_det_model = None
+                
+                # 🎯 关键改进 2:使用缓存的 enhanced_preprocessor(只创建一次)
+                cache_key = cls._get_cache_key(
+                    device=self.device,
+                    use_doc_orientation_classify=self.use_doc_orientation_classify,
+                    use_doc_unwarping=self.use_doc_unwarping,
+                    batch_size=self.batch_sampler.batch_size
+                )
+                
+                if cache_key not in cls._enhanced_preprocessor_cache:
+                    print("🔧 Creating new enhanced preprocessor instance...")
+                    enhanced_preprocessor = EnhancedDocPreprocessor(
+                        doc_ori_classify_model=self.doc_ori_classify_model if self.use_doc_orientation_classify else None,
+                        doc_unwarping_model=self.doc_unwarping_model if self.use_doc_unwarping else None,
+                        ocr_det_model=cls._shared_ocr_det_model,  # 使用共享的模型
+                        device=self.device,
+                        use_doc_orientation_classify=self.use_doc_orientation_classify,
+                        use_doc_unwarping=self.use_doc_unwarping,
+                        batch_size=self.batch_sampler.batch_size,
+                    )
+                    cls._enhanced_preprocessor_cache[cache_key] = enhanced_preprocessor
+                    print(f"✅ Enhanced preprocessor cached with key: {cache_key}")
+                else:
+                    enhanced_preprocessor = cls._enhanced_preprocessor_cache[cache_key]
+                    print(f"♻️  Reusing cached enhanced preprocessor: {cache_key}")
+                
+                # 调用增强版处理逻辑
+                return enhanced_preprocessor.predict(
+                    input,
+                    use_doc_orientation_classify,
+                    use_doc_unwarping,
+                )
+            
+            # 替换方法
+            pipeline._DocPreprocessorPipeline.predict = enhanced_predict
+            
+            print("✅ DocPreprocessor adapter applied successfully (MinerU algorithm)")
+            return True
+            
+        except Exception as e:
+            print(f"❌ Failed to apply DocPreprocessor adapter: {e}")
+            import traceback
+            traceback.print_exc()
+            return False
+    
+    @classmethod
+    def restore(cls):
+        """恢复原始方法"""
+        if cls._original_predict is None:
+            return False
+        
+        try:
+            from paddlex.inference.pipelines.doc_preprocessor import pipeline
+            
+            pipeline._DocPreprocessorPipeline.predict = cls._original_predict
+            cls._original_predict = None
+            
+            # 🎯 清理共享资源
+            cls._shared_ocr_det_model = None
+            cls._enhanced_preprocessor_cache.clear()
+            
+            print("✅ DocPreprocessor adapter restored")
+            return True
+            
+        except Exception as e:
+            print(f"❌ Failed to restore DocPreprocessor adapter: {e}")
+            return False
+
+
+# 🎯 便捷函数
+def apply_enhanced_doc_preprocessor():
+    """应用增强版文档预处理器"""
+    return DocPreprocessorAdapter.apply(use_enhanced=True)
+
+
+def restore_paddlex_doc_preprocessor():
+    """恢复 PaddleX 原始文档预处理器"""
+    return DocPreprocessorAdapter.restore()
+
+
+# 导出
+__all__ = [
+    'EnhancedDocPreprocessor',
+    'DocPreprocessorAdapter',
+    'apply_enhanced_doc_preprocessor',
+    'restore_paddlex_doc_preprocessor',
+]

+ 485 - 0
ocr_tools/paddle_common/adapters/table_recognition_adapter.py

@@ -0,0 +1,485 @@
+"""
+表格识别个性化适配器 (v6 - 行内重叠合并修正版)
+
+核心思想:
+1. 废弃全局坐标聚类,改为按行分组和对齐,极大提升对倾斜、不规则表格的鲁棒性。
+2. 结构生成与内容填充彻底分离:
+   - `build_robust_html_from_cells`: 仅根据单元格几何位置,生成带`data-bbox`的HTML骨架。
+   - `fill_html_with_ocr_by_bbox`: 根据`data-bbox`从全局OCR结果中查找文本并填充。
+3. 通过适配器直接替换PaddleX Pipeline中的核心方法,实现无侵入式升级。
+"""
+import importlib
+from typing import Any, Dict, List
+import numpy as np
+
+from paddlex.inference.pipelines.table_recognition.result import SingleTableRecognitionResult
+from paddlex.inference.pipelines.table_recognition.pipeline_v2 import OCRResult
+
+def _normalize_bbox(box: list) -> list:
+    """
+    将8点坐标或4点坐标统一转换为 [x1, y1, x2, y2]
+    """
+    if len(box) == 8:
+        # 8点坐标:取最小和最大值
+        xs = [box[0], box[2], box[4], box[6]]
+        ys = [box[1], box[3], box[5], box[7]]
+        return [min(xs), min(ys), max(xs), max(ys)]
+    elif len(box) == 4:
+        return box[:4]
+    else:
+        raise ValueError(f"Unsupported bbox format: {box}")
+
+# --- 1. 核心算法:基于排序和行分组的HTML结构生成 ---
+def filter_nested_boxes(boxes: List[list]) -> List[list]:
+    """
+    移除被其他框完全包含的框。
+    boxes: List[[x1, y1, x2, y2]]
+    """
+    if not boxes:
+        return []
+    
+    filtered = []
+    # 按面积从大到小排序,优先保留大框
+    boxes.sort(key=lambda b: (b[2] - b[0]) * (b[3] - b[1]), reverse=True)
+    
+    for i, box in enumerate(boxes):
+        is_nested = False
+        for j in range(i): # 只需和排在前面的(更大的)框比较
+            outer_box = boxes[j]
+            # 判断 box 是否被 outer_box 包含
+            if outer_box[0] <= box[0] and outer_box[1] <= box[1] and \
+               outer_box[2] >= box[2] and outer_box[3] >= box[3]:
+                is_nested = True
+                break
+        if not is_nested:
+            filtered.append(box)
+    return filtered
+
+def merge_overlapping_cells_in_row(row_cells: List[list], iou_threshold: float = 0.5) -> List[list]:
+    """
+    合并单行内水平方向上高度重叠的单元格。
+    """
+    if not row_cells:
+        return []
+
+    # 按x坐标排序
+    cells = sorted(row_cells, key=lambda c: c[0])
+    
+    merged_cells = []
+    i = 0
+    while i < len(cells):
+        current_cell = list(cells[i]) # 使用副本
+        j = i + 1
+        while j < len(cells):
+            next_cell = cells[j]
+            
+            # 计算交集
+            inter_x1 = max(current_cell[0], next_cell[0])
+            inter_y1 = max(current_cell[1], next_cell[1])
+            inter_x2 = min(current_cell[2], next_cell[2])
+            inter_y2 = min(current_cell[3], next_cell[3])
+            
+            inter_area = max(0, inter_x2 - inter_x1) * max(0, inter_y2 - inter_y1)
+            
+            # 如果交集面积大于其中一个框面积的阈值,则认为是重叠
+            current_area = (current_cell[2] - current_cell[0]) * (current_cell[3] - current_cell[1])
+            next_area = (next_cell[2] - next_cell[0]) * (next_cell[3] - next_cell[1])
+            
+            if inter_area > min(current_area, next_area) * iou_threshold:
+                # 合并两个框,取外包围框
+                current_cell[0] = min(current_cell[0], next_cell[0])
+                current_cell[1] = min(current_cell[1], next_cell[1])
+                current_cell[2] = max(current_cell[2], next_cell[2])
+                current_cell[3] = max(current_cell[3], next_cell[3])
+                j += 1
+            else:
+                break # 不再与更远的单元格合并
+        
+        merged_cells.append(current_cell)
+        i = j
+        
+    return merged_cells
+
+def build_robust_html_from_cells(cells_det_results: List[list]) -> str:
+    """
+    通过按行排序、分组、合并和对齐,稳健地将单元格Bbox列表转换为带data-bbox的HTML结构。
+    """
+    if not cells_det_results:
+        return "<table><tbody></tbody></table>"
+
+    # ✅ 关键修复:使用副本防止修改原始列表
+    import copy
+    cells_copy = copy.deepcopy(cells_det_results)
+    cells = filter_nested_boxes(cells_copy)
+    cells.sort(key=lambda c: (c[1], c[0]))
+
+    rows = []
+    if cells:
+        current_row = [cells[0]]
+        # ✅ 使用该行的Y范围而不是单个锚点
+        row_y1 = cells[0][1]
+        row_y2 = cells[0][3]
+
+        for cell in cells[1:]:
+            # ✅ 计算垂直方向的重叠
+            overlap_y1 = max(row_y1, cell[1])
+            overlap_y2 = min(row_y2, cell[3])
+            overlap_height = max(0, overlap_y2 - overlap_y1)
+            
+            # 单元格和当前行的平均高度
+            cell_height = cell[3] - cell[1]
+            row_height = row_y2 - row_y1
+            avg_height = (cell_height + row_height) / 2
+            
+            # ✅ 重叠高度超过平均高度的50%,认为是同一行
+            if overlap_height > avg_height * 0.5:
+                current_row.append(cell)
+                # 更新该行的Y范围(扩展以包含新单元格)
+                row_y1 = min(row_y1, cell[1])
+                row_y2 = max(row_y2, cell[3])
+            else:
+                rows.append(current_row)
+                current_row = [cell]
+                row_y1 = cell[1]
+                row_y2 = cell[3]
+        rows.append(current_row)
+
+    html = "<table><tbody>"
+    for row_cells in rows:
+        # 🎯 核心修正:在生成HTML前,合并行内的重叠单元格
+        merged_row_cells = merge_overlapping_cells_in_row(row_cells)
+        
+        html += "<tr>"
+        for cell in merged_row_cells:
+            bbox_str = f"[{','.join(map(str, map(int, cell)))}]"
+            html += f'<td data-bbox="{bbox_str}"></td>'
+        html += "</tr>"
+    html += "</tbody></table>"
+    
+    return html
+
+# --- 2. 内容填充工具 ---
+
+def fill_html_with_ocr_by_bbox(html_skeleton: str, ocr_dt_boxes: list, ocr_texts: list) -> str:
+    """
+    根据带有 data-bbox 的 HTML 骨架和全局 OCR 结果填充表格内容。
+    """
+    try:
+        from bs4 import BeautifulSoup
+    except ImportError:
+        print("⚠️  BeautifulSoup not installed. Cannot fill table content. Returning skeleton.")
+        return html_skeleton
+
+    soup = BeautifulSoup(html_skeleton, 'html.parser')
+    # # ocr_dt_boxes = cells_ocr_res.get("rec_boxes", [])
+    # ocr_texts = cells_ocr_res.get("rec_texts", [])
+
+    # 为快速查找,将OCR结果组织起来
+    ocr_items = []
+    for box, text in zip(ocr_dt_boxes, ocr_texts):
+        center_x = (box[0] + box[2]) / 2
+        center_y = (box[1] + box[3]) / 2
+        ocr_items.append({'box': box, 'text': text, 'center': (center_x, center_y)})
+
+    for td in soup.find_all('td'):
+        if not td.has_attr('data-bbox'):
+            continue
+        
+        bbox_str = td['data-bbox'].strip('[]')
+        cell_box = list(map(float, bbox_str.split(',')))
+        cx1, cy1, cx2, cy2 = cell_box
+
+        cell_texts_with_pos = []
+        # 查找所有中心点在该单元格内的OCR文本
+        for item in ocr_items:
+            if cx1 <= item['center'][0] <= cx2 and cy1 <= item['center'][1] <= cy2:
+                # 记录文本和其y坐标,用于后续排序
+                cell_texts_with_pos.append((item['text'], item['box'][1]))
+        
+        if cell_texts_with_pos:
+            # 按y坐标排序,确保多行文本的顺序正确
+            cell_texts_with_pos.sort(key=lambda x: x[1])
+            # 合并文本
+            td.string = " ".join([text for text, y in cell_texts_with_pos])
+            
+    return str(soup)
+
+# --- 3. 适配器主函数和应用逻辑 ---
+
+# 保存原始方法的引用
+_original_predict_single = None
+
+def infer_missing_cells_from_ocr(
+    detected_cells: List[list],
+    cells_texts_list: List[str],
+    overall_ocr_boxes: List[list],
+    overall_ocr_texts: List[str],
+    table_box: list
+) -> tuple[List[list], List[str]]:
+    """
+    根据全局OCR结果推断缺失的单元格
+    
+    Args:
+        detected_cells: 已检测到的单元格坐标 [[x1,y1,x2,y2], ...]
+        overall_ocr_boxes: 全局OCR框坐标
+        overall_ocr_texts: 全局OCR文本
+        table_box: 表格区域 [x1,y1,x2,y2]
+    
+    Returns:
+        补全后的单元格列表
+    """
+    import copy
+    
+    # 1. 找出未被覆盖的OCR框
+    uncovered_ocr_boxes = []
+    uncovered_ocr_texts = []
+    
+    for ocr_box, ocr_text in zip(overall_ocr_boxes, overall_ocr_texts):
+        # 计算OCR框中心点
+        ocr_cx = (ocr_box[0] + ocr_box[2]) / 2
+        ocr_cy = (ocr_box[1] + ocr_box[3]) / 2
+        
+        # 检查是否被任何单元格覆盖
+        is_covered = False
+        for cell in detected_cells:
+            if cell[0] <= ocr_cx <= cell[2] and cell[1] <= ocr_cy <= cell[3]:
+                is_covered = True
+                break
+        
+        if not is_covered:
+            uncovered_ocr_boxes.append(ocr_box)
+            uncovered_ocr_texts.append(ocr_text)
+    
+    if not uncovered_ocr_boxes:
+        return detected_cells, cells_texts_list  # 没有漏检
+    
+    # 2. 按行分组已检测的单元格
+    cells_sorted = sorted(detected_cells, key=lambda c: (c[1], c[0]))
+    rows = []
+    if cells_sorted:
+        current_row = [cells_sorted[0]]
+        row_y = (cells_sorted[0][1] + cells_sorted[0][3]) / 2
+        row_height = cells_sorted[0][3] - cells_sorted[0][1]
+        
+        for cell in cells_sorted[1:]:
+            cell_y = (cell[1] + cell[3]) / 2
+            if abs(cell_y - row_y) < row_height * 0.7:
+                current_row.append(cell)
+            else:
+                rows.append(current_row)
+                current_row = [cell]
+                row_y = (cell[1] + cell[3]) / 2
+                row_height = cell[3] - cell[1]
+        rows.append(current_row)
+    
+    # 3. 为每个未覆盖的OCR框推断单元格
+    inferred_cells = []
+    inferred_texts = []
+    for ocr_box, ocr_text in zip(uncovered_ocr_boxes, uncovered_ocr_texts):
+        ocr_cy = (ocr_box[1] + ocr_box[3]) / 2
+        
+        # 找到OCR框所在的行
+        target_row_idx = None
+        for i, row_cells in enumerate(rows):
+            row_y1 = min(c[1] for c in row_cells)
+            row_y2 = max(c[3] for c in row_cells)
+            if row_y1 <= ocr_cy <= row_y2:
+                target_row_idx = i
+                break
+        
+        if target_row_idx is None:
+            # 无法确定所属行,跳过
+            print(f"⚠️  无法为OCR文本 '{ocr_text}' 确定所属行")
+            continue
+        
+        target_row = rows[target_row_idx]
+        
+        # 4. 推断单元格边界
+        # 上下边界:使用该行的统一高度
+        cell_y1 = min(c[1] for c in target_row)
+        cell_y2 = max(c[3] for c in target_row)
+        
+        # 左右边界:根据OCR框位置和相邻单元格推断
+        ocr_cx = (ocr_box[0] + ocr_box[2]) / 2
+        
+        # 找左边最近的单元格
+        left_cells = [c for c in target_row if c[2] < ocr_cx]
+        if left_cells:
+            cell_x1 = max(c[2] for c in left_cells)  # 左边单元格的右边界
+        else:
+            cell_x1 = table_box[0]  # 表格左边界
+        
+        # 找右边最近的单元格
+        right_cells = [c for c in target_row if c[0] > ocr_cx]
+        if right_cells:
+            cell_x2 = min(c[0] for c in right_cells)  # 右边单元格的左边界
+        else:
+            cell_x2 = table_box[2]  # 表格右边界
+        
+        # 创建推断的单元格
+        inferred_cell = [cell_x1, cell_y1, cell_x2, cell_y2]
+        inferred_cells.append(inferred_cell)
+        inferred_texts.append(ocr_text)
+
+        print(f"✅ 为OCR文本 '{ocr_text}' 推断单元格: {inferred_cell}")
+    
+    # 5. 合并检测到的和推断的单元格
+    all_cells = detected_cells + inferred_cells
+    all_texts = cells_texts_list + inferred_texts
+    return all_cells, all_texts
+
+
+def enhanced_predict_single_table_recognition_res(
+    self,
+    image_array: np.ndarray,
+    overall_ocr_res: OCRResult,
+    table_box: list,
+    use_e2e_wired_table_rec_model: bool = False,
+    use_e2e_wireless_table_rec_model: bool = False,
+    use_wired_table_cells_trans_to_html: bool = False,
+    use_wireless_table_cells_trans_to_html: bool = False,
+    use_ocr_results_with_table_cells: bool = True,
+    flag_find_nei_text: bool = True,
+) -> SingleTableRecognitionResult:
+    """增强版方法 - 使用OCR引导的单元格补全"""
+    print(">>> [Adapter] enhanced_predict_single_table_recognition_res called")
+    
+    # 🎯 Step 1: 获取table_cells_result (原始逻辑)
+    table_cls_pred = list(self.table_cls_model(image_array))[0]
+    table_cls_result = self.extract_results(table_cls_pred, "cls")
+
+    if table_cls_result == "wired_table":
+        table_cells_pred = list(self.wired_table_cells_detection_model(image_array, threshold=0.3))[0]
+    else: # wireless_table
+        table_cells_pred = list(self.wireless_table_cells_detection_model(image_array, threshold=0.3))[0]
+    
+    table_cells_result, table_cells_score = self.extract_results(table_cells_pred, "det")
+    table_cells_result, table_cells_score = self.cells_det_results_nms(table_cells_result, table_cells_score)
+    table_cells_result.sort(key=lambda c: (c[1], c[0]))
+    
+    # 🎯 Step 2: 坐标转换
+    from paddlex.inference.pipelines.table_recognition.table_recognition_post_processing_v2 import (
+        convert_to_four_point_coordinates,
+        convert_table_structure_pred_bbox,
+        get_sub_regions_ocr_res
+    )
+    import numpy as np
+    
+    # 转换为4点坐标
+    table_cells_result_4pt = convert_to_four_point_coordinates(table_cells_result)
+    
+    # 准备坐标转换参数
+    table_box_array = np.array([table_box])
+    crop_start_point = [table_box[0], table_box[1]]
+    img_shape = overall_ocr_res["doc_preprocessor_res"]["output_img"].shape[0:2]
+    
+    # 转换到原图坐标系
+    table_cells_result_orig = convert_table_structure_pred_bbox(
+        table_cells_result_4pt, crop_start_point, img_shape
+    )
+    # 处理NumPy数组
+    if isinstance(table_cells_result_orig, np.ndarray):
+        table_cells_result_orig = table_cells_result_orig.tolist()
+    table_cells_result_orig.sort(key=lambda c: (c[1], c[0]))
+
+    # 🎯 Step 3: 获取表格区域的OCR结果
+    table_ocr_pred = get_sub_regions_ocr_res(overall_ocr_res, table_box_array)
+    
+    # 🎯 Step 4: **关键改进** - OCR引导的单元格补全
+    if (use_wired_table_cells_trans_to_html or use_wireless_table_cells_trans_to_html) and use_ocr_results_with_table_cells:
+        # ✅ 修复: 确保 general_ocr_pipeline 被初始化
+        if self.general_ocr_pipeline is None:
+            if hasattr(self, 'general_ocr_config_bak') and self.general_ocr_config_bak is not None:
+                print("🔧 [Adapter] Initializing general_ocr_pipeline from backup config")
+                self.general_ocr_pipeline = self.create_pipeline(self.general_ocr_config_bak)
+            else:
+                print("⚠️  [Adapter] No OCR pipeline available, falling back to original implementation")
+                return _original_predict_single(
+                    self, image_array, overall_ocr_res, table_box,
+                    use_e2e_wired_table_rec_model, use_e2e_wireless_table_rec_model,
+                    use_wired_table_cells_trans_to_html, use_wireless_table_cells_trans_to_html,
+                    use_ocr_results_with_table_cells, flag_find_nei_text
+                )
+        
+        # ✅ 对每个单元格做OCR(使用裁剪前的坐标)
+        cells_texts_list = self.gen_ocr_with_table_cells(image_array, table_cells_result)
+        
+        # ✅ 补全缺失的单元格
+        completed_cells, cells_texts_list = infer_missing_cells_from_ocr(
+            detected_cells=table_cells_result_orig,
+            cells_texts_list=cells_texts_list,
+            overall_ocr_boxes=table_ocr_pred["rec_boxes"],
+            overall_ocr_texts=table_ocr_pred["rec_texts"],
+            table_box=table_box
+        )
+
+        # ✅ 生成HTML骨架(使用转换后的原图坐标)
+        html_skeleton = build_robust_html_from_cells(completed_cells)
+        
+        # ✅ 填充内容(使用单元格bbox和单元格OCR文本)
+        pred_html = fill_html_with_ocr_by_bbox(
+            html_skeleton,
+            completed_cells,      # ✅ 单元格bbox
+            cells_texts_list      # ✅ 单元格OCR文本
+        )
+        
+        single_img_res = {
+            "cell_box_list": completed_cells,
+            "table_ocr_pred": table_ocr_pred,  # 保留完整OCR信息
+            "pred_html": pred_html,
+        }
+        
+        res = SingleTableRecognitionResult(single_img_res)
+        res["neighbor_texts"] = ""
+        return res
+    else:
+        print(f"⚠️  Fallback to original implementation: {table_cls_result}")
+        return _original_predict_single(
+            self, image_array, overall_ocr_res, table_box,
+            use_e2e_wired_table_rec_model, use_e2e_wireless_table_rec_model,
+            use_wired_table_cells_trans_to_html, use_wireless_table_cells_trans_to_html,
+            use_ocr_results_with_table_cells, flag_find_nei_text
+        )
+
+
+def apply_table_recognition_adapter():
+    """
+    应用表格识别适配器。
+    我们直接替换 _TableRecognitionPipelineV2 类中的 `predict_single_table_recognition_res` 方法。
+    """
+    global _original_predict_single
+    
+    try:
+        # 导入目标类
+        from paddlex.inference.pipelines.table_recognition.pipeline_v2 import _TableRecognitionPipelineV2
+        
+        # 保存原函数,防止重复应用补丁
+        if _original_predict_single is None:
+             _original_predict_single = _TableRecognitionPipelineV2.predict_single_table_recognition_res
+        
+        # 替换为增强版
+        _TableRecognitionPipelineV2.predict_single_table_recognition_res = enhanced_predict_single_table_recognition_res
+        
+        print("✅ Table recognition adapter applied successfully (v3 - corrected).")
+        return True
+        
+    except Exception as e:
+        print(f"❌ Failed to apply table recognition adapter: {e}")
+        return False
+
+
+def restore_original_function():
+    """恢复原始函数"""
+    global _original_predict_single
+    try:
+        from paddlex.inference.pipelines.table_recognition.pipeline_v2 import _TableRecognitionPipelineV2
+        
+        if _original_predict_single is not None:
+            _TableRecognitionPipelineV2.predict_single_table_recognition_res = _original_predict_single
+            _original_predict_single = None # 重置状态
+            print("✅ Original function restored.")
+            return True
+        return False
+    except Exception as e:
+        print(f"❌ Failed to restore original function: {e}")
+        return False

+ 226 - 0
ocr_tools/paddle_common/config/PP-StructureV3-RT-DETR-H_layout_17cls.yaml

@@ -0,0 +1,226 @@
+
+pipeline_name: PP-StructureV3
+
+batch_size: 8
+
+use_doc_preprocessor: True
+use_seal_recognition: True
+use_table_recognition: True
+use_formula_recognition: False
+use_chart_recognition: True
+use_region_detection: True
+
+SubModules:
+  LayoutDetection:
+    module_name: layout_detection
+    model_name: RT-DETR-H_layout_17cls
+    model_dir: null
+    batch_size: 8
+    threshold: 
+      0: 0.3  # paragraph_title
+      1: 0.5  # image
+      2: 0.4  # text
+      3: 0.5  # number
+      4: 0.5  # abstract
+      5: 0.5  # content
+      6: 0.5  # figure_table_chart_title
+      7: 0.3  # formula
+      8: 0.5  # table
+      9: 0.5  # reference
+      10: 0.5 # doc_title
+      11: 0.5 # footnote
+      12: 0.5 # header
+      13: 0.5 # algorithm
+      14: 0.5 # footer
+      15: 0.45 # seal
+      16: 0.5 # chart
+      17: 0.5 # formula_number
+      18: 0.5 # aside_text
+      19: 0.5 # reference_content
+    layout_nms: True
+    layout_unclip_ratio: [1.0, 1.0] 
+    layout_merge_bboxes_mode: 
+      0: "large"  # paragraph_title
+      1: "large"  # image
+      2: "union"  # text
+      3: "union"  # number
+      4: "union"  # abstract
+      5: "union"  # content
+      6: "union"  # figure_table_chart_title
+      7: "large"  # formula
+      8: "union"  # table
+      9: "union"  # reference
+      10: "union" # doc_title
+      11: "union" # footnote
+      12: "union" # header
+      13: "union" # algorithm
+      14: "union" # footer
+      15: "union" # seal
+      16: "large" # chart
+      17: "union" # formula_number
+      18: "union" # aside_text
+      19: "union" # reference_content
+  ChartRecognition:
+    module_name: chart_recognition
+    model_name: PP-Chart2Table
+    model_dir: null
+    batch_size: 1 
+  RegionDetection:
+    module_name: layout_detection
+    model_name: PP-DocBlockLayout
+    model_dir: null
+    layout_nms: True
+    layout_merge_bboxes_mode: "small"
+
+SubPipelines:
+  DocPreprocessor:
+    pipeline_name: doc_preprocessor
+    batch_size: 8
+    use_doc_orientation_classify: True
+    use_doc_unwarping: True
+    SubModules:
+      DocOrientationClassify:
+        module_name: doc_text_orientation
+        model_name: PP-LCNet_x1_0_doc_ori
+        model_dir: null
+        batch_size: 8
+      DocUnwarping:
+        module_name: image_unwarping
+        model_name: UVDoc
+        model_dir: null
+
+  GeneralOCR:
+    pipeline_name: OCR
+    batch_size: 8
+    text_type: general
+    use_doc_preprocessor: False
+    use_textline_orientation: True
+    SubModules:
+      TextDetection:
+        module_name: text_detection
+        model_name: PP-OCRv5_server_det
+        model_dir: null
+        limit_side_len: 2560
+        limit_type: max
+        max_side_limit: 4000
+        thresh: 0.3
+        box_thresh: 0.6
+        unclip_ratio: 1.5
+      TextLineOrientation:
+        module_name: textline_orientation
+        model_name: PP-LCNet_x1_0_textline_ori
+        model_dir: null
+        batch_size: 8
+      TextRecognition:
+        module_name: text_recognition
+        model_name: PP-OCRv5_server_rec
+        model_dir: null
+        batch_size: 8
+        score_thresh: 0.0
+ 
+
+  TableRecognition:
+    pipeline_name: table_recognition_v2
+    use_layout_detection: False
+    use_doc_preprocessor: False
+    use_ocr_model: False
+    SubModules:  
+      TableClassification:
+        module_name: table_classification
+        model_name: PP-LCNet_x1_0_table_cls
+        model_dir: null
+
+      WiredTableStructureRecognition:
+        module_name: table_structure_recognition
+        model_name: SLANeXt_wired
+        model_dir: null
+      
+      WirelessTableStructureRecognition:
+        module_name: table_structure_recognition
+        model_name: SLANet_plus
+        model_dir: null
+      
+      WiredTableCellsDetection:
+        module_name: table_cells_detection
+        model_name: RT-DETR-L_wired_table_cell_det
+        model_dir: null
+      
+      WirelessTableCellsDetection:
+        module_name: table_cells_detection
+        model_name: RT-DETR-L_wireless_table_cell_det
+        model_dir: null
+
+      TableOrientationClassify:
+        module_name: doc_text_orientation
+        model_name: PP-LCNet_x1_0_doc_ori
+        model_dir: null
+    SubPipelines:
+      GeneralOCR:
+        pipeline_name: OCR
+        text_type: general
+        use_doc_preprocessor: False
+        use_textline_orientation: True
+        SubModules:
+          TextDetection:
+            module_name: text_detection
+            model_name: PP-OCRv5_server_det
+            model_dir: null
+            limit_side_len: 2560
+            limit_type: max
+            max_side_limit: 4000
+            thresh: 0.3
+            box_thresh: 0.4
+            unclip_ratio: 1.5
+          TextLineOrientation:
+            module_name: textline_orientation
+            model_name: PP-LCNet_x1_0_textline_ori
+            model_dir: null
+            batch_size: 8
+          TextRecognition:
+            module_name: text_recognition
+            model_name: PP-OCRv5_server_rec
+            model_dir: null
+            batch_size: 8
+        score_thresh: 0.0
+
+  SealRecognition:
+    pipeline_name: seal_recognition
+    batch_size: 8
+    use_layout_detection: False
+    use_doc_preprocessor: False
+    SubPipelines:
+      SealOCR:
+        pipeline_name: OCR
+        batch_size: 8
+        text_type: seal
+        use_doc_preprocessor: False
+        use_textline_orientation: False
+        SubModules:
+          TextDetection:
+            module_name: seal_text_detection
+            model_name: PP-OCRv4_server_seal_det
+            model_dir: null
+            limit_side_len: 2560
+            limit_type: max
+            max_side_limit: 4000
+            thresh: 0.2
+            box_thresh: 0.6
+            unclip_ratio: 0.5
+          TextRecognition:
+            module_name: text_recognition
+            model_name: PP-OCRv5_server_rec
+            model_dir: null
+            batch_size: 8
+            score_thresh: 0
+    
+  FormulaRecognition:
+    pipeline_name: formula_recognition
+    batch_size: 8
+    use_layout_detection: False
+    use_doc_preprocessor: False
+    SubModules:
+      FormulaRecognition:
+        module_name: formula_recognition
+        model_name: PP-FormulaNet_plus-L
+        model_dir: null
+        batch_size: 8

+ 226 - 0
ocr_tools/paddle_common/config/PP-StructureV3-zhch.yaml

@@ -0,0 +1,226 @@
+
+pipeline_name: PP-StructureV3
+
+batch_size: 8
+
+use_doc_preprocessor: True
+use_seal_recognition: True
+use_table_recognition: True
+use_formula_recognition: True
+use_chart_recognition: False
+use_region_detection: True
+
+SubModules:
+  LayoutDetection:
+    module_name: layout_detection
+    model_name: PP-DocLayout_plus-L
+    model_dir: null
+    batch_size: 8
+    threshold: 
+      0: 0.3  # paragraph_title
+      1: 0.5  # image
+      2: 0.4  # text
+      3: 0.5  # number
+      4: 0.5  # abstract
+      5: 0.5  # content
+      6: 0.5  # figure_table_chart_title
+      7: 0.3  # formula
+      8: 0.5  # table
+      9: 0.5  # reference
+      10: 0.5 # doc_title
+      11: 0.5 # footnote
+      12: 0.5 # header
+      13: 0.5 # algorithm
+      14: 0.5 # footer
+      15: 0.45 # seal
+      16: 0.5 # chart
+      17: 0.5 # formula_number
+      18: 0.5 # aside_text
+      19: 0.5 # reference_content
+    layout_nms: True
+    layout_unclip_ratio: [1.0, 1.0] 
+    layout_merge_bboxes_mode: 
+      0: "large"  # paragraph_title
+      1: "large"  # image
+      2: "union"  # text
+      3: "union"  # number
+      4: "union"  # abstract
+      5: "union"  # content
+      6: "union"  # figure_table_chart_title
+      7: "large"  # formula
+      8: "union"  # table
+      9: "union"  # reference
+      10: "union" # doc_title
+      11: "union" # footnote
+      12: "union" # header
+      13: "union" # algorithm
+      14: "union" # footer
+      15: "union" # seal
+      16: "large" # chart
+      17: "union" # formula_number
+      18: "union" # aside_text
+      19: "union" # reference_content
+  ChartRecognition:
+    module_name: chart_recognition
+    model_name: PP-Chart2Table
+    model_dir: null
+    batch_size: 1 
+  RegionDetection:
+    module_name: layout_detection
+    model_name: PP-DocBlockLayout
+    model_dir: null
+    layout_nms: True
+    layout_merge_bboxes_mode: "small"
+
+SubPipelines:
+  DocPreprocessor:
+    pipeline_name: doc_preprocessor
+    batch_size: 8
+    use_doc_orientation_classify: True
+    use_doc_unwarping: False
+    SubModules:
+      DocOrientationClassify:
+        module_name: doc_text_orientation
+        model_name: PP-LCNet_x1_0_doc_ori
+        model_dir: null
+        batch_size: 8
+      DocUnwarping:
+        module_name: image_unwarping
+        model_name: UVDoc
+        model_dir: null
+
+  GeneralOCR:
+    pipeline_name: OCR
+    batch_size: 8
+    text_type: general
+    use_doc_preprocessor: False
+    use_textline_orientation: True
+    SubModules:
+      TextDetection:
+        module_name: text_detection
+        model_name: PP-OCRv5_server_det
+        model_dir: null
+        limit_side_len: 1200
+        limit_type: max
+        max_side_limit: 4000
+        thresh: 0.3
+        box_thresh: 0.6
+        unclip_ratio: 1.5
+      TextLineOrientation:
+        module_name: textline_orientation
+        model_name: PP-LCNet_x1_0_textline_ori
+        model_dir: null
+        batch_size: 8
+      TextRecognition:
+        module_name: text_recognition
+        model_name: PP-OCRv5_server_rec
+        model_dir: null
+        batch_size: 8
+        score_thresh: 0.0
+ 
+
+  TableRecognition:
+    pipeline_name: table_recognition_v2
+    use_layout_detection: False
+    use_doc_preprocessor: False
+    use_ocr_model: False
+    SubModules:  
+      TableClassification:
+        module_name: table_classification
+        model_name: PP-LCNet_x1_0_table_cls
+        model_dir: null
+
+      WiredTableStructureRecognition:
+        module_name: table_structure_recognition
+        model_name: SLANeXt_wired
+        model_dir: null
+      
+      WirelessTableStructureRecognition:
+        module_name: table_structure_recognition
+        model_name: SLANet_plus
+        model_dir: null
+      
+      WiredTableCellsDetection:
+        module_name: table_cells_detection
+        model_name: RT-DETR-L_wired_table_cell_det
+        model_dir: null
+      
+      WirelessTableCellsDetection:
+        module_name: table_cells_detection
+        model_name: RT-DETR-L_wireless_table_cell_det
+        model_dir: null
+
+      TableOrientationClassify:
+        module_name: doc_text_orientation
+        model_name: PP-LCNet_x1_0_doc_ori
+        model_dir: null
+    SubPipelines:
+      GeneralOCR:
+        pipeline_name: OCR
+        text_type: general
+        use_doc_preprocessor: False
+        use_textline_orientation: True
+        SubModules:
+          TextDetection:
+            module_name: text_detection
+            model_name: PP-OCRv5_server_det
+            model_dir: null
+            limit_side_len: 1600
+            limit_type: max
+            max_side_limit: 4000
+            thresh: 0.3
+            box_thresh: 0.4
+            unclip_ratio: 1.5
+          TextLineOrientation:
+            module_name: textline_orientation
+            model_name: PP-LCNet_x1_0_textline_ori
+            model_dir: null
+            batch_size: 8
+          TextRecognition:
+            module_name: text_recognition
+            model_name: PP-OCRv5_server_rec
+            model_dir: null
+            batch_size: 8
+        score_thresh: 0.0
+
+  SealRecognition:
+    pipeline_name: seal_recognition
+    batch_size: 8
+    use_layout_detection: False
+    use_doc_preprocessor: False
+    SubPipelines:
+      SealOCR:
+        pipeline_name: OCR
+        batch_size: 8
+        text_type: seal
+        use_doc_preprocessor: False
+        use_textline_orientation: False
+        SubModules:
+          TextDetection:
+            module_name: seal_text_detection
+            model_name: PP-OCRv4_server_seal_det
+            model_dir: null
+            limit_side_len: 736
+            limit_type: min
+            max_side_limit: 4000
+            thresh: 0.2
+            box_thresh: 0.6
+            unclip_ratio: 0.5
+          TextRecognition:
+            module_name: text_recognition
+            model_name: PP-OCRv5_server_rec
+            model_dir: null
+            batch_size: 8
+            score_thresh: 0
+    
+  FormulaRecognition:
+    pipeline_name: formula_recognition
+    batch_size: 8
+    use_layout_detection: False
+    use_doc_preprocessor: False
+    SubModules:
+      FormulaRecognition:
+        module_name: formula_recognition
+        model_name: PP-FormulaNet_plus-L
+        model_dir: null
+        batch_size: 8

+ 226 - 0
ocr_tools/paddle_common/config/PP-StructureV3.yaml

@@ -0,0 +1,226 @@
+
+pipeline_name: PP-StructureV3
+
+batch_size: 8
+
+use_doc_preprocessor: True
+use_seal_recognition: True
+use_table_recognition: True
+use_formula_recognition: False
+use_chart_recognition: True
+use_region_detection: True
+
+SubModules:
+  LayoutDetection:
+    module_name: layout_detection
+    model_name: PP-DocLayout_plus-L
+    model_dir: null
+    batch_size: 8
+    threshold: 
+      0: 0.3  # paragraph_title
+      1: 0.5  # image
+      2: 0.4  # text
+      3: 0.5  # number
+      4: 0.5  # abstract
+      5: 0.5  # content
+      6: 0.5  # figure_table_chart_title
+      7: 0.3  # formula
+      8: 0.5  # table
+      9: 0.5  # reference
+      10: 0.5 # doc_title
+      11: 0.5 # footnote
+      12: 0.5 # header
+      13: 0.5 # algorithm
+      14: 0.5 # footer
+      15: 0.45 # seal
+      16: 0.5 # chart
+      17: 0.5 # formula_number
+      18: 0.5 # aside_text
+      19: 0.5 # reference_content
+    layout_nms: True
+    layout_unclip_ratio: [1.0, 1.0] 
+    layout_merge_bboxes_mode: 
+      0: "large"  # paragraph_title
+      1: "large"  # image
+      2: "union"  # text
+      3: "union"  # number
+      4: "union"  # abstract
+      5: "union"  # content
+      6: "union"  # figure_table_chart_title
+      7: "large"  # formula
+      8: "union"  # table
+      9: "union"  # reference
+      10: "union" # doc_title
+      11: "union" # footnote
+      12: "union" # header
+      13: "union" # algorithm
+      14: "union" # footer
+      15: "union" # seal
+      16: "large" # chart
+      17: "union" # formula_number
+      18: "union" # aside_text
+      19: "union" # reference_content
+  ChartRecognition:
+    module_name: chart_recognition
+    model_name: PP-Chart2Table
+    model_dir: null
+    batch_size: 1 
+  RegionDetection:
+    module_name: layout_detection
+    model_name: PP-DocBlockLayout
+    model_dir: null
+    layout_nms: True
+    layout_merge_bboxes_mode: "small"
+
+SubPipelines:
+  DocPreprocessor:
+    pipeline_name: doc_preprocessor
+    batch_size: 8
+    use_doc_orientation_classify: True
+    use_doc_unwarping: True
+    SubModules:
+      DocOrientationClassify:
+        module_name: doc_text_orientation
+        model_name: PP-LCNet_x1_0_doc_ori
+        model_dir: null
+        batch_size: 8
+      DocUnwarping:
+        module_name: image_unwarping
+        model_name: UVDoc
+        model_dir: null
+
+  GeneralOCR:
+    pipeline_name: OCR
+    batch_size: 8
+    text_type: general
+    use_doc_preprocessor: False
+    use_textline_orientation: True
+    SubModules:
+      TextDetection:
+        module_name: text_detection
+        model_name: PP-OCRv5_server_det
+        model_dir: null
+        limit_side_len: 2560
+        limit_type: max
+        max_side_limit: 4000
+        thresh: 0.3
+        box_thresh: 0.6
+        unclip_ratio: 1.5
+      TextLineOrientation:
+        module_name: textline_orientation
+        model_name: PP-LCNet_x1_0_textline_ori
+        model_dir: null
+        batch_size: 8
+      TextRecognition:
+        module_name: text_recognition
+        model_name: PP-OCRv5_server_rec
+        model_dir: null
+        batch_size: 8
+        score_thresh: 0.0
+ 
+
+  TableRecognition:
+    pipeline_name: table_recognition_v2
+    use_layout_detection: False
+    use_doc_preprocessor: False
+    use_ocr_model: False
+    SubModules:  
+      TableClassification:
+        module_name: table_classification
+        model_name: PP-LCNet_x1_0_table_cls
+        model_dir: null
+
+      WiredTableStructureRecognition:
+        module_name: table_structure_recognition
+        model_name: SLANeXt_wired
+        model_dir: null
+      
+      WirelessTableStructureRecognition:
+        module_name: table_structure_recognition
+        model_name: SLANet_plus
+        model_dir: null
+      
+      WiredTableCellsDetection:
+        module_name: table_cells_detection
+        model_name: RT-DETR-L_wired_table_cell_det
+        model_dir: null
+      
+      WirelessTableCellsDetection:
+        module_name: table_cells_detection
+        model_name: RT-DETR-L_wireless_table_cell_det
+        model_dir: null
+
+      TableOrientationClassify:
+        module_name: doc_text_orientation
+        model_name: PP-LCNet_x1_0_doc_ori
+        model_dir: null
+    SubPipelines:
+      GeneralOCR:
+        pipeline_name: OCR
+        text_type: general
+        use_doc_preprocessor: False
+        use_textline_orientation: True
+        SubModules:
+          TextDetection:
+            module_name: text_detection
+            model_name: PP-OCRv5_server_det
+            model_dir: null
+            limit_side_len: 2560
+            limit_type: max
+            max_side_limit: 4000
+            thresh: 0.3
+            box_thresh: 0.4
+            unclip_ratio: 1.5
+          TextLineOrientation:
+            module_name: textline_orientation
+            model_name: PP-LCNet_x1_0_textline_ori
+            model_dir: null
+            batch_size: 8
+          TextRecognition:
+            module_name: text_recognition
+            model_name: PP-OCRv5_server_rec
+            model_dir: null
+            batch_size: 8
+        score_thresh: 0.0
+
+  SealRecognition:
+    pipeline_name: seal_recognition
+    batch_size: 8
+    use_layout_detection: False
+    use_doc_preprocessor: False
+    SubPipelines:
+      SealOCR:
+        pipeline_name: OCR
+        batch_size: 8
+        text_type: seal
+        use_doc_preprocessor: False
+        use_textline_orientation: False
+        SubModules:
+          TextDetection:
+            module_name: seal_text_detection
+            model_name: PP-OCRv4_server_seal_det
+            model_dir: null
+            limit_side_len: 2560
+            limit_type: max
+            max_side_limit: 4000
+            thresh: 0.2
+            box_thresh: 0.6
+            unclip_ratio: 0.5
+          TextRecognition:
+            module_name: text_recognition
+            model_name: PP-OCRv5_server_rec
+            model_dir: null
+            batch_size: 8
+            score_thresh: 0
+    
+  FormulaRecognition:
+    pipeline_name: formula_recognition
+    batch_size: 8
+    use_layout_detection: False
+    use_doc_preprocessor: False
+    SubModules:
+      FormulaRecognition:
+        module_name: formula_recognition
+        model_name: PP-FormulaNet_plus-L
+        model_dir: null
+        batch_size: 8

+ 98 - 0
ocr_tools/paddle_common/config/PaddleOCR-VL-Client-RT-DETR-H_layout_17cls.yaml

@@ -0,0 +1,98 @@
+
+pipeline_name: PaddleOCR-VL
+
+batch_size: 64
+
+use_queues: True
+
+use_doc_preprocessor: True
+use_layout_detection: True
+use_chart_recognition: False
+format_block_content: False
+
+SubModules:
+  LayoutDetection:
+    module_name: layout_detection
+    model_name: RT-DETR-H_layout_17cls
+    model_dir: null
+    batch_size: 8
+    threshold: 
+      0: 0.5 # abstract
+      1: 0.5 # algorithm
+      2: 0.5 # aside_text
+      3: 0.5 # chart
+      4: 0.5 # content
+      5: 0.4 # formula
+      6: 0.4 # doc_title
+      7: 0.5 # figure_title
+      8: 0.5 # footer
+      9: 0.5 # footer
+      10: 0.5 # footnote
+      11: 0.5 # formula_number
+      12: 0.5 # header
+      13: 0.5 # header
+      14: 0.5 # image
+      15: 0.4 # formula
+      16: 0.5 # number
+      17: 0.4 # paragraph_title
+      18: 0.5 # reference
+      19: 0.5 # reference_content
+      20: 0.45 # seal
+      21: 0.5 # table
+      22: 0.4 # text
+      23: 0.4 # text
+      24: 0.5 # vision_footnote
+    layout_nms: True
+    layout_unclip_ratio: [1.0, 1.0] 
+    layout_merge_bboxes_mode: 
+      0: "union" # abstract
+      1: "union" # algorithm
+      2: "union" # aside_text
+      3: "large" # chart
+      4: "union" # content
+      5: "large" # display_formula
+      6: "large" # doc_title
+      7: "union" # figure_title
+      8: "union" # footer
+      9: "union" # footer
+      10: "union" # footnote
+      11: "union" # formula_number
+      12: "union" # header
+      13: "union" # header
+      14: "union" # image
+      15: "large" # inline_formula
+      16: "union" # number
+      17: "large" # paragraph_title
+      18: "union" # reference
+      19: "union" # reference_content
+      20: "union" # seal
+      21: "union" # table
+      22: "union" # text
+      23: "union" # text
+      24: "union" # vision_footnote
+  VLRecognition:
+    module_name: vl_recognition
+    model_name: PaddleOCR-VL-0.9B
+    model_dir: null
+    batch_size: 2048
+    genai_config:
+      backend: vllm-server
+      server_url: http://10.192.72.11:20016/v1
+
+SubPipelines:
+
+  DocPreprocessor:
+    pipeline_name: doc_preprocessor
+    batch_size: 8
+    use_doc_orientation_classify: True
+    use_doc_unwarping: True
+    SubModules:
+      DocOrientationClassify:
+        module_name: doc_text_orientation
+        model_name: PP-LCNet_x1_0_doc_ori
+        model_dir: null
+        batch_size: 8
+      DocUnwarping:
+        module_name: image_unwarping
+        model_name: UVDoc
+        model_dir: null

+ 98 - 0
ocr_tools/paddle_common/config/PaddleOCR-VL-Client.yaml

@@ -0,0 +1,98 @@
+
+pipeline_name: PaddleOCR-VL
+
+batch_size: 64
+
+use_queues: True
+
+use_doc_preprocessor: True
+use_layout_detection: True
+use_chart_recognition: False
+format_block_content: False
+
+SubModules:
+  LayoutDetection:
+    module_name: layout_detection
+    model_name: PP-DocLayoutV2
+    model_dir: null
+    batch_size: 8
+    threshold: 
+      0: 0.5 # abstract
+      1: 0.5 # algorithm
+      2: 0.5 # aside_text
+      3: 0.5 # chart
+      4: 0.5 # content
+      5: 0.4 # formula
+      6: 0.4 # doc_title
+      7: 0.5 # figure_title
+      8: 0.5 # footer
+      9: 0.5 # footer
+      10: 0.5 # footnote
+      11: 0.5 # formula_number
+      12: 0.5 # header
+      13: 0.5 # header
+      14: 0.5 # image
+      15: 0.4 # formula
+      16: 0.5 # number
+      17: 0.4 # paragraph_title
+      18: 0.5 # reference
+      19: 0.5 # reference_content
+      20: 0.45 # seal
+      21: 0.5 # table
+      22: 0.4 # text
+      23: 0.4 # text
+      24: 0.5 # vision_footnote
+    layout_nms: True
+    layout_unclip_ratio: [1.0, 1.0] 
+    layout_merge_bboxes_mode: 
+      0: "union" # abstract
+      1: "union" # algorithm
+      2: "union" # aside_text
+      3: "large" # chart
+      4: "union" # content
+      5: "large" # display_formula
+      6: "large" # doc_title
+      7: "union" # figure_title
+      8: "union" # footer
+      9: "union" # footer
+      10: "union" # footnote
+      11: "union" # formula_number
+      12: "union" # header
+      13: "union" # header
+      14: "union" # image
+      15: "large" # inline_formula
+      16: "union" # number
+      17: "large" # paragraph_title
+      18: "union" # reference
+      19: "union" # reference_content
+      20: "union" # seal
+      21: "union" # table
+      22: "union" # text
+      23: "union" # text
+      24: "union" # vision_footnote
+  VLRecognition:
+    module_name: vl_recognition
+    model_name: PaddleOCR-VL-0.9B
+    model_dir: null
+    batch_size: 2048
+    genai_config:
+      backend: vllm-server
+      server_url: http://10.192.72.11:20016/v1
+
+SubPipelines:
+
+  DocPreprocessor:
+    pipeline_name: doc_preprocessor
+    batch_size: 8
+    use_doc_orientation_classify: True
+    use_doc_unwarping: True
+    SubModules:
+      DocOrientationClassify:
+        module_name: doc_text_orientation
+        model_name: PP-LCNet_x1_0_doc_ori
+        model_dir: null
+        batch_size: 8
+      DocUnwarping:
+        module_name: image_unwarping
+        model_name: UVDoc
+        model_dir: null

+ 101 - 0
ocr_tools/paddle_common/config/PaddleOCR-VL.yaml

@@ -0,0 +1,101 @@
+
+pipeline_name: PaddleOCR-VL
+
+batch_size: 64
+
+use_queues: True
+
+use_doc_preprocessor: False
+use_layout_detection: True
+use_chart_recognition: False
+format_block_content: False
+
+SubModules:
+  LayoutDetection:
+    module_name: layout_detection
+    model_name: PP-DocLayoutV2
+    model_dir: null
+    batch_size: 8
+    threshold: 
+      0: 0.5 # abstract
+      1: 0.5 # algorithm
+      2: 0.5 # aside_text
+      3: 0.5 # chart
+      4: 0.5 # content
+      5: 0.4 # formula
+      6: 0.4 # doc_title
+      7: 0.5 # figure_title
+      8: 0.5 # footer
+      9: 0.5 # footer
+      10: 0.5 # footnote
+      11: 0.5 # formula_number
+      12: 0.5 # header
+      13: 0.5 # header
+      14: 0.5 # image
+      15: 0.4 # formula
+      16: 0.5 # number
+      17: 0.4 # paragraph_title
+      18: 0.5 # reference
+      19: 0.5 # reference_content
+      20: 0.45 # seal
+      21: 0.5 # table
+      22: 0.4 # text
+      23: 0.4 # text
+      24: 0.5 # vision_footnote
+    layout_nms: True
+    layout_unclip_ratio: [1.0, 1.0] 
+    layout_merge_bboxes_mode: 
+      0: "union" # abstract
+      1: "union" # algorithm
+      2: "union" # aside_text
+      3: "large" # chart
+      4: "union" # content
+      5: "large" # display_formula
+      6: "large" # doc_title
+      7: "union" # figure_title
+      8: "union" # footer
+      9: "union" # footer
+      10: "union" # footnote
+      11: "union" # formula_number
+      12: "union" # header
+      13: "union" # header
+      14: "union" # image
+      15: "large" # inline_formula
+      16: "union" # number
+      17: "large" # paragraph_title
+      18: "union" # reference
+      19: "union" # reference_content
+      20: "union" # seal
+      21: "union" # table
+      22: "union" # text
+      23: "union" # text
+      24: "union" # vision_footnote
+  VLRecognition:
+    module_name: vl_recognition
+    model_name: PaddleOCR-VL-0.9B
+    model_dir: null
+    batch_size: 2048
+    genai_config:
+      backend: native
+      gpu-memory-utilization: 0.3
+      max-num-seqs: 64
+      max-model-len: 16384
+      enforce-eager: true
+      disable-cuda-graph: true
+
+SubPipelines:
+  DocPreprocessor:
+    pipeline_name: doc_preprocessor
+    batch_size: 8
+    use_doc_orientation_classify: True
+    use_doc_unwarping: True
+    SubModules:
+      DocOrientationClassify:
+        module_name: doc_text_orientation
+        model_name: PP-LCNet_x1_0_doc_ori
+        model_dir: null
+        batch_size: 8
+      DocUnwarping:
+        module_name: image_unwarping
+        model_name: UVDoc
+        model_dir: null

+ 89 - 0
ocr_tools/paddle_common/config/README.md

@@ -0,0 +1,89 @@
+# PaddleX Pipeline 配置文件
+
+本目录包含 PaddleX 的 pipeline 配置文件,用于配置不同的文档解析 pipeline。
+
+## 配置文件分类
+
+### PaddleOCR-VL 相关配置
+
+- **PaddleOCR-VL.yaml**: 基础 PaddleOCR-VL pipeline 配置
+- **PaddleOCR-VL-Client.yaml**: PaddleOCR-VL 客户端配置
+- **PaddleOCR-VL-Client-RT-DETR-H_layout_17cls.yaml**: 使用 RT-DETR-H 布局检测模型的 PaddleOCR-VL 配置(17 类布局)
+
+### PP-StructureV3 相关配置
+
+- **PP-StructureV3.yaml**: 基础 PP-StructureV3 pipeline 配置
+- **PP-StructureV3-zhch.yaml**: 自定义的 PP-StructureV3 配置(zhch 版本)
+- **PP-StructureV3-RT-DETR-H_layout_17cls.yaml**: 使用 RT-DETR-H 布局检测模型的 PP-StructureV3 配置(17 类布局)
+
+### 其他配置
+
+- **layout_parsing.yaml**: 布局解析配置
+- **table_recognition_v2.yaml**: 表格识别 V2 配置
+- **table_recognition_v2-zhch.yaml**: 自定义的表格识别 V2 配置(zhch 版本)
+
+## 使用方法
+
+### 在命令行中使用
+
+```bash
+# 使用相对路径(从工具目录运行)
+python main.py --input document.pdf --output_dir ./output \
+  --pipeline ../paddle_common/config/PaddleOCR-VL-Client-RT-DETR-H_layout_17cls.yaml
+
+# 使用绝对路径
+python main.py --input document.pdf --output_dir ./output \
+  --pipeline /path/to/ocr_platform/ocr_tools/paddle_common/config/PP-StructureV3-zhch.yaml
+```
+
+### 在代码中使用
+
+```python
+from pathlib import Path
+
+# 获取配置文件路径
+config_dir = Path(__file__).parent / "config"
+config_path = config_dir / "PaddleOCR-VL-Client-RT-DETR-H_layout_17cls.yaml"
+
+# 使用配置文件初始化 pipeline
+processor = PaddleXProcessor(
+    pipeline_name=str(config_path),
+    device="gpu:0"
+)
+```
+
+## 配置文件说明
+
+### PaddleOCR-VL vs PP-StructureV3
+
+- **PaddleOCR-VL**: 基于视觉语言模型的文档解析,专注于视觉理解
+- **PP-StructureV3**: 更全面的文档结构分析,包括表格、公式、图表等识别
+
+### RT-DETR-H 布局检测模型
+
+使用 RT-DETR-H 作为布局检测模型,支持 17 类布局检测:
+- abstract, algorithm, aside_text, chart, content, formula
+- doc_title, figure_title, footer, footnote, formula_number
+- header, image, number, paragraph_title, reference
+- reference_content, seal, table, text, vision_footnote
+
+### 自定义配置(zhch 版本)
+
+带有 `-zhch` 后缀的配置文件是自定义版本,可能包含:
+- 调整的阈值参数
+- 优化的模型配置
+- 特定的功能开关设置
+
+## 注意事项
+
+1. **路径引用**:配置文件路径可以是相对路径或绝对路径
+2. **Pipeline 名称**:也可以直接使用 pipeline 名称(如 `PaddleOCR-VL`),无需指定配置文件
+3. **设备配置**:某些配置可能需要特定的设备(GPU/CPU)支持
+4. **模型文件**:确保配置文件中指定的模型文件已正确安装
+
+## 相关工具
+
+- `paddle_vl_tool`: PaddleOCR-VL 批量处理工具
+- `ppstructure_tool`: PP-StructureV3 批量处理工具
+- `paddle_common`: PaddleX 共享核心模块
+

+ 102 - 0
ocr_tools/paddle_common/config/layout_parsing.yaml

@@ -0,0 +1,102 @@
+
+pipeline_name: layout_parsing
+
+use_doc_preprocessor: True
+use_seal_recognition: True
+use_table_recognition: True
+use_formula_recognition: False
+
+SubModules:
+  LayoutDetection:
+    module_name: layout_detection
+    model_name: RT-DETR-H_layout_17cls
+    model_dir: null
+
+SubPipelines:
+  DocPreprocessor:
+    pipeline_name: doc_preprocessor
+    use_doc_orientation_classify: True
+    use_doc_unwarping: True
+    SubModules:
+      DocOrientationClassify:
+        module_name: doc_text_orientation
+        model_name: PP-LCNet_x1_0_doc_ori
+        model_dir: null
+      DocUnwarping:
+        module_name: image_unwarping
+        model_name: UVDoc
+        model_dir: null
+
+  GeneralOCR:
+    pipeline_name: OCR
+    text_type: general
+    use_doc_preprocessor: False
+    use_textline_orientation: False
+    SubModules:
+      TextDetection:
+        module_name: text_detection
+        model_name: PP-OCRv5_server_det
+        model_dir: null
+        limit_side_len: 960
+        limit_type: max
+        max_side_limit: 4000
+        thresh: 0.3
+        box_thresh: 0.6
+        unclip_ratio: 1.5
+        
+      TextRecognition:
+        module_name: text_recognition
+        model_name: PP-OCRv5_server_rec
+        model_dir: null
+        batch_size: 6
+        score_thresh: 0
+
+  TableRecognition:
+    pipeline_name: table_recognition
+    use_layout_detection: False
+    use_doc_preprocessor: False
+    use_ocr_model: False
+    SubModules:
+      TableStructureRecognition:
+        module_name: table_structure_recognition
+        model_name: SLANet_plus
+        model_dir: null
+
+  SealRecognition:
+    pipeline_name: seal_recognition
+    use_layout_detection: False
+    use_doc_preprocessor: False
+    SubPipelines:
+      SealOCR:
+        pipeline_name: OCR
+        text_type: seal
+        use_doc_preprocessor: False
+        use_textline_orientation: False
+        SubModules:
+          TextDetection:
+            module_name: seal_text_detection
+            model_name: PP-OCRv4_server_seal_det
+            model_dir: null
+            limit_side_len: 736
+            limit_type: min
+            max_side_limit: 4000
+            thresh: 0.2
+            box_thresh: 0.6
+            unclip_ratio: 0.5
+          TextRecognition:
+            module_name: text_recognition
+            model_name: PP-OCRv4_server_rec
+            model_dir: null
+            batch_size: 1
+            score_thresh: 0
+    
+  FormulaRecognition:
+    pipeline_name: formula_recognition
+    use_layout_detection: False
+    use_doc_preprocessor: False
+    SubModules:
+      FormulaRecognition:
+        module_name: formula_recognition
+        model_name: PP-FormulaNet-L
+        model_dir: null
+        batch_size: 5

+ 127 - 0
ocr_tools/paddle_common/config/table_recognition_v2-zhch.yaml

@@ -0,0 +1,127 @@
+
+pipeline_name: table_recognition_v2
+
+use_doc_preprocessor: False
+use_layout_detection: True
+use_ocr_model: True
+
+SubModules:
+  LayoutDetection:
+    module_name: layout_detection
+    model_name: PP-DocLayout_plus-L
+    model_dir: null
+    batch_size: 1
+    threshold: 
+      0: 0.3  # paragraph_title
+      1: 0.5  # image
+      2: 0.4  # text
+      3: 0.5  # number
+      4: 0.5  # abstract
+      5: 0.5  # content
+      6: 0.5  # figure_table_chart_title
+      7: 0.3  # formula
+      8: 0.5  # table
+      9: 0.5  # reference
+      10: 0.5 # doc_title
+      11: 0.5 # footnote
+      12: 0.5 # header
+      13: 0.5 # algorithm
+      14: 0.5 # footer
+      15: 0.45 # seal
+      16: 0.5 # chart
+      17: 0.5 # formula_number
+      18: 0.5 # aside_text
+      19: 0.5 # reference_content
+    layout_nms: True
+    layout_unclip_ratio: [1.0, 1.0] 
+    layout_merge_bboxes_mode: 
+      0: "large"  # paragraph_title
+      1: "large"  # image
+      2: "union"  # text
+      3: "union"  # number
+      4: "union"  # abstract
+      5: "union"  # content
+      6: "union"  # figure_table_chart_title
+      7: "large"  # formula
+      8: "union"  # table
+      9: "union"  # reference
+      10: "union" # doc_title
+      11: "union" # footnote
+      12: "union" # header
+      13: "union" # algorithm
+      14: "union" # footer
+      15: "union" # seal
+      16: "large" # chart
+      17: "union" # formula_number
+      18: "union" # aside_text
+      19: "union" # reference_content  
+
+  TableOrientationClassify:
+    module_name: doc_text_orientation
+    model_name: PP-LCNet_x1_0_doc_ori
+    model_dir: null
+  
+  TableClassification:
+    module_name: table_classification
+    model_name: PP-LCNet_x1_0_table_cls
+    model_dir: null
+
+  WiredTableStructureRecognition:
+    module_name: table_structure_recognition
+    model_name: SLANeXt_wired
+    model_dir: null
+  
+  WirelessTableStructureRecognition:
+    module_name: table_structure_recognition
+    model_name: SLANeXt_wireless
+    model_dir: null
+  
+  WiredTableCellsDetection:
+    module_name: table_cells_detection
+    model_name: RT-DETR-L_wired_table_cell_det
+    model_dir: null
+  
+  WirelessTableCellsDetection:
+    module_name: table_cells_detection
+    model_name: RT-DETR-L_wireless_table_cell_det
+    model_dir: null
+
+SubPipelines:
+  DocPreprocessor:
+    pipeline_name: doc_preprocessor
+    use_doc_orientation_classify: False
+    use_doc_unwarping: False
+    SubModules:
+      DocOrientationClassify:
+        module_name: doc_text_orientation
+        model_name: PP-LCNet_x1_0_doc_ori
+        model_dir: null
+
+      DocUnwarping:
+        module_name: image_unwarping
+        model_name: UVDoc
+        model_dir: null
+
+  GeneralOCR:
+    pipeline_name: OCR
+    text_type: general
+    use_doc_preprocessor: False
+    use_textline_orientation: False
+    SubModules:
+      TextDetection:
+        module_name: text_detection
+        model_name: PP-OCRv5_server_det
+        model_dir: null
+        limit_side_len: 1600
+        limit_type: max
+        max_side_limit: 4000
+        thresh: 0.3
+        box_thresh: 0.4
+        unclip_ratio: 1.5
+        
+      TextRecognition:
+        module_name: text_recognition
+        model_name: PP-OCRv5_server_rec
+        model_dir: null
+        batch_size: 1
+        score_thresh: 0

+ 86 - 0
ocr_tools/paddle_common/config/table_recognition_v2.yaml

@@ -0,0 +1,86 @@
+
+pipeline_name: table_recognition_v2
+
+use_doc_preprocessor: True
+use_layout_detection: True
+use_ocr_model: True
+
+SubModules:
+  LayoutDetection:
+    module_name: layout_detection
+    model_name: PP-DocLayout-L
+    model_dir: null
+  
+  TableOrientationClassify:
+    module_name: doc_text_orientation
+    model_name: PP-LCNet_x1_0_doc_ori
+    model_dir: null
+  
+  TableClassification:
+    module_name: table_classification
+    model_name: PP-LCNet_x1_0_table_cls
+    model_dir: null
+
+  WiredTableStructureRecognition:
+    module_name: table_structure_recognition
+    model_name: SLANeXt_wired
+    model_dir: null
+  
+  WirelessTableStructureRecognition:
+    module_name: table_structure_recognition
+    model_name: SLANeXt_wireless
+    model_dir: null
+  
+  WiredTableCellsDetection:
+    module_name: table_cells_detection
+    model_name: RT-DETR-L_wired_table_cell_det
+    model_dir: null
+  
+  WirelessTableCellsDetection:
+    module_name: table_cells_detection
+    model_name: RT-DETR-L_wireless_table_cell_det
+    model_dir: null
+
+SubPipelines:
+  DocPreprocessor:
+    pipeline_name: doc_preprocessor
+    use_doc_orientation_classify: True
+    use_doc_unwarping: False
+    SubModules:
+      DocOrientationClassify:
+        module_name: doc_text_orientation
+        model_name: PP-LCNet_x1_0_doc_ori
+        model_dir: null
+
+      DocUnwarping:
+        module_name: image_unwarping
+        model_name: UVDoc
+        model_dir: null
+
+  GeneralOCR:
+    pipeline_name: OCR
+    text_type: general
+    use_doc_preprocessor: False
+    use_textline_orientation: True
+    SubModules:
+      TextDetection:
+        module_name: text_detection
+        model_name: PP-OCRv5_server_det
+        model_dir: null
+        limit_side_len: 1200
+        limit_type: max
+        max_side_limit: 4000
+        thresh: 0.3
+        box_thresh: 0.4
+        unclip_ratio: 1.5
+      TextLineOrientation:
+        module_name: textline_orientation
+        model_name: PP-LCNet_x1_0_textline_ori
+        model_dir: null
+        batch_size: 8
+      TextRecognition:
+        module_name: text_recognition
+        model_name: PP-OCRv5_server_rec
+        model_dir: null
+        batch_size: 8
+        score_thresh: 0.0

+ 269 - 0
ocr_tools/paddle_common/processor.py

@@ -0,0 +1,269 @@
+"""
+PaddleX 统一处理器
+
+支持多种 pipeline(PaddleOCR-VL 和 PP-StructureV3)的文档处理类
+"""
+import os
+import time
+import traceback
+import warnings
+from pathlib import Path
+from typing import List, Dict, Any
+from loguru import logger
+
+# 抑制特定警告
+warnings.filterwarnings("ignore", message="To copy construct from a tensor")
+warnings.filterwarnings("ignore", message="Setting `pad_token_id`")
+warnings.filterwarnings("ignore", category=UserWarning, module="paddlex")
+
+from paddlex import create_pipeline
+
+# 导入工具函数
+import sys
+paddle_common_root = Path(__file__).parent
+if str(paddle_common_root) not in sys.path:
+    sys.path.insert(0, str(paddle_common_root))
+
+from .utils import (
+    convert_pruned_result_to_json,
+    save_output_images,
+    save_markdown_content
+)
+
+# 导入适配器
+from .adapters import (
+    apply_table_recognition_adapter,
+    restore_original_function,
+    apply_enhanced_doc_preprocessor,
+    restore_paddlex_doc_preprocessor
+)
+
+
+class PaddleXProcessor:
+    """PaddleX 统一处理器,支持多种 pipeline"""
+    
+    def __init__(self,
+                 pipeline_name: str = "PP-StructureV3",
+                 device: str = "gpu:0",
+                 normalize_numbers: bool = True,
+                 use_enhanced_adapter: bool = True,
+                 log_level: str = "INFO",
+                 **kwargs):
+        """
+        初始化处理器
+        
+        Args:
+            pipeline_name: Pipeline 名称或配置文件路径
+            device: 设备字符串(如 'gpu:0', 'cpu')
+            normalize_numbers: 是否标准化数字
+            use_enhanced_adapter: 是否使用增强适配器
+            log_level: 日志级别(DEBUG, INFO, WARNING, ERROR),当为 DEBUG 时会打印详细错误信息
+            **kwargs: 其他预测参数
+        """
+        self.pipeline_name = pipeline_name
+        self.device = device
+        self.normalize_numbers = normalize_numbers
+        self.use_enhanced_adapter = use_enhanced_adapter
+        self.log_level = log_level
+        self.predict_kwargs = kwargs
+        
+        # 检测 pipeline 类型
+        self.is_paddleocr_vl = 'PaddleOCR-VL'.lower() in str(pipeline_name).lower()
+        
+        # 应用适配器
+        self.adapter_applied = False
+        if use_enhanced_adapter:
+            self.adapter_applied = apply_table_recognition_adapter() and apply_enhanced_doc_preprocessor()
+            if self.adapter_applied:
+                logger.info("🎯 Enhanced table recognition adapter activated and document preprocessor applied")
+            else:
+                logger.warning("⚠️  Failed to apply adapter, using original implementation")
+        
+        # 初始化 pipeline
+        self.pipeline = None
+        self._initialize_pipeline()
+        
+        logger.info(f"PaddleX Processor 初始化完成:")
+        logger.info(f"  - Pipeline: {pipeline_name}")
+        logger.info(f"  - 设备: {device}")
+        logger.info(f"  - Pipeline 类型: {'PaddleOCR-VL' if self.is_paddleocr_vl else 'PP-StructureV3'}")
+        logger.info(f"  - 数字标准化: {normalize_numbers}")
+        logger.info(f"  - 增强适配器: {use_enhanced_adapter}")
+        logger.info(f"  - 日志级别: {log_level}")
+    
+    def _initialize_pipeline(self):
+        """初始化 pipeline"""
+        try:
+            # 设置环境变量以减少警告
+            os.environ['PYTHONWARNINGS'] = 'ignore::UserWarning'
+            
+            logger.info(f"Initializing pipeline '{self.pipeline_name}' on device '{self.device}'...")
+            self.pipeline = create_pipeline(self.pipeline_name, device=self.device)
+            logger.info(f"Pipeline initialized successfully on {self.device}")
+            
+        except Exception as e:
+            logger.error(f"Failed to initialize pipeline: {e}")
+            if self.log_level == "DEBUG":
+                traceback.print_exc()
+            if self.adapter_applied:
+                restore_original_function()
+                restore_paddlex_doc_preprocessor()
+            raise
+    
+    def _get_predict_kwargs(self) -> Dict[str, Any]:
+        """根据 pipeline 类型获取预测参数"""
+        if self.is_paddleocr_vl:
+            # PaddleOCR-VL 使用驼峰命名
+            return {
+                'use_layout_detection': self.predict_kwargs.get('use_layout_detection', True),
+                'use_doc_orientation_classify': self.predict_kwargs.get('use_doc_orientation', True),
+                'use_doc_unwarping': self.predict_kwargs.get('use_doc_unwarping', False),
+            }
+        else:
+            # PP-StructureV3 使用下划线命名
+            return {
+                'use_doc_orientation_classify': self.predict_kwargs.get('use_doc_orientation', True),
+                'use_doc_unwarping': self.predict_kwargs.get('use_doc_unwarping', False),
+                'use_layout_detection': self.predict_kwargs.get('use_layout_detection', True),
+                'use_seal_recognition': self.predict_kwargs.get('use_seal_recognition', True),
+                'use_table_recognition': self.predict_kwargs.get('use_table_recognition', True),
+                'use_formula_recognition': self.predict_kwargs.get('use_formula_recognition', False),
+                'use_chart_recognition': self.predict_kwargs.get('use_chart_recognition', True),
+                'use_ocr_results_with_table_cells': self.predict_kwargs.get('use_ocr_results_with_table_cells', True),
+                'use_table_orientation_classify': self.predict_kwargs.get('use_table_orientation_classify', False),
+                'use_wired_table_cells_trans_to_html': self.predict_kwargs.get('use_wired_table_cells_trans_to_html', True),
+                'use_wireless_table_cells_trans_to_html': self.predict_kwargs.get('use_wireless_table_cells_trans_to_html', True),
+            }
+    
+    def process_single_image(self, image_path: str, output_dir: str) -> Dict[str, Any]:
+        """
+        处理单张图片
+        
+        Args:
+            image_path: 图片路径
+            output_dir: 输出目录
+            
+        Returns:
+            dict: 处理结果,包含 success 字段(基于输出文件存在性判断)
+        """
+        start_time = time.time()
+        image_path_obj = Path(image_path)
+        image_name = image_path_obj.stem
+        
+        # 判断是否为PDF页面(根据文件名模式)
+        is_pdf_page = "_page_" in image_path_obj.name
+        
+        result_info = {
+            "image_path": image_path,
+            "processing_time": 0,
+            "success": False,
+            "device": self.device,
+            "error": None,
+            "output_files": {},
+            "is_pdf_page": is_pdf_page,
+            "processing_info": {}
+        }
+        
+        try:
+            if self.pipeline is None:
+                raise Exception("Pipeline not initialized")
+            
+            # 准备预测参数
+            predict_kwargs = self._get_predict_kwargs()
+            predict_kwargs['input'] = image_path
+            
+            # 使用 pipeline 预测
+            results = self.pipeline.predict(**predict_kwargs)
+            
+            # 处理结果(应该只有一个结果)
+            # 使用迭代方式处理生成器,与原始实现保持一致
+            result = None
+            for idx, res in enumerate(results):
+                if idx > 0:
+                    raise ValueError("Multiple results found for a single image")
+                result = res
+                break  # 只处理第一个结果
+            
+            if result is None:
+                raise Exception("No results returned from pipeline")
+            input_path = Path(result["input_path"])
+            
+            # 生成输出文件名
+            # 使用输入文件名(PaddleX 的 result["input_path"] 可能包含页面信息)
+            output_filename = input_path.stem
+            
+            # 转换并保存标准JSON格式
+            json_content = result.json['res']
+            json_output_path, converted_json = convert_pruned_result_to_json(
+                json_content,
+                str(input_path),
+                output_dir,
+                output_filename,
+                normalize_numbers=self.normalize_numbers
+            )
+            
+            # 保存输出图像
+            img_content = result.img
+            saved_images = save_output_images(img_content, str(output_dir), output_filename)
+            
+            # 保存Markdown内容
+            markdown_content = result.markdown
+            md_output_path = save_markdown_content(
+                markdown_content,
+                output_dir,
+                output_filename,
+                normalize_numbers=self.normalize_numbers,
+                key_text='markdown_texts',
+                key_images='markdown_images',
+                json_data=converted_json
+            )
+            
+            # 根据实际保存的文件路径判断成功(成功判断标准:.md 和 .json 文件都存在)
+            # 使用实际保存的文件路径
+            actual_md_path = Path(md_output_path) if md_output_path else Path(output_dir) / f"{output_filename}.md"
+            actual_json_path = Path(json_output_path) if json_output_path else Path(output_dir) / f"{output_filename}.json"
+            
+            if actual_md_path.exists() and actual_json_path.exists():
+                result_info.update({
+                    "success": True,
+                    "output_files": {
+                        "md": str(actual_md_path),
+                        "json": str(actual_json_path),
+                        **saved_images
+                    },
+                    "processing_info": converted_json.get('processing_info', {})
+                })
+                logger.info(f"✅ 处理成功: {image_name}")
+            else:
+                # 文件不存在,标记为失败
+                missing_files = []
+                if not actual_md_path.exists():
+                    missing_files.append("md")
+                if not actual_json_path.exists():
+                    missing_files.append("json")
+                result_info["error"] = f"输出文件不存在: {', '.join(missing_files)}"
+                result_info["success"] = False
+                logger.error(f"❌ 处理失败: {image_name} - {result_info['error']}")
+                
+        except Exception as e:
+            result_info["error"] = str(e)
+            result_info["success"] = False
+            logger.error(f"Error processing {image_name}: {e}")
+            if self.log_level == "DEBUG":
+                traceback.print_exc()
+        
+        finally:
+            result_info["processing_time"] = time.time() - start_time
+        
+        return result_info
+    
+    def __del__(self):
+        """清理资源"""
+        if self.adapter_applied:
+            try:
+                restore_original_function()
+                restore_paddlex_doc_preprocessor()
+                logger.info("🔄 Original function restored")
+            except Exception as e:
+                logger.warning(f"Failed to restore original function: {e}")
+

+ 333 - 0
ocr_tools/paddle_common/utils.py

@@ -0,0 +1,333 @@
+"""PaddleX 公共工具函数"""
+import json
+import traceback
+import warnings
+import base64
+from pathlib import Path
+from PIL import Image
+from typing import List, Dict, Any, Union
+import numpy as np
+
+# 导入 ocr_utils
+import sys
+ocr_platform_root = Path(__file__).parents[2]
+if str(ocr_platform_root) not in sys.path:
+    sys.path.insert(0, str(ocr_platform_root))
+
+from ocr_utils import (
+    normalize_markdown_table,
+    normalize_financial_numbers
+)
+
+# 注意:load_images_from_pdf 不再需要,因为 PDF 转图片由 ocr_utils.get_input_files() 统一处理
+
+
+def convert_pruned_result_to_json(pruned_result: Dict[str, Any], 
+                              input_image_path: str, 
+                              output_dir: str, 
+                              filename: str,
+                              normalize_numbers: bool = True) -> tuple[str, Dict[str, Any]]:
+    """
+    将API返回结果转换为标准JSON格式,并支持数字标准化
+    """
+    if not pruned_result:
+        return "", {}
+    
+    # 构造标准格式的JSON
+    converted_json = {
+        "input_path": input_image_path,
+        "page_index": None,
+        "model_settings": pruned_result.get('model_settings', {}),
+        "parsing_res_list": pruned_result.get('parsing_res_list', []),
+        "doc_preprocessor_res": {
+            "input_path": None,
+            "page_index": None,
+            "model_settings": pruned_result.get('doc_preprocessor_res', {}).get('model_settings', {}),
+            "angle": pruned_result.get('doc_preprocessor_res', {}).get('angle', 0)
+        },
+        "layout_det_res": {
+            "input_path": None,
+            "page_index": None,
+            "boxes": pruned_result.get('layout_det_res', {}).get('boxes', [])
+        },
+        "overall_ocr_res": {
+            "input_path": None,
+            "page_index": None,
+            "model_settings": pruned_result.get('overall_ocr_res', {}).get('model_settings', {}),
+            "dt_polys": pruned_result.get('overall_ocr_res', {}).get('dt_polys', []),
+            "text_det_params": pruned_result.get('overall_ocr_res', {}).get('text_det_params', {}),
+            "text_type": pruned_result.get('overall_ocr_res', {}).get('text_type', 'general'),
+            "textline_orientation_angles": pruned_result.get('overall_ocr_res', {}).get('textline_orientation_angles', []),
+            "text_rec_score_thresh": pruned_result.get('overall_ocr_res', {}).get('text_rec_score_thresh', 0.0),
+            "return_word_box": pruned_result.get('overall_ocr_res', {}).get('return_word_box', False),
+            "rec_texts": pruned_result.get('overall_ocr_res', {}).get('rec_texts', []),
+            "rec_scores": pruned_result.get('overall_ocr_res', {}).get('rec_scores', []),
+            "rec_polys": pruned_result.get('overall_ocr_res', {}).get('rec_polys', []),
+            "rec_boxes": pruned_result.get('overall_ocr_res', {}).get('rec_boxes', [])
+        },
+        "table_res_list": pruned_result.get('table_res_list', [])
+    }
+    
+    # 数字标准化处理
+    original_json = converted_json.copy()
+    changes_count = 0
+    
+    if normalize_numbers:
+        # 1. 标准化 parsing_res_list 中的文本内容
+        for item in converted_json.get('parsing_res_list', []):
+            if 'block_content' in item:
+                original_content = item['block_content']
+                normalized_content = original_content
+                # 根据block_label类型选择标准化方法
+                if item.get('block_label') == 'table':
+                    normalized_content = normalize_markdown_table(original_content)
+                
+                if original_content != normalized_content:
+                    item['block_content'] = normalized_content
+                    changes_count += len([1 for o, n in zip(original_content, normalized_content) if o != n])
+        
+        # 2. 标准化 table_res_list 中的HTML表格
+        for table_item in converted_json.get('table_res_list', []):
+            if 'pred_html' in table_item:
+                original_html = table_item['pred_html']
+                normalized_html = normalize_markdown_table(original_html)
+                
+                if original_html != normalized_html:
+                    table_item['pred_html'] = normalized_html
+                    changes_count += len([1 for o, n in zip(original_html, normalized_html) if o != n])
+
+        # 统计表格数量
+        parsing_res_tables_count = 0
+        table_res_list_count = 0
+        if 'parsing_res_list' in converted_json:
+            parsing_res_tables_count = len([item for item in converted_json['parsing_res_list'] 
+                                          if 'block_label' in item and item['block_label'] == 'table'])
+        if 'table_res_list' in converted_json:
+            table_res_list_count = len(converted_json["table_res_list"])
+        table_consistency_fixed = False
+        if parsing_res_tables_count != table_res_list_count:
+            warnings.warn(f"⚠️ Warning: {filename} Table count mismatch - parsing_res_list has {parsing_res_tables_count} tables, "
+                          f"but table_res_list has {table_res_list_count} tables.")
+            table_consistency_fixed = True
+        
+        # 添加标准化处理信息
+        converted_json['processing_info'] = {
+            "normalize_numbers": normalize_numbers,
+            "changes_applied": changes_count > 0,
+            "character_changes_count": changes_count,
+            "parsing_res_tables_count": parsing_res_tables_count,
+            "table_res_list_count": table_res_list_count,
+            "table_consistency_fixed": table_consistency_fixed
+        }
+    else:
+        converted_json['processing_info'] = {
+            "normalize_numbers": False,
+            "changes_applied": False,
+            "character_changes_count": 0
+        }
+    
+    # 保存JSON文件
+    output_path = Path(output_dir).resolve()
+    output_path.mkdir(parents=True, exist_ok=True)
+    
+    json_file_path = output_path / f"{filename}.json"
+    with open(json_file_path, 'w', encoding='utf-8') as f:
+        json.dump(converted_json, f, ensure_ascii=False, indent=2)
+    
+    # 如果启用了标准化且有变化,保存原始版本用于对比
+    if normalize_numbers and changes_count > 0:
+        original_output_path = output_path / f"{filename}_original.json"
+        with open(original_output_path, 'w', encoding='utf-8') as f:
+            json.dump(original_json, f, ensure_ascii=False, indent=2)
+    
+    return str(output_path), converted_json
+
+def save_image(image: Union[Image.Image, str, np.ndarray], output_path: str) -> str:
+    """
+    保存单个图像到指定路径
+
+    Args:
+        image: 要保存的图像,可以是PIL Image对象、base64字符串或numpy数组
+        output_path: 输出文件路径
+
+    Returns:
+        保存的图像文件路径
+    """
+    try:
+        if isinstance(image, Image.Image):
+            image.save(output_path)
+        elif isinstance(image, str):
+            # 处理base64字符串
+            img_data = base64.b64decode(image)
+            with open(output_path, 'wb') as f:
+                f.write(img_data)
+        elif isinstance(image, np.ndarray):
+            # 处理numpy数组
+            pil_image = Image.fromarray(image)
+            pil_image.save(output_path)
+        else:
+            raise ValueError(f"Unsupported image type: {type(image)}")
+
+        return str(output_path)
+
+    except Exception as e:
+        print(f"❌ Error saving image {output_path}: {e}")
+        return ""
+
+def save_output_images(output_images: Dict[str, Any], output_dir: str, output_filename: str) -> Dict[str, str]:
+    """
+    保存API返回的输出图像
+    
+    Args:
+        output_images: 图像数组字典或PIL Image对象字典
+        output_dir: 输出目录
+        output_filename: 输出文件名前缀
+        
+    Returns:
+        保存的图像文件路径字典
+    """
+    if not output_images:
+        return {}
+    
+    output_path = Path(output_dir).resolve()
+    output_path.mkdir(parents=True, exist_ok=True)
+    
+    saved_images = {}
+    
+    for img_name, img_data in output_images.items():
+        try:
+            # 生成文件名
+            img_filename = f"{output_filename}_{img_name}.jpg"
+            img_path = output_path / img_filename
+            save_image(img_data, str(img_path))
+            saved_images[img_name] = str(img_path)
+            
+        except Exception as e:
+            print(f"❌ Error saving image {img_name}: {e}")
+            print(f"   Image data type: {type(img_data)}")
+            if hasattr(img_data, 'shape'):
+                print(f"   Image shape: {img_data.shape}")
+            traceback.print_exc()
+    
+    return saved_images
+
+def save_markdown_content(markdown_data: Dict[str, Any], output_dir: str, 
+                         filename: str, normalize_numbers: bool = True, 
+                         key_text: str = 'text', key_images: str = 'images',
+                         json_data: Dict[str, Any] = None) -> str:
+    """
+    保存Markdown内容,支持数字标准化和表格补全
+    """
+    if not markdown_data and not json_data:
+        return ""
+    
+    output_path = Path(output_dir).resolve()
+    output_path.mkdir(parents=True, exist_ok=True)
+    
+    # 🎯 优先使用json_data生成完整内容
+    if json_data:
+        return save_markdown_content_enhanced(json_data, str(output_path), filename, normalize_numbers)
+    
+    # 原有逻辑保持不变
+    markdown_text = markdown_data.get(key_text, '')
+    
+    # 数字标准化处理
+    changes_count = 0
+    if normalize_numbers and markdown_text:
+        original_markdown_text = markdown_text
+        markdown_text = normalize_markdown_table(markdown_text)
+        
+        changes_count = len([1 for o, n in zip(original_markdown_text, markdown_text) if o != n])
+    
+    md_file_path = output_path / f"{filename}.md"
+    with open(md_file_path, 'w', encoding='utf-8') as f:
+        f.write(markdown_text)
+    
+    # 如果启用了标准化且有变化,保存原始版本用于对比
+    if normalize_numbers and changes_count > 0:
+        original_output_path = output_path / f"{filename}_original.md"
+        with open(original_output_path, 'w', encoding='utf-8') as f:
+            f.write(original_markdown_text)
+
+    # 保存Markdown中的图像
+    markdown_images = markdown_data.get(key_images, {})
+    for img_path, img_data in markdown_images.items():
+        try:
+            full_img_path = output_path / img_path
+            full_img_path.parent.mkdir(parents=True, exist_ok=True)
+            save_image(img_data, str(full_img_path))
+            
+        except Exception as e:
+            print(f"❌ Error saving Markdown image {img_path}: {e}")
+
+    return str(md_file_path)
+
+def save_markdown_content_enhanced(json_data: Dict[str, Any], output_dir: str, 
+                         filename: str, normalize_numbers: bool = True) -> str:
+    """
+    增强版Markdown内容保存,同时处理parsing_res_list和table_res_list
+    """
+    if not json_data:
+        return ""
+    
+    output_path = Path(output_dir).resolve()
+    output_path.mkdir(parents=True, exist_ok=True)
+    
+    markdown_content = []
+    
+    # 处理 parsing_res_list
+    parsing_res_list = json_data.get('parsing_res_list', [])
+    table_res_list = json_data.get('table_res_list', [])
+    
+    table_index = 0  # 用于匹配table_res_list中的表格
+    
+    for item in parsing_res_list:
+        block_label = item.get('block_label', '')
+        block_content = item.get('block_content', '')
+        
+        if block_label == 'table':
+            # 如果是表格,优先使用table_res_list中的详细HTML
+            if table_index < len(table_res_list):
+                detailed_html = table_res_list[table_index].get('pred_html', block_content)
+                if normalize_numbers:
+                    detailed_html = normalize_markdown_table(detailed_html)
+                
+                # 转换为居中显示的HTML
+                markdown_content.append(f'<div style="text-align: center;">{detailed_html}</div>')
+                table_index += 1
+            else:
+                # 如果table_res_list中没有对应项,使用parsing_res_list中的内容
+                if normalize_numbers:
+                    block_content = normalize_markdown_table(block_content)
+                markdown_content.append(f'<div style="text-align: center;">{block_content}</div>')
+        else:
+            # 非表格内容直接添加
+            if normalize_numbers:
+                block_content = normalize_financial_numbers(block_content)
+            markdown_content.append(block_content)
+    
+    # 🎯 关键修复:处理剩余的table_res_list项目
+    # 如果table_res_list中还有未处理的表格(比parsing_res_list中的表格多)
+    remaining_tables = table_res_list[table_index:]
+    for table_item in remaining_tables:
+        detailed_html = table_item.get('pred_html', '')
+        if detailed_html:
+            if normalize_numbers:
+                detailed_html = normalize_markdown_table(detailed_html)
+            markdown_content.append(f'<div style="text-align: center;">{detailed_html}</div>')
+    
+    # 合并所有内容
+    final_markdown = '\n\n'.join(markdown_content)
+    
+    # 保存文件
+    md_file_path = output_path / f"{filename}.md"
+    with open(md_file_path, 'w', encoding='utf-8') as f:
+        f.write(final_markdown)
+    
+    print(f"📄 Enhanced Markdown saved: {md_file_path}")
+    print(f"   - parsing_res_list tables: {sum(1 for item in parsing_res_list if item.get('block_label') == 'table')}")
+    print(f"   - table_res_list tables: {len(table_res_list)}")
+    print(f"   - remaining tables added: {len(remaining_tables)}")
+    
+    return str(md_file_path)
+