瀏覽代碼

feat: 新增PaddleVL识别器,支持PaddleOCR-VL-0.9B模型和http-client后端

zhch158_admin 2 周之前
父節點
當前提交
1627cea010
共有 1 個文件被更改,包括 107 次插入0 次删除
  1. 107 0
      zhch/universal_doc_parser/models/adapters/paddle_vl_adapter.py

+ 107 - 0
zhch/universal_doc_parser/models/adapters/paddle_vl_adapter.py

@@ -0,0 +1,107 @@
+import sys
+from pathlib import Path
+from typing import Dict, Any, List, Union, Optional
+import numpy as np
+from PIL import Image
+from loguru import logger
+
+# 导入基类
+from .mineru_adapter import MinerUVLRecognizer
+
+# 导入 mineru-vl-utils 的客户端
+try:
+    from mineru_vl_utils import MinerUClient
+    MINERU_VL_UTILS_AVAILABLE = True
+except ImportError as e:
+    logger.warning(f"mineru-vl-utils not available: {e}")
+    MINERU_VL_UTILS_AVAILABLE = False
+
+
+class PaddleVLRecognizer(MinerUVLRecognizer):
+    """
+    PaddleOCR-VL识别适配器,继承自MinerUVLRecognizer
+    
+    主要差异:
+    1. 强制使用 PaddleOCR-VL-0.9B 模型
+    2. 确保使用 vllm-server 后端
+    3. 复用所有MinerU的预处理/后处理逻辑
+    """
+    
+    def __init__(self, config: Dict[str, Any]):
+        # 🔧 强制设置 PaddleOCR-VL 模型名称
+        config['model_name'] = 'PaddleOCR-VL-0.9B'
+        
+        # 🔧 确保使用正确的后端配置
+        if config.get('backend') not in ['http-client']:
+            logger.error(
+                f"Backend '{config.get('backend')}' may not be optimal for PaddleOCR-VL. "
+                f"must: 'http-client'"
+            )
+        
+        # 调用父类初始化
+        super().__init__(config)
+        
+    def initialize(self):
+        """初始化VL模型 - 使用MinerU的客户端"""
+        if not MINERU_VL_UTILS_AVAILABLE:
+            raise ImportError("mineru-vl-utils is required for PaddleVLRecognizer")
+            
+        try:
+            backend = self.config.get('backend', 'http-client')
+            server_url = self.config.get('server_url')
+            model_params = self.config.get('model_params', {})
+            
+            # 🔧 提取 MinerUClient 所需的参数
+            # 从 model_params 中获取,如果没有则使用默认值
+            max_concurrency = model_params.get('max_concurrency', 100)
+            http_timeout = model_params.get('http_timeout', 600)
+            
+            # 🔧 PaddleOCR-VL 特定的提示词(可选)
+            prompts = model_params.get('prompts', {
+                "table": "\nTable Recognition:",
+                "equation": "\nFormula Recognition:",
+                "[default]": "\nText Recognition:",
+                "[layout]": "\nLayout Detection:",
+            })
+            
+            # 🔧 初始化 MinerUClient
+            logger.info(f"Initializing PaddleOCR-VL with backend: {backend}")
+            logger.info(f"Server URL: {server_url}")
+            logger.info(f"Max concurrency: {max_concurrency}")
+            
+            # 根据后端类型调整参数
+            if backend == 'http-client':
+                # HTTP客户端模式
+                self.vlm_model = MinerUClient(
+                    backend=backend,
+                    model_name=self.config['model_name'],
+                    server_url=server_url,
+                    prompts=prompts,
+                    max_concurrency=max_concurrency,
+                    http_timeout=http_timeout,
+                    use_tqdm=False,  # 可根据需要调整
+                )
+            else:
+                raise ValueError(f"Unsupported backend for PaddleOCR-VL: {backend}")
+            
+            logger.success(f"✅ PaddleOCR-VL recognizer initialized: {backend}")
+            
+        except Exception as e:
+            logger.error(f"❌ Failed to initialize PaddleOCR-VL recognizer: {e}")
+            raise
+    
+    # 以下方法都继承自 MinerUVLRecognizer,无需重写:
+    # - cleanup()
+    # - _preprocess_image()
+    # - recognize_table()
+    # - recognize_formula()
+    # - recognize_text()
+    # - batch_recognize_table()
+    # - batch_recognize_formula()
+    # - _clean_latex()
+    # - _html_to_markdown()
+    # - _extract_cells_from_html()
+
+
+# 导出适配器类
+__all__ = ['PaddleVLRecognizer']