import sys from pathlib import Path from typing import Dict, Any, List, Union, Optional import numpy as np from PIL import Image from loguru import logger # 导入基类 from .mineru_adapter import MinerUVLRecognizer # 导入 mineru-vl-utils 的客户端 try: from mineru_vl_utils import MinerUClient MINERU_VL_UTILS_AVAILABLE = True except ImportError as e: logger.warning(f"mineru-vl-utils not available: {e}") MINERU_VL_UTILS_AVAILABLE = False class PaddleVLRecognizer(MinerUVLRecognizer): """ PaddleOCR-VL识别适配器,继承自MinerUVLRecognizer 主要差异: 1. 强制使用 PaddleOCR-VL-0.9B 模型 2. 确保使用 vllm-server 后端 3. 复用所有MinerU的预处理/后处理逻辑 """ def __init__(self, config: Dict[str, Any]): # 🔧 强制设置 PaddleOCR-VL 模型名称 config['model_name'] = 'PaddleOCR-VL-0.9B' # 🔧 确保使用正确的后端配置 if config.get('backend') not in ['http-client']: logger.error( f"Backend '{config.get('backend')}' may not be optimal for PaddleOCR-VL. " f"must: 'http-client'" ) # 调用父类初始化 super().__init__(config) def initialize(self): """初始化VL模型 - 使用MinerU的客户端""" if not MINERU_VL_UTILS_AVAILABLE: raise ImportError("mineru-vl-utils is required for PaddleVLRecognizer") try: backend = self.config.get('backend', 'http-client') server_url = self.config.get('server_url') model_params = self.config.get('model_params', {}) # 🔧 提取 MinerUClient 所需的参数 # 从 model_params 中获取,如果没有则使用默认值 max_concurrency = model_params.get('max_concurrency', 100) http_timeout = model_params.get('http_timeout', 600) # 🔧 PaddleOCR-VL 特定的提示词(可选) prompts = model_params.get('prompts', { "table": "\nTable Recognition:", "equation": "\nFormula Recognition:", "[default]": "\nText Recognition:", "[layout]": "\nLayout Detection:", }) # 🔧 初始化 MinerUClient logger.info(f"Initializing PaddleOCR-VL with backend: {backend}") logger.info(f"Server URL: {server_url}") logger.info(f"Max concurrency: {max_concurrency}") # 根据后端类型调整参数 if backend == 'http-client': # HTTP客户端模式 self.vlm_model = MinerUClient( backend=backend, model_name=self.config['model_name'], server_url=server_url, prompts=prompts, max_concurrency=max_concurrency, http_timeout=http_timeout, use_tqdm=False, # 可根据需要调整 ) else: raise ValueError(f"Unsupported backend for PaddleOCR-VL: {backend}") logger.success(f"✅ PaddleOCR-VL recognizer initialized: {backend}") except Exception as e: logger.error(f"❌ Failed to initialize PaddleOCR-VL recognizer: {e}") raise # 以下方法都继承自 MinerUVLRecognizer,无需重写: # - cleanup() # - _preprocess_image() # - recognize_table() # - recognize_formula() # - recognize_text() # - batch_recognize_table() # - batch_recognize_formula() # - _clean_latex() # - _html_to_markdown() # - _extract_cells_from_html() # 导出适配器类 __all__ = ['PaddleVLRecognizer']