6 месяцев назад · 76f8e864a8
--- a/ocr_tools/universal_doc_parser/models/adapters/docling_layout_adapter.py
+++ b/ocr_tools/universal_doc_parser/models/adapters/docling_layout_adapter.py
@@ -15,6 +15,7 @@
 
															 import cv2
														
 
															 import numpy as np
														
 
															 import threading
														
 
															+import os
														
 
															 from pathlib import Path
														
 
															 from typing import Dict, List, Union, Any, Optional
														
 
															 from PIL import Image
														
@@ -127,9 +128,96 @@ class DoclingLayoutDetector(BaseLayoutDetector):
 
															                 self._model_path = str(model_path)
														
 
															                 print(f"📂 Loading model from local path: {self._model_path}")
														
 
															             else:
														
 
															-                # 从 HuggingFace 下载
														
 
															-                print(f"📥 Downloading model from HuggingFace: {model_dir}")
														
 
															-                self._model_path = snapshot_download(repo_id=model_dir)
														
 
															+                # HuggingFace 仓库 ID，先检查本地缓存
														
 
															+                # 获取 HuggingFace 缓存目录
														
 
															+                hf_home = os.environ.get('HF_HOME', None)
														
 
															+                if hf_home:
														
 
															+                    cache_dir = Path(hf_home) / "hub"
														
 
															+                else:
														
 
															+                    cache_dir = Path.home() / ".cache" / "huggingface" / "hub"
														
 
															+                
														
 
															+                # 将模型 ID 转换为缓存目录格式
														
 
															+                # 例如: ds4sd/docling-layout-old -> models--ds4sd--docling-layout-old
														
 
															+                repo_id_escaped = model_dir.replace("/", "--")
														
 
															+                model_cache_dir = cache_dir / f"models--{repo_id_escaped}"
														
 
															+                
														
 
															+                # 先尝试从本地缓存加载（避免不必要的网络请求）
														
 
															+                local_model_path = None
														
 
															+                if model_cache_dir.exists() and model_cache_dir.is_dir():
														
 
															+                    snapshots_dir = model_cache_dir / "snapshots"
														
 
															+                    if snapshots_dir.exists():
														
 
															+                        # 获取所有 snapshot 目录，按修改时间排序
														
 
															+                        snapshots = sorted(
														
 
															+                            [d for d in snapshots_dir.iterdir() if d.is_dir()],
														
 
															+                            key=lambda x: x.stat().st_mtime,
														
 
															+                            reverse=True
														
 
															+                        )
														
 
															+                        if snapshots:
														
 
															+                            # 检查最新的 snapshot 是否完整
														
 
															+                            latest_snapshot = snapshots[0]
														
 
															+                            processor_config = latest_snapshot / "preprocessor_config.json"
														
 
															+                            model_config = latest_snapshot / "config.json"
														
 
															+                            safetensors_file = latest_snapshot / "model.safetensors"
														
 
															+                            
														
 
															+                            if processor_config.exists() and model_config.exists() and safetensors_file.exists():
														
 
															+                                local_model_path = latest_snapshot
														
 
															+                
														
 
															+                if local_model_path:
														
 
															+                    # 本地缓存存在且完整，直接使用（不进行网络请求）
														
 
															+                    self._model_path = str(local_model_path)
														
 
															+                    print(f"📂 Using local cached model: {self._model_path}")
														
 
															+                    print(f"   (Skipping network check - model already cached)")
														
 
															+                else:
														
 
															+                    # 本地缓存不存在或不完整，尝试从 HuggingFace 下载或更新
														
 
															+                    print(f"📥 Model not found in local cache, downloading from HuggingFace: {model_dir}")
														
 
															+                    try:
														
 
															+                        # snapshot_download 会自动检查本地缓存，如果存在且是最新的，不会重新下载
														
 
															+                        # 只有在需要更新或首次下载时才会下载
														
 
															+                        self._model_path = snapshot_download(repo_id=model_dir)
														
 
															+                        print(f"✅ Model downloaded/updated: {self._model_path}")
														
 
															+                    except Exception as e:
														
 
															+                        # HuggingFace 访问失败，再次尝试查找本地缓存（可能之前检查时遗漏）
														
 
															+                        print(f"⚠️ Failed to download from HuggingFace: {e}")
														
 
															+                        print(f"🔍 Trying to find local cached model again...")
														
 
															+                        
														
 
															+                        if model_cache_dir.exists() and model_cache_dir.is_dir():
														
 
															+                            snapshots_dir = model_cache_dir / "snapshots"
														
 
															+                            if snapshots_dir.exists():
														
 
															+                                snapshots = sorted(
														
 
															+                                    [d for d in snapshots_dir.iterdir() if d.is_dir()],
														
 
															+                                    key=lambda x: x.stat().st_mtime,
														
 
															+                                    reverse=True
														
 
															+                                )
														
 
															+                                if snapshots:
														
 
															+                                    local_model_path = snapshots[0]
														
 
															+                                    processor_config = local_model_path / "preprocessor_config.json"
														
 
															+                                    model_config = local_model_path / "config.json"
														
 
															+                                    safetensors_file = local_model_path / "model.safetensors"
														
 
															+                                    
														
 
															+                                    if processor_config.exists() and model_config.exists() and safetensors_file.exists():
														
 
															+                                        self._model_path = str(local_model_path)
														
 
															+                                        print(f"✅ Found local cached model: {self._model_path}")
														
 
															+                                    else:
														
 
															+                                        raise FileNotFoundError(
														
 
															+                                            f"Local cached model found but missing required files in {local_model_path}. "
														
 
															+                                            f"Required: preprocessor_config.json, config.json, model.safetensors"
														
 
															+                                        )
														
 
															+                                else:
														
 
															+                                    raise FileNotFoundError(
														
 
															+                                        f"No snapshots found in {snapshots_dir}. "
														
 
															+                                        f"Please download the model first or check your network connection."
														
 
															+                                    )
														
 
															+                            else:
														
 
															+                                raise FileNotFoundError(
														
 
															+                                    f"Cache directory exists but no snapshots found: {model_cache_dir}. "
														
 
															+                                    f"Please download the model first or check your network connection."
														
 
															+                                )
														
 
															+                        else:
														
 
															+                            raise FileNotFoundError(
														
 
															+                                f"Model not found in local cache: {model_cache_dir}. "
														
 
															+                                f"Please download the model first or check your network connection. "
														
 
															+                                f"Original error: {e}"
														
 
															+                            )
														
 
															             # 检查必要文件
														
 
															             processor_config = Path(self._model_path) / "preprocessor_config.json"