1 месяц назад · c816ff91ca
--- a/ocr_tools/model_doctor/README.md
+++ b/ocr_tools/model_doctor/README.md
@@ -0,0 +1,93 @@
 
															+# model_doctor —— 模型变更巡检工具
														
 
															+
														
 
															+`ocr_platform` 用到的模型很多、来源各异且在不断升级。本工具用「**清单 → 指纹 → 与基线比对**」
														
 
															+的方式，帮你一眼看出**哪些模型变了 / 缺失了 / 服务不可达 / HF 远端有更新**，避免悄无声息的版本漂移
														
 
															+（例如 daemon 后面把 PaddleOCR-VL 从 1.5 换成 1.6）。
														
 
															+
														
 
															+## 目录结构
														
 
															+
														
 
															+| 文件 | 说明 |
														
 
															+|---|---|
														
 
															+| `model_registry.yaml` | **手工维护**的模型清单，覆盖四类来源 |
														
 
															+| `model_doctor.py` | 巡检 CLI（采集指纹 / 比对 / 报告） |
														
 
															+| `models.lock.json` | 指纹基线（由 `update-lock` 生成，建议纳入 git） |
														
 
															+
														
 
															+## 四类模型来源（kind）
														
 
															+
														
 
															+| kind | 含义 | 指纹内容 | 「变化」如何被发现 |
														
 
															+|---|---|---|---|
														
 
															+| `hf` | HuggingFace 仓库（自动下载，缓存于 `defaults.hf_hub_dir`） | 本地快照 `local_revision`；`--online` 时附 `remote_revision` | 本地 revision 变化；远端 commit 与本地不同（`--online`） |
														
 
															+| `local_file` | 本地单个权重文件或目录 | `size`+`mtime`（`--hash` 加快速 sha256） | 文件被替换、大小/时间变化、缺失 |
														
 
															+| `daemon` | HTTP 服务（llama-server / vllm）+ 关联本地 GGUF 资产 | `/v1/models` 返回的 `served_models` + 各 asset 的 `size`+`mtime` | 服务不可达、声明的模型 id 不符、GGUF 文件被换 |
														
 
															+| `mineru` | MinerU 内置模型 | `package_version` + `model_root` 目录聚合指纹 | MinerU 包升级、内置模型目录变化 |
														
 
															+
														
 
															+## 常用命令
														
 
															+
														
 
															+> 建议在 conda 环境 `mineru` 下运行。
														
 
															+
														
 
															+```bash
														
 
															+cd ocr_tools/model_doctor
														
 
															+
														
 
															+# 列出清单
														
 
															+conda run -n mineru python model_doctor.py list
														
 
															+
														
 
															+# 体检（与基线比对）；有缺失/不可达/远端更新时退出码非 0
														
 
															+conda run -n mineru python model_doctor.py check
														
 
															+
														
 
															+# 体检 + 查 HF 远端最新 commit + 对本地文件算快速 sha256（更敏感、更慢）
														
 
															+conda run -n mineru python model_doctor.py check --online --hash
														
 
															+
														
 
															+# 确认变更合理后，把当前指纹固化为新基线
														
 
															+conda run -n mineru python model_doctor.py update-lock
														
 
															+
														
 
															+# 只打印当前采集到的指纹（不比对，便于排查）
														
 
															+conda run -n mineru python model_doctor.py show
														
 
															+```
														
 
															+
														
 
															+### 选项
														
 
															+
														
 
															+| 选项 | 作用 |
														
 
															+|---|---|
														
 
															+| `--online` | `hf` 条目额外查询远端最新 commit 并比对（需联网） |
														
 
															+| `--hash` | 本地文件/目录额外计算快速 sha256（头 8MB + 尾 8MB + size） |
														
 
															+| `--strict` | `check` 时把「指纹变化」也算失败（默认仅缺失/不可达/远端更新/新增才非 0 退出） |
														
 
															+| `--registry` / `--lock` | 指定清单 / 基线文件路径 |
														
 
															+
														
 
															+## 报告符号
														
 
															+
														
 
															+| 符号 | 含义 |
														
 
															+|---|---|
														
 
															+| ✅ | 未变化 |
														
 
															+| ⚠️ | 指纹变化（列出具体字段 diff） |
														
 
															+| 🔺 | HF 远端有更新（`--online`） |
														
 
															+| ❌ | 缺失 / 服务不可达 |
														
 
															+| 🆕 | 新增条目（基线中无记录，需 `update-lock`） |
														
 
															+| 🗑 | 基线中存在但 registry 已移除 |
														
 
															+| ·  | 跳过（`enabled: false`） |
														
 
															+
														
 
															+## 典型工作流
														
 
															+
														
 
															+1. 新增/升级模型 → 编辑 `model_registry.yaml`（增删条目或改路径/repo）。
														
 
															+2. `check` 看差异是否符合预期。
														
 
															+3. 确认无误 → `update-lock` 更新基线，并把 `models.lock.json` 一起提交。
														
 
															+4. 日常/CI/定时任务里跑 `check`；非 0 退出即代表「有人动了模型，需要关注」。
														
 
															+
														
 
															+可挂载的触发点：
														
 
															+- 流程启动前 `check`（缺失直接拦截，避免跑到一半报错）；
														
 
															+- `launchd`/`cron` 定时 `check --online` + 通知，监控 HF 远端更新；
														
 
															+- git `pre-commit`：改了 config 的 `model_dir` 时校验本地是否已下载。
														
 
															+
														
 
															+## 指纹策略说明
														
 
															+
														
 
															+- **大文件**默认只用 `size`+`mtime`（快、可离线）；`--hash` 时用「头尾各 8MB + size」的快速 sha256，
														
 
															+  兼顾敏感度与速度，不全量读取 GB 级 GGUF。
														
 
															+- **HF revision**：读取 HF 缓存 `models--{org}--{name}/refs/main`（即本地 commit）；
														
 
															+  `--online` 用 `huggingface_hub.HfApi().model_info(repo_id).sha` 取远端最新 commit 比对，无需下载。
														
 
															+- **daemon**：`/v1/models` 仅反映「服务声明的 model id」；真实权重变化靠 `assets` 的本地 GGUF 指纹兜底。
														
 
															+
														
 
															+## 维护提示
														
 
															+
														
 
															+- 内网/未启动的服务、可选模型已设为 `enabled: false`，体检时跳过、不报红；需要时改为 `true`。
														
 
															+- `mineru-builtin` 的 `model_root` 当前指向 `modelscope_cache`，若你的 MinerU 内置模型实际下载在
														
 
															+  `hf_home` 或别处，请按实修改该路径（留空 `null` 则只校验包版本）。
														
 
															+- `models.lock.json` 是「期望状态」，请在确认变更合理后才 `update-lock`，并随代码提交，便于团队对齐。
														
--- a/ocr_tools/model_doctor/model_doctor.py
+++ b/ocr_tools/model_doctor/model_doctor.py
@@ -0,0 +1,433 @@
 
															+#!/usr/bin/env python
														
 
															+"""model_doctor —— 模型变更巡检工具（方案 B）。
														
 
															+
														
 
															+依据手工维护的 model_registry.yaml，对四类模型来源采集「指纹」，
														
 
															+与基线 models.lock.json 比对，报告哪些模型发生了变化 / 缺失 / 服务不可达，
														
 
															+以及（可选）HF 远端是否有更新。
														
 
															+
														
 
															+子命令：
														
 
															+    list         列出 registry 中的模型条目
														
 
															+    show         采集并打印当前指纹（不比对）
														
 
															+    check        采集并与 lock 基线比对，输出报告（有变更/缺失则退出码非 0）
														
 
															+    update-lock  采集并把当前指纹固化为新的 lock 基线
														
 
															+
														
 
															+用法（建议在 conda 环境 mineru 下）：
														
 
															+    conda run -n mineru python model_doctor.py check
														
 
															+    conda run -n mineru python model_doctor.py check --online --hash
														
 
															+    conda run -n mineru python model_doctor.py update-lock
														
 
															+"""
														
 
															+
														
 
															+from __future__ import annotations
														
 
															+
														
 
															+import argparse
														
 
															+import hashlib
														
 
															+import json
														
 
															+import os
														
 
															+import sys
														
 
															+import urllib.request
														
 
															+from datetime import datetime
														
 
															+from pathlib import Path
														
 
															+
														
 
															+try:
														
 
															+    import yaml
														
 
															+except ImportError:
														
 
															+    sys.stderr.write("缺少依赖 PyYAML，请在 mineru 环境安装：conda run -n mineru pip install pyyaml\n")
														
 
															+    raise
														
 
															+
														
 
															+HERE = Path(__file__).resolve().parent
														
 
															+DEFAULT_REGISTRY = HERE / "model_registry.yaml"
														
 
															+DEFAULT_LOCK = HERE / "models.lock.json"
														
 
															+
														
 
															+# 报告状态符号
														
 
															+SYM = {
														
 
															+    "ok": "✅",
														
 
															+    "changed": "⚠️ ",
														
 
															+    "remote_update": "🔺",
														
 
															+    "missing": "❌",
														
 
															+    "unreachable": "❌",
														
 
															+    "new": "🆕",
														
 
															+    "removed": "🗑 ",
														
 
															+    "skipped": "· ",
														
 
															+}
														
 
															+
														
 
															+# 目录指纹遍历上限，避免误指向超大目录卡死
														
 
															+_MAX_DIR_FILES = 20000
														
 
															+
														
 
															+
														
 
															+# --------------------------------------------------------------------------- #
														
 
															+# 通用指纹工具
														
 
															+# --------------------------------------------------------------------------- #
														
 
															+def _fast_sha256(path: Path, head_tail_mb: int = 8) -> str:
														
 
															+    """对大文件取「头 + 尾 + 大小」的快速 sha256，避免全量读取。"""
														
 
															+    size = path.stat().st_size
														
 
															+    chunk = head_tail_mb * 1024 * 1024
														
 
															+    h = hashlib.sha256()
														
 
															+    h.update(str(size).encode())
														
 
															+    with path.open("rb") as f:
														
 
															+        h.update(f.read(chunk))
														
 
															+        if size > chunk * 2:
														
 
															+            f.seek(-chunk, os.SEEK_END)
														
 
															+            h.update(f.read(chunk))
														
 
															+    return h.hexdigest()
														
 
															+
														
 
															+
														
 
															+def _file_fp(path: Path, do_hash: bool) -> dict:
														
 
															+    if not path.exists():
														
 
															+        return {"exists": False}
														
 
															+    st = path.stat()
														
 
															+    fp = {
														
 
															+        "exists": True,
														
 
															+        "size": st.st_size,
														
 
															+        "mtime": int(st.st_mtime),
														
 
															+    }
														
 
															+    if do_hash and path.is_file():
														
 
															+        fp["sha256_fast"] = _fast_sha256(path)
														
 
															+    return fp
														
 
															+
														
 
															+
														
 
															+def _dir_fp(path: Path, do_hash: bool) -> dict:
														
 
															+    """目录指纹：聚合 (相对路径, size, mtime) 排序后的 sha256。"""
														
 
															+    if not path.exists():
														
 
															+        return {"exists": False}
														
 
															+    files = []
														
 
															+    count = 0
														
 
															+    for p in sorted(path.rglob("*")):
														
 
															+        if p.is_file():
														
 
															+            count += 1
														
 
															+            if count > _MAX_DIR_FILES:
														
 
															+                files.append(("<truncated>", -1, -1))
														
 
															+                break
														
 
															+            st = p.stat()
														
 
															+            files.append((str(p.relative_to(path)), st.st_size, int(st.st_mtime)))
														
 
															+    h = hashlib.sha256()
														
 
															+    total_size = 0
														
 
															+    for rel, size, mtime in files:
														
 
															+        h.update(f"{rel}|{size}|{mtime}\n".encode())
														
 
															+        if size > 0:
														
 
															+            total_size += size
														
 
															+    fp = {
														
 
															+        "exists": True,
														
 
															+        "file_count": len([f for f in files if f[1] >= 0]),
														
 
															+        "total_size": total_size,
														
 
															+        "tree_sha256": h.hexdigest(),
														
 
															+    }
														
 
															+    return fp
														
 
															+
														
 
															+
														
 
															+# --------------------------------------------------------------------------- #
														
 
															+# 各 kind 的指纹采集
														
 
															+# --------------------------------------------------------------------------- #
														
 
															+def fp_local_file(entry: dict, defaults: dict) -> dict:
														
 
															+    do_hash = entry.get("hash", defaults.get("hash", False))
														
 
															+    path = Path(os.path.expanduser(entry["path"]))
														
 
															+    fp = _dir_fp(path, do_hash) if path.is_dir() else _file_fp(path, do_hash)
														
 
															+    status = "ok" if fp.get("exists") else "missing"
														
 
															+    return {"status": status, "fingerprint": fp}
														
 
															+
														
 
															+
														
 
															+def fp_hf(entry: dict, defaults: dict) -> dict:
														
 
															+    repo_id = entry["repo_id"]
														
 
															+    hub_dir = Path(os.path.expanduser(entry.get("cache_dir", defaults["hf_hub_dir"])))
														
 
															+    repo_dir = hub_dir / ("models--" + repo_id.replace("/", "--"))
														
 
															+    fp: dict = {"repo_id": repo_id}
														
 
															+    local_rev = None
														
 
															+    if repo_dir.exists():
														
 
															+        refs_main = repo_dir / "refs" / "main"
														
 
															+        if refs_main.exists():
														
 
															+            local_rev = refs_main.read_text().strip()
														
 
															+        else:
														
 
															+            snaps = repo_dir / "snapshots"
														
 
															+            if snaps.exists():
														
 
															+                cand = sorted([d.name for d in snaps.iterdir() if d.is_dir()])
														
 
															+                local_rev = cand[-1] if cand else None
														
 
															+        fp["local_revision"] = local_rev
														
 
															+        fp["cached"] = True
														
 
															+    else:
														
 
															+        fp["cached"] = False
														
 
															+
														
 
															+    status = "ok" if fp.get("cached") else "missing"
														
 
															+
														
 
															+    # 可选：查远端最新 commit
														
 
															+    if entry.get("online", defaults.get("online", False)):
														
 
															+        try:
														
 
															+            from huggingface_hub import HfApi
														
 
															+
														
 
															+            remote_sha = HfApi().model_info(repo_id).sha
														
 
															+            fp["remote_revision"] = remote_sha
														
 
															+            if local_rev and remote_sha and local_rev != remote_sha:
														
 
															+                status = "remote_update"
														
 
															+        except Exception as e:  # 网络不可达等
														
 
															+            fp["remote_error"] = str(e)
														
 
															+    return {"status": status, "fingerprint": fp}
														
 
															+
														
 
															+
														
 
															+def fp_daemon(entry: dict, defaults: dict) -> dict:
														
 
															+    do_hash = entry.get("hash", defaults.get("hash", False))
														
 
															+    timeout = entry.get("daemon_timeout", defaults.get("daemon_timeout", 3))
														
 
															+    url = entry["server_url"].rstrip("/") + "/v1/models"
														
 
															+    fp: dict = {"server_url": entry["server_url"]}
														
 
															+    status = "ok"
														
 
															+
														
 
															+    try:
														
 
															+        req = urllib.request.Request(url, headers={"Accept": "application/json"})
														
 
															+        with urllib.request.urlopen(req, timeout=timeout) as resp:
														
 
															+            data = json.loads(resp.read().decode())
														
 
															+        served = [m.get("id") for m in data.get("data", [])]
														
 
															+        fp["reachable"] = True
														
 
															+        fp["served_models"] = served
														
 
															+        expect = entry.get("served_model")
														
 
															+        if expect and expect not in served:
														
 
															+            fp["served_mismatch"] = {"expect": expect, "actual": served}
														
 
															+            status = "changed"
														
 
															+    except Exception as e:
														
 
															+        fp["reachable"] = False
														
 
															+        fp["error"] = str(e)
														
 
															+        status = "unreachable"
														
 
															+
														
 
															+    # 本地 GGUF 资产指纹（即使服务不可达也采集，便于发现文件被换）
														
 
															+    assets = entry.get("assets") or []
														
 
															+    if assets:
														
 
															+        fp["assets"] = {}
														
 
															+        for a in assets:
														
 
															+            p = Path(os.path.expanduser(a))
														
 
															+            afp = _file_fp(p, do_hash)
														
 
															+            fp["assets"][a] = afp
														
 
															+            if not afp.get("exists"):
														
 
															+                status = "missing" if status == "ok" else status
														
 
															+    return {"status": status, "fingerprint": fp}
														
 
															+
														
 
															+
														
 
															+def fp_mineru(entry: dict, defaults: dict) -> dict:
														
 
															+    do_hash = entry.get("hash", defaults.get("hash", False))
														
 
															+    pkg = entry.get("package", "mineru")
														
 
															+    fp: dict = {"package": pkg}
														
 
															+    status = "ok"
														
 
															+    try:
														
 
															+        import importlib.metadata as md
														
 
															+
														
 
															+        fp["package_version"] = md.version(pkg)
														
 
															+    except Exception as e:
														
 
															+        fp["package_error"] = str(e)
														
 
															+        status = "missing"
														
 
															+
														
 
															+    root = entry.get("model_root")
														
 
															+    if root:
														
 
															+        p = Path(os.path.expanduser(root))
														
 
															+        rfp = _dir_fp(p, do_hash)
														
 
															+        fp["model_root"] = str(p)
														
 
															+        fp["model_root_fp"] = rfp
														
 
															+        if not rfp.get("exists"):
														
 
															+            status = "missing" if status == "ok" else status
														
 
															+    return {"status": status, "fingerprint": fp}
														
 
															+
														
 
															+
														
 
															+_COLLECTORS = {
														
 
															+    "local_file": fp_local_file,
														
 
															+    "hf": fp_hf,
														
 
															+    "daemon": fp_daemon,
														
 
															+    "mineru": fp_mineru,
														
 
															+}
														
 
															+
														
 
															+
														
 
															+# --------------------------------------------------------------------------- #
														
 
															+# registry / lock 读写与比对
														
 
															+# --------------------------------------------------------------------------- #
														
 
															+def load_registry(path: Path) -> dict:
														
 
															+    with path.open("r", encoding="utf-8") as f:
														
 
															+        reg = yaml.safe_load(f)
														
 
															+    reg.setdefault("defaults", {})
														
 
															+    reg.setdefault("models", [])
														
 
															+    return reg
														
 
															+
														
 
															+
														
 
															+def collect(reg: dict, online: bool, do_hash: bool) -> dict:
														
 
															+    defaults = dict(reg.get("defaults", {}))
														
 
															+    if online:
														
 
															+        defaults["online"] = True
														
 
															+    if do_hash:
														
 
															+        defaults["hash"] = True
														
 
															+
														
 
															+    snapshot = {}
														
 
															+    for entry in reg.get("models", []):
														
 
															+        name = entry["name"]
														
 
															+        if not entry.get("enabled", True):
														
 
															+            snapshot[name] = {"kind": entry.get("kind"), "status": "skipped", "fingerprint": {}}
														
 
															+            continue
														
 
															+        kind = entry.get("kind")
														
 
															+        collector = _COLLECTORS.get(kind)
														
 
															+        if collector is None:
														
 
															+            snapshot[name] = {"kind": kind, "status": "missing",
														
 
															+                              "fingerprint": {"error": f"未知 kind: {kind}"}}
														
 
															+            continue
														
 
															+        try:
														
 
															+            result = collector(entry, defaults)
														
 
															+        except Exception as e:
														
 
															+            result = {"status": "missing", "fingerprint": {"error": str(e)}}
														
 
															+        result["kind"] = kind
														
 
															+        result["used_by"] = entry.get("used_by", [])
														
 
															+        snapshot[name] = result
														
 
															+    return snapshot
														
 
															+
														
 
															+
														
 
															+def load_lock(path: Path) -> dict:
														
 
															+    if not path.exists():
														
 
															+        return {}
														
 
															+    with path.open("r", encoding="utf-8") as f:
														
 
															+        return json.load(f).get("models", {})
														
 
															+
														
 
															+
														
 
															+def save_lock(path: Path, snapshot: dict) -> None:
														
 
															+    payload = {
														
 
															+        "generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
														
 
															+        "models": snapshot,
														
 
															+    }
														
 
															+    with path.open("w", encoding="utf-8") as f:
														
 
															+        json.dump(payload, f, ensure_ascii=False, indent=2)
														
 
															+
														
 
															+
														
 
															+def diff_fp(old: dict, new: dict) -> list:
														
 
															+    """返回发生变化的字段路径（浅层 + 一层嵌套）。"""
														
 
															+    changes = []
														
 
															+    keys = set(old.keys()) | set(new.keys())
														
 
															+    for k in sorted(keys):
														
 
															+        ov, nv = old.get(k), new.get(k)
														
 
															+        if isinstance(ov, dict) and isinstance(nv, dict):
														
 
															+            for sk in sorted(set(ov) | set(nv)):
														
 
															+                if ov.get(sk) != nv.get(sk):
														
 
															+                    changes.append(f"{k}.{sk}: {ov.get(sk)} → {nv.get(sk)}")
														
 
															+        elif ov != nv:
														
 
															+            changes.append(f"{k}: {ov} → {nv}")
														
 
															+    return changes
														
 
															+
														
 
															+
														
 
															+# --------------------------------------------------------------------------- #
														
 
															+# 子命令
														
 
															+# --------------------------------------------------------------------------- #
														
 
															+def cmd_list(reg: dict) -> int:
														
 
															+    print(f"模型清单（{len(reg.get('models', []))} 条）：\n")
														
 
															+    for e in reg.get("models", []):
														
 
															+        flag = "  " if e.get("enabled", True) else "× "
														
 
															+        used = "；".join(e.get("used_by", []))
														
 
															+        print(f"{flag}[{e.get('kind'):<10}] {e['name']}")
														
 
															+        if used:
														
 
															+            print(f"                用于：{used}")
														
 
															+    print("\n（× 表示 enabled: false，体检时跳过）")
														
 
															+    return 0
														
 
															+
														
 
															+
														
 
															+def cmd_show(reg: dict, online: bool, do_hash: bool) -> int:
														
 
															+    snap = collect(reg, online, do_hash)
														
 
															+    print(json.dumps(snap, ensure_ascii=False, indent=2))
														
 
															+    return 0
														
 
															+
														
 
															+
														
 
															+def cmd_check(reg: dict, lock_path: Path, online: bool, do_hash: bool, strict: bool) -> int:
														
 
															+    new = collect(reg, online, do_hash)
														
 
															+    old = load_lock(lock_path)
														
 
															+
														
 
															+    has_baseline = bool(old)
														
 
															+    problems = 0      # missing / unreachable / remote_update
														
 
															+    changes = 0       # 指纹变化
														
 
															+    news = 0          # 新增（lock 中无记录）
														
 
															+
														
 
															+    print(f"模型体检报告  baseline={'有' if has_baseline else '无（首次，请先 update-lock）'}\n")
														
 
															+
														
 
															+    for name, cur in new.items():
														
 
															+        kind = cur.get("kind")
														
 
															+        status = cur.get("status")
														
 
															+
														
 
															+        if status == "skipped":
														
 
															+            print(f"{SYM['skipped']} {name:<28} [{kind}] 跳过（disabled）")
														
 
															+            continue
														
 
															+
														
 
															+        prev = old.get(name)
														
 
															+        if prev is None:
														
 
															+            news += 1
														
 
															+            print(f"{SYM['new']} {name:<28} [{kind}] 新增条目（基线中无记录）")
														
 
															+            continue
														
 
															+
														
 
															+        # 状态类问题优先
														
 
															+        if status in ("missing", "unreachable"):
														
 
															+            problems += 1
														
 
															+            detail = cur["fingerprint"].get("error", "")
														
 
															+            print(f"{SYM[status]} {name:<28} [{kind}] {status} {detail}")
														
 
															+            continue
														
 
															+        if status == "remote_update":
														
 
															+            problems += 1
														
 
															+            fpr = cur["fingerprint"]
														
 
															+            print(f"{SYM['remote_update']} {name:<28} [{kind}] HF 远端有更新："
														
 
															+                  f"{fpr.get('local_revision')} → {fpr.get('remote_revision')}")
														
 
															+            continue
														
 
															+
														
 
															+        # 指纹比对
														
 
															+        fp_changes = diff_fp(prev.get("fingerprint", {}), cur.get("fingerprint", {}))
														
 
															+        if fp_changes:
														
 
															+            changes += 1
														
 
															+            print(f"{SYM['changed']} {name:<28} [{kind}] 指纹变化：")
														
 
															+            for c in fp_changes:
														
 
															+                print(f"        - {c}")
														
 
															+        else:
														
 
															+            print(f"{SYM['ok']} {name:<28} [{kind}] 未变化")
														
 
															+
														
 
															+    # 基线中存在但 registry 已删除
														
 
															+    removed = [n for n in old if n not in new]
														
 
															+    for n in removed:
														
 
															+        print(f"{SYM['removed']} {n:<28} 基线中存在但 registry 已移除")
														
 
															+
														
 
															+    print("\n" + "-" * 60)
														
 
															+    print(f"问题(缺失/不可达/远端更新)={problems}  指纹变化={changes}  "
														
 
															+          f"新增={news}  移除={len(removed)}")
														
 
															+
														
 
															+    if not has_baseline:
														
 
															+        print("提示：尚无基线，运行 `update-lock` 生成。")
														
 
															+        return 1
														
 
															+
														
 
															+    fail = problems + (changes if strict else 0) + news
														
 
															+    if problems and not strict:
														
 
															+        # 即使非 strict，缺失/不可达也应非 0 退出
														
 
															+        return 1
														
 
															+    return 1 if fail else 0
														
 
															+
														
 
															+
														
 
															+def cmd_update_lock(reg: dict, lock_path: Path, online: bool, do_hash: bool) -> int:
														
 
															+    snap = collect(reg, online, do_hash)
														
 
															+    save_lock(lock_path, snap)
														
 
															+    enabled = [n for n, v in snap.items() if v.get("status") != "skipped"]
														
 
															+    print(f"已写入基线 {lock_path}（{len(enabled)} 条生效，"
														
 
															+          f"{len(snap) - len(enabled)} 条跳过）")
														
 
															+    return 0
														
 
															+
														
 
															+
														
 
															+# --------------------------------------------------------------------------- #
														
 
															+# 入口
														
 
															+# --------------------------------------------------------------------------- #
														
 
															+def main(argv=None) -> int:
														
 
															+    parser = argparse.ArgumentParser(description="模型变更巡检工具 model_doctor")
														
 
															+    parser.add_argument("command", choices=["list", "show", "check", "update-lock"])
														
 
															+    parser.add_argument("--registry", type=Path, default=DEFAULT_REGISTRY,
														
 
															+                        help=f"清单文件，默认 {DEFAULT_REGISTRY.name}")
														
 
															+    parser.add_argument("--lock", type=Path, default=DEFAULT_LOCK,
														
 
															+                        help=f"基线文件，默认 {DEFAULT_LOCK.name}")
														
 
															+    parser.add_argument("--online", action="store_true",
														
 
															+                        help="hf 条目额外查询远端最新 commit 进行比对")
														
 
															+    parser.add_argument("--hash", dest="do_hash", action="store_true",
														
 
															+                        help="本地文件/目录额外计算快速 sha256（更敏感但更慢）")
														
 
															+    parser.add_argument("--strict", action="store_true",
														
 
															+                        help="check 时把『指纹变化』也视为失败（非 0 退出）")
														
 
															+    args = parser.parse_args(argv)
														
 
															+
														
 
															+    reg = load_registry(args.registry)
														
 
															+
														
 
															+    if args.command == "list":
														
 
															+        return cmd_list(reg)
														
 
															+    if args.command == "show":
														
 
															+        return cmd_show(reg, args.online, args.do_hash)
														
 
															+    if args.command == "check":
														
 
															+        return cmd_check(reg, args.lock, args.online, args.do_hash, args.strict)
														
 
															+    if args.command == "update-lock":
														
 
															+        return cmd_update_lock(reg, args.lock, args.online, args.do_hash)
														
 
															+    return 2
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    sys.exit(main())
														
--- a/ocr_tools/model_doctor/model_registry.yaml
+++ b/ocr_tools/model_doctor/model_registry.yaml
@@ -0,0 +1,134 @@
 
															+# ============================================================
														
 
															+# model_doctor 模型清单（手工维护）
														
 
															+# ------------------------------------------------------------
														
 
															+# 覆盖四类模型来源：
														
 
															+#   hf          —— HuggingFace 仓库（自动下载，缓存在 defaults.hf_hub_dir）
														
 
															+#   local_file  —— 本地单个权重文件（.onnx/.gguf/.pth/...）或目录
														
 
															+#   daemon      —— 通过 HTTP 服务访问的模型（llama-server / vllm），可附带本地 GGUF 资产
														
 
															+#   mineru      —— MinerU 内置模型（校验包版本 + 可选模型根目录指纹）
														
 
															+#
														
 
															+# 指纹策略：
														
 
															+#   本地文件/目录默认用 size + mtime（快、可离线）；加 --hash 才算快速 sha256。
														
 
															+#   hf 默认只读本地快照 revision；加 --online 才查远端最新 commit 比对。
														
 
															+#   daemon 默认探测 /v1/models 是否包含 served_model + 本地 assets 指纹。
														
 
															+#
														
 
															+# 维护说明：新增/升级模型时在此增删条目，再运行 `model_doctor.py update-lock`
														
 
															+# 固化基线；日常用 `model_doctor.py check` 体检。
														
 
															+# ============================================================
														
 
															+
														
 
															+defaults:
														
 
															+  hf_hub_dir: "/Users/zhch158/models/hf_home/hub"  # HF 缓存 hub 根（= $HF_HOME/hub）
														
 
															+  hash: false        # 本地文件默认仅 size+mtime；true 则计算快速 sha256
														
 
															+  online: false      # hf 远端比对默认关闭（内网/离线场景）
														
 
															+  daemon_timeout: 3  # daemon 探测超时（秒）
														
 
															+
														
 
															+models:
														
 
															+  # ===== ① HF 仓库（自动下载，缓存在 hf_hub_dir） =====
														
 
															+  - name: docling-layout-old
														
 
															+    kind: hf
														
 
															+    repo_id: ds4sd/docling-layout-old
														
 
															+    used_by: ["layout/docling（bank_statement_* 默认布局）"]
														
 
															+    enabled: true
														
 
															+
														
 
															+  - name: pp-doclayoutv3
														
 
															+    kind: hf
														
 
															+    repo_id: PaddlePaddle/PP-DocLayoutV3_safetensors
														
 
															+    used_by: ["layout/paddle", "seal_supplement"]
														
 
															+    enabled: true
														
 
															+
														
 
															+  - name: paddleocr-vl-1.6-hf
														
 
															+    kind: hf
														
 
															+    repo_id: PaddlePaddle/PaddleOCR-VL-1.6
														
 
															+    used_by: ["PaddleOCR-VL transformers 权重（GGUF 转换源）"]
														
 
															+    enabled: false
														
 
															+
														
 
															+  - name: mineru-2.5-pro-2604-1.2b-hf
														
 
															+    kind: hf
														
 
															+    repo_id: opendatalab/MinerU2.5-Pro-2604-1.2B
														
 
															+    used_by: ["MinerU2.5-Pro-2604-1.2B transformers 权重（GGUF 转换源）"]
														
 
															+    enabled: false
														
 
															+
														
 
															+  - name: glm-ocr-hf
														
 
															+    kind: hf
														
 
															+    repo_id: zai-org/GLM-OCR
														
 
															+    used_by: ["GLM-OCR transformers 权重（GGUF 转换源）"]
														
 
															+    enabled: false
														
 
															+
														
 
															+  - name: rtdetr-wired-cell-hf
														
 
															+    kind: hf
														
 
															+    repo_id: PaddlePaddle/RT-DETR-L_wired_table_cell_det
														
 
															+    used_by: ["table_recognition_wired/cell_fusion paddle格式 pdiparams"]
														
 
															+    enabled: false
														
 
															+
														
 
															+  - name: rdetr-h-layout-17cls-hf
														
 
															+    kind: hf
														
 
															+    repo_id: PaddlePaddle/RT-DETR-H_layout_17cls
														
 
															+    used_by: ["layout/paddle paddle格式 pdiparams"]
														
 
															+    enabled: false
														
 
															+
														
 
															+  # ===== ② 本地权重文件 =====
														
 
															+  - name: rtdetr-wired-cell
														
 
															+    kind: local_file
														
 
															+    path: /Users/zhch158/models/pytorch_models/Table/RT-DETR-L_wired_table_cell_det.onnx
														
 
															+    used_by: ["table_recognition_wired/cell_fusion（有线表格单元格融合）"]
														
 
															+    enabled: true
														
 
															+
														
 
															+  - name: rtdetr-h-layout-17cls
														
 
															+    kind: local_file
														
 
															+    path: /Users/zhch158/models/pytorch_models/Layout/RT-DETR-H_layout_17cls.onnx
														
 
															+    used_by: ["layout/paddle（可选，默认走 HF 路径）"]
														
 
															+    enabled: false
														
 
															+
														
 
															+  # ===== ③ daemon 服务（HTTP + 关联本地 GGUF 资产） =====
														
 
															+  - name: paddleocr-vl-1.6-daemon
														
 
															+    kind: daemon
														
 
															+    server_url: http://localhost:8102
														
 
															+    served_model: PaddleOCR-VL-1.6      # 期望 /v1/models 返回包含此 id
														
 
															+    assets:
														
 
															+      - /Users/zhch158/models/PaddleOCR-VL-1.6-GGUF/PaddleOCR-VL-1.6-F16.gguf
														
 
															+      - /Users/zhch158/models/PaddleOCR-VL-1.6-GGUF/PaddleOCR-VL-1.6-F16-mmproj.gguf
														
 
															+    used_by: ["vl_recognition/paddle（bank_statement_paddle_vl_local）"]
														
 
															+    enabled: true
														
 
															+
														
 
															+  - name: glm-ocr-daemon-local
														
 
															+    kind: daemon
														
 
															+    server_url: http://localhost:8101
														
 
															+    served_model: glm-ocr
														
 
															+    assets:
														
 
															+      - /Users/zhch158/models/hf_home/hub/models--ggml-org--GLM-OCR-GGUF/snapshots/65a42de1148dbed2297e922b5dbc7d9b70c36578/GLM-OCR-Q8_0.gguf
														
 
															+      - /Users/zhch158/models/hf_home/hub/models--ggml-org--GLM-OCR-GGUF/snapshots/65a42de1148dbed2297e922b5dbc7d9b70c36578/mmproj-GLM-OCR-Q8_0.gguf
														
 
															+    used_by: ["vl_recognition/glmocr（本地）"]
														
 
															+    enabled: false
														
 
															+
														
 
															+  - name: mineru-2.5-pro-daemon-local
														
 
															+    kind: daemon
														
 
															+    server_url: http://localhost:8103
														
 
															+    served_model: MinerU2.5-Pro-2604-1.2B
														
 
															+    assets:
														
 
															+      - /Users/zhch158/models/hf_home/hub/models--mradermacher--MinerU2.5-Pro-2604-1.2B-GGUF/snapshots/70429e9c728b6a5e904f358a9936c17bd3f5f4b8/MinerU2.5-Pro-2604-1.2B.Q8_0.gguf
														
 
															+    used_by: ["MinerU2.5 本地 VLM"]
														
 
															+    enabled: false
														
 
															+
														
 
															+  - name: mineru-vl-remote
														
 
															+    kind: daemon
														
 
															+    server_url: http://10.192.72.11:20006
														
 
															+    served_model: MinerU2.5
														
 
															+    used_by: ["vl_recognition/mineru_vl（远程 vllm）"]
														
 
															+    enabled: false
														
 
															+
														
 
															+  - name: paddleocr-vl-remote
														
 
															+    kind: daemon
														
 
															+    server_url: http://10.192.72.11:20016
														
 
															+    served_model: PaddleOCR-VL-0.9B
														
 
															+    used_by: ["vl_recognition/paddle（远程 vllm）"]
														
 
															+    enabled: false
														
 
															+
														
 
															+  # ===== ④ MinerU 内置模型（包版本 + 模型根目录指纹） =====
														
 
															+  - name: mineru-builtin
														
 
															+    kind: mineru
														
 
															+    package: mineru
														
 
															+    # MinerU pipeline 内置模型（layout/ocr/formula/table 等）下载根目录；
														
 
															+    # 留空（null）则仅校验包版本。常见位置：modelscope_cache 或 hf_home。
														
 
															+    model_root: /Users/zhch158/models/modelscope_cache
														
 
															+    used_by: ["preprocessor/mineru", "ocr_recognition/mineru", "table_classification/paddle"]
														
 
															+    enabled: true
														
--- a/ocr_tools/model_doctor/models.lock.json
+++ b/ocr_tools/model_doctor/models.lock.json
@@ -0,0 +1,125 @@
 
															+{
														
 
															+  "generated_at": "2026-05-29T16:06:36+08:00",
														
 
															+  "models": {
														
 
															+    "docling-layout-old": {
														
 
															+      "status": "ok",
														
 
															+      "fingerprint": {
														
 
															+        "repo_id": "ds4sd/docling-layout-old",
														
 
															+        "local_revision": "b5b4bd59ad2b69aab715e9b1f1dfd74394c45fd4",
														
 
															+        "cached": true
														
 
															+      },
														
 
															+      "kind": "hf",
														
 
															+      "used_by": [
														
 
															+        "layout/docling（bank_statement_* 默认布局）"
														
 
															+      ]
														
 
															+    },
														
 
															+    "pp-doclayoutv3": {
														
 
															+      "status": "ok",
														
 
															+      "fingerprint": {
														
 
															+        "repo_id": "PaddlePaddle/PP-DocLayoutV3_safetensors",
														
 
															+        "local_revision": "fc37bdafb4cb98df1750ad8d2e21e2655189b171",
														
 
															+        "cached": true
														
 
															+      },
														
 
															+      "kind": "hf",
														
 
															+      "used_by": [
														
 
															+        "layout/paddle",
														
 
															+        "seal_supplement"
														
 
															+      ]
														
 
															+    },
														
 
															+    "paddleocr-vl-1.6-hf": {
														
 
															+      "status": "ok",
														
 
															+      "fingerprint": {
														
 
															+        "repo_id": "PaddlePaddle/PaddleOCR-VL-1.6",
														
 
															+        "local_revision": "bd1f9d64f127560f3fa49e69292486a5993782c6",
														
 
															+        "cached": true
														
 
															+      },
														
 
															+      "kind": "hf",
														
 
															+      "used_by": [
														
 
															+        "PaddleOCR-VL transformers 权重（GGUF 转换源）"
														
 
															+      ]
														
 
															+    },
														
 
															+    "rtdetr-wired-cell": {
														
 
															+      "status": "ok",
														
 
															+      "fingerprint": {
														
 
															+        "exists": true,
														
 
															+        "size": 129353461,
														
 
															+        "mtime": 1769605421
														
 
															+      },
														
 
															+      "kind": "local_file",
														
 
															+      "used_by": [
														
 
															+        "table_recognition_wired/cell_fusion（有线表格单元格融合）"
														
 
															+      ]
														
 
															+    },
														
 
															+    "rtdetr-h-layout-17cls": {
														
 
															+      "kind": "local_file",
														
 
															+      "status": "skipped",
														
 
															+      "fingerprint": {}
														
 
															+    },
														
 
															+    "paddleocr-vl-1.6-daemon": {
														
 
															+      "status": "ok",
														
 
															+      "fingerprint": {
														
 
															+        "server_url": "http://localhost:8102",
														
 
															+        "reachable": true,
														
 
															+        "served_models": [
														
 
															+          "PaddleOCR-VL-1.6"
														
 
															+        ],
														
 
															+        "assets": {
														
 
															+          "/Users/zhch158/models/PaddleOCR-VL-1.6-GGUF/PaddleOCR-VL-1.6-F16.gguf": {
														
 
															+            "exists": true,
														
 
															+            "size": 935768704,
														
 
															+            "mtime": 1780029330
														
 
															+          },
														
 
															+          "/Users/zhch158/models/PaddleOCR-VL-1.6-GGUF/PaddleOCR-VL-1.6-F16-mmproj.gguf": {
														
 
															+            "exists": true,
														
 
															+            "size": 880415808,
														
 
															+            "mtime": 1780029364
														
 
															+          }
														
 
															+        }
														
 
															+      },
														
 
															+      "kind": "daemon",
														
 
															+      "used_by": [
														
 
															+        "vl_recognition/paddle（bank_statement_paddle_vl_local）"
														
 
															+      ]
														
 
															+    },
														
 
															+    "glm-ocr-daemon-local": {
														
 
															+      "kind": "daemon",
														
 
															+      "status": "skipped",
														
 
															+      "fingerprint": {}
														
 
															+    },
														
 
															+    "mineru-2.5-pro-daemon-local": {
														
 
															+      "kind": "daemon",
														
 
															+      "status": "skipped",
														
 
															+      "fingerprint": {}
														
 
															+    },
														
 
															+    "mineru-vl-remote": {
														
 
															+      "kind": "daemon",
														
 
															+      "status": "skipped",
														
 
															+      "fingerprint": {}
														
 
															+    },
														
 
															+    "paddleocr-vl-remote": {
														
 
															+      "kind": "daemon",
														
 
															+      "status": "skipped",
														
 
															+      "fingerprint": {}
														
 
															+    },
														
 
															+    "mineru-builtin": {
														
 
															+      "status": "ok",
														
 
															+      "fingerprint": {
														
 
															+        "package": "mineru",
														
 
															+        "package_version": "3.1.13",
														
 
															+        "model_root": "/Users/zhch158/models/modelscope_cache",
														
 
															+        "model_root_fp": {
														
 
															+          "exists": true,
														
 
															+          "file_count": 10,
														
 
															+          "total_size": 122467480,
														
 
															+          "tree_sha256": "7d579059c6b11dca03920f2fa65553ac03fb37dc8b9075a81acb65bde93cc3d8"
														
 
															+        }
														
 
															+      },
														
 
															+      "kind": "mineru",
														
 
															+      "used_by": [
														
 
															+        "preprocessor/mineru",
														
 
															+        "ocr_recognition/mineru",
														
 
															+        "table_classification/paddle"
														
 
															+      ]
														
 
															+    }
														
 
															+  }
														
 
															+}