2 weeks ago · c4107d9e65
--- a/zhch/unified_pytorch_models/vendor/README.md
+++ b/zhch/unified_pytorch_models/vendor/README.md
@@ -1,51 +1,305 @@
 
															 # Vendor 依赖说明
														
 
															-本目录包含从 MinerU 提取的核心 OCR 模块。
														
 
															+本目录包含从 MinerU 提取并整合的核心 OCR 模块，提供完整的 PyTorch OCR 推理能力。
														
 
															 ## 📂 目录结构
														
 
															 ```
														
 
															 vendor/
														
 
															-├── __init__.py
														
 
															-├── ocr_utils.py           # OCR 工具函数
														
 
															-├── pytorchocr/            # PytorchOCR 核心（需要手动复制）
														
 
															+├── __init__.py                   # 模块初始化
														
 
															+├── device_utils.py               # 设备检测工具（CPU/CUDA/MPS）
														
 
															+├── ocr_utils.py                  # OCR 工具函数（图像处理、框合并等）
														
 
															+├── README.md                     # 本文档
														
 
															+├── infer/                        # 推理模块
														
 
															 │   ├── __init__.py
														
 
															-│   └── predict_system.py  # TextSystem 主类
														
 
															-└── README.md
														
 
															+│   ├── predict_det.py            # 文本检测推理
														
 
															+│   ├── predict_rec.py            # 文本识别推理
														
 
															+│   ├── predict_cls.py            # 方向分类推理
														
 
															+│   ├── predict_system.py         # OCR 系统（TextSystem）
														
 
															+│   └── pytorchocr_utility.py     # 推理工具函数
														
 
															+└── pytorchocr/                   # PytorchOCR 核心
														
 
															+    ├── base_ocr_v20.py           # OCR 基础类
														
 
															+    ├── data/                     # 数据处理
														
 
															+    │   └── imaug/
														
 
															+    │       ├── __init__.py
														
 
															+    │       └── operators.py      # 图像增强操作
														
 
															+    ├── modeling/                 # 模型架构
														
 
															+    │   ├── common.py             # 通用层
														
 
															+    │   ├── architectures/        # 模型架构定义
														
 
															+    │   │   └── base_model.py
														
 
															+    │   ├── backbones/            # 骨干网络
														
 
															+    │   │   ├── det_mobilenet_v3.py
														
 
															+    │   │   ├── rec_lcnetv3.py
														
 
															+    │   │   ├── rec_svtrnet.py
														
 
															+    │   │   ├── rec_hgnet.py
														
 
															+    │   │   └── ...
														
 
															+    │   ├── heads/                # 预测头
														
 
															+    │   │   ├── det_db_head.py
														
 
															+    │   │   ├── rec_ctc_head.py
														
 
															+    │   │   ├── rec_multi_head.py
														
 
															+    │   │   └── ...
														
 
															+    │   └── necks/                # 特征融合层
														
 
															+    │       ├── db_fpn.py
														
 
															+    │       └── rnn.py
														
 
															+    ├── postprocess/              # 后处理
														
 
															+    │   ├── db_postprocess.py     # 检测后处理
														
 
															+    │   ├── rec_postprocess.py    # 识别后处理
														
 
															+    │   └── cls_postprocess.py    # 分类后处理
														
 
															+    └── utils/                    # 工具与资源
														
 
															+        └── resources/            # 配置与字典
														
 
															+            ├── arch_config.yaml
														
 
															+            ├── models_config.yml
														
 
															+            └── dict/             # 多语言字典文件
														
 
															+                ├── ppocrv5_dict.txt          # 中文字典（18383 字符）
														
 
															+                ├── ppocrv5_en_dict.txt       # 英文字典
														
 
															+                ├── ppocrv5_korean_dict.txt   # 韩文字典
														
 
															+                ├── ppocrv5_arabic_dict.txt   # 阿拉伯语字典
														
 
															+                └── ...
														
 
															 ```
														
 
															-## 🔧 安装步骤
														
 
															+## 🚀 核心模块说明
														
 
															-由于 `pytorchocr` 模块较大，需要手动从 MinerU 复制：
														
 
															+### 1. `ocr_utils.py` - 工具函数库
														
 
															+
														
 
															+**核心函数**:
														
 
															+
														
 
															+| 函数名 | 功能 | 说明 |
														
 
															+|--------|------|------|
														
 
															+| `get_rotate_crop_image` | 裁剪并矫正倾斜文本框 | 透视变换 + 旋转竖排文字 |
														
 
															+| `sorted_boxes` | 排序检测框 | 从左到右、从上到下排序 |
														
 
															+| `merge_det_boxes` | 合并检测框 | 合并同行文本框 |
														
 
															+| `update_det_boxes` | 更新检测框 | 根据 MFD 结果调整框 |
														
 
															+| `preprocess_image` | 图像预处理 | Alpha 通道处理 + BGR 格式 |
														
 
															+| `check_img` | 检查图像格式 | 支持路径/bytes/ndarray |
														
 
															+
														
 
															+**使用示例**:
														
 
															+```python
														
 
															+from vendor.ocr_utils import get_rotate_crop_image, sorted_boxes
														
 
															+
														
 
															+# 矫正倾斜文本框
														
 
															+img_crop = get_rotate_crop_image(img, box_points)
														
 
															+
														
 
															+# 排序检测框
														
 
															+sorted_dt_boxes = sorted_boxes(dt_boxes)
														
 
															+```
														
 
															+
														
 
															+---
														
 
															+
														
 
															+### 2. `device_utils.py` - 设备检测
														
 
															+
														
 
															+**功能**: 自动检测可用的计算设备
														
 
															+
														
 
															+```python
														
 
															+from vendor.device_utils import select_device
														
 
															+
														
 
															+device = select_device('auto')  # 返回 'cuda', 'mps', 或 'cpu'
														
 
															+```
														
 
															+
														
 
															+---
														
 
															+
														
 
															+### 3. `infer/` - 推理模块
														
 
															+
														
 
															+#### `predict_system.py` - TextSystem
														
 
															+
														
 
															+**核心类**: `TextSystem`
														
 
															+- 整合检测器、识别器、分类器
														
 
															+- 提供完整的 OCR 流程
														
 
															+
														
 
															+#### `predict_det.py` - 文本检测
														
 
															+
														
 
															+**核心类**: `TextDetector`
														
 
															+- 基于 DBNet++ 的文本检测
														
 
															+- 支持可变形卷积
														
 
															+
														
 
															+#### `predict_rec.py` - 文本识别
														
 
															+
														
 
															+**核心类**: `TextRecognizer`
														
 
															+- 基于 SVTR-HGNet 的文本识别
														
 
															+- 支持 CTC 和 Attention 双解码
														
 
															+
														
 
															+#### `predict_cls.py` - 方向分类
														
 
															+
														
 
															+**核心类**: `TextClassifier`
														
 
															+- 文本行方向分类（0°/180°）
														
 
															+
														
 
															+---
														
 
															+
														
 
															+### 5. `pytorchocr/` - 核心模型库
														
 
															+
														
 
															+#### 支持的模型架构
														
 
															+
														
 
															+**检测模型**:
														
 
															+- ✅ DBNet / DBNet++
														
 
															+- ✅ MobileNetV3 backbone
														
 
															+
														
 
															+**识别模型**:
														
 
															+- ✅ SVTR-HGNet (推荐)
														
 
															+- ✅ PPOCRv5
														
 
															+- ✅ PPOCRv4
														
 
															+- ✅ LCNetV3
														
 
															+- ✅ Donut Swin Transformer
														
 
															+
														
 
															+**分类模型**:
														
 
															+- ✅ PP-LCNet
														
 
															+
														
 
															+---
														
 
															+
														
 
															+## 📝 配置文件说明
														
 
															+
														
 
															+### `models_config.yml`
														
 
															+
														
 
															+定义了所有支持的模型配置：
														
 
															+
														
 
															+```yaml
														
 
															+# 中文识别模型
														
 
															+ch_PP-OCRv5_rec:
														
 
															+  model_type: rec
														
 
															+  algorithm: SVTR_HGNet
														
 
															+  weights_path: OCR/Rec/ch_PP-OCRv5_rec_infer.pth
														
 
															+  character_dict_path: vendor/pytorchocr/utils/resources/dict/ppocrv5_dict.txt
														
 
															+
														
 
															+# 英文识别模型
														
 
															+en_PP-OCRv5_rec:
														
 
															+  model_type: rec
														
 
															+  algorithm: SVTR_HGNet
														
 
															+  weights_path: OCR/Rec/en_PP-OCRv5_rec_infer.pth
														
 
															+  character_dict_path: vendor/pytorchocr/utils/resources/dict/ppocrv5_en_dict.txt
														
 
															+```
														
 
															+
														
 
															+### `arch_config.yaml`
														
 
															+
														
 
															+定义了模型架构细节：
														
 
															+
														
 
															+```yaml
														
 
															+rec_SVTR_HGNet:
														
 
															+  Backbone:
														
 
															+    name: PPHGNet_small
														
 
															+    scale: 0.95
														
 
															+  Head:
														
 
															+    name: MultiHead
														
 
															+    head_list:
														
 
															+      - CTCHead:
														
 
															+          Neck:
														
 
															+            name: svtr
														
 
															+```
														
 
															+
														
 
															+---
														
 
															+
														
 
															+## 🔧 依赖安装
														
 
															+
														
 
															+### 核心依赖
														
 
															 ```bash
														
 
															-# 1. 复制核心模块
														
 
															-cp -r /Users/zhch158/workspace/repository.git/MinerU/mineru/model/utils/tools/infer/ \
														
 
															-      vendor/infer/
														
 
															+pip install torch torchvision opencv-python numpy pyyaml shapely pyclipper loguru
														
 
															+```
														
 
															+
														
 
															+### 可选依赖
														
 
															-# 2. 复制依赖的其他模块（如果需要）
														
 
															-cp -r /Users/zhch158/workspace/repository.git/MinerU/mineru/model/utils/pytorchocr/ \
														
 
															-      vendor/pytorchocr/
														
 
															+```bash
														
 
															+# ONNX Runtime（用于方向分类）
														
 
															+pip install onnxruntime
														
 
															+
														
 
															+# GPU 支持
														
 
															+pip install onnxruntime-gpu  # NVIDIA GPU
														
 
															 ```
														
 
															-## ⚠️ 注意事项
														
 
															+---
														
 
															+
														
 
															+## 🐛 常见问题
														
 
															+
														
 
															+### 1. 找不到字典文件
														
 
															-1. **依赖管理**: vendor 的模块不会自动更新，需要手动同步 MinerU 的更新
														
 
															-2. **路径问题**: 如果遇到 import 错误，需要调整 `mineru_ocr_adapter.py` 中的导入路径
														
 
															-3. **模型文件**: 确保模型文件在 `~/.cache/modelscope/hub/models/OpenDataLab/` 下
														
 
															+**错误信息**:
														
 
															+```
														
 
															+FileNotFoundError: [Errno 2] No such file or directory: 'vendor/pytorchocr/utils/resources/dict/ppocrv5_dict.txt'
														
 
															+```
														
 
															+
														
 
															+**解决方案**:
														
 
															+```python
														
 
															+# 检查字典文件是否存在
														
 
															+from pathlib import Path
														
 
															+dict_path = Path(__file__).parent / "pytorchocr/utils/resources/dict/ppocrv5_dict.txt"
														
 
															+assert dict_path.exists(), f"Dictionary not found: {dict_path}"
														
 
															+```
														
 
															+
														
 
															+---
														
 
															+
														
 
															+### 2. 识别结果为空
														
 
															+
														
 
															+**原因**: 字符集未加载到 `postprocess_op.character`
														
 
															+
														
 
															+**解决方案**:
														
 
															+```python
														
 
															+# 在 pytorch_paddle.py 的 __init__ 中验证
														
 
															+if hasattr(self.text_recognizer, 'postprocess_op'):
														
 
															+    char_count = len(self.text_recognizer.postprocess_op.character)
														
 
															+    print(f"Character set size: {char_count}")  # 应该 > 0
														
 
															+```
														
 
															-## 🔄 备选方案
														
 
															+---
														
 
															-如果 vendor 方式有问题，可以直接在运行时添加 MinerU 到 PYTHONPATH：
														
 
															+### 3. ImportError: No module named 'vendor'
														
 
															+**解决方案**:
														
 
															 ```python
														
 
															 import sys
														
 
															 from pathlib import Path
														
 
															-mineru_root = Path("/Users/zhch158/workspace/repository.git/MinerU").resolve()
														
 
															-sys.path.insert(0, str(mineru_root))
														
 
															+# 添加项目根目录到 PYTHONPATH
														
 
															+root_dir = Path(__file__).resolve().parent.parent
														
 
															+sys.path.insert(0, str(root_dir))
														
 
															 ```
														
 
															-```bash
														
 
															-pip install omegaconf
														
 
															-```
														
 
															+---
														
 
															+
														
 
															+## 📚 模型来源
														
 
															+
														
 
															+所有模型均来自 **MinerU** 和 **PaddleOCR**：
														
 
															+
														
 
															+- **检测模型**: [PaddleOCR Detection](https://github.com/PaddlePaddle/PaddleOCR/blob/main/doc/doc_ch/algorithm_det_db.md)
														
 
															+- **识别模型**: [PP-OCRv5](https://github.com/PaddlePaddle/PaddleOCR/blob/main/doc/doc_ch/PP-OCRv5_introduction.md)
														
 
															+- **字典文件**: [PaddleOCR Dictionaries](https://github.com/PaddlePaddle/PaddleOCR/tree/main/ppocr/utils/dict)
														
 
															+
														
 
															+---
														
 
															+
														
 
															+## 🔄 更新说明
														
 
															+
														
 
															+### 与 MinerU 的差异
														
 
															+
														
 
															+| 项目 | MinerU | Vendor (本项目) |
														
 
															+|------|--------|----------------|
														
 
															+| **方向分类** | 两阶段（文本检测 + 分类器） | 集成到 `PytorchPaddleOCR` |
														
 
															+| **可视化** | 无内置 | ✅ 内置 `visualize()` 方法 |
														
 
															+| **字符集验证** | 无 | ✅ 自动验证并修复 |
														
 
															+| **检测框合并** | 手动调用 | ✅ 可选自动合并 |
														
 
															+
														
 
															+### 版本历史
														
 
															+
														
 
															+- **v2.0** (2024-10-30): 
														
 
															+  - ✅ 集成方向分类到 OCR 流程
														
 
															+  - ✅ 新增可视化功能
														
 
															+  - ✅ 修复字符集加载问题
														
 
															+  - ✅ 优化检测框合并逻辑
														
 
															+
														
 
															+- **v1.0** (2024-10-01):
														
 
															+  - 初始版本，从 MinerU 提取核心模块
														
 
															+
														
 
															+---
														
 
															+
														
 
															+## 📄 许可证
														
 
															+
														
 
															+本模块基于 MinerU 和 PaddleOCR，遵循其原有许可证：
														
 
															+
														
 
															+- **MinerU**: Apache-2.0
														
 
															+- **PaddleOCR**: Apache-2.0
														
 
															+
														
 
															+---
														
 
															+
														
 
															+## 🙏 致谢
														
 
															+
														
 
															+- [MinerU](https://github.com/opendatalab/MinerU) - PDF 文档解析工具
														
 
															+- [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR) - 百度 OCR 工具包
														
 
															+
														
 
															+---
														
 
															+
														
 
															+**最后更新**: 2024-10-30