2 kuukautta sitten · ad97851be8
--- a/zhch/study-notes.md
+++ b/zhch/study-notes.md
@@ -1,5 +1,7 @@
 
				 # [PaddleX本地安装教程](https://paddlepaddle.github.io/PaddleX/latest/installation/installation.html)
			
 
				+
			
 
				 # 环境选择
			
 
				+
			
 
				 ```
			
 
				 git clone https://gitee.com/zhch158_admin/PaddleX.git
			
 
				 
			
@@ -13,8 +15,12 @@ cd PaddleX
 
				 uv venv paddle_env --python 3.11
			
 
				 #激活环境
			
 
				 source paddle_env/bin/activate
			
 
				+
			
 
				+uv pip install pytest PyMuPDF
			
 
				 ```
			
 
				+
			
 
				 # 安装依赖 CPU
			
 
				+
			
 
				 ```
			
 
				 uv pip install paddlepaddle==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
			
 
				 uv pip install -e .
			
@@ -23,6 +29,7 @@ paddlex --install PaddleOCR PaddleDetection PaddleClas  # 例如PaddleOCR
 
				 ```
			
 
				 
			
 
				 # linux GPU - 更新版本以匹配NVML 570.169
			
 
				+
			
 
				 ```bash
			
 
				 # 首先检查NVIDIA驱动版本
			
 
				 nvidia-smi
			
@@ -45,7 +52,26 @@ paddlex --install PaddleOCR PaddleDetection PaddleClas
 
				 python -c "import paddle; print(f'CUDA available: {paddle.device.is_compiled_with_cuda()}'); print(f'GPU count: {paddle.device.cuda.device_count()}')"
			
 
				 ```
			
 
				 
			
 
				+# 模型下载源
			
 
				+PaddleX 支持通过环境变量 PADDLE_PDX_MODEL_SOURCE 来指定模型下载源。根据 official_models.py 中的实现，支持以下下载源：
			
 
				+
			
 
				+支持的下载源
			
 
				+huggingface - HuggingFace（默认优先级）
			
 
				+modelscope - ModelScope
			
 
				+bos - 百度云存储
			
 
				+aistudio - AI Studio
			
 
				+```bash
			
 
				+# 使用百度云存储源
			
 
				+export PADDLE_PDX_MODEL_SOURCE="bos"
			
 
				+python ppstructurev3_single_process.py --input_file "file.pdf" --output_dir "./output"
			
 
				+
			
 
				+# 使用 ModelScope 源
			
 
				+export PADDLE_PDX_MODEL_SOURCE="modelscope"
			
 
				+python ppstructurev3_single_process.py --input_file "file.pdf" --output_dir "./output"
			
 
				+```
			
 
				+
			
 
				 # 设置launch.json
			
 
				+
			
 
				 ```json
			
 
				 	"configurations": [
			
 
				 		{
			
@@ -85,6 +111,7 @@ python -c "import paddle; print(f'CUDA available: {paddle.device.is_compiled_wit
 
				 ```
			
 
				 
			
 
				 # 测试样例
			
 
				+
			
 
				 ```
			
 
				 paddleocr ocr -i zhch/sample_data/600916_中国黄金_2002年报_83_94_2.png --save_path zhch/sample_data/output --enable_mkldnn=False  --device=cpu
			
 
				 
			
@@ -107,6 +134,7 @@ paddlex --pipeline table_recognition_v2 --use_doc_orientation_classify=True --us
 
				 ```
			
 
				 
			
 
				 # macOS 查看显卡信息
			
 
				+
			
 
				 ```
			
 
				 # 使用 system_profiler 命令查看 GPU 信息
			
 
				 system_profiler SPDisplaysDataType
			
@@ -117,14 +145,15 @@ system_profiler SPDisplaysDataType
 
				 ## 问题分析
			
 
				 
			
 
				 1. **配置文件中印章识别是关闭的**：
			
 
				+
			
 
				    - 在 `PP-StructureV3.yaml` 中，`use_seal_recognition: False`
			
 
				    - 这意味着初始化时没有加载印章识别相关的模型
			
 
				-
			
 
				 2. **命令行参数冲突**：
			
 
				+
			
 
				    - 您使用了 `--use_seal_recognition True` 参数试图启用印章识别
			
 
				    - 但由于初始化时没有加载相关模型，导致运行时检查失败
			
 
				-
			
 
				 3. **模型检查失败**：
			
 
				+
			
 
				    - 在 `_LayoutParsingPipelineV2.check_model_settings_valid` 方法中检测到模型未初始化
			
 
				    - 错误消息："Set use_seal_recognition, but the models for seal recognition are not initialized."
			
 
				 
			
@@ -145,17 +174,18 @@ paddlex --pipeline seal_recognition \
 
				 ## 完整的解决步骤
			
 
				 
			
 
				 1. **获取 PP-StructureV3 配置文件**：
			
 
				+
			
 
				    ````bash
			
 
				    paddlex --get_pipeline_config PP-StructureV3 --save_path ./my_config
			
 
				    ````
			
 
				-
			
 
				 2. **修改配置文件**：
			
 
				+
			
 
				    ````yaml
			
 
				    # 在 my_config/PP-StructureV3.yaml 中修改：
			
 
				    use_seal_recognition: True  # 改为 True
			
 
				    ````
			
 
				-
			
 
				 3. **使用修改后的配置文件**：
			
 
				+
			
 
				    ````bash
			
 
				    paddlex --pipeline ./my_config/PP-StructureV3.yaml \
			
 
				        --input sample_data/300674-母公司现金流量表-扫描.png \
			
@@ -172,9 +202,16 @@ paddlex --pipeline seal_recognition \
 
				 这是 PaddleX 的设计机制，确保只有在配置文件中明确启用的功能才会被初始化和使用。
			
 
				 
			
 
				 # 运行批量识别OmniDocBench数据集
			
 
				-	- 只能单进程，
			
 
				+
			
 
				+    - 只能单进程，
			
 
				+
			
 
				 ```bash
			
 
				 cd ~/zhch/PaddleX/zhch
			
 
				 conda activate paddle
			
 
				+# export PADDLE_PDX_MODEL_SOURCE="modelscope"
			
 
				+export PADDLE_PDX_MODEL_SOURCE="bos"
			
 
				+
			
 
				 python ppstructurev3_single_process.py
			
 
				-```
			
 
				+
			
 
				+python ppstructurev3_single_process.py --input_file "/home/ubuntu/zhch/data/至远彩色印刷工业有限公司/2023年度报告母公司.pdf" --output_dir "./data_PPStructureV3_Results" --pipeline "./my_config/PP-StructureV3.yaml" --device "gpu:0"
			
 
				+```