|
|
há 1 mês atrás | |
|---|---|---|
| .. | ||
| sample_data | há 2 meses atrás | |
| utils | há 1 mês atrás | |
| Finance_report_extract.md | há 3 meses atrás | |
| README.md | há 1 mês atrás | |
| annual_report_kg_solution.md | há 3 meses atrás | |
| demo.py | há 2 meses atrás | |
| demo_zhch_v1.py | há 2 meses atrás | |
| download_models.py | há 11 meses atrás | |
| html_zhch.py | há 11 meses atrás | |
| magic-pdf-0.json | há 11 meses atrás | |
| magic-pdf.json | há 11 meses atrás | |
| magic_pdf_parse_main_zhch.py | há 11 meses atrás | |
| mineru.json | há 2 meses atrás | |
| mineru_pipeline.md | há 3 meses atrás | |
| mineru_pipeline_flow.md | há 3 meses atrás | |
| mineru_vllm_daemon.sh | há 1 mês atrás | |
git clone https://gitee.com/zhch158_admin/MinerU.git -c user.name=zhch158_admin -c user.email=zhch158@sina.com
git config --local user.name "zhch158_admin"
git config --local user.email "zhch158@sina.com"
conda create -n mineru2 python=3.12
conda activate mineru2
pip install -U "mineru[core]" -i https://mirrors.aliyun.com/pypi/simple
# 查看PyTorch版本
python -c "import torch; print(torch.__version__)"
# 应该用不着安装
# uv pip install flash-attn==2.8.3 --find-links https://github.com/Dao-AILab/flash-attention/releases
uv pip install -U -e '.[all]'
# 从modelscope下载模型权重文件, 模型会在$MODELSCOPE_CACHE
python -m mineru.cli.models_download
# .env中加入model配置
MINERU_TOOLS_CONFIG_JSON="/home/dev/zhch/src/MinerU/mineru.json"
.env
NLTK_DATA="/home/ubuntu/nltk_data"
HF_HOME="/home/ubuntu/models/hf_home"
HF_ENDPOINT=https://hf-mirror.com
HF_HUB_OFFLINE=0
TORCH_HOME="/home/ubuntu/models/torch/"
MODELSCOPE_CACHE="/home/ubuntu/models/modelscope_cache"
USE_MODELSCOPE_HUB=1
CUDA_VISIBLE_DEVICES=0,1,2,3
CUDA_VISIBLE_DEVICES="0"; mineru-vllm-server --port 8121 --gpu-memory-utilization 0.3
export MINERU_TOOLS_CONFIG_JSON="/home/ubuntu/zhch/MinerU/mineru.json"
python -m mineru.cli.client -p /home/ubuntu/zhch/OmniDocBench/OpenDataLab___OmniDocBench/images -o /home/ubuntu/zhch/MinerU/zhch/OmniDocBench_MinerU-pipeline-2.1.1_Results --lang ch --source modelscope
export MINERU_TOOLS_CONFIG_JSON="/home/ubuntu/zhch/MinerU/mineru.json"
python -m mineru.cli.client -p /home/ubuntu/zhch/data/流水分析/A用户_单元格扫描流水.img -o /home/ubuntu/zhch/data/流水分析/A用户_单元格扫描流水/mineru-pipleline-2.5.3_Results --lang ch --source modelscope
python -m mineru.cli.client -p /home/ubuntu/zhch/data/流水分析/B用户_扫描流水.img -o /home/ubuntu/zhch/data/流水分析/B用户_扫描流水/mineru-pipleline-2.5.3_Results --lang ch --source modelscope
CUDA_VISIBLE_DEVICES=3 python -m mineru.cli.client -p /home/ubuntu/zhch/data/流水分析/B用户_扫描流水.img -o /home/ubuntu/zhch/data/流水分析/B用户_扫描流水/mineru-vlm-2.5.3_Results --backend vlm-vllm-engine --gpu-memory-utilization 0.3 --source modelscope
export MINERU_TOOLS_CONFIG_JSON="/home/ubuntu/zhch/MinerU/mineru.json"
python -m mineru.cli.client -p /home/ubuntu/zhch/data/至远彩色印刷工业有限公司/2023年度报告母公司.img -o /home/ubuntu/zhch/data/至远彩色印刷工业有限公司/2023年度报告母公司/mineru-pipleline-2.5.3_Results --lang ch --source modelscope
export MINERU_TOOLS_CONFIG_JSON="/home/ubuntu/zhch/MinerU/mineru.json"
CUDA_VISIBLE_DEVICES=3 python -m mineru.cli.client -p /home/ubuntu/zhch/data/至远彩色印刷工业有限公司/2023年度报告母公司.img -o /home/ubuntu/zhch/data/至远彩色印刷工业有限公司/2023年度报告母公司/mineru-vlm-2.5.3_Results --backend vlm-vllm-engine --gpu-memory-utilization 0.3 --source modelscope
# mac vlm-transformers
export MINERU_TOOLS_CONFIG_JSON="/Users/zhch158/workspace/repository.git/MinerU/zhch/mineru.json"
python -m mineru.cli.client -p /Users/zhch158/workspace/data/流水分析/B用户_扫描流水.1/data_PPStructureV3_Results/B用户_扫描流水/B用户_扫描流水_page_002.png -o /Users/zhch158/workspace/data/流水分析/B用户_扫描流水.1/mineru-vlm-2.5.3_Results --backend vlm-transformers --source modelscope
# mac vlm-vllm-engine
export MINERU_TOOLS_CONFIG_JSON="/Users/zhch158/workspace/repository.git/MinerU/zhch/mineru.json"
python -m mineru.cli.client -p /Users/zhch158/workspace/data/流水分析/B用户_扫描流水.1/data_PPStructureV3_Results/B用户_扫描流水/B用户_扫描流水_page_002.png -o /Users/zhch158/workspace/data/流水分析/B用户_扫描流水.1/mineru-vlm-2.5.3_Results --backend vlm-vllm-engine --max-num-batched-tokens 16384 --source modelscope