Переглянути джерело

refactor(model download script)

- Remove the download script from the `docs` directory and unify it in the `scripts` directory.
- Update the JSON template URL in the script to the master branch.
- Modify the script download link provided in the documentation.
- Update the table recognition model in the configuration template to `rapid_table`.
myhloli 1 рік тому
батько
коміт
9496c6c4cb

+ 0 - 59
docs/download_models.py

@@ -1,59 +0,0 @@
-import json
-import os
-
-import requests
-from modelscope import snapshot_download
-
-
-def download_json(url):
-    # 下载JSON文件
-    response = requests.get(url)
-    response.raise_for_status()  # 检查请求是否成功
-    return response.json()
-
-
-def download_and_modify_json(url, local_filename, modifications):
-    if os.path.exists(local_filename):
-        data = json.load(open(local_filename))
-        config_version = data.get('config_version', '0.0.0')
-        if config_version < '1.0.0':
-            data = download_json(url)
-    else:
-        data = download_json(url)
-
-    # 修改内容
-    for key, value in modifications.items():
-        data[key] = value
-
-    # 保存修改后的内容
-    with open(local_filename, 'w', encoding='utf-8') as f:
-        json.dump(data, f, ensure_ascii=False, indent=4)
-
-
-if __name__ == '__main__':
-    mineru_patterns = [
-        "models/Layout/LayoutLMv3/*",
-        "models/Layout/YOLO/*",
-        "models/MFD/YOLO/*",
-        "models/MFR/unimernet_small/*",
-        "models/TabRec/TableMaster/*",
-        "models/TabRec/StructEqTable/*",
-    ]
-    model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0', allow_patterns=mineru_patterns)
-    layoutreader_model_dir = snapshot_download('ppaanngggg/layoutreader')
-    model_dir = model_dir + '/models'
-    print(f'model_dir is: {model_dir}')
-    print(f'layoutreader_model_dir is: {layoutreader_model_dir}')
-
-    json_url = 'https://gitee.com/myhloli/MinerU/raw/dev/magic-pdf.template.json'
-    config_file_name = 'magic-pdf.json'
-    home_dir = os.path.expanduser('~')
-    config_file = os.path.join(home_dir, config_file_name)
-
-    json_mods = {
-        'models-dir': model_dir,
-        'layoutreader-model-dir': layoutreader_model_dir,
-    }
-
-    download_and_modify_json(json_url, config_file, json_mods)
-    print(f'The configuration file has been configured successfully, the path is: {config_file}')

+ 0 - 66
docs/download_models_hf.py

@@ -1,66 +0,0 @@
-import json
-import os
-
-import requests
-from huggingface_hub import snapshot_download
-
-
-def download_json(url):
-    # 下载JSON文件
-    response = requests.get(url)
-    response.raise_for_status()  # 检查请求是否成功
-    return response.json()
-
-
-def download_and_modify_json(url, local_filename, modifications):
-    if os.path.exists(local_filename):
-        data = json.load(open(local_filename))
-        config_version = data.get('config_version', '0.0.0')
-        if config_version < '1.0.0':
-            data = download_json(url)
-    else:
-        data = download_json(url)
-
-    # 修改内容
-    for key, value in modifications.items():
-        data[key] = value
-
-    # 保存修改后的内容
-    with open(local_filename, 'w', encoding='utf-8') as f:
-        json.dump(data, f, ensure_ascii=False, indent=4)
-
-
-if __name__ == '__main__':
-
-    mineru_patterns = [
-        "models/Layout/LayoutLMv3/*",
-        "models/Layout/YOLO/*",
-        "models/MFD/YOLO/*",
-        "models/MFR/unimernet_small/*",
-        "models/TabRec/TableMaster/*",
-        "models/TabRec/StructEqTable/*",
-    ]
-    model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0', allow_patterns=mineru_patterns)
-
-    layoutreader_pattern = [
-        "*.json",
-        "*.safetensors",
-    ]
-    layoutreader_model_dir = snapshot_download('hantian/layoutreader', allow_patterns=layoutreader_pattern)
-
-    model_dir = model_dir + '/models'
-    print(f'model_dir is: {model_dir}')
-    print(f'layoutreader_model_dir is: {layoutreader_model_dir}')
-
-    json_url = 'https://github.com/opendatalab/MinerU/raw/dev/magic-pdf.template.json'
-    config_file_name = 'magic-pdf.json'
-    home_dir = os.path.expanduser('~')
-    config_file = os.path.join(home_dir, config_file_name)
-
-    json_mods = {
-        'models-dir': model_dir,
-        'layoutreader-model-dir': layoutreader_model_dir,
-    }
-
-    download_and_modify_json(json_url, config_file, json_mods)
-    print(f'The configuration file has been configured successfully, the path is: {config_file}')

+ 1 - 1
docs/how_to_download_models_en.md

@@ -8,7 +8,7 @@ Use a Python Script to Download Model Files from Hugging Face
 
 ```bash
 pip install huggingface_hub
-wget https://github.com/opendatalab/MinerU/raw/master/docs/download_models_hf.py -O download_models_hf.py
+wget https://github.com/opendatalab/MinerU/raw/master/scripts/download_models_hf.py -O download_models_hf.py
 python download_models_hf.py
 ```
 

+ 2 - 2
docs/how_to_download_models_zh_cn.md

@@ -8,7 +8,7 @@
   <summary>方法一:从 Hugging Face 下载模型</summary>
   <p>使用python脚本 从Hugging Face下载模型文件</p>
   <pre><code>pip install huggingface_hub
-wget https://gitee.com/myhloli/MinerU/raw/master/docs/download_models_hf.py -O download_models_hf.py
+wget https://gitee.com/myhloli/MinerU/raw/master/scripts/download_models_hf.py -O download_models_hf.py
 python download_models_hf.py</code></pre>
 </details>
 
@@ -18,7 +18,7 @@ python download_models_hf.py</code></pre>
 
 ```bash
 pip install modelscope
-wget https://gitee.com/myhloli/MinerU/raw/master/docs/download_models.py -O download_models.py
+wget https://gitee.com/myhloli/MinerU/raw/master/scripts/download_models.py -O download_models.py
 python download_models.py
 ```
 

+ 1 - 1
magic-pdf.template.json

@@ -15,7 +15,7 @@
         "enable": true
     },
     "table-config": {
-        "model": "tablemaster",
+        "model": "rapid_table",
         "enable": false,
         "max_time": 400
     },

+ 1 - 1
scripts/download_models.py

@@ -45,7 +45,7 @@ if __name__ == '__main__':
     print(f'model_dir is: {model_dir}')
     print(f'layoutreader_model_dir is: {layoutreader_model_dir}')
 
-    json_url = 'https://gitee.com/myhloli/MinerU/raw/dev/magic-pdf.template.json'
+    json_url = 'https://gitee.com/myhloli/MinerU/raw/master/magic-pdf.template.json'
     config_file_name = 'magic-pdf.json'
     home_dir = os.path.expanduser('~')
     config_file = os.path.join(home_dir, config_file_name)

+ 1 - 1
scripts/download_models_hf.py

@@ -52,7 +52,7 @@ if __name__ == '__main__':
     print(f'model_dir is: {model_dir}')
     print(f'layoutreader_model_dir is: {layoutreader_model_dir}')
 
-    json_url = 'https://github.com/opendatalab/MinerU/raw/dev/magic-pdf.template.json'
+    json_url = 'https://github.com/opendatalab/MinerU/raw/master/magic-pdf.template.json'
     config_file_name = 'magic-pdf.json'
     home_dir = os.path.expanduser('~')
     config_file = os.path.join(home_dir, config_file_name)