Procházet zdrojové kódy

feat(docs): automate model download and configuration

- Add scripts to download models and update configuration file
- Remove manual steps for modifying model paths
- Update documentation for both ModelScope and HuggingFace model downloads
- Improve user experience by automating the entire process
myhloli před 1 rokem
rodič
revize
cf38577943

+ 45 - 4
docs/download_models.py

@@ -1,5 +1,46 @@
-# use modelscope sdk download models
+import os
+import requests
+import json
 from modelscope import snapshot_download
-model_dir = snapshot_download('opendatalab/PDF-Extract-Kit')
-layoutreader_model_dir = snapshot_download('ppaanngggg/layoutreader')
-print(f"model dir is: {model_dir}/models")
+
+
+def download_and_modify_json(url, local_filename, modifications):
+    if os.path.exists(local_filename):
+        data = json.load(open(local_filename))
+    else:
+        # 下载JSON文件
+        response = requests.get(url)
+        response.raise_for_status()  # 检查请求是否成功
+
+        # 解析JSON内容
+        data = response.json()
+
+    # 修改内容
+    for key, value in modifications.items():
+        data[key] = value
+
+    # 保存修改后的内容
+    with open(local_filename, 'w', encoding='utf-8') as f:
+        json.dump(data, f, ensure_ascii=False, indent=4)
+
+
+if __name__ == '__main__':
+    model_dir = snapshot_download('opendatalab/PDF-Extract-Kit')
+    layoutreader_model_dir = snapshot_download('ppaanngggg/layoutreader')
+    model_dir = model_dir + "/models"
+    print(f"model_dir is: {model_dir}")
+    print(f"layoutreader_model_dir is: {layoutreader_model_dir}")
+
+    json_url = 'https://gitee.com/myhloli/MinerU/raw/master/magic-pdf.template.json'
+    config_file_name = "magic-pdf.json"
+    home_dir = os.path.expanduser("~")
+    config_file = os.path.join(home_dir, config_file_name)
+
+    json_mods = {
+        'models-dir': model_dir,
+        'layoutreader-model-dir': layoutreader_model_dir,
+    }
+
+    download_and_modify_json(json_url, config_file, json_mods)
+    print(f"The configuration file has been configured successfully, the path is: {config_file}")
+

+ 45 - 3
docs/download_models_hf.py

@@ -1,4 +1,46 @@
+import os
+import requests
+import json
 from huggingface_hub import snapshot_download
-model_dir = snapshot_download('opendatalab/PDF-Extract-Kit')
-layoutreader_model_dir = snapshot_download('hantian/layoutreader')
-print(f"model dir is: {model_dir}/models")
+
+
+def download_and_modify_json(url, local_filename, modifications):
+    if os.path.exists(local_filename):
+        data = json.load(open(local_filename))
+    else:
+        # 下载JSON文件
+        response = requests.get(url)
+        response.raise_for_status()  # 检查请求是否成功
+
+        # 解析JSON内容
+        data = response.json()
+
+    # 修改内容
+    for key, value in modifications.items():
+        data[key] = value
+
+    # 保存修改后的内容
+    with open(local_filename, 'w', encoding='utf-8') as f:
+        json.dump(data, f, ensure_ascii=False, indent=4)
+
+
+if __name__ == '__main__':
+    model_dir = snapshot_download('opendatalab/PDF-Extract-Kit')
+    layoutreader_model_dir = snapshot_download('hantian/layoutreader')
+    model_dir = model_dir + "/models"
+    print(f"model_dir is: {model_dir}")
+    print(f"layoutreader_model_dir is: {layoutreader_model_dir}")
+
+    json_url = 'https://github.com/opendatalab/MinerU/raw/master/magic-pdf.template.json'
+    config_file_name = "magic-pdf.json"
+    home_dir = os.path.expanduser("~")
+    config_file = os.path.join(home_dir, config_file_name)
+
+    json_mods = {
+        'models-dir': model_dir,
+        'layoutreader-model-dir': layoutreader_model_dir,
+    }
+
+    download_and_modify_json(json_url, config_file, json_mods)
+    print(f"The configuration file has been configured successfully, the path is: {config_file}")
+

+ 1 - 4
docs/how_to_download_models_en.md

@@ -10,11 +10,8 @@ pip install huggingface_hub
 wget https://github.com/opendatalab/MinerU/raw/master/docs/download_models_hf.py
 python download_models_hf.py
 ```
-After the Python script finishes executing, it will output the directory where the models are downloaded.
+The Python script will automatically download the model files and configure the model directory in the configuration file.
 
-### 2. To modify the model path address in the configuration file
-
-Additionally, in `~/magic-pdf.json`, update the model directory path to the absolute path of the `models` directory output by the previous Python script. Otherwise, you will encounter an error indicating that the model cannot be loaded.
 
 
 # How to update models previously downloaded

+ 3 - 5
docs/how_to_download_models_zh_cn.md

@@ -22,12 +22,10 @@ pip install modelscope
 wget https://gitee.com/myhloli/MinerU/raw/master/docs/download_models.py
 python download_models.py
 ```
-python脚本执行完毕后,会输出模型下载目录
-
-
-## 下载完成后的操作:修改magic-pdf.json中的模型路径
-在`~/magic-pdf.json`里修改模型的目录指向上一步脚本输出的models目录的绝对路径,否则会报模型无法加载的错误。
+python脚本会自动下载模型文件并配置好配置文件中的模型目录
 
+配置文件可以在用户目录中找到,文件名为`magic-pdf.json`
+> windows的用户目录为 "C:\\Users\\用户名", linux用户目录为 "/home/用户名", macOS用户目录为 "/Users/用户名"
 
 
 # 此前下载过模型,如何更新