zhch158_admin 11 mesiacov pred
rodič
commit
6df07a7baf

BIN
zhch/600916_中国黄金_2002年报_83_94.pdf


BIN
zhch/600916_合并现金流量表有错.png


Rozdielové dáta súboru neboli zobrazené, pretože súbor je príliš veľký
+ 8 - 0
zhch/demo1.md


BIN
zhch/demo1.pdf


BIN
zhch/demo2.pdf


+ 29 - 0
zhch/demo_zhch.py

@@ -0,0 +1,29 @@
+import os
+
+from loguru import logger
+
+from magic_pdf.data.data_reader_writer import FileBasedDataWriter
+from magic_pdf.pipe.UNIPipe import UNIPipe
+
+from dotenv import load_dotenv; load_dotenv()
+print(f"os.environ['CUDA_VISIBLE_DEVICES']: {os.environ['CUDA_VISIBLE_DEVICES']}") 
+print(f"os.environ['MINERU_TOOLS_CONFIG_JSON']: {os.environ['MINERU_TOOLS_CONFIG_JSON']}")
+
+try:
+    current_script_dir = os.path.dirname(os.path.abspath(__file__))
+    demo_name = '600916_中国黄金_2002年报_83_94'
+    pdf_path = os.path.join(current_script_dir, f'{demo_name}.pdf')
+    pdf_bytes = open(pdf_path, 'rb').read()
+    jso_useful_key = {'_pdf_type': '', 'model_list': []}
+    local_image_dir = os.path.join(current_script_dir, 'images')
+    image_dir = str(os.path.basename(local_image_dir))
+    image_writer = FileBasedDataWriter(local_image_dir)
+    pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
+    pipe.pipe_classify()
+    pipe.pipe_analyze()
+    pipe.pipe_parse()
+    md_content = pipe.pipe_mk_markdown(image_dir, drop_mode='none')
+    with open(f'{demo_name}.md', 'w', encoding='utf-8') as f:
+        f.write(md_content)
+except Exception as e:
+    logger.exception(e)

+ 63 - 0
zhch/download_models.py

@@ -0,0 +1,63 @@
+import json
+import os
+
+import requests
+from modelscope import snapshot_download
+
+from dotenv import load_dotenv; load_dotenv()
+print(f"os.environ['CUDA_VISIBLE_DEVICES']: {os.environ['CUDA_VISIBLE_DEVICES']}") 
+print(f"os.environ['HF_ENDPOINT']: {os.environ['HF_ENDPOINT']}") 
+
+def download_json(url):
+    # 下载JSON文件
+    response = requests.get(url)
+    response.raise_for_status()  # 检查请求是否成功
+    return response.json()
+
+
+def download_and_modify_json(url, local_filename, modifications):
+    if os.path.exists(local_filename):
+        data = json.load(open(local_filename))
+        config_version = data.get('config_version', '0.0.0')
+        if config_version < '1.0.0':
+            data = download_json(url)
+    else:
+        data = download_json(url)
+
+    # 修改内容
+    for key, value in modifications.items():
+        data[key] = value
+
+    # 保存修改后的内容
+    with open(local_filename, 'w', encoding='utf-8') as f:
+        json.dump(data, f, ensure_ascii=False, indent=4)
+
+
+if __name__ == '__main__':
+    mineru_patterns = [
+        "models/Layout/LayoutLMv3/*",
+        "models/Layout/YOLO/*",
+        "models/MFD/YOLO/*",
+        "models/MFR/unimernet_small/*",
+        "models/TabRec/TableMaster/*",
+        "models/TabRec/StructEqTable/*",
+    ]
+    model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0', allow_file_pattern=mineru_patterns)
+    layoutreader_model_dir = snapshot_download('ppaanngggg/layoutreader')
+    model_dir = model_dir + '/models'
+    print(f'model_dir is: {model_dir}')
+    print(f'layoutreader_model_dir is: {layoutreader_model_dir}')
+
+    json_url = 'https://gitee.com/myhloli/MinerU/raw/master/magic-pdf.template.json'
+    config_file_name = 'magic-pdf.json'
+    # home_dir = os.path.expanduser('~')
+    home_dir = os.path.dirname(os.path.abspath(__file__))
+    config_file = os.path.join(home_dir, config_file_name)
+
+    json_mods = {
+        'models-dir': model_dir,
+        'layoutreader-model-dir': layoutreader_model_dir,
+    }
+
+    download_and_modify_json(json_url, config_file, json_mods)
+    print(f'The configuration file has been configured successfully, the path is: {config_file}')

+ 31 - 0
zhch/magic-pdf.json

@@ -0,0 +1,31 @@
+{
+    "bucket_info": {
+        "bucket-name-1": [
+            "ak",
+            "sk",
+            "endpoint"
+        ],
+        "bucket-name-2": [
+            "ak",
+            "sk",
+            "endpoint"
+        ]
+    },
+    "models-dir": "/home/dev/models/modelscope_cache/hub/opendatalab/PDF-Extract-Kit-1___0/models",
+    "layoutreader-model-dir": "/home/dev/models/modelscope_cache/hub/ppaanngggg/layoutreader",
+    "device-mode": "cuda",
+    "layout-config": {
+        "model": "layoutlmv3"
+    },
+    "formula-config": {
+        "mfd_model": "yolo_v8_mfd",
+        "mfr_model": "unimernet_small",
+        "enable": true
+    },
+    "table-config": {
+        "model": "rapid_table",
+        "enable": true,
+        "max_time": 400
+    },
+    "config_version": "1.0.0"
+}

BIN
zhch/small_ocr.pdf


+ 25 - 6
zhch/study-notes.md

@@ -10,16 +10,35 @@ git config --local user.email "zhch158@sina.com"
 
 ## 1.2 python 安装依赖包
 ```
-conda activate py311
-pip install pyMuPDF
+conda create -n MinerU python=3.10
+conda activate MinerU
+pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple
+
+conda install pytorch torchvision torchaudio cudatoolkit=11.7 -c pytorch -c nvidia
+
+pip install torch==2.3.1
+pip install torchtext==0.18.0
+pip install torchvision==0.18
+
+pip install -U -e .
+
+<!-- pip install pyMuPDF -->
+<!-- pip install doclayout_yolo -->
+<!-- pip install detectron2 -->
 
 ```
 
-## 1.3 huggiingface 下载模型
+## 1.3 modelscope 下载模型
 ```
-huggingface-cli login
-cd ~/zhch/models
-ln -s /home/dev/models/hf_home/hub/models--stepfun-ai--GOT-OCR2_0/snapshots/cf6b7386bc89a54f09785612ba74cb12de6fa17c stepfun-ai--GOT-OCR2_0
+# 从modelscope下载模型权重文件
+cd zhch
+set -a
+source .env
+python download_models.py
+
+# .env中加入model配置
+MINERU_TOOLS_CONFIG_JSON="/home/dev/zhch/src/MinerU/zhch/magic-pdf.json"
+
 ```
 
 .env

Niektoré súbory nie sú zobrazené, pretože je v týchto rozdielových dátach zmenené mnoho súborov