quyuan 1 年之前
父節點
當前提交
2e79da594f
共有 4 個文件被更改,包括 31 次插入7 次删除
  1. 0 2
      .github/workflows/benchmark.yml
  2. 1 1
      tests/benchmark/benchmark.py
  3. 0 4
      tests/benchmark/env.sh
  4. 30 0
      tests/test_cli/test_cli.py

+ 0 - 2
.github/workflows/benchmark.yml

@@ -37,6 +37,4 @@ jobs:
         echo "start test"
         cd $GITHUB_WORKSPACE/tests/benchmark/ 
         tree
-        sh env.sh
-        python benchmark.py
   

+ 1 - 1
tests/benchmark/benchmark.py

@@ -18,7 +18,7 @@ def test_cli():
     rm_cmd = f"rm -rf {pdf_res_path}"
     os.system(rm_cmd)
     os.makedirs(pdf_res_path)
-    cmd = f'magic-pdf pdf-command --pdf {os.path.join(pdf_dev_path, "mineru")} --inside_model true'
+    cmd = f'magic-pdf pdf-command --pdf {os.path.join(pdf_dev_path, "mineru")}'
     os.system(cmd)
     for root, dirs, files in os.walk(pdf_res_path):
          for magic_file in files:

+ 0 - 4
tests/benchmark/env.sh

@@ -1,8 +1,4 @@
 conda create -n MinerU python=3.10
 conda activate MinerU
 pip install magic-pdf
-pip install magic-pdf[full-cpu]
-pip install detectron2 --extra-index-url https://myhloli.github.io/wheels/
-git lfs install
-git lfs clone https://huggingface.co/wanderkid/PDF-Extract-Kit
 #cp magic-pdf.template.json ~/magic-pdf.json

+ 30 - 0
tests/test_cli/test_cli.py

@@ -4,6 +4,13 @@ from conf import conf
 import subprocess
 from lib import common
 import logging
+import os
+import json
+
+from loguru import logger
+
+from magic_pdf.pipe.UNIPipe import UNIPipe
+from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
 pdf_res_path = conf.conf["pdf_res_path"]
 code_path = conf.conf["code_path"]
 pdf_dev_path = conf.conf["pdf_dev_path"]
@@ -18,6 +25,29 @@ class TestCli:
         common.check_shell(cmd)
         #common.count_folders_and_check_contents(pdf_res_path)      
    
+    def test_pdf_sdk(self):
+        """
+        pdf sdk 方式解析
+        """
+        demo_names = list()
+        for pdf_file in os.listdir(pdf_dev_path):
+            if pdf_file.endswith('.pdf'):
+                demo_names.append(pdf_file.split('.')[0])
+        for demo_name in demo_names:
+            model_path = os.path.join(pdf_dev_path, f"{demo_name}.json")
+            pdf_path = os.path.join(pdf_dev_path, f"{demo_name}.pdf")
+            pdf_bytes = open(pdf_path, "rb").read()
+            model_json = json.loads(open(model_path, "r", encoding="utf-8").read())
+            image_writer = DiskReaderWriter(pdf_dev_path)
+            image_dir = str(os.path.basename(pdf_dev_path))
+            jso_useful_key = {"_pdf_type": "", "model_list": model_json}
+            pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
+            pipe.pipe_classify()
+            pipe.pipe_parse()
+            md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
+            with open(f"{demo_name}.md", "w", encoding="utf-8") as f:
+                f.write(md_content)
+
     # def test_pdf_specify_jsonl(self):
     #     """
     #     输入jsonl, 默认方式解析