|
|
@@ -4,6 +4,13 @@ from conf import conf
|
|
|
import subprocess
|
|
|
from lib import common
|
|
|
import logging
|
|
|
+import os
|
|
|
+import json
|
|
|
+
|
|
|
+from loguru import logger
|
|
|
+
|
|
|
+from magic_pdf.pipe.UNIPipe import UNIPipe
|
|
|
+from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
|
|
|
pdf_res_path = conf.conf["pdf_res_path"]
|
|
|
code_path = conf.conf["code_path"]
|
|
|
pdf_dev_path = conf.conf["pdf_dev_path"]
|
|
|
@@ -18,6 +25,29 @@ class TestCli:
|
|
|
common.check_shell(cmd)
|
|
|
#common.count_folders_and_check_contents(pdf_res_path)
|
|
|
|
|
|
+ def test_pdf_sdk(self):
|
|
|
+ """
|
|
|
+ pdf sdk 方式解析
|
|
|
+ """
|
|
|
+ demo_names = list()
|
|
|
+ for pdf_file in os.listdir(pdf_dev_path):
|
|
|
+ if pdf_file.endswith('.pdf'):
|
|
|
+ demo_names.append(pdf_file.split('.')[0])
|
|
|
+ for demo_name in demo_names:
|
|
|
+ model_path = os.path.join(pdf_dev_path, f"{demo_name}.json")
|
|
|
+ pdf_path = os.path.join(pdf_dev_path, f"{demo_name}.pdf")
|
|
|
+ pdf_bytes = open(pdf_path, "rb").read()
|
|
|
+ model_json = json.loads(open(model_path, "r", encoding="utf-8").read())
|
|
|
+ image_writer = DiskReaderWriter(pdf_dev_path)
|
|
|
+ image_dir = str(os.path.basename(pdf_dev_path))
|
|
|
+ jso_useful_key = {"_pdf_type": "", "model_list": model_json}
|
|
|
+ pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
|
|
|
+ pipe.pipe_classify()
|
|
|
+ pipe.pipe_parse()
|
|
|
+ md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
|
|
|
+ with open(f"{demo_name}.md", "w", encoding="utf-8") as f:
|
|
|
+ f.write(md_content)
|
|
|
+
|
|
|
# def test_pdf_specify_jsonl(self):
|
|
|
# """
|
|
|
# 输入jsonl, 默认方式解析
|