| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424 |
- """test cli and sdk."""
- import logging
- import os
- import pytest
- from conf import conf
- from lib import common
- import time
- import magic_pdf.model as model_config
- from magic_pdf.data.read_api import read_local_images
- from magic_pdf.data.read_api import read_local_office
- from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
- from magic_pdf.config.make_content_config import DropMode, MakeMode
- from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
- from magic_pdf.data.dataset import PymuDocDataset
- from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
- from magic_pdf.config.enums import SupportedPdfParseMethod
- pdf_res_path = conf.conf['pdf_res_path']
- code_path = conf.conf['code_path']
- pdf_dev_path = conf.conf['pdf_dev_path']
- magic_pdf_config = "/home/quyuan/magic-pdf.json"
- class TestCli:
- """test cli."""
- @pytest.fixture(autouse=True)
- def setup(self):
- """
- init
- """
- common.clear_gpu_memory()
- common.update_config_file(magic_pdf_config, "device-mode", "cuda")
- # 这里可以添加任何前置操作
- yield
- @pytest.mark.P0
- def test_pdf_local_sdk(self):
- """pdf sdk auto test."""
- demo_names = list()
- pdf_path = os.path.join(pdf_dev_path, 'pdf')
- for pdf_file in os.listdir(pdf_path):
- if pdf_file.endswith('.pdf'):
- demo_names.append(pdf_file.split('.')[0])
- for demo_name in demo_names:
- pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf')
- local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
- image_dir = str(os.path.basename(local_image_dir))
- name_without_suff = os.path.basename(pdf_path).split(".pdf")[0]
- dir_path = os.path.join(pdf_dev_path, 'mineru')
- image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(dir_path)
- reader1 = FileBasedDataReader("")
- pdf_bytes = reader1.read(pdf_path)
- ds = PymuDocDataset(pdf_bytes)
- ## inference
- if ds.classify() == SupportedPdfParseMethod.OCR:
- infer_result = ds.apply(doc_analyze, ocr=True)
- ## pipeline
- pipe_result = infer_result.pipe_ocr_mode(image_writer)
- else:
- infer_result = ds.apply(doc_analyze, ocr=False)
- ## pipeline
- pipe_result = infer_result.pipe_txt_mode(image_writer)
- common.delete_file(dir_path)
- ### draw model result on each page
- infer_result.draw_model(os.path.join(dir_path, f"{name_without_suff}_model.pdf"))
- ### get model inference result
- model_inference_result = infer_result.get_infer_res()
- ### draw layout result on each page
- pipe_result.draw_layout(os.path.join(dir_path, f"{name_without_suff}_layout.pdf"))
- ### draw spans result on each page
- pipe_result.draw_span(os.path.join(dir_path, f"{name_without_suff}_spans.pdf"))
- ### dump markdown
- md_content = pipe_result.get_markdown(image_dir)
- pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
- ### get content list content
- content_list_content = pipe_result.get_content_list(image_dir)
- pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
-
- ### get middle json
- middle_json_content = pipe_result.get_middle_json()
- ### dump middle json
- pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')
- common.sdk_count_folders_and_check_contents(dir_path)
- @pytest.mark.P0
- def test_pdf_s3_sdk(self):
- """pdf s3 sdk test."""
- demo_names = list()
- pdf_path = os.path.join(pdf_dev_path, 'pdf')
- for pdf_file in os.listdir(pdf_path):
- if pdf_file.endswith('.pdf'):
- demo_names.append(pdf_file.split('.')[0])
- for demo_name in demo_names:
- pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf')
- local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
- image_dir = str(os.path.basename(local_image_dir))
- name_without_suff = os.path.basename(pdf_path).split(".pdf")[0]
- dir_path = os.path.join(pdf_dev_path, 'mineru')
- pass
- @pytest.mark.P0
- def test_pdf_local_ppt(self):
- """pdf sdk auto test."""
- demo_names = list()
- pdf_path = os.path.join(pdf_dev_path, 'ppt')
- for pdf_file in os.listdir(pdf_path):
- if pdf_file.endswith('.pptx'):
- demo_names.append(pdf_file.split('.')[0])
- for demo_name in demo_names:
- pdf_path = os.path.join(pdf_dev_path, 'ppt', f'{demo_name}.pptx')
- local_image_dir = os.path.join(pdf_dev_path, 'mineru', 'images')
- image_dir = str(os.path.basename(local_image_dir))
- name_without_suff = os.path.basename(pdf_path).split(".pptx")[0]
- dir_path = os.path.join(pdf_dev_path, 'mineru')
- image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(dir_path)
- ds = read_local_office(pdf_path)[0]
- common.delete_file(dir_path)
-
- ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
- common.sdk_count_folders_and_check_contents(dir_path)
- @pytest.mark.P0
- def test_pdf_local_image(self):
- """pdf sdk auto test."""
- demo_names = list()
- pdf_path = os.path.join(pdf_dev_path, 'images')
- for pdf_file in os.listdir(pdf_path):
- if pdf_file.endswith('.jpg'):
- demo_names.append(pdf_file.split('.')[0])
- for demo_name in demo_names:
- pdf_path = os.path.join(pdf_dev_path, 'images', f'{demo_name}.jpg')
- local_image_dir = os.path.join(pdf_dev_path, 'mineru', 'images')
- image_dir = str(os.path.basename(local_image_dir))
- name_without_suff = os.path.basename(pdf_path).split(".jpg")[0]
- dir_path = os.path.join(pdf_dev_path, 'mineru')
- common.delete_file(dir_path)
- image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(dir_path)
- ds = read_local_images(pdf_path)[0]
- ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
- md_writer, f"{name_without_suff}.md", image_dir)
- common.sdk_count_folders_and_check_contents(dir_path)
- @pytest.mark.P0
- def test_local_image_dir(self):
- """local image dir."""
- demo_names = list()
- pdf_path = os.path.join(pdf_dev_path, 'images')
- dir_path = os.path.join(pdf_dev_path, 'mineru')
- local_image_dir = os.path.join(pdf_dev_path, 'mineru', 'images')
- image_dir = str(os.path.basename(local_image_dir))
- image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(dir_path)
- common.delete_file(dir_path)
- dss = read_local_images(pdf_path, suffixes=['.png', '.jpg'])
- count = 0
- for ds in dss:
- ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{count}.md", image_dir)
- count += 1
- common.sdk_count_folders_and_check_contents(dir_path)
- def test_local_doc_parse(self):
- """
- doc 解析
- """
- demo_names = list()
- pdf_path = os.path.join(pdf_dev_path, 'doc')
- for pdf_file in os.listdir(pdf_path):
- if pdf_file.endswith('.docx'):
- demo_names.append(pdf_file.split('.')[0])
- for demo_name in demo_names:
- pdf_path = os.path.join(pdf_dev_path, 'doc', f'{demo_name}.docx')
- local_image_dir = os.path.join(pdf_dev_path, 'mineru', 'images')
- image_dir = str(os.path.basename(local_image_dir))
- name_without_suff = os.path.basename(pdf_path).split(".docx")[0]
- dir_path = os.path.join(pdf_dev_path, 'mineru')
- image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(dir_path)
- ds = read_local_office(pdf_path)[0]
- common.delete_file(dir_path)
-
- ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
- common.sdk_count_folders_and_check_contents(dir_path)
- @pytest.mark.P0
- def test_pdf_cli_auto(self):
- """magic_pdf cli test auto."""
- time.sleep(2)
- demo_names = []
- pdf_path = os.path.join(pdf_dev_path, 'pdf')
- for pdf_file in os.listdir(pdf_path):
- if pdf_file.endswith('.pdf'):
- demo_names.append(pdf_file.split('.')[0])
- for demo_name in demo_names:
- res_path = os.path.join(pdf_dev_path, 'mineru')
- common.delete_file(res_path)
- cmd = 'magic-pdf -p %s -o %s -m %s' % (os.path.join(
- pdf_path, f'{demo_name}.pdf'), res_path, 'auto')
- logging.info(cmd)
- os.system(cmd)
- common.cli_count_folders_and_check_contents(
- os.path.join(res_path, demo_name, 'auto'))
-
- @pytest.mark.P0
- def test_pdf_cli_txt(self):
- """magic_pdf cli test txt."""
- time.sleep(2)
- demo_names = []
- pdf_path = os.path.join(pdf_dev_path, 'pdf')
- for pdf_file in os.listdir(pdf_path):
- if pdf_file.endswith('.pdf'):
- demo_names.append(pdf_file.split('.')[0])
- for demo_name in demo_names:
- res_path = os.path.join(pdf_dev_path, 'mineru')
- common.delete_file(res_path)
- cmd = 'magic-pdf -p %s -o %s -m %s' % (os.path.join(
- pdf_path, f'{demo_name}.pdf'), res_path, 'txt')
- logging.info(cmd)
- os.system(cmd)
- common.cli_count_folders_and_check_contents(
- os.path.join(res_path, demo_name, 'txt'))
-
- @pytest.mark.P0
- def test_pdf_cli_ocr(self):
- """magic_pdf cli test ocr."""
- time.sleep(2)
- demo_names = []
- pdf_path = os.path.join(pdf_dev_path, 'pdf')
- for pdf_file in os.listdir(pdf_path):
- if pdf_file.endswith('.pdf'):
- demo_names.append(pdf_file.split('.')[0])
- for demo_name in demo_names:
- res_path = os.path.join(pdf_dev_path, 'mineru')
- common.delete_file(res_path)
- cmd = 'magic-pdf -p %s -o %s -m %s' % (os.path.join(
- pdf_path, f'{demo_name}.pdf'), res_path, 'ocr')
- logging.info(cmd)
- os.system(cmd)
- common.cli_count_folders_and_check_contents(
- os.path.join(res_path, demo_name, 'ocr'))
-
- @pytest.mark.skip(reason='out-of-date api')
- @pytest.mark.P1
- def test_pdf_dev_cli_local_jsonl_txt(self):
- """magic_pdf_dev cli local txt."""
- time.sleep(2)
- jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
- cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, "txt")
- logging.info(cmd)
- os.system(cmd)
- @pytest.mark.skip(reason='out-of-date api')
- @pytest.mark.P1
- def test_pdf_dev_cli_local_jsonl_ocr(self):
- """magic_pdf_dev cli local ocr."""
- time.sleep(2)
- jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
- cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'ocr')
- logging.info(cmd)
- os.system(cmd)
- @pytest.mark.skip(reason='out-of-date api')
- @pytest.mark.P1
- def test_pdf_dev_cli_local_jsonl_auto(self):
- """magic_pdf_dev cli local auto."""
- time.sleep(2)
- jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
- cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'auto')
- logging.info(cmd)
- os.system(cmd)
-
- @pytest.mark.skip(reason='out-of-date api')
- @pytest.mark.P1
- def test_pdf_dev_cli_s3_jsonl_txt(self):
- """magic_pdf_dev cli s3 txt."""
- time.sleep(2)
- jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
- cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, "txt")
- logging.info(cmd)
- os.system(cmd)
- @pytest.mark.skip(reason='out-of-date api')
- @pytest.mark.P1
- def test_pdf_dev_cli_s3_jsonl_ocr(self):
- """magic_pdf_dev cli s3 ocr."""
- time.sleep(2)
- jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
- cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'ocr')
- logging.info(cmd)
- os.system(cmd)
- @pytest.mark.skip(reason='out-of-date api')
- @pytest.mark.P1
- def test_pdf_dev_cli_s3_jsonl_auto(self):
- """magic_pdf_dev cli s3 auto."""
- time.sleep(2)
- jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
- cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'auto')
- logging.info(cmd)
- os.system(cmd)
- @pytest.mark.P1
- def test_pdf_dev_cli_pdf_json_auto(self):
- """magic_pdf_dev cli pdf+json auto."""
- time.sleep(2)
- json_path = os.path.join(pdf_dev_path, 'test_model.json')
- pdf_path = os.path.join(pdf_dev_path, 'pdf', 'test_rearch_report.pdf')
- cmd = 'magic-pdf-dev --pdf %s --json %s --method %s' % (pdf_path, json_path, 'auto')
- logging.info(cmd)
- os.system(cmd)
-
- @pytest.mark.skip(reason='out-of-date api')
- @pytest.mark.P1
- def test_pdf_dev_cli_pdf_json_ocr(self):
- """magic_pdf_dev cli pdf+json ocr."""
- time.sleep(2)
- json_path = os.path.join(pdf_dev_path, 'test_model.json')
- pdf_path = os.path.join(pdf_dev_path, 'pdf', 'test_rearch_report.pdf')
- cmd = 'magic-pdf-dev --pdf %s --json %s --method %s' % (pdf_path, json_path, 'auto')
- logging.info(cmd)
- os.system(cmd)
-
- @pytest.mark.P1
- def test_local_magic_pdf_open_rapidai_table(self):
- """magic pdf cli open rapid ai table."""
- time.sleep(2)
- #pre_cmd = "cp ~/magic_pdf_html.json ~/magic-pdf.json"
- #os.system(pre_cmd)
- value = {
- "model": "rapid_table",
- "enable": True,
- "sub_model": "slanet_plus",
- "max_time": 400
- }
- common.update_config_file(magic_pdf_config, "table-config", value)
- pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
- common.delete_file(pdf_res_path)
- cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
- os.system(cli_cmd)
- res = common.check_html_table_exists(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
- assert res is True
-
-
- @pytest.mark.P1
- def test_local_magic_pdf_doclayout_yolo(self):
- """magic pdf cli open doclyaout yolo."""
- time.sleep(2)
- #pre_cmd = "cp ~/magic_pdf_html.json ~/magic-pdf.json"
- #os.system(pre_cmd)
- value = {
- "model": "doclayout_yolo"
- }
- common.update_config_file(magic_pdf_config, "layout-config", value)
- pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
- common.delete_file(pdf_res_path)
- cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
- os.system(cli_cmd)
- common.cli_count_folders_and_check_contents(os.path.join(pdf_res_path, "test_rearch_report", "auto"))
- @pytest.mark.skip(reason="layoutlmv3废弃")
- @pytest.mark.P1
- def test_local_magic_pdf_layoutlmv3_yolo(self):
- """magic pdf cli open layoutlmv3."""
- time.sleep(2)
- value = {
- "model": "layoutlmv3"
- }
- common.update_config_file(magic_pdf_config, "layout-config", value)
- pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
- common.delete_file(pdf_res_path)
- cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
- os.system(cli_cmd)
- common.cli_count_folders_and_check_contents(os.path.join(pdf_res_path, "test_rearch_report", "auto"))
- #res = common.check_html_table_exists(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
- @pytest.mark.P1
- def test_magic_pdf_cpu(self):
- """magic pdf cli cpu mode."""
- time.sleep(2)
- #pre_cmd = "cp ~/magic_pdf_html_table_cpu.json ~/magic-pdf.json"
- #os.system(pre_cmd)
- value = {
- "model": "rapid_table",
- "enable": True,
- "sub_model": "slanet_plus",
- "max_time": 400
- }
- common.update_config_file(magic_pdf_config, "table-config", value)
- common.update_config_file(magic_pdf_config, "device-mode", "cpu")
- pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
- common.delete_file(pdf_res_path)
- cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
- os.system(cli_cmd)
- common.cli_count_folders_and_check_contents(os.path.join(pdf_res_path, "test_rearch_report", "auto"))
- @pytest.mark.P1
- def test_local_magic_pdf_close_html_table(self):
- """magic pdf cli close table."""
- time.sleep(2)
- #pre_cmd = "cp ~/magic_pdf_close_table.json ~/magic-pdf.json"
- #os.system(pre_cmd)
- value = {
- "model": "rapid_table",
- "enable": False,
- "sub_model": "slanet_plus",
- "max_time": 400
- }
- common.update_config_file(magic_pdf_config, "table-config", value)
- pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
- common.delete_file(pdf_res_path)
- cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
- os.system(cli_cmd)
- res = common.check_close_tables(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
- assert res is True
-
-
- if __name__ == '__main__':
- pytest.main()
|