| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395 |
- """test cli and sdk."""
- import logging
- import os
- import pytest
- from conf import conf
- from lib import common
- import time
- import magic_pdf.model as model_config
- import os
- from magic_pdf.data.data_reader_writer import FileBasedDataWriter
- from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
- from magic_pdf.config.make_content_config import DropMode, MakeMode
- from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
- from magic_pdf.data.dataset import PymuDocDataset
- from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
- from magic_pdf.config.enums import SupportedPdfParseMethod
- model_config.__use_inside_model__ = True
- pdf_res_path = conf.conf['pdf_res_path']
- code_path = conf.conf['code_path']
- pdf_dev_path = conf.conf['pdf_dev_path']
- magic_pdf_config = "/home/quyuan/magic-pdf.json"
- class TestCli:
- """test cli."""
- @pytest.fixture(autouse=True)
- def setup(self):
- """
- init
- """
- common.clear_gpu_memory()
- common.update_config_file(magic_pdf_config, "device-mode", "cuda")
- # 这里可以添加任何前置操作
- yield
- @pytest.mark.P0
- def test_pdf_auto_sdk(self):
- """pdf sdk auto test."""
- demo_names = list()
- pdf_path = os.path.join(pdf_dev_path, 'pdf')
- for pdf_file in os.listdir(pdf_path):
- if pdf_file.endswith('.pdf'):
- demo_names.append(pdf_file.split('.')[0])
- for demo_name in demo_names:
- pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf')
- local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
- image_dir = str(os.path.basename(local_image_dir))
- dir_path = os.path.join(pdf_dev_path, 'mineru')
- image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(dir_path)
- reader1 = FileBasedDataReader("")
- pdf_bytes = reader1.read(pdf_path)
- ds = PymuDocDataset(pdf_bytes)
- ## inference
- if ds.classify() == SupportedPdfParseMethod.OCR:
- infer_result = ds.apply(doc_analyze, ocr=True)
- ## pipeline
- pipe_result = infer_result.pipe_ocr_mode(image_writer)
- else:
- infer_result = ds.apply(doc_analyze, ocr=False)
- ## pipeline
- pipe_result = infer_result.pipe_txt_mode(image_writer)
- common.delete_file(dir_path)
- infer_result.draw_model(os.path.join(dir_path, f"{demo_name}_model.pdf"))
- pipe_result.draw_layout(os.path.join(dir_path, f"{demo_name}_layout.pdf"))
- pipe_result.draw_span(os.path.join(dir_path, f"{demo_name}_spans.pdf"))
- pipe_result.dump_md(md_writer, f"{demo_name}.md", image_dir)
- pipe_result.dump_content_list(md_writer, f"{demo_name}_content_list.json", image_dir)
- common.sdk_count_folders_and_check_contents(dir_path)
- @pytest.mark.P0
- def test_pdf_cli_auto(self):
- """magic_pdf cli test auto."""
- time.sleep(2)
- demo_names = []
- pdf_path = os.path.join(pdf_dev_path, 'pdf')
- for pdf_file in os.listdir(pdf_path):
- if pdf_file.endswith('.pdf'):
- demo_names.append(pdf_file.split('.')[0])
- for demo_name in demo_names:
- res_path = os.path.join(pdf_dev_path, 'mineru')
- common.delete_file(res_path)
- cmd = 'magic-pdf -p %s -o %s -m %s' % (os.path.join(
- pdf_path, f'{demo_name}.pdf'), res_path, 'auto')
- logging.info(cmd)
- os.system(cmd)
- common.cli_count_folders_and_check_contents(
- os.path.join(res_path, demo_name, 'auto'))
-
- @pytest.mark.P0
- def test_pdf_cli_txt(self):
- """magic_pdf cli test txt."""
- time.sleep(2)
- demo_names = []
- pdf_path = os.path.join(pdf_dev_path, 'pdf')
- for pdf_file in os.listdir(pdf_path):
- if pdf_file.endswith('.pdf'):
- demo_names.append(pdf_file.split('.')[0])
- for demo_name in demo_names:
- res_path = os.path.join(pdf_dev_path, 'mineru')
- common.delete_file(res_path)
- cmd = 'magic-pdf -p %s -o %s -m %s' % (os.path.join(
- pdf_path, f'{demo_name}.pdf'), res_path, 'txt')
- logging.info(cmd)
- os.system(cmd)
- common.cli_count_folders_and_check_contents(
- os.path.join(res_path, demo_name, 'txt'))
-
- @pytest.mark.P0
- def test_pdf_cli_ocr(self):
- """magic_pdf cli test ocr."""
- time.sleep(2)
- demo_names = []
- pdf_path = os.path.join(pdf_dev_path, 'pdf')
- for pdf_file in os.listdir(pdf_path):
- if pdf_file.endswith('.pdf'):
- demo_names.append(pdf_file.split('.')[0])
- for demo_name in demo_names:
- res_path = os.path.join(pdf_dev_path, 'mineru')
- common.delete_file(res_path)
- cmd = 'magic-pdf -p %s -o %s -m %s' % (os.path.join(
- pdf_path, f'{demo_name}.pdf'), res_path, 'ocr')
- logging.info(cmd)
- os.system(cmd)
- common.cli_count_folders_and_check_contents(
- os.path.join(res_path, demo_name, 'ocr'))
-
- @pytest.mark.skip(reason='out-of-date api')
- @pytest.mark.P1
- def test_pdf_dev_cli_local_jsonl_txt(self):
- """magic_pdf_dev cli local txt."""
- time.sleep(2)
- jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
- cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, "txt")
- logging.info(cmd)
- os.system(cmd)
- @pytest.mark.skip(reason='out-of-date api')
- @pytest.mark.P1
- def test_pdf_dev_cli_local_jsonl_ocr(self):
- """magic_pdf_dev cli local ocr."""
- time.sleep(2)
- jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
- cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'ocr')
- logging.info(cmd)
- os.system(cmd)
- @pytest.mark.skip(reason='out-of-date api')
- @pytest.mark.P1
- def test_pdf_dev_cli_local_jsonl_auto(self):
- """magic_pdf_dev cli local auto."""
- time.sleep(2)
- jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
- cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'auto')
- logging.info(cmd)
- os.system(cmd)
-
- @pytest.mark.skip(reason='out-of-date api')
- @pytest.mark.P1
- def test_pdf_dev_cli_s3_jsonl_txt(self):
- """magic_pdf_dev cli s3 txt."""
- time.sleep(2)
- jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
- cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, "txt")
- logging.info(cmd)
- os.system(cmd)
- @pytest.mark.skip(reason='out-of-date api')
- @pytest.mark.P1
- def test_pdf_dev_cli_s3_jsonl_ocr(self):
- """magic_pdf_dev cli s3 ocr."""
- time.sleep(2)
- jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
- cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'ocr')
- logging.info(cmd)
- os.system(cmd)
- @pytest.mark.skip(reason='out-of-date api')
- @pytest.mark.P1
- def test_pdf_dev_cli_s3_jsonl_auto(self):
- """magic_pdf_dev cli s3 auto."""
- time.sleep(2)
- jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
- cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'auto')
- logging.info(cmd)
- os.system(cmd)
- @pytest.mark.P1
- def test_pdf_dev_cli_pdf_json_auto(self):
- """magic_pdf_dev cli pdf+json auto."""
- time.sleep(2)
- json_path = os.path.join(pdf_dev_path, 'test_model.json')
- pdf_path = os.path.join(pdf_dev_path, 'pdf', 'test_rearch_report.pdf')
- cmd = 'magic-pdf-dev --pdf %s --json %s --method %s' % (pdf_path, json_path, 'auto')
- logging.info(cmd)
- os.system(cmd)
-
- @pytest.mark.skip(reason='out-of-date api')
- @pytest.mark.P1
- def test_pdf_dev_cli_pdf_json_ocr(self):
- """magic_pdf_dev cli pdf+json ocr."""
- time.sleep(2)
- json_path = os.path.join(pdf_dev_path, 'test_model.json')
- pdf_path = os.path.join(pdf_dev_path, 'pdf', 'test_rearch_report.pdf')
- cmd = 'magic-pdf-dev --pdf %s --json %s --method %s' % (pdf_path, json_path, 'auto')
- logging.info(cmd)
- os.system(cmd)
-
- @pytest.mark.P1
- def test_s3_sdk_auto(self):
- """
- test s3 sdk auto.
- """
- time.sleep(2)
- pdf_ak = os.getenv('pdf_ak')
- print (pdf_ak)
- pdf_sk = os.environ.get('pdf_sk', "")
- pdf_bucket = os.environ.get('bucket', "")
- pdf_endpoint = os.environ.get('pdf_endpoint', "")
- s3_pdf_path = conf.conf["s3_pdf_path"]
- image_dir = "s3://" + pdf_bucket + "/mineru/test/output"
- prefix = "mineru/test/output"
- reader = S3DataReader(prefix, pdf_bucket, pdf_ak, pdf_sk, pdf_endpoint)
- writer = S3DataWriter(prefix, pdf_bucket, pdf_ak, pdf_sk, pdf_endpoint)
- # = S3DataWriter(prefix, pdf_bucket, pdf_ak, pdf_sk, pdf_endpoint)
- image_writer = S3DataWriter(prefix, pdf_bucket, pdf_ak, pdf_sk, pdf_endpoint)
- local_dir = "output"
- name_without_suff = os.path.basename(s3_pdf_path).split(".")[0]
- # read bytes
- pdf_bytes = reader.read(s3_pdf_path) # read the pdf content
- # proc
- ## Create Dataset Instance
- ds = PymuDocDataset(pdf_bytes)
- ## inference
- if ds.classify() == SupportedPdfParseMethod.OCR:
- infer_result = ds.apply(doc_analyze, ocr=True)
- ## pipeline
- pipe_result = infer_result.pipe_ocr_mode(image_writer)
- else:
- infer_result = ds.apply(doc_analyze, ocr=False)
- ## pipeline
- pipe_result = infer_result.pipe_txt_mode(image_writer)
- ### draw model result on each page
- infer_result.draw_model(os.path.join(local_dir, f'{name_without_suff}_model.pdf')) # dump to local
- ### draw layout result on each page
- pipe_result.draw_layout(os.path.join(local_dir, f'{name_without_suff}_layout.pdf')) # dump to local
- ### draw spans result on each page
- pipe_result.draw_span(os.path.join(local_dir, f'{name_without_suff}_spans.pdf')) # dump to local
- ### dump markdown
- pipe_result.dump_md(writer, f'{name_without_suff}.md', "unittest/tmp/images") # dump to remote s3
- ### dump content list
- pipe_result.dump_content_list(writer, f"{name_without_suff}_content_list.json", image_dir)
- @pytest.mark.P1
- def test_local_magic_pdf_open_st_table(self):
- """magic pdf cli open st table."""
- time.sleep(2)
- #pre_cmd = "cp ~/magic_pdf_st.json ~/magic-pdf.json"
- value = {
- "model": "struct_eqtable",
- "enable": True,
- "max_time": 400
- }
- common.update_config_file(magic_pdf_config, "table-config", value)
- pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
- common.delete_file(pdf_res_path)
- cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
- os.system(cli_cmd)
- res = common.check_html_table_exists(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
- assert res is True
-
- @pytest.mark.P1
- def test_local_magic_pdf_open_tablemaster_cuda(self):
- """magic pdf cli open table master html table cuda mode."""
- time.sleep(2)
- #pre_cmd = "cp ~/magic_pdf_html.json ~/magic-pdf.json"
- #os.system(pre_cmd)
- value = {
- "model": "tablemaster",
- "enable": True,
- "max_time": 400
- }
- common.update_config_file(magic_pdf_config, "table-config", value)
- pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
- common.delete_file(pdf_res_path)
- cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
- os.system(cli_cmd)
- res = common.check_html_table_exists(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
- assert res is True
-
- @pytest.mark.P1
- def test_local_magic_pdf_open_rapidai_table(self):
- """magic pdf cli open rapid ai table."""
- time.sleep(2)
- #pre_cmd = "cp ~/magic_pdf_html.json ~/magic-pdf.json"
- #os.system(pre_cmd)
- value = {
- "model": "rapid_table",
- "enable": True,
- "max_time": 400
- }
- common.update_config_file(magic_pdf_config, "table-config", value)
- pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
- common.delete_file(pdf_res_path)
- cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
- os.system(cli_cmd)
- res = common.check_html_table_exists(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
- assert res is True
-
-
- @pytest.mark.P1
- def test_local_magic_pdf_doclayout_yolo(self):
- """magic pdf cli open doclyaout yolo."""
- time.sleep(2)
- #pre_cmd = "cp ~/magic_pdf_html.json ~/magic-pdf.json"
- #os.system(pre_cmd)
- value = {
- "model": "doclayout_yolo"
- }
- common.update_config_file(magic_pdf_config, "layout-config", value)
- pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
- common.delete_file(pdf_res_path)
- cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
- os.system(cli_cmd)
- common.cli_count_folders_and_check_contents(os.path.join(pdf_res_path, "test_rearch_report", "auto"))
- @pytest.mark.P1
- def test_local_magic_pdf_layoutlmv3_yolo(self):
- """magic pdf cli open layoutlmv3."""
- time.sleep(2)
- value = {
- "model": "layoutlmv3"
- }
- common.update_config_file(magic_pdf_config, "layout-config", value)
- pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
- common.delete_file(pdf_res_path)
- cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
- os.system(cli_cmd)
- common.cli_count_folders_and_check_contents(os.path.join(pdf_res_path, "test_rearch_report", "auto"))
- #res = common.check_html_table_exists(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
- @pytest.mark.P1
- def test_magic_pdf_cpu(self):
- """magic pdf cli cpu mode."""
- time.sleep(2)
- #pre_cmd = "cp ~/magic_pdf_html_table_cpu.json ~/magic-pdf.json"
- #os.system(pre_cmd)
- value = {
- "model": "tablemaster",
- "enable": False,
- "max_time": 400
- }
- common.update_config_file(magic_pdf_config, "table-config", value)
- common.update_config_file(magic_pdf_config, "device-mode", "cpu")
- pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
- common.delete_file(pdf_res_path)
- cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
- os.system(cli_cmd)
- common.cli_count_folders_and_check_contents(os.path.join(pdf_res_path, "test_rearch_report", "auto"))
- @pytest.mark.P1
- def test_local_magic_pdf_close_html_table(self):
- """magic pdf cli close table."""
- time.sleep(2)
- #pre_cmd = "cp ~/magic_pdf_close_table.json ~/magic-pdf.json"
- #os.system(pre_cmd)
- value = {
- "model": "tablemaster",
- "enable": False,
- "max_time": 400
- }
- common.update_config_file(magic_pdf_config, "table-config", value)
- pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
- common.delete_file(pdf_res_path)
- cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
- os.system(cli_cmd)
- res = common.check_close_tables(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
- assert res is True
-
-
- if __name__ == '__main__':
- pytest.main()
|