| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254 |
- """test cli and sdk."""
- import logging
- import os
- import pytest
- from conf import conf
- from lib import common
- import magic_pdf.model as model_config
- from magic_pdf.pipe.UNIPipe import UNIPipe
- from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
- model_config.__use_inside_model__ = True
- pdf_res_path = conf.conf['pdf_res_path']
- code_path = conf.conf['code_path']
- pdf_dev_path = conf.conf['pdf_dev_path']
- class TestCli:
- """test cli."""
- @pytest.mark.P0
- def test_pdf_auto_sdk(self):
- """pdf sdk auto test."""
- demo_names = list()
- pdf_path = os.path.join(pdf_dev_path, 'pdf')
- for pdf_file in os.listdir(pdf_path):
- if pdf_file.endswith('.pdf'):
- demo_names.append(pdf_file.split('.')[0])
- for demo_name in demo_names:
- pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf')
- print(pdf_path)
- pdf_bytes = open(pdf_path, 'rb').read()
- local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
- image_dir = str(os.path.basename(local_image_dir))
- image_writer = DiskReaderWriter(local_image_dir)
- model_json = list()
- jso_useful_key = {'_pdf_type': '', 'model_list': model_json}
- pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
- pipe.pipe_classify()
- if len(model_json) == 0:
- if model_config.__use_inside_model__:
- pipe.pipe_analyze()
- else:
- exit(1)
- pipe.pipe_parse()
- md_content = pipe.pipe_mk_markdown(image_dir, drop_mode='none')
- dir_path = os.path.join(pdf_dev_path, 'mineru')
- if not os.path.exists(dir_path):
- os.makedirs(dir_path, exist_ok=True)
- res_path = os.path.join(dir_path, f'{demo_name}.md')
- common.delete_file(res_path)
- with open(res_path, 'w+', encoding='utf-8') as f:
- f.write(md_content)
- common.sdk_count_folders_and_check_contents(res_path)
- @pytest.mark.P0
- def test_pdf_ocr_sdk(self):
- """pdf sdk ocr test."""
- demo_names = list()
- pdf_path = os.path.join(pdf_dev_path, 'pdf')
- for pdf_file in os.listdir(pdf_path):
- if pdf_file.endswith('.pdf'):
- demo_names.append(pdf_file.split('.')[0])
- for demo_name in demo_names:
- pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf')
- print(pdf_path)
- pdf_bytes = open(pdf_path, 'rb').read()
- local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
- image_dir = str(os.path.basename(local_image_dir))
- image_writer = DiskReaderWriter(local_image_dir)
- model_json = list()
- jso_useful_key = {'_pdf_type': 'ocr', 'model_list': model_json}
- pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
- pipe.pipe_classify()
- if len(model_json) == 0:
- if model_config.__use_inside_model__:
- pipe.pipe_analyze()
- else:
- exit(1)
- pipe.pipe_parse()
- md_content = pipe.pipe_mk_markdown(image_dir, drop_mode='none')
- dir_path = os.path.join(pdf_dev_path, 'mineru')
- if not os.path.exists(dir_path):
- os.makedirs(dir_path, exist_ok=True)
- res_path = os.path.join(dir_path, f'{demo_name}.md')
- common.delete_file(res_path)
- with open(res_path, 'w+', encoding='utf-8') as f:
- f.write(md_content)
- common.sdk_count_folders_and_check_contents(res_path)
- @pytest.mark.P0
- def test_pdf_txt_sdk(self):
- """pdf sdk txt test."""
- demo_names = list()
- pdf_path = os.path.join(pdf_dev_path, 'pdf')
- for pdf_file in os.listdir(pdf_path):
- if pdf_file.endswith('.pdf'):
- demo_names.append(pdf_file.split('.')[0])
- for demo_name in demo_names:
- pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf')
- print(pdf_path)
- pdf_bytes = open(pdf_path, 'rb').read()
- local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
- image_dir = str(os.path.basename(local_image_dir))
- image_writer = DiskReaderWriter(local_image_dir)
- model_json = list()
- jso_useful_key = {'_pdf_type': 'txt', 'model_list': model_json}
- pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
- pipe.pipe_classify()
- if len(model_json) == 0:
- if model_config.__use_inside_model__:
- pipe.pipe_analyze()
- else:
- exit(1)
- pipe.pipe_parse()
- md_content = pipe.pipe_mk_markdown(image_dir, drop_mode='none')
- dir_path = os.path.join(pdf_dev_path, 'mineru')
- if not os.path.exists(dir_path):
- os.makedirs(dir_path, exist_ok=True)
- res_path = os.path.join(dir_path, f'{demo_name}.md')
- common.delete_file(res_path)
- with open(res_path, 'w+', encoding='utf-8') as f:
- f.write(md_content)
- common.sdk_count_folders_and_check_contents(res_path)
- @pytest.mark.P0
- def test_pdf_cli_auto(self):
- """magic_pdf cli test auto."""
- demo_names = []
- pdf_path = os.path.join(pdf_dev_path, 'pdf')
- for pdf_file in os.listdir(pdf_path):
- if pdf_file.endswith('.pdf'):
- demo_names.append(pdf_file.split('.')[0])
- for demo_name in demo_names:
- res_path = os.path.join(pdf_dev_path, 'mineru')
- common.delete_file(res_path)
- cmd = 'magic-pdf -p %s -o %s -m %s' % (os.path.join(
- pdf_path, f'{demo_name}.pdf'), res_path, 'auto')
- logging.info(cmd)
- os.system(cmd)
- common.cli_count_folders_and_check_contents(
- os.path.join(res_path, demo_name, 'auto'))
- @pytest.mark.P0
- def test_pdf_clit_txt(self):
- """magic_pdf cli test txt."""
- demo_names = []
- pdf_path = os.path.join(pdf_dev_path, 'pdf')
- for pdf_file in os.listdir(pdf_path):
- if pdf_file.endswith('.pdf'):
- demo_names.append(pdf_file.split('.')[0])
- for demo_name in demo_names:
- res_path = os.path.join(pdf_dev_path, 'mineru')
- common.delete_file(res_path)
- cmd = 'magic-pdf -p %s -o %s -m %s' % (os.path.join(
- pdf_path, f'{demo_name}.pdf'), res_path, 'txt')
- logging.info(cmd)
- os.system(cmd)
- common.cli_count_folders_and_check_contents(
- os.path.join(res_path, demo_name, 'txt'))
- @pytest.mark.P0
- def test_pdf_clit_ocr(self):
- """magic_pdf cli test ocr."""
- demo_names = []
- pdf_path = os.path.join(pdf_dev_path, 'pdf')
- for pdf_file in os.listdir(pdf_path):
- if pdf_file.endswith('.pdf'):
- demo_names.append(pdf_file.split('.')[0])
- for demo_name in demo_names:
- res_path = os.path.join(pdf_dev_path, 'mineru')
- common.delete_file(res_path)
- cmd = 'magic-pdf -p %s -o %s -m %s' % (os.path.join(
- pdf_path, f'{demo_name}.pdf'), res_path, 'ocr')
- logging.info(cmd)
- os.system(cmd)
- common.cli_count_folders_and_check_contents(
- os.path.join(res_path, demo_name, 'ocr'))
- @pytest.mark.P1
- def test_pdf_dev_cli_local_jsonl_txt(self):
- """magic_pdf_dev cli local txt."""
- jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
- cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, "txt")
- logging.info(cmd)
- os.system(cmd)
- @pytest.mark.P1
- def test_pdf_dev_cli_local_jsonl_ocr(self):
- """magic_pdf_dev cli local ocr."""
- jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
- cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'ocr')
- logging.info(cmd)
- os.system(cmd)
- @pytest.mark.P1
- def test_pdf_dev_cli_local_jsonl_auto(self):
- """magic_pdf_dev cli local auto."""
- jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
- cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'auto')
- logging.info(cmd)
- os.system(cmd)
- @pytest.mark.P1
- def test_pdf_dev_cli_s3_jsonl_txt(self):
- """magic_pdf_dev cli s3 txt."""
- jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
- cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, "txt")
- logging.info(cmd)
- os.system(cmd)
- @pytest.mark.P1
- def test_pdf_dev_cli_s3_jsonl_ocr(self):
- """magic_pdf_dev cli s3 ocr."""
- jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
- cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'ocr')
- logging.info(cmd)
- os.system(cmd)
- @pytest.mark.P1
- def test_pdf_dev_cli_s3_jsonl_auto(self):
- """magic_pdf_dev cli s3 auto."""
- jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
- cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'auto')
- logging.info(cmd)
- os.system(cmd)
- @pytest.mark.P1
- def test_pdf_dev_cli_pdf_json_auto(self):
- """magic_pdf_dev cli pdf+json auto."""
- json_path = os.path.join(pdf_dev_path, 'test_model.json')
- pdf_path = os.path.join(pdf_dev_path, 'pdf', 'research_report_1f978cd81fb7260c8f7644039ec2c054.pdf')
- cmd = 'magic-pdf-dev --pdf %s --json %s --method %s' % (pdf_path, json_path, 'auto')
- logging.info(cmd)
- os.system(cmd)
-
- @pytest.mark.P1
- def test_pdf_dev_cli_pdf_json_ocr(self):
- """magic_pdf_dev cli pdf+json ocr."""
- json_path = os.path.join(pdf_dev_path, 'test_model.json')
- pdf_path = os.path.join(pdf_dev_path, 'pdf', 'research_report_1f978cd81fb7260c8f7644039ec2c054.pdf')
- cmd = 'magic-pdf-dev --pdf %s --json %s --method %s' % (pdf_path, json_path, 'auto')
- logging.info(cmd)
- os.system(cmd)
- if __name__ == '__main__':
- pytest.main()
|