| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758 |
- import os
- import shutil
- import tempfile
- import pytest
- import magic_pdf.model as model_config
- from magic_pdf.tools.common import do_parse
- @pytest.mark.parametrize('method', ['auto', 'txt', 'ocr'])
- def test_common_do_parse(method):
- # setup
- model_config.__use_inside_model__ = True
- unitest_dir = '/tmp/magic_pdf/unittest/tools'
- filename = 'fake'
- os.makedirs(unitest_dir, exist_ok=True)
- temp_output_dir = tempfile.mkdtemp(dir='/tmp/magic_pdf/unittest/tools')
- os.makedirs(temp_output_dir, exist_ok=True)
- # run
- with open('tests/test_tools/assets/common/cli_test_01.pdf', 'rb') as f:
- bits = f.read()
- do_parse(temp_output_dir,
- filename,
- bits, [],
- method,
- f_dump_content_list=True)
- # check
- base_output_dir = os.path.join(temp_output_dir, f'fake/{method}')
- r = os.stat(os.path.join(base_output_dir, f'{filename}_content_list.json'))
- assert r.st_size > 5000
- r = os.stat(os.path.join(base_output_dir, f'{filename}.md'))
- assert r.st_size > 7000
- r = os.stat(os.path.join(base_output_dir, f'{filename}_middle.json'))
- assert r.st_size > 200000
- r = os.stat(os.path.join(base_output_dir, f'{filename}_model.json'))
- assert r.st_size > 15000
- r = os.stat(os.path.join(base_output_dir, f'{filename}_origin.pdf'))
- assert r.st_size > 500000
- r = os.stat(os.path.join(base_output_dir, f'{filename}_layout.pdf'))
- assert r.st_size > 500000
- r = os.stat(os.path.join(base_output_dir, f'{filename}_spans.pdf'))
- assert r.st_size > 500000
- os.path.exists(os.path.join(base_output_dir, 'images'))
- os.path.isdir(os.path.join(base_output_dir, 'images'))
- # teardown
- shutil.rmtree(temp_output_dir)
|