test_common.py 1.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758
  1. import os
  2. import shutil
  3. import tempfile
  4. import pytest
  5. import magic_pdf.model as model_config
  6. from magic_pdf.tools.common import do_parse
  7. @pytest.mark.parametrize('method', ['auto', 'txt', 'ocr'])
  8. def test_common_do_parse(method):
  9. # setup
  10. model_config.__use_inside_model__ = True
  11. unitest_dir = '/tmp/magic_pdf/unittest/tools'
  12. filename = 'fake'
  13. os.makedirs(unitest_dir, exist_ok=True)
  14. temp_output_dir = tempfile.mkdtemp(dir='/tmp/magic_pdf/unittest/tools')
  15. os.makedirs(temp_output_dir, exist_ok=True)
  16. # run
  17. with open('tests/test_tools/assets/common/cli_test_01.pdf', 'rb') as f:
  18. bits = f.read()
  19. do_parse(temp_output_dir,
  20. filename,
  21. bits, [],
  22. method,
  23. f_dump_content_list=True)
  24. # check
  25. base_output_dir = os.path.join(temp_output_dir, f'fake/{method}')
  26. r = os.stat(os.path.join(base_output_dir, f'{filename}_content_list.json'))
  27. assert r.st_size > 5000
  28. r = os.stat(os.path.join(base_output_dir, f'{filename}.md'))
  29. assert r.st_size > 7000
  30. r = os.stat(os.path.join(base_output_dir, f'{filename}_middle.json'))
  31. assert r.st_size > 200000
  32. r = os.stat(os.path.join(base_output_dir, f'{filename}_model.json'))
  33. assert r.st_size > 15000
  34. r = os.stat(os.path.join(base_output_dir, f'{filename}_origin.pdf'))
  35. assert r.st_size > 500000
  36. r = os.stat(os.path.join(base_output_dir, f'{filename}_layout.pdf'))
  37. assert r.st_size > 500000
  38. r = os.stat(os.path.join(base_output_dir, f'{filename}_spans.pdf'))
  39. assert r.st_size > 500000
  40. os.path.exists(os.path.join(base_output_dir, 'images'))
  41. os.path.isdir(os.path.join(base_output_dir, 'images'))
  42. # teardown
  43. shutil.rmtree(temp_output_dir)