test_common.py 1.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
  1. import tempfile
  2. import os
  3. import shutil
  4. import pytest
  5. from magic_pdf.tools.common import do_parse
  6. @pytest.mark.parametrize("method", ["auto", "txt", "ocr"])
  7. def test_common_do_parse(method):
  8. # setup
  9. unitest_dir = "/tmp/magic_pdf/unittest/tools"
  10. filename = "fake"
  11. os.makedirs(unitest_dir, exist_ok=True)
  12. temp_output_dir = tempfile.mkdtemp(dir="/tmp/magic_pdf/unittest/tools")
  13. # run
  14. with open("tests/test_tools/assets/common/cli_test_01.pdf", "rb") as f:
  15. bits = f.read()
  16. do_parse(temp_output_dir,
  17. filename,
  18. bits, [],
  19. method,
  20. False,
  21. f_dump_content_list=True)
  22. # check
  23. base_output_dir = os.path.join(temp_output_dir, f"fake/{method}")
  24. r = os.stat(os.path.join(base_output_dir, "content_list.json"))
  25. assert r.st_size > 5000
  26. r = os.stat(os.path.join(base_output_dir, f"{filename}.md"))
  27. assert r.st_size > 7000
  28. r = os.stat(os.path.join(base_output_dir, "middle.json"))
  29. assert r.st_size > 200000
  30. r = os.stat(os.path.join(base_output_dir, "model.json"))
  31. assert r.st_size > 15000
  32. r = os.stat(os.path.join(base_output_dir, "origin.pdf"))
  33. assert r.st_size > 500000
  34. r = os.stat(os.path.join(base_output_dir, "layout.pdf"))
  35. assert r.st_size > 500000
  36. r = os.stat(os.path.join(base_output_dir, "spans.pdf"))
  37. assert r.st_size > 500000
  38. os.path.exists(os.path.join(base_output_dir, "images"))
  39. os.path.isdir(os.path.join(base_output_dir, "images"))
  40. # teardown
  41. shutil.rmtree(temp_output_dir)