test_cli_dev.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123
  1. import os
  2. import shutil
  3. import tempfile
  4. from click.testing import CliRunner
  5. from magic_pdf.tools import cli_dev
  6. def test_cli_pdf():
  7. # setup
  8. unitest_dir = '/tmp/magic_pdf/unittest/tools'
  9. filename = 'cli_test_01'
  10. os.makedirs(unitest_dir, exist_ok=True)
  11. temp_output_dir = tempfile.mkdtemp(dir='/tmp/magic_pdf/unittest/tools')
  12. os.makedirs(temp_output_dir, exist_ok=True)
  13. # run
  14. runner = CliRunner()
  15. result = runner.invoke(
  16. cli_dev.cli,
  17. [
  18. 'pdf',
  19. '-p',
  20. 'tests/test_tools/assets/cli/pdf/cli_test_01.pdf',
  21. '-j',
  22. 'tests/test_tools/assets/cli_dev/cli_test_01.model.json',
  23. '-o',
  24. temp_output_dir,
  25. ],
  26. )
  27. # check
  28. assert result.exit_code == 0
  29. base_output_dir = os.path.join(temp_output_dir, 'cli_test_01/auto')
  30. r = os.stat(os.path.join(base_output_dir, f'{filename}_content_list.json'))
  31. assert r.st_size > 5000
  32. r = os.stat(os.path.join(base_output_dir, f'{filename}.md'))
  33. assert r.st_size > 7000
  34. r = os.stat(os.path.join(base_output_dir, f'{filename}_middle.json'))
  35. assert r.st_size > 200000
  36. r = os.stat(os.path.join(base_output_dir, f'{filename}_model.json'))
  37. assert r.st_size > 15000
  38. r = os.stat(os.path.join(base_output_dir, f'{filename}_origin.pdf'))
  39. assert r.st_size > 500000
  40. r = os.stat(os.path.join(base_output_dir, f'{filename}_layout.pdf'))
  41. assert r.st_size > 500000
  42. r = os.stat(os.path.join(base_output_dir, f'{filename}_spans.pdf'))
  43. assert r.st_size > 500000
  44. assert os.path.exists(os.path.join(base_output_dir, 'images')) is True
  45. assert os.path.isdir(os.path.join(base_output_dir, 'images')) is True
  46. # teardown
  47. shutil.rmtree(temp_output_dir)
  48. def test_cli_jsonl():
  49. # setup
  50. unitest_dir = '/tmp/magic_pdf/unittest/tools'
  51. filename = 'cli_test_01'
  52. os.makedirs(unitest_dir, exist_ok=True)
  53. temp_output_dir = tempfile.mkdtemp(dir='/tmp/magic_pdf/unittest/tools')
  54. os.makedirs(temp_output_dir, exist_ok=True)
  55. def mock_read_s3_path(s3path):
  56. with open(s3path, 'rb') as f:
  57. return f.read()
  58. cli_dev.read_s3_path = mock_read_s3_path # mock
  59. # run
  60. runner = CliRunner()
  61. result = runner.invoke(
  62. cli_dev.cli,
  63. [
  64. 'jsonl',
  65. '-j',
  66. 'tests/test_tools/assets/cli_dev/cli_test_01.jsonl',
  67. '-o',
  68. temp_output_dir,
  69. ],
  70. )
  71. # check
  72. assert result.exit_code == 0
  73. base_output_dir = os.path.join(temp_output_dir, 'cli_test_01/auto')
  74. r = os.stat(os.path.join(base_output_dir, f'{filename}_content_list.json'))
  75. assert r.st_size > 5000
  76. r = os.stat(os.path.join(base_output_dir, f'{filename}.md'))
  77. assert r.st_size > 7000
  78. r = os.stat(os.path.join(base_output_dir, f'{filename}_middle.json'))
  79. assert r.st_size > 200000
  80. r = os.stat(os.path.join(base_output_dir, f'{filename}_model.json'))
  81. assert r.st_size > 15000
  82. r = os.stat(os.path.join(base_output_dir, f'{filename}_origin.pdf'))
  83. assert r.st_size > 500000
  84. r = os.stat(os.path.join(base_output_dir, f'{filename}_layout.pdf'))
  85. assert r.st_size > 500000
  86. r = os.stat(os.path.join(base_output_dir, f'{filename}_spans.pdf'))
  87. assert r.st_size > 500000
  88. assert os.path.exists(os.path.join(base_output_dir, 'images')) is True
  89. assert os.path.isdir(os.path.join(base_output_dir, 'images')) is True
  90. # teardown
  91. shutil.rmtree(temp_output_dir)