test_cli.py 2.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051
  1. import pytest
  2. import os
  3. from conf import conf
  4. import subprocess
  5. from lib import common
  6. import logging
  7. pdf_res_path = conf.conf["pdf_res_path"]
  8. code_path = conf.conf["code_path"]
  9. pdf_dev_path = conf.conf["pdf_dev_path"]
  10. class TestCli:
  11. def test_pdf_specify_dir(self):
  12. """
  13. 输入pdf和指定目录的模型结果
  14. """
  15. cmd = 'cd %s && export PYTHONPATH=. && find %s -type f -name "*.pdf" | xargs -I{} python magic_pdf/cli/magicpdf.py pdf-command --pdf {}' % (code_path, pdf_dev_path)
  16. logging.info(cmd)
  17. common.check_shell(cmd)
  18. common.count_folders_and_check_contents(pdf_res_path)
  19. def test_pdf_specify_jsonl(self):
  20. """
  21. 输入jsonl, 默认方式解析
  22. """
  23. cmd = "cd %s && export PYTHONPATH=. && python magic_pdf/cli/magicpdf.py json-command --json 's3://llm-process-pperf/ebook_index_textbook_40k/中高考&竞赛知识点/part-663f1ef5e7c1-009416.jsonl?bytes=0,1133972'"
  24. logging.info(cmd)
  25. common.check_shell(cmd)
  26. common.count_folders_and_check_contents(pdf_res_path)
  27. def test_pdf_specify_jsonl_txt(self):
  28. """
  29. 输入jsonl, txt方式解析
  30. """
  31. cmd = "cd %s && export PYTHONPATH=. && python magic_pdf/cli/magicpdf.py json-command --json 's3://llm-process-pperf/ebook_index_textbook_40k/中高考&竞赛知识点/part-663f1ef5e7c1-009416.jsonl?bytes=0,1133972' --method txt"
  32. logging.info(cmd)
  33. common.check_shell(cmd)
  34. common.count_folders_and_check_contents(pdf_res_path)
  35. def test_pdf_specify_jsonl_ocr(self):
  36. """
  37. 输入jsonl, ocr方式解析
  38. """
  39. cmd = "cd %s && export PYTHONPATH=. && python magic_pdf/cli/magicpdf.py json-command --json 's3://llm-process-pperf/ebook_index_textbook_40k/中高考&竞赛知识点/part-663f1ef5e7c1-009416.jsonl?bytes=0,1133972' --method ocr"
  40. logging.info(cmd)
  41. common.check_shell(cmd)
  42. common.count_folders_and_check_contents(pdf_res_path)
  43. if __name__ == "__main__":
  44. pytest.main()