test_cli.py 3.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
  1. import pytest
  2. import os
  3. from conf import conf
  4. import subprocess
  5. from lib import common
  6. import logging
  7. import os
  8. import json
  9. from loguru import logger
  10. from magic_pdf.pipe.UNIPipe import UNIPipe
  11. from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
  12. pdf_res_path = conf.conf["pdf_res_path"]
  13. code_path = conf.conf["code_path"]
  14. pdf_dev_path = conf.conf["pdf_dev_path"]
  15. class TestCli:
  16. def test_pdf_specify_dir(self):
  17. """
  18. 输入pdf和指定目录的模型结果
  19. """
  20. cmd = 'cd %s && export PYTHONPATH=. && find %s -type f -name "*.pdf" | xargs -I{} python magic_pdf/cli/magicpdf.py pdf-command --pdf {}' % (code_path, pdf_dev_path)
  21. logging.info(cmd)
  22. common.check_shell(cmd)
  23. #common.count_folders_and_check_contents(pdf_res_path)
  24. def test_pdf_sdk(self):
  25. """
  26. pdf sdk 方式解析
  27. """
  28. demo_names = list()
  29. for pdf_file in os.listdir(pdf_dev_path):
  30. if pdf_file.endswith('.pdf'):
  31. demo_names.append(pdf_file.split('.')[0])
  32. for demo_name in demo_names:
  33. model_path = os.path.join(pdf_dev_path, f"{demo_name}.json")
  34. pdf_path = os.path.join(pdf_dev_path, f"{demo_name}.pdf")
  35. pdf_bytes = open(pdf_path, "rb").read()
  36. model_json = json.loads(open(model_path, "r", encoding="utf-8").read())
  37. image_writer = DiskReaderWriter(pdf_dev_path)
  38. image_dir = str(os.path.basename(pdf_dev_path))
  39. jso_useful_key = {"_pdf_type": "", "model_list": model_json}
  40. pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
  41. pipe.pipe_classify()
  42. pipe.pipe_parse()
  43. md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
  44. with open(f"{demo_name}.md", "w", encoding="utf-8") as f:
  45. f.write(md_content)
  46. # def test_pdf_specify_jsonl(self):
  47. # """
  48. # 输入jsonl, 默认方式解析
  49. # """
  50. # cmd = "cd %s && export PYTHONPATH=. && python magic_pdf/cli/magicpdf.py json-command --json 's3://llm-process-pperf/ebook_index_textbook_40k/中高考&竞赛知识点/part-663f1ef5e7c1-009416.jsonl?bytes=0,1133972'" % (code_path)
  51. # logging.info(cmd)
  52. # common.check_shell(cmd)
  53. # #common.count_folders_and_check_contents(pdf_res_path)
  54. # def test_pdf_specify_jsonl_txt(self):
  55. # """
  56. # 输入jsonl, txt方式解析
  57. # """
  58. # cmd = "cd %s && export PYTHONPATH=. && python magic_pdf/cli/magicpdf.py json-command --json 's3://llm-process-pperf/ebook_index_textbook_40k/中高考&竞赛知识点/part-663f1ef5e7c1-009416.jsonl?bytes=0,1133972' --method txt" % (code_path)
  59. # logging.info(cmd)
  60. # common.check_shell(cmd)
  61. # #common.count_folders_and_check_contents(pdf_res_path)
  62. #
  63. # def test_pdf_specify_jsonl_ocr(self):
  64. # """
  65. # 输入jsonl, ocr方式解析
  66. # """
  67. # cmd = "cd %s && export PYTHONPATH=. && python magic_pdf/cli/magicpdf.py json-command --json 's3://llm-process-pperf/ebook_index_textbook_40k/中高考&竞赛知识点/part-663f1ef5e7c1-009416.jsonl?bytes=0,1133972' --method ocr" % (code_path)
  68. # logging.info(cmd)
  69. # common.check_shell(cmd)
  70. # #common.count_folders_and_check_contents(pdf_res_path)
  71. if __name__ == "__main__":
  72. pytest.main()