test_cli.py 3.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374
  1. import pytest
  2. import os
  3. from conf import conf
  4. import subprocess
  5. from lib import common
  6. import logging
  7. import os
  8. import json
  9. from loguru import logger
  10. from magic_pdf.pipe.UNIPipe import UNIPipe
  11. from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
  12. pdf_res_path = conf.conf["pdf_res_path"]
  13. code_path = conf.conf["code_path"]
  14. pdf_dev_path = conf.conf["pdf_dev_path"]
  15. class TestCli:
  16. """
  17. test cli
  18. """
  19. def test_pdf_sdk(self):
  20. """
  21. pdf sdk 方式解析
  22. """
  23. demo_names = list()
  24. pdf_path = os.path.join(pdf_dev_path, "pdf")
  25. for pdf_file in os.listdir(pdf_path):
  26. if pdf_file.endswith('.pdf'):
  27. demo_names.append(pdf_file.split('.')[0])
  28. for demo_name in demo_names:
  29. model_path = os.path.join(pdf_dev_path, f"{demo_name}_model.json")
  30. pdf_path = os.path.join(pdf_dev_path, f"{demo_name}.pdf")
  31. pdf_bytes = open(pdf_path, "rb").read()
  32. model_json = json.loads(open(model_path, "r", encoding="utf-8").read())
  33. image_writer = DiskReaderWriter(pdf_dev_path)
  34. image_dir = str(os.path.basename(pdf_dev_path))
  35. jso_useful_key = {"_pdf_type": "", "model_list": model_json}
  36. pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
  37. pipe.pipe_classify()
  38. pipe.pipe_parse()
  39. md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
  40. res_path = os.path.join(pdf_dev_path, "miner", f"{demo_name}.md")
  41. with open(res_path, "w", encoding="utf-8") as f:
  42. f.write(md_content)
  43. # def test_pdf_specify_jsonl(self):
  44. # """
  45. # 输入jsonl, 默认方式解析
  46. # """
  47. # cmd = "cd %s && export PYTHONPATH=. && python magic_pdf/cli/magicpdf.py json-command --json 's3://llm-process-pperf/ebook_index_textbook_40k/中高考&竞赛知识点/part-663f1ef5e7c1-009416.jsonl?bytes=0,1133972'" % (code_path)
  48. # logging.info(cmd)
  49. # common.check_shell(cmd)
  50. # #common.count_folders_and_check_contents(pdf_res_path)
  51. # def test_pdf_specify_jsonl_txt(self):
  52. # """
  53. # 输入jsonl, txt方式解析
  54. # """
  55. # cmd = "cd %s && export PYTHONPATH=. && python magic_pdf/cli/magicpdf.py json-command --json 's3://llm-process-pperf/ebook_index_textbook_40k/中高考&竞赛知识点/part-663f1ef5e7c1-009416.jsonl?bytes=0,1133972' --method txt" % (code_path)
  56. # logging.info(cmd)
  57. # common.check_shell(cmd)
  58. # #common.count_folders_and_check_contents(pdf_res_path)
  59. #
  60. # def test_pdf_specify_jsonl_ocr(self):
  61. # """
  62. # 输入jsonl, ocr方式解析
  63. # """
  64. # cmd = "cd %s && export PYTHONPATH=. && python magic_pdf/cli/magicpdf.py json-command --json 's3://llm-process-pperf/ebook_index_textbook_40k/中高考&竞赛知识点/part-663f1ef5e7c1-009416.jsonl?bytes=0,1133972' --method ocr" % (code_path)
  65. # logging.info(cmd)
  66. # common.check_shell(cmd)
  67. # #common.count_folders_and_check_contents(pdf_res_path)
  68. if __name__ == "__main__":
  69. pytest.main()