test_cli.py 3.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273
  1. import pytest
  2. import os
  3. from conf import conf
  4. import os
  5. import json
  6. from magic_pdf.pipe.UNIPipe import UNIPipe
  7. from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
  8. pdf_res_path = conf.conf["pdf_res_path"]
  9. code_path = conf.conf["code_path"]
  10. pdf_dev_path = conf.conf["pdf_dev_path"]
  11. class TestCli:
  12. """
  13. test cli
  14. """
  15. def test_pdf_sdk(self):
  16. """
  17. pdf sdk 方式解析
  18. """
  19. demo_names = list()
  20. pdf_path = os.path.join(pdf_dev_path, "pdf")
  21. for pdf_file in os.listdir(pdf_path):
  22. if pdf_file.endswith('.pdf'):
  23. demo_names.append(pdf_file.split('.')[0])
  24. for demo_name in demo_names:
  25. model_path = os.path.join(pdf_dev_path, f"{demo_name}_model.json")
  26. pdf_path = os.path.join(pdf_dev_path, "pdf", f"{demo_name}.pdf")
  27. pdf_bytes = open(pdf_path, "rb").read()
  28. model_json = json.loads(open(model_path, "r", encoding="utf-8").read())
  29. image_writer = DiskReaderWriter(pdf_dev_path)
  30. image_dir = str(os.path.basename(pdf_dev_path))
  31. jso_useful_key = {"_pdf_type": "", "model_list": model_json}
  32. pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
  33. pipe.pipe_classify()
  34. pipe.pipe_parse()
  35. md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
  36. dir_path = os.path.join(pdf_dev_path, "mineru")
  37. if not os.path.exists(dir_path):
  38. os.makedirs(dir_path, exist_ok=True)
  39. res_path = os.path.join(dir_path, f"{demo_name}.md")
  40. with open(res_path, "w+", encoding="utf-8") as f:
  41. f.write(md_content)
  42. # def test_pdf_specify_jsonl(self):
  43. # """
  44. # 输入jsonl, 默认方式解析
  45. # """
  46. # cmd = "cd %s && export PYTHONPATH=. && python magic_pdf/cli/magicpdf.py json-command --json 's3://llm-process-pperf/ebook_index_textbook_40k/中高考&竞赛知识点/part-663f1ef5e7c1-009416.jsonl?bytes=0,1133972'" % (code_path)
  47. # logging.info(cmd)
  48. # common.check_shell(cmd)
  49. # #common.count_folders_and_check_contents(pdf_res_path)
  50. # def test_pdf_specify_jsonl_txt(self):
  51. # """
  52. # 输入jsonl, txt方式解析
  53. # """
  54. # cmd = "cd %s && export PYTHONPATH=. && python magic_pdf/cli/magicpdf.py json-command --json 's3://llm-process-pperf/ebook_index_textbook_40k/中高考&竞赛知识点/part-663f1ef5e7c1-009416.jsonl?bytes=0,1133972' --method txt" % (code_path)
  55. # logging.info(cmd)
  56. # common.check_shell(cmd)
  57. # #common.count_folders_and_check_contents(pdf_res_path)
  58. #
  59. # def test_pdf_specify_jsonl_ocr(self):
  60. # """
  61. # 输入jsonl, ocr方式解析
  62. # """
  63. # cmd = "cd %s && export PYTHONPATH=. && python magic_pdf/cli/magicpdf.py json-command --json 's3://llm-process-pperf/ebook_index_textbook_40k/中高考&竞赛知识点/part-663f1ef5e7c1-009416.jsonl?bytes=0,1133972' --method ocr" % (code_path)
  64. # logging.info(cmd)
  65. # common.check_shell(cmd)
  66. # #common.count_folders_and_check_contents(pdf_res_path)
  67. if __name__ == "__main__":
  68. pytest.main()