common.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
  1. import os
  2. import json as json_parse
  3. import copy
  4. import click
  5. from loguru import logger
  6. from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
  7. from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
  8. from magic_pdf.pipe.UNIPipe import UNIPipe
  9. from magic_pdf.pipe.OCRPipe import OCRPipe
  10. from magic_pdf.pipe.TXTPipe import TXTPipe
  11. from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
  12. from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
  13. import magic_pdf.model as model_config
  14. def prepare_env(output_dir, pdf_file_name, method):
  15. local_parent_dir = os.path.join(output_dir, pdf_file_name, method)
  16. local_image_dir = os.path.join(str(local_parent_dir), "images")
  17. local_md_dir = local_parent_dir
  18. os.makedirs(local_image_dir, exist_ok=True)
  19. os.makedirs(local_md_dir, exist_ok=True)
  20. return local_image_dir, local_md_dir
  21. def do_parse(
  22. output_dir,
  23. pdf_file_name,
  24. pdf_bytes,
  25. model_list,
  26. parse_method,
  27. f_draw_span_bbox=True,
  28. f_draw_layout_bbox=True,
  29. f_dump_md=True,
  30. f_dump_middle_json=True,
  31. f_dump_model_json=True,
  32. f_dump_orig_pdf=True,
  33. f_dump_content_list=False,
  34. f_make_md_mode=MakeMode.MM_MD,
  35. ):
  36. orig_model_list = copy.deepcopy(model_list)
  37. local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
  38. image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(
  39. local_md_dir
  40. )
  41. image_dir = str(os.path.basename(local_image_dir))
  42. if parse_method == "auto":
  43. jso_useful_key = {"_pdf_type": "", "model_list": model_list}
  44. pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
  45. elif parse_method == "txt":
  46. pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True)
  47. elif parse_method == "ocr":
  48. pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
  49. else:
  50. logger.error("unknown parse method")
  51. exit(1)
  52. pipe.pipe_classify()
  53. if len(model_list) == 0:
  54. if model_config.__use_inside_model__:
  55. pipe.pipe_analyze()
  56. orig_model_list = copy.deepcopy(pipe.model_list)
  57. else:
  58. logger.error("need model list input")
  59. exit(2)
  60. pipe.pipe_parse()
  61. pdf_info = pipe.pdf_mid_data["pdf_info"]
  62. if f_draw_layout_bbox:
  63. draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
  64. if f_draw_span_bbox:
  65. draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
  66. md_content = pipe.pipe_mk_markdown(
  67. image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode
  68. )
  69. if f_dump_md:
  70. md_writer.write(
  71. content=md_content,
  72. path=f"{pdf_file_name}.md",
  73. mode=AbsReaderWriter.MODE_TXT,
  74. )
  75. if f_dump_middle_json:
  76. md_writer.write(
  77. content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
  78. path="middle.json",
  79. mode=AbsReaderWriter.MODE_TXT,
  80. )
  81. if f_dump_model_json:
  82. md_writer.write(
  83. content=json_parse.dumps(orig_model_list, ensure_ascii=False, indent=4),
  84. path="model.json",
  85. mode=AbsReaderWriter.MODE_TXT,
  86. )
  87. if f_dump_orig_pdf:
  88. md_writer.write(
  89. content=pdf_bytes,
  90. path="origin.pdf",
  91. mode=AbsReaderWriter.MODE_BIN,
  92. )
  93. content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
  94. if f_dump_content_list:
  95. md_writer.write(
  96. content=json_parse.dumps(content_list, ensure_ascii=False, indent=4),
  97. path="content_list.json",
  98. mode=AbsReaderWriter.MODE_TXT,
  99. )
  100. logger.info(f"local output dir is {local_md_dir}")
  101. parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])