common.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. import os
  2. import json as json_parse
  3. import copy
  4. import click
  5. from loguru import logger
  6. from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
  7. from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox, drow_model_bbox
  8. from magic_pdf.pipe.UNIPipe import UNIPipe
  9. from magic_pdf.pipe.OCRPipe import OCRPipe
  10. from magic_pdf.pipe.TXTPipe import TXTPipe
  11. from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
  12. from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
  13. import magic_pdf.model as model_config
  14. def prepare_env(output_dir, pdf_file_name, method):
  15. local_parent_dir = os.path.join(output_dir, pdf_file_name, method)
  16. local_image_dir = os.path.join(str(local_parent_dir), "images")
  17. local_md_dir = local_parent_dir
  18. os.makedirs(local_image_dir, exist_ok=True)
  19. os.makedirs(local_md_dir, exist_ok=True)
  20. return local_image_dir, local_md_dir
  21. def do_parse(
  22. output_dir,
  23. pdf_file_name,
  24. pdf_bytes,
  25. model_list,
  26. parse_method,
  27. f_draw_span_bbox=True,
  28. f_draw_layout_bbox=True,
  29. f_dump_md=True,
  30. f_dump_middle_json=True,
  31. f_dump_model_json=True,
  32. f_dump_orig_pdf=True,
  33. f_dump_content_list=False,
  34. f_make_md_mode=MakeMode.MM_MD,
  35. f_draw_model_bbox=False,
  36. ):
  37. orig_model_list = copy.deepcopy(model_list)
  38. local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
  39. image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(
  40. local_md_dir
  41. )
  42. image_dir = str(os.path.basename(local_image_dir))
  43. if parse_method == "auto":
  44. jso_useful_key = {"_pdf_type": "", "model_list": model_list}
  45. pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
  46. elif parse_method == "txt":
  47. pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True)
  48. elif parse_method == "ocr":
  49. pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
  50. else:
  51. logger.error("unknown parse method")
  52. exit(1)
  53. pipe.pipe_classify()
  54. if len(model_list) == 0:
  55. if model_config.__use_inside_model__:
  56. pipe.pipe_analyze()
  57. orig_model_list = copy.deepcopy(pipe.model_list)
  58. else:
  59. logger.error("need model list input")
  60. exit(2)
  61. pipe.pipe_parse()
  62. pdf_info = pipe.pdf_mid_data["pdf_info"]
  63. if f_draw_layout_bbox:
  64. draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
  65. if f_draw_span_bbox:
  66. draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
  67. if f_draw_model_bbox:
  68. drow_model_bbox(orig_model_list, pdf_bytes, local_md_dir)
  69. md_content = pipe.pipe_mk_markdown(
  70. image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode
  71. )
  72. if f_dump_md:
  73. md_writer.write(
  74. content=md_content,
  75. path=f"{pdf_file_name}.md",
  76. mode=AbsReaderWriter.MODE_TXT,
  77. )
  78. if f_dump_middle_json:
  79. md_writer.write(
  80. content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
  81. path="middle.json",
  82. mode=AbsReaderWriter.MODE_TXT,
  83. )
  84. if f_dump_model_json:
  85. md_writer.write(
  86. content=json_parse.dumps(orig_model_list, ensure_ascii=False, indent=4),
  87. path="model.json",
  88. mode=AbsReaderWriter.MODE_TXT,
  89. )
  90. if f_dump_orig_pdf:
  91. md_writer.write(
  92. content=pdf_bytes,
  93. path="origin.pdf",
  94. mode=AbsReaderWriter.MODE_BIN,
  95. )
  96. content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
  97. if f_dump_content_list:
  98. md_writer.write(
  99. content=json_parse.dumps(content_list, ensure_ascii=False, indent=4),
  100. path="content_list.json",
  101. mode=AbsReaderWriter.MODE_TXT,
  102. )
  103. logger.info(f"local output dir is {local_md_dir}")
  104. parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])