common.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. import os
  2. import json as json_parse
  3. import copy
  4. import click
  5. from loguru import logger
  6. from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
  7. from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
  8. from magic_pdf.pipe.UNIPipe import UNIPipe
  9. from magic_pdf.pipe.OCRPipe import OCRPipe
  10. from magic_pdf.pipe.TXTPipe import TXTPipe
  11. from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
  12. from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
  13. import magic_pdf.model as model_config
  14. def prepare_env(output_dir, pdf_file_name, method):
  15. local_parent_dir = os.path.join(output_dir, pdf_file_name, method)
  16. local_image_dir = os.path.join(str(local_parent_dir), "images")
  17. local_md_dir = local_parent_dir
  18. os.makedirs(local_image_dir, exist_ok=True)
  19. os.makedirs(local_md_dir, exist_ok=True)
  20. return local_image_dir, local_md_dir
  21. def do_parse(
  22. output_dir,
  23. pdf_file_name,
  24. pdf_bytes,
  25. model_list,
  26. parse_method,
  27. f_draw_span_bbox=True,
  28. f_draw_layout_bbox=True,
  29. f_dump_md=True,
  30. f_dump_middle_json=True,
  31. f_dump_model_json=True,
  32. f_dump_orig_pdf=True,
  33. f_dump_content_list=False,
  34. f_make_md_mode=MakeMode.MM_MD,
  35. ):
  36. orig_model_list = copy.deepcopy(model_list)
  37. local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
  38. logger.info(f"local output dir is {local_md_dir}")
  39. image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(
  40. local_md_dir
  41. )
  42. image_dir = str(os.path.basename(local_image_dir))
  43. if parse_method == "auto":
  44. jso_useful_key = {"_pdf_type": "", "model_list": model_list}
  45. pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
  46. elif parse_method == "txt":
  47. pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True)
  48. elif parse_method == "ocr":
  49. pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
  50. else:
  51. logger.error("unknown parse method")
  52. exit(1)
  53. pipe.pipe_classify()
  54. if len(model_list) == 0:
  55. if model_config.__use_inside_model__:
  56. pipe.pipe_analyze()
  57. orig_model_list = copy.deepcopy(pipe.model_list)
  58. else:
  59. logger.error("need model list input")
  60. exit(2)
  61. pipe.pipe_parse()
  62. pdf_info = pipe.pdf_mid_data["pdf_info"]
  63. if f_draw_layout_bbox:
  64. draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
  65. if f_draw_span_bbox:
  66. draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
  67. md_content = pipe.pipe_mk_markdown(
  68. image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode
  69. )
  70. if f_dump_md:
  71. md_writer.write(
  72. content=md_content,
  73. path=f"{pdf_file_name}.md",
  74. mode=AbsReaderWriter.MODE_TXT,
  75. )
  76. if f_dump_middle_json:
  77. md_writer.write(
  78. content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
  79. path="middle.json",
  80. mode=AbsReaderWriter.MODE_TXT,
  81. )
  82. if f_dump_model_json:
  83. md_writer.write(
  84. content=json_parse.dumps(orig_model_list, ensure_ascii=False, indent=4),
  85. path="model.json",
  86. mode=AbsReaderWriter.MODE_TXT,
  87. )
  88. if f_dump_orig_pdf:
  89. md_writer.write(
  90. content=pdf_bytes,
  91. path="origin.pdf",
  92. mode=AbsReaderWriter.MODE_BIN,
  93. )
  94. content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
  95. if f_dump_content_list:
  96. md_writer.write(
  97. content=json_parse.dumps(content_list, ensure_ascii=False, indent=4),
  98. path="content_list.json",
  99. mode=AbsReaderWriter.MODE_TXT,
  100. )
  101. parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])