common.py 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183
  1. import copy
  2. import json as json_parse
  3. import os
  4. import click
  5. import fitz
  6. from loguru import logger
  7. import magic_pdf.model as model_config
  8. from magic_pdf.data.data_reader_writer import FileBasedDataWriter
  9. from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_line_sort_bbox,
  10. draw_model_bbox, draw_span_bbox)
  11. from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
  12. from magic_pdf.pipe.OCRPipe import OCRPipe
  13. from magic_pdf.pipe.TXTPipe import TXTPipe
  14. from magic_pdf.pipe.UNIPipe import UNIPipe
  15. # from io import BytesIO
  16. # from pypdf import PdfReader, PdfWriter
  17. def prepare_env(output_dir, pdf_file_name, method):
  18. local_parent_dir = os.path.join(output_dir, pdf_file_name, method)
  19. local_image_dir = os.path.join(str(local_parent_dir), 'images')
  20. local_md_dir = local_parent_dir
  21. os.makedirs(local_image_dir, exist_ok=True)
  22. os.makedirs(local_md_dir, exist_ok=True)
  23. return local_image_dir, local_md_dir
  24. # def convert_pdf_bytes_to_bytes_by_pypdf(pdf_bytes, start_page_id=0, end_page_id=None):
  25. # # 将字节数据包装在 BytesIO 对象中
  26. # pdf_file = BytesIO(pdf_bytes)
  27. # # 读取 PDF 的字节数据
  28. # reader = PdfReader(pdf_file)
  29. # # 创建一个新的 PDF 写入器
  30. # writer = PdfWriter()
  31. # # 将所有页面添加到新的 PDF 写入器中
  32. # end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(reader.pages) - 1
  33. # if end_page_id > len(reader.pages) - 1:
  34. # logger.warning("end_page_id is out of range, use pdf_docs length")
  35. # end_page_id = len(reader.pages) - 1
  36. # for i, page in enumerate(reader.pages):
  37. # if start_page_id <= i <= end_page_id:
  38. # writer.add_page(page)
  39. # # 创建一个字节缓冲区来存储输出的 PDF 数据
  40. # output_buffer = BytesIO()
  41. # # 将 PDF 写入字节缓冲区
  42. # writer.write(output_buffer)
  43. # # 获取字节缓冲区的内容
  44. # converted_pdf_bytes = output_buffer.getvalue()
  45. # return converted_pdf_bytes
  46. def convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id=0, end_page_id=None):
  47. document = fitz.open('pdf', pdf_bytes)
  48. output_document = fitz.open()
  49. end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(document) - 1
  50. if end_page_id > len(document) - 1:
  51. logger.warning('end_page_id is out of range, use pdf_docs length')
  52. end_page_id = len(document) - 1
  53. output_document.insert_pdf(document, from_page=start_page_id, to_page=end_page_id)
  54. output_bytes = output_document.tobytes()
  55. return output_bytes
  56. def do_parse(
  57. output_dir,
  58. pdf_file_name,
  59. pdf_bytes,
  60. model_list,
  61. parse_method,
  62. debug_able,
  63. f_draw_span_bbox=True,
  64. f_draw_layout_bbox=True,
  65. f_dump_md=True,
  66. f_dump_middle_json=True,
  67. f_dump_model_json=True,
  68. f_dump_orig_pdf=True,
  69. f_dump_content_list=True,
  70. f_make_md_mode=MakeMode.MM_MD,
  71. f_draw_model_bbox=False,
  72. f_draw_line_sort_bbox=False,
  73. start_page_id=0,
  74. end_page_id=None,
  75. lang=None,
  76. layout_model=None,
  77. formula_enable=None,
  78. table_enable=None,
  79. ):
  80. if debug_able:
  81. logger.warning('debug mode is on')
  82. f_draw_model_bbox = True
  83. f_draw_line_sort_bbox = True
  84. pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id, end_page_id)
  85. orig_model_list = copy.deepcopy(model_list)
  86. local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name,
  87. parse_method)
  88. image_writer, md_writer = FileBasedDataWriter(
  89. local_image_dir), FileBasedDataWriter(local_md_dir)
  90. image_dir = str(os.path.basename(local_image_dir))
  91. if parse_method == 'auto':
  92. jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
  93. pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True,
  94. # start_page_id=start_page_id, end_page_id=end_page_id,
  95. lang=lang,
  96. layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
  97. elif parse_method == 'txt':
  98. pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True,
  99. # start_page_id=start_page_id, end_page_id=end_page_id,
  100. lang=lang,
  101. layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
  102. elif parse_method == 'ocr':
  103. pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True,
  104. # start_page_id=start_page_id, end_page_id=end_page_id,
  105. lang=lang,
  106. layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
  107. else:
  108. logger.error('unknown parse method')
  109. exit(1)
  110. pipe.pipe_classify()
  111. if len(model_list) == 0:
  112. if model_config.__use_inside_model__:
  113. pipe.pipe_analyze()
  114. orig_model_list = copy.deepcopy(pipe.model_list)
  115. else:
  116. logger.error('need model list input')
  117. exit(2)
  118. pipe.pipe_parse()
  119. pdf_info = pipe.pdf_mid_data['pdf_info']
  120. if f_draw_layout_bbox:
  121. draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
  122. if f_draw_span_bbox:
  123. draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
  124. if f_draw_model_bbox:
  125. draw_model_bbox(copy.deepcopy(orig_model_list), pdf_bytes, local_md_dir, pdf_file_name)
  126. if f_draw_line_sort_bbox:
  127. draw_line_sort_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
  128. md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode)
  129. if f_dump_md:
  130. md_writer.write_string(
  131. f'{pdf_file_name}.md',
  132. md_content
  133. )
  134. if f_dump_middle_json:
  135. md_writer.write_string(
  136. f'{pdf_file_name}_middle.json',
  137. json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4)
  138. )
  139. if f_dump_model_json:
  140. md_writer.write_string(
  141. f'{pdf_file_name}_model.json',
  142. json_parse.dumps(orig_model_list, ensure_ascii=False, indent=4)
  143. )
  144. if f_dump_orig_pdf:
  145. md_writer.write(
  146. f'{pdf_file_name}_origin.pdf',
  147. pdf_bytes,
  148. )
  149. content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
  150. if f_dump_content_list:
  151. md_writer.write_string(
  152. f'{pdf_file_name}_content_list.json',
  153. json_parse.dumps(content_list, ensure_ascii=False, indent=4)
  154. )
  155. logger.info(f'local output dir is {local_md_dir}')
  156. parse_pdf_methods = click.Choice(['ocr', 'txt', 'auto'])