pdf_parse_by_ocr.py 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189
  1. import os
  2. import time
  3. from loguru import logger
  4. from magic_pdf.libs.commons import read_file, join_path, fitz, get_img_s3_client, get_delta_time, get_docx_model_output
  5. from magic_pdf.libs.safe_filename import sanitize_filename
  6. from magic_pdf.pre_proc.detect_footer_by_model import parse_footers
  7. from magic_pdf.pre_proc.detect_footnote import parse_footnotes_by_model
  8. from magic_pdf.pre_proc.detect_header import parse_headers
  9. from magic_pdf.pre_proc.detect_page_number import parse_pageNos
  10. from magic_pdf.pre_proc.ocr_detect_layout import layout_detect
  11. from magic_pdf.pre_proc.ocr_dict_merge import merge_spans_to_line, remove_overlaps_min_spans
  12. from magic_pdf.pre_proc.ocr_remove_spans import remove_spans_by_bboxes
  13. def construct_page_component(page_id, blocks, layout_bboxes):
  14. return_dict = {
  15. 'preproc_blocks': blocks,
  16. 'page_idx': page_id,
  17. 'layout_bboxes': layout_bboxes,
  18. }
  19. return return_dict
  20. def parse_pdf_by_ocr(
  21. pdf_path,
  22. s3_pdf_profile,
  23. pdf_model_output,
  24. book_name,
  25. pdf_model_profile=None,
  26. image_s3_config=None,
  27. start_page_id=0,
  28. end_page_id=None,
  29. debug_mode=False,
  30. ):
  31. pdf_bytes = read_file(pdf_path, s3_pdf_profile)
  32. save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
  33. book_name = sanitize_filename(book_name)
  34. md_bookname_save_path = ""
  35. if debug_mode:
  36. save_path = join_path(save_tmp_path, "md")
  37. pdf_local_path = join_path(save_tmp_path, "download-pdfs", book_name)
  38. if not os.path.exists(os.path.dirname(pdf_local_path)):
  39. # 如果目录不存在,创建它
  40. os.makedirs(os.path.dirname(pdf_local_path))
  41. md_bookname_save_path = join_path(save_tmp_path, "md", book_name)
  42. if not os.path.exists(md_bookname_save_path):
  43. # 如果目录不存在,创建它
  44. os.makedirs(md_bookname_save_path)
  45. with open(pdf_local_path + ".pdf", "wb") as pdf_file:
  46. pdf_file.write(pdf_bytes)
  47. pdf_docs = fitz.open("pdf", pdf_bytes)
  48. # 初始化空的pdf_info_dict
  49. pdf_info_dict = {}
  50. img_s3_client = get_img_s3_client(save_path, image_s3_config)
  51. start_time = time.time()
  52. remove_bboxes = []
  53. end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
  54. for page_id in range(start_page_id, end_page_id + 1):
  55. # 获取当前页的page对象
  56. page = pdf_docs[page_id]
  57. if debug_mode:
  58. time_now = time.time()
  59. logger.info(f"page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}")
  60. start_time = time_now
  61. # 获取当前页的模型数据
  62. ocr_page_info = get_docx_model_output(pdf_model_output, pdf_model_profile, page_id)
  63. """从json中获取每页的页码、页眉、页脚的bbox"""
  64. page_no_bboxes = parse_pageNos(page_id, page, ocr_page_info)
  65. header_bboxes = parse_headers(page_id, page, ocr_page_info)
  66. footer_bboxes = parse_footers(page_id, page, ocr_page_info)
  67. footnote_bboxes = parse_footnotes_by_model(page_id, page, ocr_page_info, md_bookname_save_path, debug_mode=debug_mode)
  68. # 构建需要remove的bbox列表
  69. need_remove_spans_bboxes = []
  70. need_remove_spans_bboxes.extend(page_no_bboxes)
  71. need_remove_spans_bboxes.extend(header_bboxes)
  72. need_remove_spans_bboxes.extend(footer_bboxes)
  73. need_remove_spans_bboxes.extend(footnote_bboxes)
  74. remove_bboxes.append(need_remove_spans_bboxes)
  75. layout_dets = ocr_page_info['layout_dets']
  76. spans = []
  77. # 将模型坐标转换成pymu格式下的未缩放坐标
  78. DPI = 72 # use this resolution
  79. pix = page.get_pixmap(dpi=DPI)
  80. pageL = 0
  81. pageR = int(pix.w)
  82. pageU = 0
  83. pageD = int(pix.h)
  84. width_from_json = ocr_page_info['page_info']['width']
  85. height_from_json = ocr_page_info['page_info']['height']
  86. LR_scaleRatio = width_from_json / (pageR - pageL)
  87. UD_scaleRatio = height_from_json / (pageD - pageU)
  88. for layout_det in layout_dets:
  89. category_id = layout_det['category_id']
  90. allow_category_id_list = [1, 7, 13, 14, 15]
  91. if category_id in allow_category_id_list:
  92. x0, y0, _, _, x1, y1, _, _ = layout_det['poly']
  93. x0 = x0 / LR_scaleRatio
  94. y0 = y0 / UD_scaleRatio
  95. x1 = x1 / LR_scaleRatio
  96. y1 = y1 / UD_scaleRatio
  97. bbox = [int(x0), int(y0), int(x1), int(y1)]
  98. '''要删除的'''
  99. # 3: 'header', # 页眉
  100. # 4: 'page number', # 页码
  101. # 5: 'footnote', # 脚注
  102. # 6: 'footer', # 页脚
  103. '''当成span拼接的'''
  104. # 1: 'image', # 图片
  105. # 7: 'table', # 表格
  106. # 13: 'inline_equation', # 行内公式
  107. # 14: 'displayed_equation', # 行间公式
  108. # 15: 'text', # ocr识别文本
  109. '''layout信息'''
  110. # 11: 'full column', # 单栏
  111. # 12: 'sub column', # 多栏
  112. span = {
  113. 'bbox': bbox,
  114. }
  115. if category_id == 1:
  116. span['type'] = 'image'
  117. elif category_id == 7:
  118. span['type'] = 'table'
  119. elif category_id == 13:
  120. span['content'] = layout_det['latex']
  121. span['type'] = 'inline_equation'
  122. elif category_id == 14:
  123. span['content'] = layout_det['latex']
  124. span['type'] = 'displayed_equation'
  125. elif category_id == 15:
  126. span['content'] = layout_det['text']
  127. span['type'] = 'text'
  128. # print(span)
  129. spans.append(span)
  130. else:
  131. continue
  132. # 删除重叠spans中较小的那些
  133. spans = remove_overlaps_min_spans(spans)
  134. # 删除remove_span_block_bboxes中的bbox
  135. spans = remove_spans_by_bboxes(spans, need_remove_spans_bboxes)
  136. # 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整低于文字的y0
  137. # 将spans合并成line(从上到下,从左到右)
  138. lines = merge_spans_to_line(spans)
  139. # logger.info(lines)
  140. # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
  141. blocks = []
  142. for line in lines:
  143. blocks.append({
  144. "bbox": line['bbox'],
  145. "lines": [line],
  146. })
  147. # 从ocr_page_info中解析layout信息(按自然阅读方向排序,并修复重叠和交错的bad case)
  148. layout_bboxes = layout_detect(ocr_page_info['subfield_dets'])
  149. # 构造pdf_info_dict
  150. page_info = construct_page_component(page_id, blocks, layout_bboxes)
  151. pdf_info_dict[f"page_{page_id}"] = page_info
  152. # logger.info(remove_bboxes)
  153. return pdf_info_dict