pdf_parse_by_ocr.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279
  1. import json
  2. import os
  3. import time
  4. from loguru import logger
  5. from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_text_bbox
  6. from magic_pdf.libs.commons import (
  7. read_file,
  8. join_path,
  9. fitz,
  10. get_img_s3_client,
  11. get_delta_time,
  12. get_docx_model_output,
  13. )
  14. from magic_pdf.libs.coordinate_transform import get_scale_ratio
  15. from magic_pdf.libs.drop_tag import DropTag
  16. from magic_pdf.libs.ocr_content_type import ContentType
  17. from magic_pdf.libs.safe_filename import sanitize_filename
  18. from magic_pdf.para.para_split import para_split
  19. from magic_pdf.pre_proc.detect_footer_by_model import parse_footers
  20. from magic_pdf.pre_proc.detect_footnote import parse_footnotes_by_model
  21. from magic_pdf.pre_proc.detect_header import parse_headers
  22. from magic_pdf.pre_proc.detect_page_number import parse_pageNos
  23. from magic_pdf.pre_proc.ocr_cut_image import cut_image_and_table
  24. from magic_pdf.pre_proc.ocr_detect_layout import layout_detect
  25. from magic_pdf.pre_proc.ocr_dict_merge import (
  26. merge_spans_to_line_by_layout, merge_lines_to_block,
  27. )
  28. from magic_pdf.pre_proc.ocr_span_list_modify import remove_spans_by_bboxes, remove_overlaps_min_spans, \
  29. adjust_bbox_for_standalone_block, modify_y_axis, modify_inline_equation, get_qa_need_list, \
  30. remove_spans_by_bboxes_dict
  31. from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
  32. def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
  33. images, tables, interline_equations, inline_equations,
  34. dropped_text_block, dropped_image_block, dropped_table_block, dropped_equation_block,
  35. need_remove_spans_bboxes_dict):
  36. return_dict = {
  37. 'preproc_blocks': blocks,
  38. 'layout_bboxes': layout_bboxes,
  39. 'page_idx': page_id,
  40. 'page_size': [page_w, page_h],
  41. '_layout_tree': layout_tree,
  42. 'images': images,
  43. 'tables': tables,
  44. 'interline_equations': interline_equations,
  45. 'inline_equations': inline_equations,
  46. 'droped_text_block': dropped_text_block,
  47. 'droped_image_block': dropped_image_block,
  48. 'droped_table_block': dropped_table_block,
  49. 'dropped_equation_block': dropped_equation_block,
  50. 'droped_bboxes': need_remove_spans_bboxes_dict,
  51. }
  52. return return_dict
  53. def parse_pdf_by_ocr(
  54. pdf_path,
  55. s3_pdf_profile,
  56. pdf_model_output,
  57. save_path,
  58. book_name,
  59. pdf_model_profile=None,
  60. image_s3_config=None,
  61. start_page_id=0,
  62. end_page_id=None,
  63. debug_mode=False,
  64. ):
  65. pdf_bytes = read_file(pdf_path, s3_pdf_profile)
  66. save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
  67. book_name = sanitize_filename(book_name)
  68. md_bookname_save_path = ""
  69. if debug_mode:
  70. save_path = join_path(save_tmp_path, "md")
  71. pdf_local_path = join_path(save_tmp_path, "download-pdfs", book_name)
  72. if not os.path.exists(os.path.dirname(pdf_local_path)):
  73. # 如果目录不存在,创建它
  74. os.makedirs(os.path.dirname(pdf_local_path))
  75. md_bookname_save_path = join_path(save_tmp_path, "md", book_name)
  76. if not os.path.exists(md_bookname_save_path):
  77. # 如果目录不存在,创建它
  78. os.makedirs(md_bookname_save_path)
  79. with open(pdf_local_path + ".pdf", "wb") as pdf_file:
  80. pdf_file.write(pdf_bytes)
  81. pdf_docs = fitz.open("pdf", pdf_bytes)
  82. # 初始化空的pdf_info_dict
  83. pdf_info_dict = {}
  84. img_s3_client = get_img_s3_client(save_path, image_s3_config)
  85. start_time = time.time()
  86. end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
  87. for page_id in range(start_page_id, end_page_id + 1):
  88. # 获取当前页的page对象
  89. page = pdf_docs[page_id]
  90. # 获取当前页的宽高
  91. page_w = page.rect.width
  92. page_h = page.rect.height
  93. if debug_mode:
  94. time_now = time.time()
  95. logger.info(
  96. f"page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}"
  97. )
  98. start_time = time_now
  99. # 获取当前页的模型数据
  100. ocr_page_info = get_docx_model_output(
  101. pdf_model_output, pdf_model_profile, page_id
  102. )
  103. """从json中获取每页的页码、页眉、页脚的bbox"""
  104. page_no_bboxes = parse_pageNos(page_id, page, ocr_page_info)
  105. header_bboxes = parse_headers(page_id, page, ocr_page_info)
  106. footer_bboxes = parse_footers(page_id, page, ocr_page_info)
  107. footnote_bboxes = parse_footnotes_by_model(
  108. page_id, page, ocr_page_info, md_bookname_save_path, debug_mode=debug_mode
  109. )
  110. # 构建需要remove的bbox字典
  111. need_remove_spans_bboxes_dict = {
  112. DropTag.PAGE_NUMBER: page_no_bboxes,
  113. DropTag.HEADER: header_bboxes,
  114. DropTag.FOOTER: footer_bboxes,
  115. DropTag.FOOTNOTE: footnote_bboxes,
  116. }
  117. layout_dets = ocr_page_info["layout_dets"]
  118. spans = []
  119. # 计算模型坐标和pymu坐标的缩放比例
  120. horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(
  121. ocr_page_info, page
  122. )
  123. for layout_det in layout_dets:
  124. category_id = layout_det["category_id"]
  125. allow_category_id_list = [1, 7, 13, 14, 15]
  126. if category_id in allow_category_id_list:
  127. x0, y0, _, _, x1, y1, _, _ = layout_det["poly"]
  128. bbox = [
  129. int(x0 / horizontal_scale_ratio),
  130. int(y0 / vertical_scale_ratio),
  131. int(x1 / horizontal_scale_ratio),
  132. int(y1 / vertical_scale_ratio),
  133. ]
  134. # 删除高度或者宽度为0的spans
  135. if bbox[2] - bbox[0] == 0 or bbox[3] - bbox[1] == 0:
  136. continue
  137. """要删除的"""
  138. # 3: 'header', # 页眉
  139. # 4: 'page number', # 页码
  140. # 5: 'footnote', # 脚注
  141. # 6: 'footer', # 页脚
  142. """当成span拼接的"""
  143. # 1: 'image', # 图片
  144. # 7: 'table', # 表格
  145. # 13: 'inline_equation', # 行内公式
  146. # 14: 'interline_equation', # 行间公式
  147. # 15: 'text', # ocr识别文本
  148. """layout信息"""
  149. # 11: 'full column', # 单栏
  150. # 12: 'sub column', # 多栏
  151. span = {
  152. "bbox": bbox,
  153. }
  154. if category_id == 1:
  155. span["type"] = ContentType.Image
  156. elif category_id == 7:
  157. span["type"] = ContentType.Table
  158. elif category_id == 13:
  159. span["content"] = layout_det["latex"]
  160. span["type"] = ContentType.InlineEquation
  161. elif category_id == 14:
  162. span["content"] = layout_det["latex"]
  163. span["type"] = ContentType.InterlineEquation
  164. elif category_id == 15:
  165. span["content"] = layout_det["text"]
  166. span["type"] = ContentType.Text
  167. # print(span)
  168. spans.append(span)
  169. else:
  170. continue
  171. '''删除重叠spans中较小的那些'''
  172. spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
  173. '''
  174. 删除remove_span_block_bboxes中的bbox
  175. 并增加drop相关数据
  176. '''
  177. spans, dropped_spans_by_removed_bboxes = remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict)
  178. '''对image和table截图'''
  179. spans = cut_image_and_table(spans, page, page_id, book_name, save_path, img_s3_client)
  180. '''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
  181. displayed_list = []
  182. text_inline_lines = []
  183. modify_y_axis(spans, displayed_list, text_inline_lines)
  184. '''模型识别错误的行间公式, type类型转换成行内公式'''
  185. spans = modify_inline_equation(spans, displayed_list, text_inline_lines)
  186. '''bbox去除粘连'''
  187. spans = remove_overlap_between_bbox(spans)
  188. '''
  189. 对tpye=["interline_equation", "image", "table"]进行额外处理,
  190. 如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
  191. '''
  192. spans = adjust_bbox_for_standalone_block(spans)
  193. '''从ocr_page_info中解析layout信息(按自然阅读方向排序,并修复重叠和交错的bad case)'''
  194. layout_bboxes, layout_tree = layout_detect(ocr_page_info['subfield_dets'], page, ocr_page_info)
  195. '''将spans合并成line(在layout内,从上到下,从左到右)'''
  196. lines, dropped_spans_by_layout = merge_spans_to_line_by_layout(spans, layout_bboxes)
  197. '''将lines合并成block'''
  198. blocks = merge_lines_to_block(lines)
  199. '''获取QA需要外置的list'''
  200. images, tables, interline_equations, inline_equations = get_qa_need_list(blocks)
  201. '''drop的span_list合并'''
  202. dropped_spans = []
  203. dropped_spans.extend(dropped_spans_by_span_overlap)
  204. dropped_spans.extend(dropped_spans_by_removed_bboxes)
  205. dropped_spans.extend(dropped_spans_by_layout)
  206. dropped_text_block = []
  207. dropped_image_block = []
  208. dropped_table_block = []
  209. dropped_equation_block = []
  210. for span in dropped_spans:
  211. # drop出的spans进行分类
  212. if span['type'] == ContentType.Text:
  213. dropped_text_block.append(span)
  214. elif span['type'] == ContentType.Image:
  215. dropped_image_block.append(span)
  216. elif span['type'] == ContentType.Table:
  217. dropped_table_block.append(span)
  218. elif span['type'] in [ContentType.InlineEquation, ContentType.InterlineEquation]:
  219. dropped_equation_block.append(span)
  220. '''构造pdf_info_dict'''
  221. page_info = construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
  222. images, tables, interline_equations, inline_equations,
  223. dropped_text_block, dropped_image_block, dropped_table_block,
  224. dropped_equation_block,
  225. need_remove_spans_bboxes_dict)
  226. pdf_info_dict[f"page_{page_id}"] = page_info
  227. """分段"""
  228. para_split(pdf_info_dict, debug_mode=debug_mode)
  229. '''在测试时,保存调试信息'''
  230. if debug_mode:
  231. params_file_save_path = join_path(
  232. save_tmp_path, "md", book_name, "preproc_out.json"
  233. )
  234. with open(params_file_save_path, "w", encoding="utf-8") as f:
  235. json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)
  236. # drow_bbox
  237. draw_layout_bbox(pdf_info_dict, pdf_bytes, md_bookname_save_path)
  238. draw_text_bbox(pdf_info_dict, pdf_bytes, md_bookname_save_path)
  239. return pdf_info_dict