pdf_parse_by_ocr.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293
  1. from magic_pdf.pre_proc.ocr_detect_layout import layout_detect
  2. from magic_pdf.pre_proc.ocr_dict_merge import merge_spans_to_line, remove_overlaps_min_spans
  3. def construct_page_component(page_id, blocks, layout_bboxes):
  4. return_dict = {
  5. 'preproc_blocks': blocks,
  6. 'page_idx': page_id,
  7. 'layout_bboxes': layout_bboxes,
  8. }
  9. return return_dict
  10. def parse_pdf_by_ocr(
  11. ocr_pdf_info,
  12. start_page_id=0,
  13. end_page_id=None,
  14. ):
  15. pdf_info_dict = {}
  16. end_page_id = end_page_id if end_page_id else len(ocr_pdf_info) - 1
  17. for page_id in range(start_page_id, end_page_id + 1):
  18. ocr_page_info = ocr_pdf_info[page_id]
  19. layout_dets = ocr_page_info['layout_dets']
  20. spans = []
  21. for layout_det in layout_dets:
  22. category_id = layout_det['category_id']
  23. allow_category_id_list = [1, 7, 13, 14, 15]
  24. if category_id in allow_category_id_list:
  25. x0, y0, _, _, x1, y1, _, _ = layout_det['poly']
  26. bbox = [int(x0), int(y0), int(x1), int(y1)]
  27. '''要删除的'''
  28. # 3: 'header', # 页眉
  29. # 4: 'page number', # 页码
  30. # 5: 'footnote', # 脚注
  31. # 6: 'footer', # 页脚
  32. '''当成span拼接的'''
  33. # 1: 'image', # 图片
  34. # 7: 'table', # 表格
  35. # 13: 'inline_equation', # 行内公式
  36. # 14: 'displayed_equation', # 行间公式
  37. # 15: 'text', # ocr识别文本
  38. '''layout信息'''
  39. # 11: 'full column', # 单栏
  40. # 12: 'sub column', # 多栏
  41. span = {
  42. 'bbox': bbox,
  43. }
  44. if category_id == 1:
  45. span['type'] = 'image'
  46. elif category_id == 7:
  47. span['type'] = 'table'
  48. elif category_id == 13:
  49. span['content'] = layout_det['latex']
  50. span['type'] = 'inline_equation'
  51. elif category_id == 14:
  52. span['content'] = layout_det['latex']
  53. span['type'] = 'displayed_equation'
  54. elif category_id == 15:
  55. span['content'] = layout_det['text']
  56. span['type'] = 'text'
  57. # print(span)
  58. spans.append(span)
  59. else:
  60. continue
  61. # 删除重叠spans中较小的那些
  62. spans = remove_overlaps_min_spans(spans)
  63. # 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整低于文字的y0
  64. # 将spans合并成line(从上到下,从左到右)
  65. lines = merge_spans_to_line(spans)
  66. # logger.info(lines)
  67. # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
  68. blocks = []
  69. for line in lines:
  70. blocks.append({
  71. "bbox": line['bbox'],
  72. "lines": [line],
  73. })
  74. # 从ocr_page_info中解析layout信息(按自然阅读方向排序,并修复重叠和交错的bad case)
  75. layout_bboxes = layout_detect(ocr_page_info['subfield_dets'])
  76. # 构造pdf_info_dict
  77. page_info = construct_page_component(page_id, blocks, layout_bboxes)
  78. pdf_info_dict[f"page_{page_id}"] = page_info
  79. return pdf_info_dict