pdf_parse_by_ocr.py 2.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio
  2. from magic_pdf.libs.ocr_dict_merge import merge_spans
  3. def construct_page_component(page_id, text_blocks_preproc):
  4. return_dict = {
  5. 'preproc_blocks': text_blocks_preproc,
  6. 'page_idx': page_id
  7. }
  8. return return_dict
  9. def parse_pdf_by_ocr(
  10. ocr_pdf_info,
  11. start_page_id=0,
  12. end_page_id=None,
  13. ):
  14. pdf_info_dict = {}
  15. end_page_id = end_page_id if end_page_id else len(ocr_pdf_info) - 1
  16. for page_id in range(start_page_id, end_page_id + 1):
  17. ocr_page_info = ocr_pdf_info[page_id]
  18. layout_dets = ocr_page_info['layout_dets']
  19. spans = []
  20. for layout_det in layout_dets:
  21. category_id = layout_det['category_id']
  22. allow_category_id_list = [13, 14, 15]
  23. if category_id in allow_category_id_list:
  24. x0, y0, _, _, x1, y1, _, _ = layout_det['poly']
  25. bbox = [int(x0), int(y0), int(x1), int(y1)]
  26. # 13: 'embedding', # 嵌入公式
  27. # 14: 'isolated', # 单行公式
  28. # 15: 'ocr_text', # ocr识别文本
  29. span = {
  30. 'bbox': bbox,
  31. }
  32. if category_id == 13:
  33. span['content'] = layout_det['latex']
  34. span['type'] = 'inline_equation'
  35. elif category_id == 14:
  36. span['content'] = layout_det['latex']
  37. span['type'] = 'displayed_equation'
  38. elif category_id == 15:
  39. span['content'] = layout_det['text']
  40. span['type'] = 'text'
  41. # print(span)
  42. spans.append(span)
  43. else:
  44. continue
  45. # 合并重叠的spans
  46. for span1 in spans.copy():
  47. for span2 in spans.copy():
  48. if span1 != span2:
  49. overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.8)
  50. if overlap_box is not None:
  51. bbox_to_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
  52. if bbox_to_remove is not None:
  53. spans.remove(bbox_to_remove)
  54. # 将spans合并成line
  55. lines = merge_spans(spans)
  56. # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
  57. blocks = []
  58. for line in lines:
  59. blocks.append({
  60. "bbox": line['bbox'],
  61. "lines": [line],
  62. })
  63. # 构造pdf_info_dict
  64. page_info = construct_page_component(page_id, blocks)
  65. pdf_info_dict[f"page_{page_id}"] = page_info
  66. return pdf_info_dict