model_json_to_middle_json.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384
  1. # Copyright (c) Opendatalab. All rights reserved.
  2. from mineru.utils.block_pre_proc import prepare_block_bboxes
  3. from mineru.utils.pipeline_magic_model import MagicModel
  4. from mineru.version import __version__
  5. from mineru.utils.hash_utils import str_md5
  6. def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer, page_index, lang=None, ocr=False):
  7. scale = image_dict["scale"]
  8. page_pil_img = image_dict["img_pil"]
  9. page_img_md5 = str_md5(image_dict["img_base64"])
  10. page_w, page_h = map(int, page.get_size())
  11. magic_model = MagicModel(page_model_info, scale)
  12. """从magic_model对象中获取后面会用到的区块信息"""
  13. img_groups = magic_model.get_imgs()
  14. table_groups = magic_model.get_tables()
  15. """对image和table的区块分组"""
  16. img_body_blocks, img_caption_blocks, img_footnote_blocks = process_groups(
  17. img_groups, 'image_body', 'image_caption_list', 'image_footnote_list'
  18. )
  19. table_body_blocks, table_caption_blocks, table_footnote_blocks = process_groups(
  20. table_groups, 'table_body', 'table_caption_list', 'table_footnote_list'
  21. )
  22. discarded_blocks = magic_model.get_discarded()
  23. text_blocks = magic_model.get_text_blocks()
  24. title_blocks = magic_model.get_title_blocks()
  25. inline_equations, interline_equations, interline_equation_blocks = magic_model.get_equations()
  26. """将所有区块的bbox整理到一起"""
  27. interline_equation_blocks = []
  28. if len(interline_equation_blocks) > 0:
  29. all_bboxes, all_discarded_blocks, footnote_blocks = prepare_block_bboxes(
  30. img_body_blocks, img_caption_blocks, img_footnote_blocks,
  31. table_body_blocks, table_caption_blocks, table_footnote_blocks,
  32. discarded_blocks,
  33. text_blocks,
  34. title_blocks,
  35. interline_equation_blocks,
  36. page_w,
  37. page_h,
  38. )
  39. else:
  40. all_bboxes, all_discarded_blocks, footnote_blocks = prepare_block_bboxes(
  41. img_body_blocks, img_caption_blocks, img_footnote_blocks,
  42. table_body_blocks, table_caption_blocks, table_footnote_blocks,
  43. discarded_blocks,
  44. text_blocks,
  45. title_blocks,
  46. interline_equations,
  47. page_w,
  48. page_h,
  49. )
  50. def result_to_middle_json(model_list, images_list, pdf_doc, image_writer, lang=None, ocr=False):
  51. middle_json = {"pdf_info": [], "_backend":"vlm", "_version_name": __version__}
  52. for page_index, page_model_info in enumerate(model_list):
  53. page = pdf_doc[page_index]
  54. image_dict = images_list[page_index]
  55. page_info = page_model_info_to_page_info(
  56. page_model_info, image_dict, page, image_writer, page_index, lang=lang, ocr=ocr
  57. )
  58. middle_json["pdf_info"].append(page_info)
  59. return middle_json
  60. def process_groups(groups, body_key, caption_key, footnote_key):
  61. body_blocks = []
  62. caption_blocks = []
  63. footnote_blocks = []
  64. for i, group in enumerate(groups):
  65. group[body_key]['group_id'] = i
  66. body_blocks.append(group[body_key])
  67. for caption_block in group[caption_key]:
  68. caption_block['group_id'] = i
  69. caption_blocks.append(caption_block)
  70. for footnote_block in group[footnote_key]:
  71. footnote_block['group_id'] = i
  72. footnote_blocks.append(footnote_block)
  73. return body_blocks, caption_blocks, footnote_blocks