| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384 |
- # Copyright (c) Opendatalab. All rights reserved.
- from mineru.utils.block_pre_proc import prepare_block_bboxes
- from mineru.utils.pipeline_magic_model import MagicModel
- from mineru.version import __version__
- from mineru.utils.hash_utils import str_md5
- def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer, page_index, lang=None, ocr=False):
- scale = image_dict["scale"]
- page_pil_img = image_dict["img_pil"]
- page_img_md5 = str_md5(image_dict["img_base64"])
- page_w, page_h = map(int, page.get_size())
- magic_model = MagicModel(page_model_info, scale)
- """从magic_model对象中获取后面会用到的区块信息"""
- img_groups = magic_model.get_imgs()
- table_groups = magic_model.get_tables()
- """对image和table的区块分组"""
- img_body_blocks, img_caption_blocks, img_footnote_blocks = process_groups(
- img_groups, 'image_body', 'image_caption_list', 'image_footnote_list'
- )
- table_body_blocks, table_caption_blocks, table_footnote_blocks = process_groups(
- table_groups, 'table_body', 'table_caption_list', 'table_footnote_list'
- )
- discarded_blocks = magic_model.get_discarded()
- text_blocks = magic_model.get_text_blocks()
- title_blocks = magic_model.get_title_blocks()
- inline_equations, interline_equations, interline_equation_blocks = magic_model.get_equations()
- """将所有区块的bbox整理到一起"""
- interline_equation_blocks = []
- if len(interline_equation_blocks) > 0:
- all_bboxes, all_discarded_blocks, footnote_blocks = prepare_block_bboxes(
- img_body_blocks, img_caption_blocks, img_footnote_blocks,
- table_body_blocks, table_caption_blocks, table_footnote_blocks,
- discarded_blocks,
- text_blocks,
- title_blocks,
- interline_equation_blocks,
- page_w,
- page_h,
- )
- else:
- all_bboxes, all_discarded_blocks, footnote_blocks = prepare_block_bboxes(
- img_body_blocks, img_caption_blocks, img_footnote_blocks,
- table_body_blocks, table_caption_blocks, table_footnote_blocks,
- discarded_blocks,
- text_blocks,
- title_blocks,
- interline_equations,
- page_w,
- page_h,
- )
- def result_to_middle_json(model_list, images_list, pdf_doc, image_writer, lang=None, ocr=False):
- middle_json = {"pdf_info": [], "_backend":"vlm", "_version_name": __version__}
- for page_index, page_model_info in enumerate(model_list):
- page = pdf_doc[page_index]
- image_dict = images_list[page_index]
- page_info = page_model_info_to_page_info(
- page_model_info, image_dict, page, image_writer, page_index, lang=lang, ocr=ocr
- )
- middle_json["pdf_info"].append(page_info)
- return middle_json
- def process_groups(groups, body_key, caption_key, footnote_key):
- body_blocks = []
- caption_blocks = []
- footnote_blocks = []
- for i, group in enumerate(groups):
- group[body_key]['group_id'] = i
- body_blocks.append(group[body_key])
- for caption_block in group[caption_key]:
- caption_block['group_id'] = i
- caption_blocks.append(caption_block)
- for footnote_block in group[footnote_key]:
- footnote_block['group_id'] = i
- footnote_blocks.append(footnote_block)
- return body_blocks, caption_blocks, footnote_blocks
|