# Copyright (c) Opendatalab. All rights reserved. from mineru.utils.block_pre_proc import prepare_block_bboxes from mineru.utils.pipeline_magic_model import MagicModel from mineru.version import __version__ from mineru.utils.hash_utils import str_md5 def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer, page_index, lang=None, ocr=False): scale = image_dict["scale"] page_pil_img = image_dict["img_pil"] page_img_md5 = str_md5(image_dict["img_base64"]) page_w, page_h = map(int, page.get_size()) magic_model = MagicModel(page_model_info, scale) """从magic_model对象中获取后面会用到的区块信息""" img_groups = magic_model.get_imgs() table_groups = magic_model.get_tables() """对image和table的区块分组""" img_body_blocks, img_caption_blocks, img_footnote_blocks = process_groups( img_groups, 'image_body', 'image_caption_list', 'image_footnote_list' ) table_body_blocks, table_caption_blocks, table_footnote_blocks = process_groups( table_groups, 'table_body', 'table_caption_list', 'table_footnote_list' ) discarded_blocks = magic_model.get_discarded() text_blocks = magic_model.get_text_blocks() title_blocks = magic_model.get_title_blocks() inline_equations, interline_equations, interline_equation_blocks = magic_model.get_equations() """将所有区块的bbox整理到一起""" interline_equation_blocks = [] if len(interline_equation_blocks) > 0: all_bboxes, all_discarded_blocks, footnote_blocks = prepare_block_bboxes( img_body_blocks, img_caption_blocks, img_footnote_blocks, table_body_blocks, table_caption_blocks, table_footnote_blocks, discarded_blocks, text_blocks, title_blocks, interline_equation_blocks, page_w, page_h, ) else: all_bboxes, all_discarded_blocks, footnote_blocks = prepare_block_bboxes( img_body_blocks, img_caption_blocks, img_footnote_blocks, table_body_blocks, table_caption_blocks, table_footnote_blocks, discarded_blocks, text_blocks, title_blocks, interline_equations, page_w, page_h, ) def result_to_middle_json(model_list, images_list, pdf_doc, image_writer, lang=None, ocr=False): middle_json = {"pdf_info": [], "_backend":"vlm", "_version_name": __version__} for page_index, page_model_info in enumerate(model_list): page = pdf_doc[page_index] image_dict = images_list[page_index] page_info = page_model_info_to_page_info( page_model_info, image_dict, page, image_writer, page_index, lang=lang, ocr=ocr ) middle_json["pdf_info"].append(page_info) return middle_json def process_groups(groups, body_key, caption_key, footnote_key): body_blocks = [] caption_blocks = [] footnote_blocks = [] for i, group in enumerate(groups): group[body_key]['group_id'] = i body_blocks.append(group[body_key]) for caption_block in group[caption_key]: caption_block['group_id'] = i caption_blocks.append(caption_block) for footnote_block in group[footnote_key]: footnote_block['group_id'] = i footnote_blocks.append(footnote_block) return body_blocks, caption_blocks, footnote_blocks