| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137 |
- # Copyright (c) Opendatalab. All rights reserved.
- from mineru.utils.block_pre_proc import prepare_block_bboxes, process_groups
- from mineru.utils.block_sort import sort_blocks_by_bbox
- from mineru.utils.cut_image import cut_image_and_table
- from mineru.utils.pipeline_magic_model import MagicModel
- from mineru.utils.span_block_fix import fill_spans_in_blocks, fix_discarded_block, fix_block_spans
- from mineru.utils.span_pre_proc import remove_outside_spans, remove_overlaps_low_confidence_spans, \
- remove_overlaps_min_spans, txt_spans_extract
- from mineru.version import __version__
- from mineru.utils.hash_utils import str_md5
- def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer, page_index, ocr=False):
- scale = image_dict["scale"]
- page_pil_img = image_dict["img_pil"]
- page_img_md5 = str_md5(image_dict["img_base64"])
- page_w, page_h = map(int, page.get_size())
- magic_model = MagicModel(page_model_info, scale)
- """从magic_model对象中获取后面会用到的区块信息"""
- img_groups = magic_model.get_imgs()
- table_groups = magic_model.get_tables()
- """对image和table的区块分组"""
- img_body_blocks, img_caption_blocks, img_footnote_blocks = process_groups(
- img_groups, 'image_body', 'image_caption_list', 'image_footnote_list'
- )
- table_body_blocks, table_caption_blocks, table_footnote_blocks = process_groups(
- table_groups, 'table_body', 'table_caption_list', 'table_footnote_list'
- )
- discarded_blocks = magic_model.get_discarded()
- text_blocks = magic_model.get_text_blocks()
- title_blocks = magic_model.get_title_blocks()
- inline_equations, interline_equations, interline_equation_blocks = magic_model.get_equations()
- """将所有区块的bbox整理到一起"""
- interline_equation_blocks = []
- if len(interline_equation_blocks) > 0:
- all_bboxes, all_discarded_blocks, footnote_blocks = prepare_block_bboxes(
- img_body_blocks, img_caption_blocks, img_footnote_blocks,
- table_body_blocks, table_caption_blocks, table_footnote_blocks,
- discarded_blocks,
- text_blocks,
- title_blocks,
- interline_equation_blocks,
- page_w,
- page_h,
- )
- else:
- all_bboxes, all_discarded_blocks, footnote_blocks = prepare_block_bboxes(
- img_body_blocks, img_caption_blocks, img_footnote_blocks,
- table_body_blocks, table_caption_blocks, table_footnote_blocks,
- discarded_blocks,
- text_blocks,
- title_blocks,
- interline_equations,
- page_w,
- page_h,
- )
- """获取所有的spans信息"""
- spans = magic_model.get_all_spans()
- """在删除重复span之前,应该通过image_body和table_body的block过滤一下image和table的span"""
- """顺便删除大水印并保留abandon的span"""
- spans = remove_outside_spans(spans, all_bboxes, all_discarded_blocks)
- """删除重叠spans中置信度较低的那些"""
- spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
- """删除重叠spans中较小的那些"""
- spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
- """根据parse_mode,构造spans,主要是文本类的字符填充"""
- if ocr:
- pass
- else:
- """使用新版本的混合ocr方案."""
- spans = txt_spans_extract(page, spans, page_pil_img, scale)
- """先处理不需要排版的discarded_blocks"""
- discarded_block_with_spans, spans = fill_spans_in_blocks(
- all_discarded_blocks, spans, 0.4
- )
- fix_discarded_blocks = fix_discarded_block(discarded_block_with_spans)
- """如果当前页面没有有效的bbox则跳过"""
- if len(all_bboxes) == 0:
- return None
- """对image和table截图"""
- for span in spans:
- if span['type'] in ['image', 'table']:
- span = cut_image_and_table(
- span, page_pil_img, page_img_md5, page_index, image_writer, scale=scale
- )
- """span填充进block"""
- block_with_spans, spans = fill_spans_in_blocks(all_bboxes, spans, 0.5)
- """对block进行fix操作"""
- fix_blocks = fix_block_spans(block_with_spans)
- """同一行被断开的titile合并"""
- # merge_title_blocks(fix_blocks)
- """对block进行排序"""
- sorted_blocks = sort_blocks_by_bbox(fix_blocks, page_w, page_h, footnote_blocks)
- """构造page_info"""
- page_info = make_page_info_dict(sorted_blocks, page_index, page_w, page_h, fix_discarded_blocks)
- return page_info
- def result_to_middle_json(model_list, images_list, pdf_doc, image_writer, lang=None, ocr=False):
- middle_json = {"pdf_info": [], "_backend":"vlm", "_version_name": __version__}
- for page_index, page_model_info in enumerate(model_list):
- page = pdf_doc[page_index]
- image_dict = images_list[page_index]
- page_info = page_model_info_to_page_info(
- page_model_info, image_dict, page, image_writer, page_index, ocr=ocr
- )
- if page_info is None:
- page_w, page_h = map(int, page.get_size())
- page_info = make_page_info_dict([], page_index, page_w, page_h, [])
- middle_json["pdf_info"].append(page_info)
- return middle_json
- def make_page_info_dict(blocks, page_id, page_w, page_h, discarded_blocks):
- return_dict = {
- 'preproc_blocks': blocks,
- 'page_idx': page_id,
- 'page_size': [page_w, page_h],
- 'discarded_blocks': discarded_blocks,
- }
- return return_dict
|