zhengchun
/
MinerU


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322
							
from magic_pdf.config.drop_tag import DropTag
from magic_pdf.config.ocr_content_type import BlockType, ContentType
from magic_pdf.libs.boxbase import (__is_overlaps_y_exceeds_threshold,
                                    calculate_iou,
                                    calculate_overlap_area_in_bbox1_area_ratio,
                                    get_minbox_if_overlap_by_ratio)


def remove_overlaps_low_confidence_spans(spans):
    dropped_spans = []
    #  删除重叠spans中置信度低的的那些
    for span1 in spans:
        for span2 in spans:
            if span1 != span2:
                # span1 或 span2 任何一个都不应该在 dropped_spans 中
                if span1 in dropped_spans or span2 in dropped_spans:
                    continue
                else:
                    if calculate_iou(span1['bbox'], span2['bbox']) > 0.9:
                        if span1['score'] < span2['score']:
                            span_need_remove = span1
                        else:
                            span_need_remove = span2
                        if (
                            span_need_remove is not None
                            and span_need_remove not in dropped_spans
                        ):
                            dropped_spans.append(span_need_remove)

    if len(dropped_spans) > 0:
        for span_need_remove in dropped_spans:
            spans.remove(span_need_remove)
            span_need_remove['tag'] = DropTag.SPAN_OVERLAP

    return spans, dropped_spans


def remove_overlaps_min_spans(spans):
    dropped_spans = []
    #  删除重叠spans中较小的那些
    for span1 in spans:
        for span2 in spans:
            if span1 != span2:
                # span1 或 span2 任何一个都不应该在 dropped_spans 中
                if span1 in dropped_spans or span2 in dropped_spans:
                    continue
                else:
                    overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
                    if overlap_box is not None:
                        span_need_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
                        if span_need_remove is not None and span_need_remove not in dropped_spans:
                            dropped_spans.append(span_need_remove)
    if len(dropped_spans) > 0:
        for span_need_remove in dropped_spans:
            spans.remove(span_need_remove)
            span_need_remove['tag'] = DropTag.SPAN_OVERLAP

    return spans, dropped_spans


def remove_spans_by_bboxes(spans, need_remove_spans_bboxes):
    # 遍历spans, 判断是否在removed_span_block_bboxes中
    # 如果是, 则删除该span 否则, 保留该span
    need_remove_spans = []
    for span in spans:
        for removed_bbox in need_remove_spans_bboxes:
            if (
                calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox)
                > 0.5
            ):
                if span not in need_remove_spans:
                    need_remove_spans.append(span)
                    break

    if len(need_remove_spans) > 0:
        for span in need_remove_spans:
            spans.remove(span)

    return spans


def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
    dropped_spans = []
    for drop_tag, removed_bboxes in need_remove_spans_bboxes_dict.items():
        # logger.info(f"remove spans by bbox dict, drop_tag: {drop_tag}, removed_bboxes: {removed_bboxes}")
        need_remove_spans = []
        for span in spans:
            # 通过判断span的bbox是否在removed_bboxes中, 判断是否需要删除该span
            for removed_bbox in removed_bboxes:
                if (
                    calculate_overlap_area_in_bbox1_area_ratio(
                        span['bbox'], removed_bbox
                    )
                    > 0.5
                ):
                    need_remove_spans.append(span)
                    break
                # 当drop_tag为DropTag.FOOTNOTE时, 判断span是否在removed_bboxes中任意一个的下方，如果是,则删除该span
                elif (
                    drop_tag == DropTag.FOOTNOTE
                    and (span['bbox'][1] + span['bbox'][3]) / 2 > removed_bbox[3]
                    and removed_bbox[0]
                    < (span['bbox'][0] + span['bbox'][2]) / 2
                    < removed_bbox[2]
                ):
                    need_remove_spans.append(span)
                    break

        for span in need_remove_spans:
            spans.remove(span)
            span['tag'] = drop_tag
            dropped_spans.append(span)

    return spans, dropped_spans


def adjust_bbox_for_standalone_block(spans):
    # 对tpye=["interline_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
    for sb_span in spans:
        if sb_span['type'] in [
            ContentType.InterlineEquation,
            ContentType.Image,
            ContentType.Table,
        ]:
            for text_span in spans:
                if text_span['type'] in [ContentType.Text, ContentType.InlineEquation]:
                    # 判断span2的纵向高度是否被span所覆盖
                    if (
                        sb_span['bbox'][1] < text_span['bbox'][1]
                        and sb_span['bbox'][3] > text_span['bbox'][3]
                    ):
                        # 判断span2是否在span左边
                        if text_span['bbox'][0] < sb_span['bbox'][0]:
                            # 调整span的y0和span2的y0一致
                            sb_span['bbox'][1] = text_span['bbox'][1]
    return spans


def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
    # displayed_list = []
    # 如果spans为空,则不处理
    if len(spans) == 0:
        pass
    else:
        spans.sort(key=lambda span: span['bbox'][1])

        lines = []
        current_line = [spans[0]]
        if spans[0]['type'] in [
            ContentType.InterlineEquation,
            ContentType.Image,
            ContentType.Table,
        ]:
            displayed_list.append(spans[0])

        line_first_y0 = spans[0]['bbox'][1]
        line_first_y = spans[0]['bbox'][3]
        # 用于给行间公式搜索
        # text_inline_lines = []
        for span in spans[1:]:
            # if span.get("content","") == "78.":
            #     print("debug")
            # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
            # image和table类型，同上
            if span['type'] in [
                ContentType.InterlineEquation,
                ContentType.Image,
                ContentType.Table,
            ] or any(
                s['type']
                in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]
                for s in current_line
            ):
                # 传入
                if span['type'] in [
                    ContentType.InterlineEquation,
                    ContentType.Image,
                    ContentType.Table,
                ]:
                    displayed_list.append(span)
                # 则开始新行
                lines.append(current_line)
                if len(current_line) > 1 or current_line[0]['type'] in [
                    ContentType.Text,
                    ContentType.InlineEquation,
                ]:
                    text_inline_lines.append(
                        (current_line, (line_first_y0, line_first_y))
                    )
                current_line = [span]
                line_first_y0 = span['bbox'][1]
                line_first_y = span['bbox'][3]
                continue

            # 如果当前的span与当前行的最后一个span在y轴上重叠，则添加到当前行
            if __is_overlaps_y_exceeds_threshold(
                span['bbox'], current_line[-1]['bbox']
            ):
                if span['type'] == 'text':
                    line_first_y0 = span['bbox'][1]
                    line_first_y = span['bbox'][3]
                current_line.append(span)

            else:
                # 否则，开始新行
                lines.append(current_line)
                text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
                current_line = [span]
                line_first_y0 = span['bbox'][1]
                line_first_y = span['bbox'][3]

            # 添加最后一行
        if current_line:
            lines.append(current_line)
            if len(current_line) > 1 or current_line[0]['type'] in [
                ContentType.Text,
                ContentType.InlineEquation,
            ]:
                text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
        for line in text_inline_lines:
            # 按照x0坐标排序
            current_line = line[0]
            current_line.sort(key=lambda span: span['bbox'][0])

        # 调整每一个文字行内bbox统一
        for line in text_inline_lines:
            current_line, (line_first_y0, line_first_y) = line
            for span in current_line:
                span['bbox'][1] = line_first_y0
                span['bbox'][3] = line_first_y

        # return spans, displayed_list, text_inline_lines


def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines: list):
    # 错误行间公式转行内公式
    j = 0
    for i in range(len(displayed_list)):
        # if i == 8:
        #     print("debug")
        span = displayed_list[i]
        span_y0, span_y = span['bbox'][1], span['bbox'][3]

        while j < len(text_inline_lines):
            text_line = text_inline_lines[j]
            y0, y1 = text_line[1]
            if (
                span_y0 < y0 < span_y
                or span_y0 < y1 < span_y
                or span_y0 < y0
                and span_y > y1
            ) and __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
                # 调整公式类型
                if span['type'] == ContentType.InterlineEquation:
                    # 最后一行是行间公式
                    if j + 1 >= len(text_inline_lines):
                        span['type'] = ContentType.InlineEquation
                        span['bbox'][1] = y0
                        span['bbox'][3] = y1
                    else:
                        # 行间公式旁边有多行文字或者行间公式比文字高3倍则不转换
                        y0_next, y1_next = text_inline_lines[j + 1][1]
                        if (
                            not __is_overlaps_y_exceeds_threshold(
                                span['bbox'], (0, y0_next, 0, y1_next)
                            )
                            and 3 * (y1 - y0) > span_y - span_y0
                        ):
                            span['type'] = ContentType.InlineEquation
                            span['bbox'][1] = y0
                            span['bbox'][3] = y1
                break
            elif (
                span_y < y0
                or span_y0 < y0 < span_y
                and not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1))
            ):
                break
            else:
                j += 1

    return spans


def get_qa_need_list(blocks):
    # 创建 images, tables, interline_equations, inline_equations 的副本
    images = []
    tables = []
    interline_equations = []
    inline_equations = []

    for block in blocks:
        for line in block['lines']:
            for span in line['spans']:
                if span['type'] == ContentType.Image:
                    images.append(span)
                elif span['type'] == ContentType.Table:
                    tables.append(span)
                elif span['type'] == ContentType.InlineEquation:
                    inline_equations.append(span)
                elif span['type'] == ContentType.InterlineEquation:
                    interline_equations.append(span)
                else:
                    continue
    return images, tables, interline_equations, inline_equations


def get_qa_need_list_v2(blocks):
    # 创建 images, tables, interline_equations, inline_equations 的副本
    images = []
    tables = []
    interline_equations = []

    for block in blocks:
        if block['type'] == BlockType.Image:
            images.append(block)
        elif block['type'] == BlockType.Table:
            tables.append(block)
        elif block['type'] == BlockType.InterlineEquation:
            interline_equations.append(block)
    return images, tables, interline_equations