zhengchun
/
MinerU


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246
							import math

from collections import defaultdict
from magic_pdf.para.commons import *

if sys.version_info[0] >= 3:
    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore


class HeaderFooterProcessor:
    def __init__(self) -> None:
        pass

    def get_most_common_bboxes(self, bboxes, page_height, position="top", threshold=0.25, num_bboxes=3, min_frequency=2):
        """
        This function gets the most common bboxes from the bboxes

        Parameters
        ----------
        bboxes : list
            bboxes
        page_height : float
            height of the page
        position : str, optional
            "top" or "bottom", by default "top"
        threshold : float, optional
            threshold, by default 0.25
        num_bboxes : int, optional
            number of bboxes to return, by default 3
        min_frequency : int, optional
            minimum frequency of the bbox, by default 2

        Returns
        -------
        common_bboxes : list
            common bboxes
        """
        # Filter bbox by position
        if position == "top":
            filtered_bboxes = [bbox for bbox in bboxes if bbox[1] < page_height * threshold]
        else:
            filtered_bboxes = [bbox for bbox in bboxes if bbox[3] > page_height * (1 - threshold)]

        # Find the most common bbox
        bbox_count = defaultdict(int)
        for bbox in filtered_bboxes:
            bbox_count[tuple(bbox)] += 1

        # Get the most frequently occurring bbox, but only consider it when the frequency exceeds min_frequency
        common_bboxes = [
            bbox for bbox, count in sorted(bbox_count.items(), key=lambda item: item[1], reverse=True) if count >= min_frequency
        ][:num_bboxes]
        return common_bboxes

    def detect_footer_header(self, result_dict, similarity_threshold=0.5):
        """
        This function detects the header and footer of the document.

        Parameters
        ----------
        result_dict : dict
            result dictionary

        Returns
        -------
        result_dict : dict
            result dictionary
        """

        def compare_bbox_with_list(bbox, bbox_list, tolerance=1):
            return any(all(abs(a - b) < tolerance for a, b in zip(bbox, common_bbox)) for common_bbox in bbox_list)

        def is_single_line_block(block):
            # Determine based on the width and height of the block
            block_width = block["X1"] - block["X0"]
            block_height = block["bbox"][3] - block["bbox"][1]

            # If the height of the block is close to the average character height and the width is large, it is considered a single line
            return block_height <= block["avg_char_height"] * 3 and block_width > block["avg_char_width"] * 3

        # Traverse all blocks in the document
        single_preproc_blocks = 0
        total_blocks = 0
        single_preproc_blocks = 0

        for page_id, blocks in result_dict.items():
            if page_id.startswith("page_"):
                for block_key, block in blocks.items():
                    if block_key.startswith("block_"):
                        total_blocks += 1
                        if is_single_line_block(block):
                            single_preproc_blocks += 1

        # If there are no blocks, skip the header and footer detection
        if total_blocks == 0:
            print("No blocks found. Skipping header/footer detection.")
            return result_dict

        # If most of the blocks are single-line, skip the header and footer detection
        if single_preproc_blocks / total_blocks > 0.5:  # 50% of the blocks are single-line
            return result_dict

        # Collect the bounding boxes of all blocks
        all_bboxes = []
        all_texts = []

        for page_id, blocks in result_dict.items():
            if page_id.startswith("page_"):
                for block_key, block in blocks.items():
                    if block_key.startswith("block_"):
                        all_bboxes.append(block["bbox"])

        # Get the height of the page
        page_height = max(bbox[3] for bbox in all_bboxes)

        # Get the most common bbox lists for headers and footers
        common_header_bboxes = self.get_most_common_bboxes(all_bboxes, page_height, position="top") if all_bboxes else []
        common_footer_bboxes = self.get_most_common_bboxes(all_bboxes, page_height, position="bottom") if all_bboxes else []

        # Detect and mark headers and footers
        for page_id, blocks in result_dict.items():
            if page_id.startswith("page_"):
                for block_key, block in blocks.items():
                    if block_key.startswith("block_"):
                        bbox = block["bbox"]
                        text = block["text"]

                        is_header = compare_bbox_with_list(bbox, common_header_bboxes)
                        is_footer = compare_bbox_with_list(bbox, common_footer_bboxes)

                        block["is_header"] = int(is_header)
                        block["is_footer"] = int(is_footer)

        return result_dict


class NonHorizontalTextProcessor:
    def __init__(self) -> None:
        pass

    def detect_non_horizontal_texts(self, result_dict):
        """
        This function detects watermarks and vertical margin notes in the document.

        Watermarks are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
        If these conditions are met, the blocks are highly likely to be watermarks, as opposed to headers or footers, which can change from page to page.
        If the direction of these blocks is not horizontal, they are definitely considered to be watermarks.

        Vertical margin notes are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
        If these conditions are met, the blocks are highly likely to be vertical margin notes, which typically appear on the left and right sides of the page.
        If the direction of these blocks is vertical, they are definitely considered to be vertical margin notes.


        Parameters
        ----------
        result_dict : dict
            The result dictionary.

        Returns
        -------
        result_dict : dict
            The updated result dictionary.
        """
        # Dictionary to store information about potential watermarks
        potential_watermarks = {}
        potential_margin_notes = {}

        for page_id, page_content in result_dict.items():
            if page_id.startswith("page_"):
                for block_id, block_data in page_content.items():
                    if block_id.startswith("block_"):
                        if "dir" in block_data:
                            coordinates_text = (block_data["bbox"], block_data["text"])  # Tuple of coordinates and text

                            angle = math.atan2(block_data["dir"][1], block_data["dir"][0])
                            angle = abs(math.degrees(angle))

                            if angle > 5 and angle < 85:  # Check if direction is watermarks
                                if coordinates_text in potential_watermarks:
                                    potential_watermarks[coordinates_text] += 1
                                else:
                                    potential_watermarks[coordinates_text] = 1

                            if angle > 85 and angle < 105:  # Check if direction is vertical
                                if coordinates_text in potential_margin_notes:
                                    potential_margin_notes[coordinates_text] += 1  # Increment count
                                else:
                                    potential_margin_notes[coordinates_text] = 1  # Initialize count

        # Identify watermarks by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
        watermark_threshold = len(result_dict) // 2
        watermarks = {k: v for k, v in potential_watermarks.items() if v > watermark_threshold}

        # Identify margin notes by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
        margin_note_threshold = len(result_dict) // 2
        margin_notes = {k: v for k, v in potential_margin_notes.items() if v > margin_note_threshold}

        # Add watermark information to the result dictionary
        for page_id, blocks in result_dict.items():
            if page_id.startswith("page_"):
                for block_id, block_data in blocks.items():
                    coordinates_text = (block_data["bbox"], block_data["text"])
                    if coordinates_text in watermarks:
                        block_data["is_watermark"] = 1
                    else:
                        block_data["is_watermark"] = 0

                    if coordinates_text in margin_notes:
                        block_data["is_vertical_margin_note"] = 1
                    else:
                        block_data["is_vertical_margin_note"] = 0

        return result_dict


class NoiseRemover:
    def __init__(self) -> None:
        pass

    def skip_data_noises(self, result_dict):
        """
        This function skips the data noises, including overlap blocks, header, footer, watermark, vertical margin note, title
        """
        filtered_result_dict = {}
        for page_id, blocks in result_dict.items():
            if page_id.startswith("page_"):
                filtered_blocks = {}
                for block_id, block in blocks.items():
                    if block_id.startswith("block_"):
                        if any(
                            block.get(key, 0)
                            for key in [
                                "is_overlap",
                                "is_header",
                                "is_footer",
                                "is_watermark",
                                "is_vertical_margin_note",
                                "is_block_title",
                            ]
                        ):
                            continue
                        filtered_blocks[block_id] = block
                if filtered_blocks:
                    filtered_result_dict[page_id] = filtered_blocks

        return filtered_result_dict