| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246 |
- import math
- from collections import defaultdict
- from magic_pdf.para.commons import *
- if sys.version_info[0] >= 3:
- sys.stdout.reconfigure(encoding="utf-8") # type: ignore
- class HeaderFooterProcessor:
- def __init__(self) -> None:
- pass
- def get_most_common_bboxes(self, bboxes, page_height, position="top", threshold=0.25, num_bboxes=3, min_frequency=2):
- """
- This function gets the most common bboxes from the bboxes
- Parameters
- ----------
- bboxes : list
- bboxes
- page_height : float
- height of the page
- position : str, optional
- "top" or "bottom", by default "top"
- threshold : float, optional
- threshold, by default 0.25
- num_bboxes : int, optional
- number of bboxes to return, by default 3
- min_frequency : int, optional
- minimum frequency of the bbox, by default 2
- Returns
- -------
- common_bboxes : list
- common bboxes
- """
- # Filter bbox by position
- if position == "top":
- filtered_bboxes = [bbox for bbox in bboxes if bbox[1] < page_height * threshold]
- else:
- filtered_bboxes = [bbox for bbox in bboxes if bbox[3] > page_height * (1 - threshold)]
- # Find the most common bbox
- bbox_count = defaultdict(int)
- for bbox in filtered_bboxes:
- bbox_count[tuple(bbox)] += 1
- # Get the most frequently occurring bbox, but only consider it when the frequency exceeds min_frequency
- common_bboxes = [
- bbox for bbox, count in sorted(bbox_count.items(), key=lambda item: item[1], reverse=True) if count >= min_frequency
- ][:num_bboxes]
- return common_bboxes
- def detect_footer_header(self, result_dict, similarity_threshold=0.5):
- """
- This function detects the header and footer of the document.
- Parameters
- ----------
- result_dict : dict
- result dictionary
- Returns
- -------
- result_dict : dict
- result dictionary
- """
- def compare_bbox_with_list(bbox, bbox_list, tolerance=1):
- return any(all(abs(a - b) < tolerance for a, b in zip(bbox, common_bbox)) for common_bbox in bbox_list)
- def is_single_line_block(block):
- # Determine based on the width and height of the block
- block_width = block["X1"] - block["X0"]
- block_height = block["bbox"][3] - block["bbox"][1]
- # If the height of the block is close to the average character height and the width is large, it is considered a single line
- return block_height <= block["avg_char_height"] * 3 and block_width > block["avg_char_width"] * 3
- # Traverse all blocks in the document
- single_preproc_blocks = 0
- total_blocks = 0
- single_preproc_blocks = 0
- for page_id, blocks in result_dict.items():
- if page_id.startswith("page_"):
- for block_key, block in blocks.items():
- if block_key.startswith("block_"):
- total_blocks += 1
- if is_single_line_block(block):
- single_preproc_blocks += 1
- # If there are no blocks, skip the header and footer detection
- if total_blocks == 0:
- print("No blocks found. Skipping header/footer detection.")
- return result_dict
- # If most of the blocks are single-line, skip the header and footer detection
- if single_preproc_blocks / total_blocks > 0.5: # 50% of the blocks are single-line
- return result_dict
- # Collect the bounding boxes of all blocks
- all_bboxes = []
- all_texts = []
- for page_id, blocks in result_dict.items():
- if page_id.startswith("page_"):
- for block_key, block in blocks.items():
- if block_key.startswith("block_"):
- all_bboxes.append(block["bbox"])
- # Get the height of the page
- page_height = max(bbox[3] for bbox in all_bboxes)
- # Get the most common bbox lists for headers and footers
- common_header_bboxes = self.get_most_common_bboxes(all_bboxes, page_height, position="top") if all_bboxes else []
- common_footer_bboxes = self.get_most_common_bboxes(all_bboxes, page_height, position="bottom") if all_bboxes else []
- # Detect and mark headers and footers
- for page_id, blocks in result_dict.items():
- if page_id.startswith("page_"):
- for block_key, block in blocks.items():
- if block_key.startswith("block_"):
- bbox = block["bbox"]
- text = block["text"]
- is_header = compare_bbox_with_list(bbox, common_header_bboxes)
- is_footer = compare_bbox_with_list(bbox, common_footer_bboxes)
- block["is_header"] = int(is_header)
- block["is_footer"] = int(is_footer)
- return result_dict
- class NonHorizontalTextProcessor:
- def __init__(self) -> None:
- pass
- def detect_non_horizontal_texts(self, result_dict):
- """
- This function detects watermarks and vertical margin notes in the document.
- Watermarks are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
- If these conditions are met, the blocks are highly likely to be watermarks, as opposed to headers or footers, which can change from page to page.
- If the direction of these blocks is not horizontal, they are definitely considered to be watermarks.
- Vertical margin notes are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
- If these conditions are met, the blocks are highly likely to be vertical margin notes, which typically appear on the left and right sides of the page.
- If the direction of these blocks is vertical, they are definitely considered to be vertical margin notes.
- Parameters
- ----------
- result_dict : dict
- The result dictionary.
- Returns
- -------
- result_dict : dict
- The updated result dictionary.
- """
- # Dictionary to store information about potential watermarks
- potential_watermarks = {}
- potential_margin_notes = {}
- for page_id, page_content in result_dict.items():
- if page_id.startswith("page_"):
- for block_id, block_data in page_content.items():
- if block_id.startswith("block_"):
- if "dir" in block_data:
- coordinates_text = (block_data["bbox"], block_data["text"]) # Tuple of coordinates and text
- angle = math.atan2(block_data["dir"][1], block_data["dir"][0])
- angle = abs(math.degrees(angle))
- if angle > 5 and angle < 85: # Check if direction is watermarks
- if coordinates_text in potential_watermarks:
- potential_watermarks[coordinates_text] += 1
- else:
- potential_watermarks[coordinates_text] = 1
- if angle > 85 and angle < 105: # Check if direction is vertical
- if coordinates_text in potential_margin_notes:
- potential_margin_notes[coordinates_text] += 1 # Increment count
- else:
- potential_margin_notes[coordinates_text] = 1 # Initialize count
- # Identify watermarks by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
- watermark_threshold = len(result_dict) // 2
- watermarks = {k: v for k, v in potential_watermarks.items() if v > watermark_threshold}
- # Identify margin notes by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
- margin_note_threshold = len(result_dict) // 2
- margin_notes = {k: v for k, v in potential_margin_notes.items() if v > margin_note_threshold}
- # Add watermark information to the result dictionary
- for page_id, blocks in result_dict.items():
- if page_id.startswith("page_"):
- for block_id, block_data in blocks.items():
- coordinates_text = (block_data["bbox"], block_data["text"])
- if coordinates_text in watermarks:
- block_data["is_watermark"] = 1
- else:
- block_data["is_watermark"] = 0
- if coordinates_text in margin_notes:
- block_data["is_vertical_margin_note"] = 1
- else:
- block_data["is_vertical_margin_note"] = 0
- return result_dict
- class NoiseRemover:
- def __init__(self) -> None:
- pass
- def skip_data_noises(self, result_dict):
- """
- This function skips the data noises, including overlap blocks, header, footer, watermark, vertical margin note, title
- """
- filtered_result_dict = {}
- for page_id, blocks in result_dict.items():
- if page_id.startswith("page_"):
- filtered_blocks = {}
- for block_id, block in blocks.items():
- if block_id.startswith("block_"):
- if any(
- block.get(key, 0)
- for key in [
- "is_overlap",
- "is_header",
- "is_footer",
- "is_watermark",
- "is_vertical_margin_note",
- "is_block_title",
- ]
- ):
- continue
- filtered_blocks[block_id] = block
- if filtered_blocks:
- filtered_result_dict[page_id] = filtered_blocks
- return filtered_result_dict
|