| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952 |
- # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- __all__ = [
- "get_sub_regions_ocr_res",
- "get_show_color",
- "sorted_layout_boxes",
- ]
- import re
- from copy import deepcopy
- from typing import Dict, List, Optional, Tuple, Union
- import numpy as np
- from PIL import Image
- from ..components import convert_points_to_boxes
- from ..ocr.result import OCRResult
- from .setting import REGION_SETTINGS
- def get_overlap_boxes_idx(src_boxes: np.ndarray, ref_boxes: np.ndarray) -> List:
- """
- Get the indices of source boxes that overlap with reference boxes based on a specified threshold.
- Args:
- src_boxes (np.ndarray): A 2D numpy array of source bounding boxes.
- ref_boxes (np.ndarray): A 2D numpy array of reference bounding boxes.
- Returns:
- match_idx_list (list): A list of indices of source boxes that overlap with reference boxes.
- """
- match_idx_list = []
- src_boxes_num = len(src_boxes)
- if src_boxes_num > 0 and len(ref_boxes) > 0:
- for rno in range(len(ref_boxes)):
- ref_box = ref_boxes[rno]
- x1 = np.maximum(ref_box[0], src_boxes[:, 0])
- y1 = np.maximum(ref_box[1], src_boxes[:, 1])
- x2 = np.minimum(ref_box[2], src_boxes[:, 2])
- y2 = np.minimum(ref_box[3], src_boxes[:, 3])
- pub_w = x2 - x1
- pub_h = y2 - y1
- match_idx = np.where((pub_w > 3) & (pub_h > 3))[0]
- match_idx_list.extend(match_idx)
- return match_idx_list
- def get_sub_regions_ocr_res(
- overall_ocr_res: OCRResult,
- object_boxes: List,
- flag_within: bool = True,
- return_match_idx: bool = False,
- ) -> OCRResult:
- """
- Filters OCR results to only include text boxes within specified object boxes based on a flag.
- Args:
- overall_ocr_res (OCRResult): The original OCR result containing all text boxes.
- object_boxes (list): A list of bounding boxes for the objects of interest.
- flag_within (bool): If True, only include text boxes within the object boxes. If False, exclude text boxes within the object boxes.
- return_match_idx (bool): If True, return the list of matching indices.
- Returns:
- OCRResult: A filtered OCR result containing only the relevant text boxes.
- """
- sub_regions_ocr_res = {}
- sub_regions_ocr_res["rec_polys"] = []
- sub_regions_ocr_res["rec_texts"] = []
- sub_regions_ocr_res["rec_scores"] = []
- sub_regions_ocr_res["rec_boxes"] = []
- overall_text_boxes = overall_ocr_res["rec_boxes"]
- match_idx_list = get_overlap_boxes_idx(overall_text_boxes, object_boxes)
- match_idx_list = list(set(match_idx_list))
- for box_no in range(len(overall_text_boxes)):
- if flag_within:
- if box_no in match_idx_list:
- flag_match = True
- else:
- flag_match = False
- else:
- if box_no not in match_idx_list:
- flag_match = True
- else:
- flag_match = False
- if flag_match:
- sub_regions_ocr_res["rec_polys"].append(
- overall_ocr_res["rec_polys"][box_no]
- )
- sub_regions_ocr_res["rec_texts"].append(
- overall_ocr_res["rec_texts"][box_no]
- )
- sub_regions_ocr_res["rec_scores"].append(
- overall_ocr_res["rec_scores"][box_no]
- )
- sub_regions_ocr_res["rec_boxes"].append(
- overall_ocr_res["rec_boxes"][box_no]
- )
- for key in ["rec_polys", "rec_scores", "rec_boxes"]:
- sub_regions_ocr_res[key] = np.array(sub_regions_ocr_res[key])
- return (
- (sub_regions_ocr_res, match_idx_list)
- if return_match_idx
- else sub_regions_ocr_res
- )
- def sorted_layout_boxes(res, w):
- """
- Sort text boxes in order from top to bottom, left to right
- Args:
- res: List of dictionaries containing layout information.
- w: Width of image.
- Returns:
- List of dictionaries containing sorted layout information.
- """
- num_boxes = len(res)
- if num_boxes == 1:
- return res
- # Sort on the y axis first or sort it on the x axis
- sorted_boxes = sorted(res, key=lambda x: (x["block_bbox"][1], x["block_bbox"][0]))
- _boxes = list(sorted_boxes)
- new_res = []
- res_left = []
- res_right = []
- i = 0
- while True:
- if i >= num_boxes:
- break
- # Check that the bbox is on the left
- elif (
- _boxes[i]["block_bbox"][0] < w / 4
- and _boxes[i]["block_bbox"][2] < 3 * w / 5
- ):
- res_left.append(_boxes[i])
- i += 1
- elif _boxes[i]["block_bbox"][0] > 2 * w / 5:
- res_right.append(_boxes[i])
- i += 1
- else:
- new_res += res_left
- new_res += res_right
- new_res.append(_boxes[i])
- res_left = []
- res_right = []
- i += 1
- res_left = sorted(res_left, key=lambda x: (x["block_bbox"][1]))
- res_right = sorted(res_right, key=lambda x: (x["block_bbox"][1]))
- if res_left:
- new_res += res_left
- if res_right:
- new_res += res_right
- return new_res
- def calculate_projection_overlap_ratio(
- bbox1: List[float],
- bbox2: List[float],
- direction: str = "horizontal",
- mode="union",
- ) -> float:
- """
- Calculate the IoU of lines between two bounding boxes.
- Args:
- bbox1 (List[float]): First bounding box [x_min, y_min, x_max, y_max].
- bbox2 (List[float]): Second bounding box [x_min, y_min, x_max, y_max].
- direction (str): direction of the projection, "horizontal" or "vertical".
- Returns:
- float: Line overlap ratio. Returns 0 if there is no overlap.
- """
- start_index, end_index = 1, 3
- if direction == "horizontal":
- start_index, end_index = 0, 2
- intersection_start = max(bbox1[start_index], bbox2[start_index])
- intersection_end = min(bbox1[end_index], bbox2[end_index])
- overlap = intersection_end - intersection_start
- if overlap <= 0:
- return 0
- if mode == "union":
- ref_width = max(bbox1[end_index], bbox2[end_index]) - min(
- bbox1[start_index], bbox2[start_index]
- )
- elif mode == "small":
- ref_width = min(
- bbox1[end_index] - bbox1[start_index], bbox2[end_index] - bbox2[start_index]
- )
- elif mode == "large":
- ref_width = max(
- bbox1[end_index] - bbox1[start_index], bbox2[end_index] - bbox2[start_index]
- )
- else:
- raise ValueError(
- f"Invalid mode {mode}, must be one of ['union', 'small', 'large']."
- )
- return overlap / ref_width if ref_width > 0 else 0.0
- def calculate_overlap_ratio(
- bbox1: Union[list, tuple], bbox2: Union[list, tuple], mode="union"
- ) -> float:
- """
- Calculate the overlap ratio between two bounding boxes.
- Args:
- bbox1 (list or tuple): The first bounding box, format [x_min, y_min, x_max, y_max]
- bbox2 (list or tuple): The second bounding box, format [x_min, y_min, x_max, y_max]
- mode (str): The mode of calculation, either 'union', 'small', or 'large'.
- Returns:
- float: The overlap ratio value between the two bounding boxes
- """
- x_min_inter = max(bbox1[0], bbox2[0])
- y_min_inter = max(bbox1[1], bbox2[1])
- x_max_inter = min(bbox1[2], bbox2[2])
- y_max_inter = min(bbox1[3], bbox2[3])
- inter_width = max(0, x_max_inter - x_min_inter)
- inter_height = max(0, y_max_inter - y_min_inter)
- inter_area = inter_width * inter_height
- bbox1_area = caculate_bbox_area(bbox1)
- bbox2_area = caculate_bbox_area(bbox2)
- if mode == "union":
- ref_area = bbox1_area + bbox2_area - inter_area
- elif mode == "small":
- ref_area = min(bbox1_area, bbox2_area)
- elif mode == "large":
- ref_area = max(bbox1_area, bbox2_area)
- else:
- raise ValueError(
- f"Invalid mode {mode}, must be one of ['union', 'small', 'large']."
- )
- if ref_area == 0:
- return 0.0
- return inter_area / ref_area
- def group_boxes_into_lines(ocr_rec_res, line_height_iou_threshold):
- rec_boxes = ocr_rec_res["boxes"]
- rec_texts = ocr_rec_res["rec_texts"]
- rec_labels = ocr_rec_res["rec_labels"]
- text_boxes = [
- rec_boxes[i] for i in range(len(rec_boxes)) if rec_labels[i] == "text"
- ]
- text_orientation = calculate_text_orientation(text_boxes)
- match_direction = "vertical" if text_orientation == "horizontal" else "horizontal"
- line_start_index = 1 if text_orientation == "horizontal" else 0
- line_end_index = 3 if text_orientation == "horizontal" else 2
- spans = list(zip(rec_boxes, rec_texts, rec_labels))
- sort_index = 1
- reverse = False
- if text_orientation == "vertical":
- sort_index = 0
- reverse = True
- spans.sort(key=lambda span: span[0][sort_index], reverse=reverse)
- spans = [list(span) for span in spans]
- lines = []
- line = [spans[0]]
- line_region_box = spans[0][0].copy()
- line_heights = []
- # merge line
- for span in spans[1:]:
- rec_bbox = span[0]
- if (
- calculate_projection_overlap_ratio(
- line_region_box, rec_bbox, match_direction, mode="small"
- )
- >= line_height_iou_threshold
- ):
- line.append(span)
- line_region_box[line_start_index] = min(
- line_region_box[line_start_index], rec_bbox[line_start_index]
- )
- line_region_box[line_end_index] = max(
- line_region_box[line_end_index], rec_bbox[line_end_index]
- )
- else:
- line_heights.append(
- line_region_box[line_end_index] - line_region_box[line_start_index]
- )
- lines.append(line)
- line = [span]
- line_region_box = rec_bbox.copy()
- lines.append(line)
- line_heights.append(
- line_region_box[line_end_index] - line_region_box[line_start_index]
- )
- min_height = min(line_heights) if line_heights else 0
- max_height = max(line_heights) if line_heights else 0
- if max_height > min_height * 2 and text_orientation == "vertical":
- line_heights = np.array(line_heights)
- min_height_num = np.sum(line_heights < min_height * 1.1)
- if min_height_num < len(lines) * 0.4:
- condition = line_heights > min_height * 1.1
- lines = [value for value, keep in zip(lines, condition) if keep]
- return lines, text_orientation, np.mean(line_heights)
- def calculate_minimum_enclosing_bbox(bboxes):
- """
- Calculate the minimum enclosing bounding box for a list of bounding boxes.
- Args:
- bboxes (list): A list of bounding boxes represented as lists of four integers [x1, y1, x2, y2].
- Returns:
- list: The minimum enclosing bounding box represented as a list of four integers [x1, y1, x2, y2].
- """
- if not bboxes:
- raise ValueError("The list of bounding boxes is empty.")
- # Convert the list of bounding boxes to a NumPy array
- bboxes_array = np.array(bboxes)
- # Compute the minimum and maximum values along the respective axes
- min_x = np.min(bboxes_array[:, 0])
- min_y = np.min(bboxes_array[:, 1])
- max_x = np.max(bboxes_array[:, 2])
- max_y = np.max(bboxes_array[:, 3])
- # Return the minimum enclosing bounding box
- return [min_x, min_y, max_x, max_y]
- def calculate_text_orientation(
- bboxes: List[List[int]], orientation_ratio: float = 1.5
- ) -> bool:
- """
- Calculate the orientation of the text based on the bounding boxes.
- Args:
- bboxes (list): A list of bounding boxes.
- orientation_ratio (float): Ratio for determining orientation. Default is 1.5.
- Returns:
- str: "horizontal" or "vertical".
- """
- horizontal_box_num = 0
- for bbox in bboxes:
- if len(bbox) != 4:
- raise ValueError(
- "Invalid bounding box format. Expected a list of length 4."
- )
- x1, y1, x2, y2 = bbox
- width = x2 - x1
- height = y2 - y1
- horizontal_box_num += 1 if width * orientation_ratio >= height else 0
- return "horizontal" if horizontal_box_num >= len(bboxes) * 0.5 else "vertical"
- def is_english_letter(char):
- return bool(re.match(r"^[A-Za-z]$", char))
- def is_numeric(char):
- return bool(re.match(r"^[\d.]+$", char))
- def is_non_breaking_punctuation(char):
- """
- 判断一个字符是否是不需要换行的标点符号,包括全角和半角的符号。
- :param char: str, 单个字符
- :return: bool, 如果字符是不需要换行的标点符号,返回True,否则返回False
- """
- non_breaking_punctuations = {
- ",", # 半角逗号
- ",", # 全角逗号
- "、", # 顿号
- ";", # 半角分号
- ";", # 全角分号
- ":", # 半角冒号
- ":", # 全角冒号
- "-", # 连字符
- }
- return char in non_breaking_punctuations
- def format_line(
- line: List[List[Union[List[int], str]]],
- text_direction: int,
- block_width: int,
- block_start_coordinate: int,
- block_stop_coordinate: int,
- line_gap_limit: int = 10,
- block_label: str = "text",
- ) -> None:
- """
- Format a line of text spans based on layout constraints.
- Args:
- line (list): A list of spans, where each span is a list containing a bounding box and text.
- block_left_coordinate (int): The text line directional minimum coordinate of the layout bounding box.
- block_stop_coordinate (int): The text line directional maximum x-coordinate of the layout bounding box.
- first_line_span_limit (int): The limit for the number of pixels before the first span that should be considered part of the first line. Default is 10.
- line_gap_limit (int): The limit for the number of pixels after the last span that should be considered part of the last line. Default is 10.
- block_label (str): The label associated with the entire block. Default is 'text'.
- Returns:
- None: The function modifies the line in place.
- """
- first_span_box = line[0][0]
- last_span_box = line[-1][0]
- for span in line:
- if span[2] == "formula" and block_label != "formula":
- if len(line) > 1:
- span[1] = f"${span[1]}$"
- else:
- span[1] = f"\n${span[1]}$"
- line_text = ""
- for span in line:
- _, text, label = span
- line_text += text
- if len(text) > 0 and is_english_letter(line_text[-1]) or label == "formula":
- line_text += " "
- if text_direction == "horizontal":
- text_start_index = 0
- text_stop_index = 2
- else:
- text_start_index = 1
- text_stop_index = 3
- need_new_line = False
- if (
- len(line_text) > 0
- and not is_english_letter(line_text[-1])
- and not is_non_breaking_punctuation(line_text[-1])
- ):
- if (
- text_direction == "horizontal"
- and block_stop_coordinate - last_span_box[text_stop_index] > line_gap_limit
- ) or (
- text_direction == "vertical"
- and (
- block_stop_coordinate - last_span_box[text_stop_index] > line_gap_limit
- or first_span_box[1] - block_start_coordinate > line_gap_limit
- )
- ):
- need_new_line = True
- if line_text.endswith("-"):
- line_text = line_text[:-1]
- elif (
- len(line_text) > 0 and is_english_letter(line_text[-1])
- ) or line_text.endswith("$"):
- line_text += " "
- elif (
- len(line_text) > 0
- and not is_english_letter(line_text[-1])
- and not is_non_breaking_punctuation(line_text[-1])
- and not is_numeric(line_text[-1])
- ) or text_direction == "vertical":
- if block_stop_coordinate - last_span_box[text_stop_index] > block_width * 0.4:
- line_text += "\n"
- if (
- first_span_box[text_start_index] - block_start_coordinate
- > block_width * 0.4
- ):
- line_text = "\n" + line_text
- return line_text, need_new_line
- def split_boxes_by_projection(spans: List[List[int]], direction, offset=1e-5):
- """
- Check if there is any complete containment in the x-direction
- between the bounding boxes and split the containing box accordingly.
- Args:
- spans (list of lists): Each element is a list containing an ndarray of length 4, a text string, and a label.
- direction: 'horizontal' or 'vertical', indicating whether the spans are arranged horizontally or vertically.
- offset (float): A small offset value to ensure that the split boxes are not too close to the original boxes.
- Returns:
- A new list of boxes, including split boxes, with the same `rec_text` and `label` attributes.
- """
- def is_projection_contained(box_a, box_b, start_idx, end_idx):
- """Check if box_a completely contains box_b in the x-direction."""
- return box_a[start_idx] <= box_b[start_idx] and box_a[end_idx] >= box_b[end_idx]
- new_boxes = []
- if direction == "horizontal":
- projection_start_index, projection_end_index = 0, 2
- else:
- projection_start_index, projection_end_index = 1, 3
- for i in range(len(spans)):
- span = spans[i]
- is_split = False
- for j in range(i, len(spans)):
- box_b = spans[j][0]
- box_a, text, label = span
- if is_projection_contained(
- box_a, box_b, projection_start_index, projection_end_index
- ):
- is_split = True
- # Split box_a based on the x-coordinates of box_b
- if box_a[projection_start_index] < box_b[projection_start_index]:
- w = (
- box_b[projection_start_index]
- - offset
- - box_a[projection_start_index]
- )
- if w > 1:
- new_bbox = box_a.copy()
- new_bbox[projection_end_index] = (
- box_b[projection_start_index] - offset
- )
- new_boxes.append(
- [
- np.array(new_bbox),
- text,
- label,
- ]
- )
- if box_a[projection_end_index] > box_b[projection_end_index]:
- w = (
- box_a[projection_end_index]
- - box_b[projection_end_index]
- + offset
- )
- if w > 1:
- box_a[projection_start_index] = (
- box_b[projection_end_index] + offset
- )
- span = [
- np.array(box_a),
- text,
- label,
- ]
- if j == len(spans) - 1 and is_split:
- new_boxes.append(span)
- if not is_split:
- new_boxes.append(span)
- return new_boxes
- def remove_extra_space(input_text: str) -> str:
- """
- Process the input text to handle spaces.
- The function removes multiple consecutive spaces between Chinese characters and ensures that
- only a single space is retained between Chinese and non-Chinese characters.
- Args:
- input_text (str): The text to be processed.
- Returns:
- str: The processed text with properly formatted spaces.
- """
- # Remove spaces between Chinese characters
- text_without_spaces = re.sub(
- r"(?<=[\u4e00-\u9fff])\s+(?=[\u4e00-\u9fff])", "", input_text
- )
- # Ensure single space between Chinese and non-Chinese characters
- text_with_single_spaces = re.sub(
- r"(?<=[\u4e00-\u9fff])\s+(?=[^\u4e00-\u9fff])|(?<=[^\u4e00-\u9fff])\s+(?=[\u4e00-\u9fff])",
- " ",
- text_without_spaces,
- )
- # Reduce any remaining consecutive spaces to a single space
- final_text = re.sub(r"\s+", " ", text_with_single_spaces).strip()
- return final_text
- def gather_imgs(original_img, layout_det_objs):
- imgs_in_doc = []
- for det_obj in layout_det_objs:
- if det_obj["label"] in ("image", "chart", "seal", "formula", "table"):
- x_min, y_min, x_max, y_max = list(map(int, det_obj["coordinate"]))
- img_path = f"imgs/img_in_table_box_{x_min}_{y_min}_{x_max}_{y_max}.jpg"
- img = Image.fromarray(original_img[y_min:y_max, x_min:x_max, ::-1])
- imgs_in_doc.append(
- {
- "path": img_path,
- "img": img,
- "coordinate": (x_min, y_min, x_max, y_max),
- "score": det_obj["score"],
- }
- )
- return imgs_in_doc
- def _get_minbox_if_overlap_by_ratio(
- bbox1: Union[List[int], Tuple[int, int, int, int]],
- bbox2: Union[List[int], Tuple[int, int, int, int]],
- ratio: float,
- smaller: bool = True,
- ) -> Optional[Union[List[int], Tuple[int, int, int, int]]]:
- """
- Determine if the overlap area between two bounding boxes exceeds a given ratio
- and return the smaller (or larger) bounding box based on the `smaller` flag.
- Args:
- bbox1 (Union[List[int], Tuple[int, int, int, int]]): Coordinates of the first bounding box [x_min, y_min, x_max, y_max].
- bbox2 (Union[List[int], Tuple[int, int, int, int]]): Coordinates of the second bounding box [x_min, y_min, x_max, y_max].
- ratio (float): The overlap ratio threshold.
- smaller (bool): If True, return the smaller bounding box; otherwise, return the larger one.
- Returns:
- Optional[Union[List[int], Tuple[int, int, int, int]]]:
- The selected bounding box or None if the overlap ratio is not exceeded.
- """
- # Calculate the areas of both bounding boxes
- area1 = caculate_bbox_area(bbox1)
- area2 = caculate_bbox_area(bbox2)
- # Calculate the overlap ratio using a helper function
- overlap_ratio = calculate_overlap_ratio(bbox1, bbox2, mode="small")
- # Check if the overlap ratio exceeds the threshold
- if overlap_ratio > ratio:
- if (area1 <= area2 and smaller) or (area1 >= area2 and not smaller):
- return 1
- else:
- return 2
- return None
- def remove_overlap_blocks(
- blocks: List[Dict[str, List[int]]], threshold: float = 0.65, smaller: bool = True
- ) -> Tuple[List[Dict[str, List[int]]], List[Dict[str, List[int]]]]:
- """
- Remove overlapping blocks based on a specified overlap ratio threshold.
- Args:
- blocks (List[Dict[str, List[int]]]): List of block dictionaries, each containing a 'block_bbox' key.
- threshold (float): Ratio threshold to determine significant overlap.
- smaller (bool): If True, the smaller block in overlap is removed.
- Returns:
- Tuple[List[Dict[str, List[int]]], List[Dict[str, List[int]]]]:
- A tuple containing the updated list of blocks and a list of dropped blocks.
- """
- dropped_indexes = set()
- blocks = deepcopy(blocks)
- overlap_image_blocks = []
- # Iterate over each pair of blocks to find overlaps
- for i, block1 in enumerate(blocks["boxes"]):
- for j in range(i + 1, len(blocks["boxes"])):
- block2 = blocks["boxes"][j]
- # Skip blocks that are already marked for removal
- if i in dropped_indexes or j in dropped_indexes:
- continue
- # Check for overlap and determine which block to remove
- overlap_box_index = _get_minbox_if_overlap_by_ratio(
- block1["coordinate"],
- block2["coordinate"],
- threshold,
- smaller=smaller,
- )
- if overlap_box_index is not None:
- is_block1_image = block1["label"] == "image"
- is_block2_image = block2["label"] == "image"
- if is_block1_image != is_block2_image:
- # 如果只有一个块在视觉标签中,删除在视觉标签中的那个块
- drop_index = i if is_block1_image else j
- overlap_image_blocks.append(blocks["boxes"][drop_index])
- else:
- # 如果两个块都在或都不在视觉标签中,根据 overlap_box_index 决定删除哪个块
- drop_index = i if overlap_box_index == 1 else j
- dropped_indexes.add(drop_index)
- # Remove marked blocks from the original list
- for index in sorted(dropped_indexes, reverse=True):
- del blocks["boxes"][index]
- return blocks
- def get_bbox_intersection(bbox1, bbox2, return_format="bbox"):
- """
- Compute the intersection of two bounding boxes, supporting both 4-coordinate and 8-coordinate formats.
- Args:
- bbox1 (tuple): The first bounding box, either in 4-coordinate format (x_min, y_min, x_max, y_max)
- or 8-coordinate format (x1, y1, x2, y2, x3, y3, x4, y4).
- bbox2 (tuple): The second bounding box in the same format as bbox1.
- return_format (str): The format of the output intersection, either 'bbox' or 'poly'.
- Returns:
- tuple or None: The intersection bounding box in the specified format, or None if there is no intersection.
- """
- bbox1 = np.array(bbox1)
- bbox2 = np.array(bbox2)
- # Convert both bounding boxes to rectangles
- rect1 = bbox1 if len(bbox1.shape) == 1 else convert_points_to_boxes([bbox1])[0]
- rect2 = bbox2 if len(bbox2.shape) == 1 else convert_points_to_boxes([bbox2])[0]
- # Calculate the intersection rectangle
- x_min_inter = max(rect1[0], rect2[0])
- y_min_inter = max(rect1[1], rect2[1])
- x_max_inter = min(rect1[2], rect2[2])
- y_max_inter = min(rect1[3], rect2[3])
- # Check if there is an intersection
- if x_min_inter >= x_max_inter or y_min_inter >= y_max_inter:
- return None
- if return_format == "bbox":
- return np.array([x_min_inter, y_min_inter, x_max_inter, y_max_inter])
- elif return_format == "poly":
- return np.array(
- [
- [x_min_inter, y_min_inter],
- [x_max_inter, y_min_inter],
- [x_max_inter, y_max_inter],
- [x_min_inter, y_max_inter],
- ],
- dtype=np.int16,
- )
- else:
- raise ValueError("return_format must be either 'bbox' or 'poly'.")
- def shrink_supplement_region_bbox(
- supplement_region_bbox,
- ref_region_bbox,
- image_width,
- image_height,
- block_idxes_set,
- block_bboxes,
- ) -> List:
- """
- Shrink the supplement region bbox according to the reference region bbox and match the block bboxes.
- Args:
- supplement_region_bbox (list): The supplement region bbox.
- ref_region_bbox (list): The reference region bbox.
- image_width (int): The width of the image.
- image_height (int): The height of the image.
- block_idxes_set (set): The indexes of the blocks that intersect with the region bbox.
- block_bboxes (dict): The dictionary of block bboxes.
- Returns:
- list: The new region bbox and the matched block idxes.
- """
- x1, y1, x2, y2 = supplement_region_bbox
- x1_prime, y1_prime, x2_prime, y2_prime = ref_region_bbox
- index_conversion_map = {0: 2, 1: 3, 2: 0, 3: 1}
- edge_distance_list = [
- (x1_prime - x1) / image_width,
- (y1_prime - y1) / image_height,
- (x2 - x2_prime) / image_width,
- (y2 - y2_prime) / image_height,
- ]
- edge_distance_list_tmp = edge_distance_list[:]
- min_distance = min(edge_distance_list)
- src_index = index_conversion_map[edge_distance_list.index(min_distance)]
- if len(block_idxes_set) == 0:
- return supplement_region_bbox, []
- for _ in range(3):
- dst_index = index_conversion_map[src_index]
- tmp_region_bbox = supplement_region_bbox[:]
- tmp_region_bbox[dst_index] = ref_region_bbox[src_index]
- iner_block_idxes, split_block_idxes = [], []
- for block_idx in block_idxes_set:
- overlap_ratio = calculate_overlap_ratio(
- tmp_region_bbox, block_bboxes[block_idx], mode="small"
- )
- if overlap_ratio > REGION_SETTINGS.get(
- "match_block_overlap_ratio_threshold", 0.8
- ):
- iner_block_idxes.append(block_idx)
- elif overlap_ratio > REGION_SETTINGS.get(
- "split_block_overlap_ratio_threshold", 0.4
- ):
- split_block_idxes.append(block_idx)
- if len(iner_block_idxes) > 0:
- if len(split_block_idxes) > 0:
- for split_block_idx in split_block_idxes:
- split_block_bbox = block_bboxes[split_block_idx]
- x1, y1, x2, y2 = tmp_region_bbox
- x1_prime, y1_prime, x2_prime, y2_prime = split_block_bbox
- edge_distance_list = [
- (x1_prime - x1) / image_width,
- (y1_prime - y1) / image_height,
- (x2 - x2_prime) / image_width,
- (y2 - y2_prime) / image_height,
- ]
- max_distance = max(edge_distance_list)
- src_index = edge_distance_list.index(max_distance)
- dst_index = index_conversion_map[src_index]
- tmp_region_bbox[dst_index] = split_block_bbox[src_index]
- tmp_region_bbox, iner_idxes = shrink_supplement_region_bbox(
- tmp_region_bbox,
- ref_region_bbox,
- image_width,
- image_height,
- iner_block_idxes,
- block_bboxes,
- )
- if len(iner_idxes) == 0:
- continue
- matched_bboxes = [block_bboxes[idx] for idx in iner_block_idxes]
- supplement_region_bbox = calculate_minimum_enclosing_bbox(matched_bboxes)
- break
- else:
- edge_distance_list_tmp = [
- x for x in edge_distance_list_tmp if x != min_distance
- ]
- min_distance = min(edge_distance_list_tmp)
- src_index = index_conversion_map[edge_distance_list.index(min_distance)]
- return supplement_region_bbox, iner_block_idxes
- def update_region_box(bbox, region_box):
- if region_box is None:
- return bbox
- x1, y1, x2, y2 = bbox
- x1_region, y1_region, x2_region, y2_region = region_box
- x1_region = int(min(x1, x1_region))
- y1_region = int(min(y1, y1_region))
- x2_region = int(max(x2, x2_region))
- y2_region = int(max(y2, y2_region))
- region_box = [x1_region, y1_region, x2_region, y2_region]
- return region_box
- def convert_formula_res_to_ocr_format(formula_res_list: List, ocr_res: dict):
- for formula_res in formula_res_list:
- x_min, y_min, x_max, y_max = list(map(int, formula_res["dt_polys"]))
- poly_points = [
- (x_min, y_min),
- (x_max, y_min),
- (x_max, y_max),
- (x_min, y_max),
- ]
- ocr_res["dt_polys"].append(poly_points)
- formula_res_text: str = formula_res["rec_formula"]
- if formula_res_text.startswith("$$") and formula_res_text.endswith("$$"):
- formula_res_text = formula_res_text[2:-2]
- elif formula_res_text.startswith("$") and formula_res_text.endswith("$"):
- formula_res_text = formula_res_text[1:-1]
- ocr_res["rec_texts"].append(formula_res_text)
- if ocr_res["rec_boxes"].size == 0:
- ocr_res["rec_boxes"] = np.array(formula_res["dt_polys"])
- else:
- ocr_res["rec_boxes"] = np.vstack(
- (ocr_res["rec_boxes"], [formula_res["dt_polys"]])
- )
- ocr_res["rec_labels"].append("formula")
- ocr_res["rec_polys"].append(poly_points)
- ocr_res["rec_scores"].append(1)
- def caculate_bbox_area(bbox):
- x1, y1, x2, y2 = map(float, bbox)
- area = abs((x2 - x1) * (y2 - y1))
- return area
- def get_show_color(label: str, order_label=False) -> Tuple:
- if order_label:
- label_colors = {
- "doc_title": (255, 248, 220, 100), # Cornsilk
- "doc_title_text": (255, 239, 213, 100),
- "paragraph_title": (102, 102, 255, 100),
- "sub_paragraph_title": (102, 178, 255, 100),
- "vision": (153, 255, 51, 100),
- "vision_title": (144, 238, 144, 100), # Light Green
- "vision_footnote": (144, 238, 144, 100), # Light Green
- "normal_text": (153, 0, 76, 100),
- "cross_layout": (53, 218, 207, 100), # Thistle
- "cross_reference": (221, 160, 221, 100), # Floral White
- }
- else:
- label_colors = {
- # Medium Blue (from 'titles_list')
- "paragraph_title": (102, 102, 255, 100),
- "doc_title": (255, 248, 220, 100), # Cornsilk
- # Light Yellow (from 'tables_caption_list')
- "table_title": (255, 255, 102, 100),
- # Sky Blue (from 'imgs_caption_list')
- "figure_title": (102, 178, 255, 100),
- "chart_title": (221, 160, 221, 100), # Plum
- "vision_footnote": (144, 238, 144, 100), # Light Green
- # Deep Purple (from 'texts_list')
- "text": (153, 0, 76, 100),
- # Bright Green (from 'interequations_list')
- "formula": (0, 255, 0, 100),
- "abstract": (255, 239, 213, 100), # Papaya Whip
- # Medium Green (from 'lists_list' and 'indexs_list')
- "content": (40, 169, 92, 100),
- # Neutral Gray (from 'dropped_bbox_list')
- "seal": (158, 158, 158, 100),
- # Olive Yellow (from 'tables_body_list')
- "table": (204, 204, 0, 100),
- # Bright Green (from 'imgs_body_list')
- "image": (153, 255, 51, 100),
- # Bright Green (from 'imgs_body_list')
- "figure": (153, 255, 51, 100),
- "chart": (216, 191, 216, 100), # Thistle
- # Pale Yellow-Green (from 'tables_footnote_list')
- "reference": (229, 255, 204, 100),
- # "reference_content": (229, 255, 204, 100),
- "algorithm": (255, 250, 240, 100), # Floral White
- }
- default_color = (158, 158, 158, 100)
- return label_colors.get(label, default_color)
|