| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255 |
- """
- 布局处理工具模块
- 提供布局相关处理功能:
- - 重叠框检测与去重
- - 阅读顺序排序
- - IoU/重叠比例计算
- """
- from typing import Dict, List, Any
- from loguru import logger
- # 导入 MinerU 组件
- try:
- from mineru.utils.boxbase import calculate_iou, calculate_overlap_area_2_minbox_area_ratio
- MINERU_AVAILABLE = True
- except ImportError:
- MINERU_AVAILABLE = False
- calculate_iou = None
- calculate_overlap_area_2_minbox_area_ratio = None
- class LayoutUtils:
- """布局处理工具类"""
-
- @staticmethod
- def calculate_iou(bbox1: List[float], bbox2: List[float]) -> float:
- """
- 计算两个 bbox 的 IoU(交并比)
-
- Args:
- bbox1: 第一个 bbox [x1, y1, x2, y2]
- bbox2: 第二个 bbox [x1, y1, x2, y2]
-
- Returns:
- IoU 值
- """
- if MINERU_AVAILABLE and calculate_iou is not None:
- return calculate_iou(bbox1, bbox2)
-
- # 备用实现
- x_left = max(bbox1[0], bbox2[0])
- y_top = max(bbox1[1], bbox2[1])
- x_right = min(bbox1[2], bbox2[2])
- y_bottom = min(bbox1[3], bbox2[3])
-
- if x_right < x_left or y_bottom < y_top:
- return 0.0
-
- intersection_area = (x_right - x_left) * (y_bottom - y_top)
- bbox1_area = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
- bbox2_area = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
-
- if bbox1_area == 0 or bbox2_area == 0:
- return 0.0
-
- return intersection_area / float(bbox1_area + bbox2_area - intersection_area)
-
- @staticmethod
- def calculate_overlap_ratio(bbox1: List[float], bbox2: List[float]) -> float:
- """
- 计算重叠面积占小框面积的比例
-
- Args:
- bbox1: 第一个 bbox [x1, y1, x2, y2]
- bbox2: 第二个 bbox [x1, y1, x2, y2]
-
- Returns:
- 重叠比例
- """
- if MINERU_AVAILABLE and calculate_overlap_area_2_minbox_area_ratio is not None:
- return calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2)
-
- # 备用实现
- x_left = max(bbox1[0], bbox2[0])
- y_top = max(bbox1[1], bbox2[1])
- x_right = min(bbox1[2], bbox2[2])
- y_bottom = min(bbox1[3], bbox2[3])
-
- if x_right < x_left or y_bottom < y_top:
- return 0.0
-
- intersection_area = (x_right - x_left) * (y_bottom - y_top)
- area1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
- area2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
- min_area = min(area1, area2)
-
- if min_area == 0:
- return 0.0
-
- return intersection_area / min_area
-
- @staticmethod
- def remove_overlapping_boxes(
- layout_results: List[Dict[str, Any]],
- iou_threshold: float = 0.8,
- overlap_ratio_threshold: float = 0.8
- ) -> List[Dict[str, Any]]:
- """
- 处理重叠的布局框(参考 MinerU 的去重策略)
-
- 策略:
- 1. 高 IoU 重叠:保留置信度高的框
- 2. 包含关系:小框被大框高度包含时,保留大框并扩展边界
- 3. 同类型优先合并
-
- Args:
- layout_results: Layout 检测结果列表
- iou_threshold: IoU 阈值,超过此值认为高度重叠
- overlap_ratio_threshold: 重叠面积占小框面积的比例阈值
-
- Returns:
- 去重后的布局结果列表
- """
- if not layout_results or len(layout_results) <= 1:
- return layout_results
-
- # 复制列表避免修改原数据
- results = [item.copy() for item in layout_results]
- need_remove = set()
-
- for i in range(len(results)):
- if i in need_remove:
- continue
-
- for j in range(i + 1, len(results)):
- if j in need_remove:
- continue
-
- bbox1 = results[i].get('bbox', [0, 0, 0, 0])
- bbox2 = results[j].get('bbox', [0, 0, 0, 0])
-
- if len(bbox1) < 4 or len(bbox2) < 4:
- continue
-
- # 计算 IoU
- iou = LayoutUtils.calculate_iou(bbox1, bbox2)
-
- if iou > iou_threshold:
- # 高度重叠,保留置信度高的
- score1 = results[i].get('confidence', results[i].get('score', 0))
- score2 = results[j].get('confidence', results[j].get('score', 0))
-
- if score1 >= score2:
- need_remove.add(j)
- else:
- need_remove.add(i)
- break # i 被移除,跳出内层循环
- else:
- # 检查包含关系
- overlap_ratio = LayoutUtils.calculate_overlap_ratio(bbox1, bbox2)
-
- if overlap_ratio > overlap_ratio_threshold:
- # 小框被大框高度包含
- area1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
- area2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
-
- if area1 <= area2:
- small_idx, large_idx = i, j
- else:
- small_idx, large_idx = j, i
-
- # 扩展大框的边界
- small_bbox = results[small_idx]['bbox']
- large_bbox = results[large_idx]['bbox']
- results[large_idx]['bbox'] = [
- min(small_bbox[0], large_bbox[0]),
- min(small_bbox[1], large_bbox[1]),
- max(small_bbox[2], large_bbox[2]),
- max(small_bbox[3], large_bbox[3])
- ]
- need_remove.add(small_idx)
-
- if small_idx == i:
- break # i 被移除,跳出内层循环
-
- # 返回去重后的结果
- return [results[i] for i in range(len(results)) if i not in need_remove]
-
- @staticmethod
- def sort_elements_by_reading_order(
- elements: List[Dict[str, Any]],
- y_tolerance: float = 15.0
- ) -> List[Dict[str, Any]]:
- """
- 根据阅读顺序对元素进行排序,并添加 reading_order 字段
-
- 排序规则:
- 1. 按Y坐标分行(考虑容差,Y坐标相近的元素视为同一行)
- 2. 同一行内按X坐标从左到右排序
- 3. 行与行之间按Y坐标从上到下排序
-
- Args:
- elements: 元素列表(坐标已转换为原始图片坐标系)
- y_tolerance: Y坐标容差,在此范围内的元素被视为同一行
-
- Returns:
- 排序后的元素列表,每个元素都添加了 reading_order 字段
- """
- if not elements:
- return elements
-
- # 为每个元素提取排序用的坐标
- elements_with_coords = []
- for elem in elements:
- bbox = elem.get('bbox', [0, 0, 0, 0])
- if len(bbox) >= 4:
- y_top = bbox[1] # 上边界
- x_left = bbox[0] # 左边界
- else:
- y_top = 0
- x_left = 0
- elements_with_coords.append((elem, y_top, x_left))
-
- # 先按Y坐标排序
- elements_with_coords.sort(key=lambda x: (x[1], x[2]))
-
- # 按Y坐标分行
- rows = []
- current_row = []
- current_row_y = None
-
- for elem, y_top, x_left in elements_with_coords:
- if current_row_y is None:
- # 第一个元素
- current_row.append((elem, x_left))
- current_row_y = y_top
- elif abs(y_top - current_row_y) <= y_tolerance:
- # 同一行
- current_row.append((elem, x_left))
- else:
- # 新的一行
- rows.append(current_row)
- current_row = [(elem, x_left)]
- current_row_y = y_top
-
- # 添加最后一行
- if current_row:
- rows.append(current_row)
-
- # 每行内按X坐标排序,然后展平
- sorted_elements = []
- reading_order = 0
-
- for row in rows:
- # 行内按X坐标排序
- row.sort(key=lambda x: x[1])
- for elem, _ in row:
- # 添加 reading_order 字段
- elem['reading_order'] = reading_order
- sorted_elements.append(elem)
- reading_order += 1
-
- logger.debug(f"📖 Elements sorted by reading order: {len(sorted_elements)} elements")
- return sorted_elements
|