6 месяцев назад · 66103ab214
--- a/ocr_tools/universal_doc_parser/models/adapters/dit_layout_adapter.py
+++ b/ocr_tools/universal_doc_parser/models/adapters/dit_layout_adapter.py
@@ -0,0 +1,733 @@
 
															+"""DiT Layout Detector 适配器
														
 
															+
														
 
															+基于 DiT (Document Image Transformer) 的布局检测适配器，参考 docling_layout_adapter 的实现方式。
														
 
															+支持 PubLayNet 数据集的 5 个类别：text, title, list, table, figure。
														
 
															+
														
 
															+支持的配置：
														
 
															+- config_file: DiT 配置文件路径
														
 
															+- model_weights: 模型权重路径或 URL
														
 
															+- device: 运行设备 ('cpu', 'cuda', 'mps')
														
 
															+- conf: 置信度阈值 (默认 0.3)
														
 
															+- remove_overlap: 是否启用重叠框处理 (默认 True)
														
 
															+- iou_threshold: IoU 阈值 (默认 0.8)
														
 
															+- overlap_ratio_threshold: 重叠比例阈值 (默认 0.8)
														
 
															+"""
														
 
															+
														
 
															+import cv2
														
 
															+import numpy as np
														
 
															+import threading
														
 
															+from pathlib import Path
														
 
															+from typing import Dict, List, Union, Any, Optional
														
 
															+from PIL import Image
														
 
															+
														
 
															+try:
														
 
															+    from .base import BaseLayoutDetector
														
 
															+except ImportError:
														
 
															+    from base import BaseLayoutDetector
														
 
															+
														
 
															+# 全局锁，防止模型初始化时的线程问题
														
 
															+_model_init_lock = threading.Lock()
														
 
															+
														
 
															+
														
 
															+class LayoutUtils:
														
 
															+    """布局处理工具类（简化版，不依赖 external 模块）"""
														
 
															+    
														
 
															+    @staticmethod
														
 
															+    def calculate_iou(bbox1: List[float], bbox2: List[float]) -> float:
														
 
															+        """计算两个 bbox 的 IoU（交并比）"""
														
 
															+        x1_1, y1_1, x2_1, y2_1 = bbox1
														
 
															+        x1_2, y1_2, x2_2, y2_2 = bbox2
														
 
															+        
														
 
															+        # 计算交集
														
 
															+        x1_i = max(x1_1, x1_2)
														
 
															+        y1_i = max(y1_1, y1_2)
														
 
															+        x2_i = min(x2_1, x2_2)
														
 
															+        y2_i = min(y2_1, y2_2)
														
 
															+        
														
 
															+        if x2_i <= x1_i or y2_i <= y1_i:
														
 
															+            return 0.0
														
 
															+        
														
 
															+        intersection = (x2_i - x1_i) * (y2_i - y1_i)
														
 
															+        
														
 
															+        # 计算并集
														
 
															+        area1 = (x2_1 - x1_1) * (y2_1 - y1_1)
														
 
															+        area2 = (x2_2 - x1_2) * (y2_2 - y1_2)
														
 
															+        union = area1 + area2 - intersection
														
 
															+        
														
 
															+        if union == 0:
														
 
															+            return 0.0
														
 
															+        
														
 
															+        return intersection / union
														
 
															+    
														
 
															+    @staticmethod
														
 
															+    def calculate_overlap_ratio(bbox1: List[float], bbox2: List[float]) -> float:
														
 
															+        """计算重叠面积占小框面积的比例"""
														
 
															+        x1_1, y1_1, x2_1, y2_1 = bbox1
														
 
															+        x1_2, y1_2, x2_2, y2_2 = bbox2
														
 
															+        
														
 
															+        # 计算交集
														
 
															+        x1_i = max(x1_1, x1_2)
														
 
															+        y1_i = max(y1_1, y1_2)
														
 
															+        x2_i = min(x2_1, x2_2)
														
 
															+        y2_i = min(y2_1, y2_2)
														
 
															+        
														
 
															+        if x2_i <= x1_i or y2_i <= y1_i:
														
 
															+            return 0.0
														
 
															+        
														
 
															+        intersection = (x2_i - x1_i) * (y2_i - y1_i)
														
 
															+        
														
 
															+        # 计算两个框的面积
														
 
															+        area1 = (x2_1 - x1_1) * (y2_1 - y1_1)
														
 
															+        area2 = (x2_2 - x1_2) * (y2_2 - y1_2)
														
 
															+        
														
 
															+        # 返回交集占小框面积的比例
														
 
															+        min_area = min(area1, area2)
														
 
															+        if min_area == 0:
														
 
															+            return 0.0
														
 
															+        
														
 
															+        return intersection / min_area
														
 
															+    
														
 
															+    @staticmethod
														
 
															+    def remove_overlapping_boxes(
														
 
															+        layout_results: List[Dict[str, Any]],
														
 
															+        iou_threshold: float = 0.8,
														
 
															+        overlap_ratio_threshold: float = 0.8
														
 
															+    ) -> List[Dict[str, Any]]:
														
 
															+        """
														
 
															+        处理重叠的布局框（参考 MinerU 的去重策略）
														
 
															+        
														
 
															+        策略：
														
 
															+        1. 高 IoU 重叠：保留置信度高的框
														
 
															+        2. 包含关系：小框被大框高度包含时，保留大框并扩展边界
														
 
															+        
														
 
															+        Args:
														
 
															+            layout_results: Layout 检测结果列表
														
 
															+            iou_threshold: IoU 阈值，超过此值认为高度重叠
														
 
															+            overlap_ratio_threshold: 重叠面积占小框面积的比例阈值
														
 
															+            
														
 
															+        Returns:
														
 
															+            去重后的布局结果列表
														
 
															+        """
														
 
															+        if not layout_results or len(layout_results) <= 1:
														
 
															+            return layout_results
														
 
															+        
														
 
															+        # 复制列表避免修改原数据
														
 
															+        results = [item.copy() for item in layout_results]
														
 
															+        need_remove = set()
														
 
															+        
														
 
															+        for i in range(len(results)):
														
 
															+            if i in need_remove:
														
 
															+                continue
														
 
															+                
														
 
															+            for j in range(i + 1, len(results)):
														
 
															+                if j in need_remove:
														
 
															+                    continue
														
 
															+                
														
 
															+                bbox1 = results[i].get('bbox', [0, 0, 0, 0])
														
 
															+                bbox2 = results[j].get('bbox', [0, 0, 0, 0])
														
 
															+                
														
 
															+                if len(bbox1) < 4 or len(bbox2) < 4:
														
 
															+                    continue
														
 
															+                
														
 
															+                # 计算 IoU
														
 
															+                iou = LayoutUtils.calculate_iou(bbox1, bbox2)
														
 
															+                
														
 
															+                if iou > iou_threshold:
														
 
															+                    # 高度重叠，保留置信度高的
														
 
															+                    score1 = results[i].get('confidence', results[i].get('score', 0))
														
 
															+                    score2 = results[j].get('confidence', results[j].get('score', 0))
														
 
															+                    
														
 
															+                    if score1 >= score2:
														
 
															+                        need_remove.add(j)
														
 
															+                    else:
														
 
															+                        need_remove.add(i)
														
 
															+                        break  # i 被移除，跳出内层循环
														
 
															+                else:
														
 
															+                    # 检查包含关系
														
 
															+                    overlap_ratio = LayoutUtils.calculate_overlap_ratio(bbox1, bbox2)
														
 
															+                    
														
 
															+                    if overlap_ratio > overlap_ratio_threshold:
														
 
															+                        # 小框被大框高度包含
														
 
															+                        area1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
														
 
															+                        area2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
														
 
															+                        
														
 
															+                        if area1 <= area2:
														
 
															+                            small_idx, large_idx = i, j
														
 
															+                        else:
														
 
															+                            small_idx, large_idx = j, i
														
 
															+                        
														
 
															+                        # 扩展大框的边界
														
 
															+                        small_bbox = results[small_idx]['bbox']
														
 
															+                        large_bbox = results[large_idx]['bbox']
														
 
															+                        results[large_idx]['bbox'] = [
														
 
															+                            min(small_bbox[0], large_bbox[0]),
														
 
															+                            min(small_bbox[1], large_bbox[1]),
														
 
															+                            max(small_bbox[2], large_bbox[2]),
														
 
															+                            max(small_bbox[3], large_bbox[3])
														
 
															+                        ]
														
 
															+                        need_remove.add(small_idx)
														
 
															+                        
														
 
															+                        if small_idx == i:
														
 
															+                            break  # i 被移除，跳出内层循环
														
 
															+        
														
 
															+        # 返回去重后的结果
														
 
															+        return [results[i] for i in range(len(results)) if i not in need_remove]
														
 
															+
														
 
															+
														
 
															+class DitLayoutDetector(BaseLayoutDetector):
														
 
															+    """DiT Layout Detector 适配器
														
 
															+    
														
 
															+    基于 DiT (Document Image Transformer) 的布局检测器，使用 detectron2 + DiT backbone。
														
 
															+    支持 PubLayNet 数据集的布局检测。
														
 
															+    """
														
 
															+    
														
 
															+    # DiT/PubLayNet 原始类别定义
														
 
															+    DIT_LABELS = {
														
 
															+        0: 'text',
														
 
															+        1: 'title',
														
 
															+        2: 'list',
														
 
															+        3: 'table',
														
 
															+        4: 'figure',
														
 
															+    }
														
 
															+    
														
 
															+    # 类别映射：PubLayNet → MinerU/EnhancedDocPipeline 类别体系
														
 
															+    # 参考：
														
 
															+    # - Pipeline: universal_doc_parser/core/pipeline_manager_v2.py (EnhancedDocPipeline 类别定义)
														
 
															+    CATEGORY_MAP = {
														
 
															+        'text': 'text',                    # Text -> text (TEXT_CATEGORIES)
														
 
															+        'title': 'title',                  # Title -> title (TEXT_CATEGORIES)
														
 
															+        'list': 'text',                    # List-item -> text (TEXT_CATEGORIES)
														
 
															+        'table': 'table_body',             # Table -> table_body (TABLE_BODY_CATEGORIES)
														
 
															+        'figure': 'image_body',            # Figure -> image_body (IMAGE_BODY_CATEGORIES)
														
 
															+    }
														
 
															+    
														
 
															+    def __init__(self, config: Dict[str, Any]):
														
 
															+        """
														
 
															+        初始化 DiT Layout 检测器
														
 
															+        
														
 
															+        Args:
														
 
															+            config: 配置字典，支持以下参数：
														
 
															+                - config_file: DiT 配置文件路径（默认使用 cascade_dit_large.yaml）
														
 
															+                - model_weights: 模型权重路径或 URL
														
 
															+                - device: 运行设备 ('cpu', 'cuda', 'mps')
														
 
															+                - conf: 置信度阈值 (默认 0.3)
														
 
															+                - remove_overlap: 是否启用重叠框处理 (默认 True)
														
 
															+                - iou_threshold: IoU 阈值 (默认 0.8)
														
 
															+                - overlap_ratio_threshold: 重叠比例阈值 (默认 0.8)
														
 
															+        """
														
 
															+        super().__init__(config)
														
 
															+        self.predictor = None
														
 
															+        self.cfg = None
														
 
															+        self._device = None
														
 
															+        self._threshold = 0.3
														
 
															+        self._remove_overlap = True
														
 
															+        self._iou_threshold = 0.8
														
 
															+        self._overlap_ratio_threshold = 0.8
														
 
															+    
														
 
															+    def initialize(self):
														
 
															+        """初始化模型"""
														
 
															+        import os
														
 
															+        import sys
														
 
															+        
														
 
															+        try:
														
 
															+            import torch
														
 
															+            from detectron2.config import get_cfg
														
 
															+            from detectron2.engine import DefaultPredictor
														
 
															+            from detectron2.data import MetadataCatalog
														
 
															+            
														
 
															+            # PyTorch 2.6+ 兼容性修复
														
 
															+            if hasattr(torch, '__version__'):
														
 
															+                torch_version = tuple(map(int, torch.__version__.split('.')[:2]))
														
 
															+                if torch_version >= (2, 6):
														
 
															+                    _original_torch_load = torch.load
														
 
															+                    def _patched_torch_load(f, map_location=None, pickle_module=None, 
														
 
															+                                            weights_only=None, **kwargs):
														
 
															+                        if weights_only is None:
														
 
															+                            weights_only = False
														
 
															+                        return _original_torch_load(f, map_location=map_location, 
														
 
															+                                                  pickle_module=pickle_module,
														
 
															+                                                  weights_only=weights_only, **kwargs)
														
 
															+                    torch.load = _patched_torch_load
														
 
															+            
														
 
															+            # 添加 dit_support 路径（适配到 universal_doc_parser）
														
 
															+            current_dir = os.path.dirname(os.path.abspath(__file__))
														
 
															+            dit_support_path = os.path.join(current_dir, '..', 'dit_support')
														
 
															+            if dit_support_path not in sys.path:
														
 
															+                sys.path.insert(0, dit_support_path)
														
 
															+            
														
 
															+            from ditod import add_vit_config
														
 
															+            
														
 
															+            # 获取配置参数
														
 
															+            config_file = self.config.get(
														
 
															+                'config_file',
														
 
															+                os.path.join(current_dir, '..', 'dit_support', 'configs',
														
 
															+                           'cascade', 'cascade_dit_large.yaml')
														
 
															+            )
														
 
															+            model_weights = self.config.get(
														
 
															+                'model_weights',
														
 
															+                'https://huggingface.co/HYPJUDY/dit/resolve/main/dit-fts/publaynet_dit-l_cascade.pth'
														
 
															+            )
														
 
															+            device = self.config.get('device', 'cpu')
														
 
															+            self._threshold = self.config.get('conf', 0.3)
														
 
															+            self._remove_overlap = self.config.get('remove_overlap', True)
														
 
															+            self._iou_threshold = self.config.get('iou_threshold', 0.8)
														
 
															+            self._overlap_ratio_threshold = self.config.get('overlap_ratio_threshold', 0.8)
														
 
															+            
														
 
															+            # 设置设备
														
 
															+            self._device = torch.device(device)
														
 
															+            
														
 
															+            # 验证配置文件存在
														
 
															+            if not os.path.exists(config_file):
														
 
															+                raise FileNotFoundError(f"Config file not found: {config_file}")
														
 
															+            
														
 
															+            # 加载配置
														
 
															+            self.cfg = get_cfg()
														
 
															+            add_vit_config(self.cfg)
														
 
															+            self.cfg.merge_from_file(config_file)
														
 
															+            self.cfg.merge_from_list(["MODEL.WEIGHTS", model_weights])
														
 
															+            self.cfg.MODEL.DEVICE = str(self._device)
														
 
															+            
														
 
															+            # 设置元数据
														
 
															+            dataset_name = self.cfg.DATASETS.TEST[0]
														
 
															+            md = MetadataCatalog.get(dataset_name)
														
 
															+            if dataset_name == 'icdar2019_test':
														
 
															+                md.set(thing_classes=["table"])
														
 
															+            else:
														
 
															+                md.set(thing_classes=["text", "title", "list", "table", "figure"])
														
 
															+            
														
 
															+            # 创建预测器（使用锁防止线程问题）
														
 
															+            with _model_init_lock:
														
 
															+                self.predictor = DefaultPredictor(self.cfg)
														
 
															+            
														
 
															+            print(f"✅ DiT Layout Detector initialized")
														
 
															+            print(f"   - Config: {config_file}")
														
 
															+            print(f"   - Device: {self._device}")
														
 
															+            print(f"   - Threshold: {self._threshold}")
														
 
															+            print(f"   - Remove overlap: {self._remove_overlap}")
														
 
															+            
														
 
															+        except ImportError as e:
														
 
															+            print(f"❌ Failed to import required libraries: {e}")
														
 
															+            print("   Please ensure detectron2 and ditod are installed")
														
 
															+            raise
														
 
															+        except Exception as e:
														
 
															+            print(f"❌ Failed to initialize DiT Layout Detector: {e}")
														
 
															+            raise
														
 
															+    
														
 
															+    def cleanup(self):
														
 
															+        """清理资源"""
														
 
															+        self.predictor = None
														
 
															+        self.cfg = None
														
 
															+        self._device = None
														
 
															+    
														
 
															+    def detect(self, image: Union[np.ndarray, Image.Image]) -> List[Dict[str, Any]]:
														
 
															+        """
														
 
															+        检测布局
														
 
															+        
														
 
															+        Args:
														
 
															+            image: 输入图像 (numpy数组或PIL图像)
														
 
															+            
														
 
															+        Returns:
														
 
															+            检测结果列表，每个元素包含:
														
 
															+            - category: MinerU类别名称
														
 
															+            - bbox: [x1, y1, x2, y2]
														
 
															+            - confidence: 置信度
														
 
															+            - raw: 原始检测结果
														
 
															+        """
														
 
															+        if self.predictor is None:
														
 
															+            raise RuntimeError("Model not initialized. Call initialize() first.")
														
 
															+        
														
 
															+        # 转换为 numpy 数组 (BGR 格式)
														
 
															+        if isinstance(image, Image.Image):
														
 
															+            image = np.array(image)
														
 
															+            if len(image.shape) == 3 and image.shape[2] == 3:
														
 
															+                # PIL RGB -> OpenCV BGR
														
 
															+                image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
														
 
															+        
														
 
															+        # 确保是 BGR 格式
														
 
															+        if isinstance(image, np.ndarray):
														
 
															+            if len(image.shape) == 2:
														
 
															+                image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
														
 
															+            elif len(image.shape) == 3 and image.shape[2] == 3:
														
 
															+                # 假设是 RGB，转换为 BGR
														
 
															+                image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) if image.dtype == np.uint8 else image
														
 
															+        
														
 
															+        orig_h, orig_w = image.shape[:2]
														
 
															+        
														
 
															+        # 运行推理
														
 
															+        outputs = self.predictor(image)
														
 
															+        instances = outputs["instances"]
														
 
															+        
														
 
															+        # 解析结果
														
 
															+        formatted_results = []
														
 
															+        for i in range(len(instances)):
														
 
															+            score = float(instances.scores[i].cpu().item())
														
 
															+            
														
 
															+            # 过滤低置信度
														
 
															+            if score < self._threshold:
														
 
															+                continue
														
 
															+            
														
 
															+            # 获取类别
														
 
															+            class_id = int(instances.pred_classes[i].cpu().item())
														
 
															+            original_label = self.DIT_LABELS.get(class_id, f'unknown_{class_id}')
														
 
															+            
														
 
															+            # 映射到 MinerU 类别
														
 
															+            mineru_category = self.CATEGORY_MAP.get(original_label, 'text')
														
 
															+            
														
 
															+            # 提取边界框
														
 
															+            bbox_tensor = instances.pred_boxes[i].tensor[0].cpu().numpy()
														
 
															+            x1 = max(0, min(orig_w, float(bbox_tensor[0])))
														
 
															+            y1 = max(0, min(orig_h, float(bbox_tensor[1])))
														
 
															+            x2 = max(0, min(orig_w, float(bbox_tensor[2])))
														
 
															+            y2 = max(0, min(orig_h, float(bbox_tensor[3])))
														
 
															+            
														
 
															+            bbox = [int(x1), int(y1), int(x2), int(y2)]
														
 
															+            
														
 
															+            # 计算宽高
														
 
															+            width = bbox[2] - bbox[0]
														
 
															+            height = bbox[3] - bbox[1]
														
 
															+            
														
 
															+            # 过滤太小的框
														
 
															+            if width < 10 or height < 10:
														
 
															+                continue
														
 
															+            
														
 
															+            # 过滤面积异常大的框
														
 
															+            area = width * height
														
 
															+            img_area = orig_w * orig_h
														
 
															+            if area > img_area * 0.95:
														
 
															+                continue
														
 
															+            
														
 
															+            # 生成多边形坐标
														
 
															+            poly = [
														
 
															+                bbox[0], bbox[1],  # 左上
														
 
															+                bbox[2], bbox[1],  # 右上
														
 
															+                bbox[2], bbox[3],  # 右下
														
 
															+                bbox[0], bbox[3],  # 左下
														
 
															+            ]
														
 
															+            
														
 
															+            formatted_results.append({
														
 
															+                'category': mineru_category,
														
 
															+                'bbox': bbox,
														
 
															+                'confidence': score,
														
 
															+                'raw': {
														
 
															+                    'original_label': original_label,
														
 
															+                    'original_label_id': class_id,
														
 
															+                    'poly': poly,
														
 
															+                    'width': width,
														
 
															+                    'height': height
														
 
															+                }
														
 
															+            })
														
 
															+        
														
 
															+        # 应用重叠框处理
														
 
															+        if self._remove_overlap and len(formatted_results) > 1:
														
 
															+            formatted_results = LayoutUtils.remove_overlapping_boxes(
														
 
															+                formatted_results,
														
 
															+                iou_threshold=self._iou_threshold,
														
 
															+                overlap_ratio_threshold=self._overlap_ratio_threshold
														
 
															+            )
														
 
															+        
														
 
															+        return formatted_results
														
 
															+    
														
 
															+    def detect_batch(
														
 
															+        self, 
														
 
															+        images: List[Union[np.ndarray, Image.Image]]
														
 
															+    ) -> List[List[Dict[str, Any]]]:
														
 
															+        """
														
 
															+        批量检测布局
														
 
															+        
														
 
															+        Args:
														
 
															+            images: 输入图像列表
														
 
															+            
														
 
															+        Returns:
														
 
															+            每个图像的检测结果列表
														
 
															+        """
														
 
															+        if self.predictor is None:
														
 
															+            raise RuntimeError("Model not initialized. Call initialize() first.")
														
 
															+        
														
 
															+        if not images:
														
 
															+            return []
														
 
															+        
														
 
															+        all_results = []
														
 
															+        for image in images:
														
 
															+            results = self.detect(image)
														
 
															+            all_results.append(results)
														
 
															+        
														
 
															+        return all_results
														
 
															+    
														
 
															+    def visualize(
														
 
															+        self, 
														
 
															+        img: np.ndarray, 
														
 
															+        results: List[Dict],
														
 
															+        output_path: str = None,
														
 
															+        show_confidence: bool = True,
														
 
															+        min_confidence: float = 0.0
														
 
															+    ) -> np.ndarray:
														
 
															+        """
														
 
															+        可视化检测结果
														
 
															+        
														
 
															+        Args:
														
 
															+            img: 输入图像 (BGR 格式)
														
 
															+            results: 检测结果 (MinerU 格式)
														
 
															+            output_path: 输出路径（可选）
														
 
															+            show_confidence: 是否显示置信度
														
 
															+            min_confidence: 最小置信度阈值
														
 
															+            
														
 
															+        Returns:
														
 
															+            标注后的图像
														
 
															+        """
														
 
															+        import random
														
 
															+        
														
 
															+        vis_img = img.copy()
														
 
															+        
														
 
															+        # 预定义类别颜色（与 EnhancedDocPipeline 保持一致）
														
 
															+        predefined_colors = {
														
 
															+            # 文本类
														
 
															+            'text': (153, 0, 76),
														
 
															+            'title': (102, 102, 255),
														
 
															+            'header': (128, 128, 128),
														
 
															+            'footer': (128, 128, 128),
														
 
															+            'page_footnote': (200, 200, 200),
														
 
															+            # 表格类
														
 
															+            'table_body': (204, 204, 0),
														
 
															+            'table_caption': (255, 255, 102),
														
 
															+            # 图片类
														
 
															+            'image_body': (153, 255, 51),
														
 
															+            'image_caption': (102, 178, 255),
														
 
															+            # 公式类
														
 
															+            'interline_equation': (0, 255, 0),
														
 
															+            # 代码类
														
 
															+            'code': (102, 0, 204),
														
 
															+            # 丢弃类
														
 
															+            'abandon': (100, 100, 100),
														
 
															+        }
														
 
															+        
														
 
															+        # 过滤低置信度结果
														
 
															+        filtered_results = [
														
 
															+            res for res in results 
														
 
															+            if res['confidence'] >= min_confidence
														
 
															+        ]
														
 
															+        
														
 
															+        if not filtered_results:
														
 
															+            print(f"⚠️ No results to visualize (min_confidence={min_confidence})")
														
 
															+            return vis_img
														
 
															+        
														
 
															+        # 为每个出现的类别分配颜色
														
 
															+        category_colors = {}
														
 
															+        for res in filtered_results:
														
 
															+            cat = res['category']
														
 
															+            if cat not in category_colors:
														
 
															+                if cat in predefined_colors:
														
 
															+                    category_colors[cat] = predefined_colors[cat]
														
 
															+                else:
														
 
															+                    category_colors[cat] = (
														
 
															+                        random.randint(50, 255),
														
 
															+                        random.randint(50, 255),
														
 
															+                        random.randint(50, 255)
														
 
															+                    )
														
 
															+        
														
 
															+        # 绘制检测框
														
 
															+        for res in filtered_results:
														
 
															+            bbox = res['bbox']
														
 
															+            x1, y1, x2, y2 = bbox
														
 
															+            cat = res['category']
														
 
															+            confidence = res['confidence']
														
 
															+            color = category_colors[cat]
														
 
															+            
														
 
															+            # 获取原始标签
														
 
															+            original_label = res.get('raw', {}).get('original_label', cat)
														
 
															+            
														
 
															+            # 绘制矩形边框
														
 
															+            cv2.rectangle(vis_img, (x1, y1), (x2, y2), color, 2)
														
 
															+            
														
 
															+            # 构造标签文本
														
 
															+            if show_confidence:
														
 
															+                label = f"{original_label}->{cat} {confidence:.2f}"
														
 
															+            else:
														
 
															+                label = f"{original_label}->{cat}"
														
 
															+            
														
 
															+            # 计算标签尺寸
														
 
															+            label_size, baseline = cv2.getTextSize(
														
 
															+                label, 
														
 
															+                cv2.FONT_HERSHEY_SIMPLEX, 
														
 
															+                0.4, 
														
 
															+                1
														
 
															+            )
														
 
															+            label_w, label_h = label_size
														
 
															+            
														
 
															+            # 绘制标签背景
														
 
															+            cv2.rectangle(
														
 
															+                vis_img,
														
 
															+                (x1, y1 - label_h - 4),
														
 
															+                (x1 + label_w, y1),
														
 
															+                color,
														
 
															+                -1
														
 
															+            )
														
 
															+            
														
 
															+            # 绘制标签文字
														
 
															+            cv2.putText(
														
 
															+                vis_img,
														
 
															+                label,
														
 
															+                (x1, y1 - 2),
														
 
															+                cv2.FONT_HERSHEY_SIMPLEX,
														
 
															+                0.4,
														
 
															+                (255, 255, 255),
														
 
															+                1,
														
 
															+                cv2.LINE_AA
														
 
															+            )
														
 
															+        
														
 
															+        # 添加图例
														
 
															+        if category_colors:
														
 
															+            self._draw_legend(vis_img, category_colors, len(filtered_results))
														
 
															+        
														
 
															+        # 保存可视化结果
														
 
															+        if output_path:
														
 
															+            output_path_obj = Path(output_path)
														
 
															+            output_path_obj.parent.mkdir(parents=True, exist_ok=True)
														
 
															+            cv2.imwrite(str(output_path_obj), vis_img)
														
 
															+            print(f"💾 Visualization saved to: {output_path_obj}")
														
 
															+        
														
 
															+        return vis_img
														
 
															+    
														
 
															+    def _draw_legend(
														
 
															+        self, 
														
 
															+        img: np.ndarray, 
														
 
															+        category_colors: Dict[str, tuple],
														
 
															+        total_count: int
														
 
															+    ):
														
 
															+        """在图像上绘制图例"""
														
 
															+        legend_x = img.shape[1] - 200
														
 
															+        legend_y = 20
														
 
															+        line_height = 25
														
 
															+        
														
 
															+        # 绘制半透明背景
														
 
															+        overlay = img.copy()
														
 
															+        cv2.rectangle(
														
 
															+            overlay,
														
 
															+            (legend_x - 10, legend_y - 10),
														
 
															+            (img.shape[1] - 10, legend_y + len(category_colors) * line_height + 30),
														
 
															+            (255, 255, 255),
														
 
															+            -1
														
 
															+        )
														
 
															+        cv2.addWeighted(overlay, 0.7, img, 0.3, 0, img)
														
 
															+        
														
 
															+        # 绘制标题
														
 
															+        cv2.putText(
														
 
															+            img,
														
 
															+            f"Legend ({total_count} total)",
														
 
															+            (legend_x, legend_y),
														
 
															+            cv2.FONT_HERSHEY_SIMPLEX,
														
 
															+            0.5,
														
 
															+            (0, 0, 0),
														
 
															+            1,
														
 
															+            cv2.LINE_AA
														
 
															+        )
														
 
															+        
														
 
															+        # 绘制每个类别
														
 
															+        y_offset = legend_y + line_height
														
 
															+        for cat, color in sorted(category_colors.items()):
														
 
															+            cv2.rectangle(
														
 
															+                img,
														
 
															+                (legend_x, y_offset - 10),
														
 
															+                (legend_x + 15, y_offset),
														
 
															+                color,
														
 
															+                -1
														
 
															+            )
														
 
															+            cv2.rectangle(
														
 
															+                img,
														
 
															+                (legend_x, y_offset - 10),
														
 
															+                (legend_x + 15, y_offset),
														
 
															+                (0, 0, 0),
														
 
															+                1
														
 
															+            )
														
 
															+            
														
 
															+            cv2.putText(
														
 
															+                img,
														
 
															+                cat,
														
 
															+                (legend_x + 20, y_offset - 2),
														
 
															+                cv2.FONT_HERSHEY_SIMPLEX,
														
 
															+                0.4,
														
 
															+                (0, 0, 0),
														
 
															+                1,
														
 
															+                cv2.LINE_AA
														
 
															+            )
														
 
															+            
														
 
															+            y_offset += line_height
														
 
															+
														
 
															+
														
 
															+# 测试代码
														
 
															+if __name__ == "__main__":
														
 
															+    import sys
														
 
															+    import os
														
 
															+    
														
 
															+    # 测试配置
														
 
															+    project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
														
 
															+    config = {
														
 
															+        'config_file': os.path.join(project_root, 'dit', 'object_detection',
														
 
															+                                   'publaynet_configs', 'cascade', 'cascade_dit_large.yaml'),
														
 
															+        'model_weights': 'https://huggingface.co/HYPJUDY/dit/resolve/main/dit-fts/publaynet_dit-l_cascade.pth',
														
 
															+        'device': 'cpu',
														
 
															+        'conf': 0.3,
														
 
															+        'remove_overlap': True,
														
 
															+        'iou_threshold': 0.8,
														
 
															+        'overlap_ratio_threshold': 0.8
														
 
															+    }
														
 
															+    
														
 
															+    # 初始化检测器
														
 
															+    print("🔧 Initializing DiT Layout Detector...")
														
 
															+    detector = DitLayoutDetector(config)
														
 
															+    detector.initialize()
														
 
															+    
														
 
															+    # 读取测试图像
														
 
															+    img_path = "/Users/zhch158/workspace/data/流水分析/2023年度报告母公司/paddleocr_vl_results/2023年度报告母公司/2023年度报告母公司_page_021.png"
														
 
															+    
														
 
															+    print(f"\n📖 Loading image: {img_path}")
														
 
															+    img = cv2.imread(img_path)
														
 
															+    
														
 
															+    if img is None:
														
 
															+        print(f"❌ Failed to load image: {img_path}")
														
 
															+        sys.exit(1)
														
 
															+    
														
 
															+    print(f"   Image shape: {img.shape}")
														
 
															+    
														
 
															+    # 执行检测
														
 
															+    print("\n🔍 Detecting layout...")
														
 
															+    results = detector.detect(img)
														
 
															+    
														
 
															+    print(f"\n✅ 检测到 {len(results)} 个区域:")
														
 
															+    for i, res in enumerate(results, 1):
														
 
															+        print(f"  [{i}] {res['category']}: "
														
 
															+              f"score={res['confidence']:.3f}, "
														
 
															+              f"bbox={res['bbox']}, "
														
 
															+              f"original={res['raw']['original_label']}")
														
 
															+    
														
 
															+    # 统计各类别
														
 
															+    category_counts = {}
														
 
															+    for res in results:
														
 
															+        cat = res['category']
														
 
															+        category_counts[cat] = category_counts.get(cat, 0) + 1
														
 
															+    
														
 
															+    print(f"\n📊 类别统计 (MinerU格式):")
														
 
															+    for cat, count in sorted(category_counts.items()):
														
 
															+        print(f"  - {cat}: {count}")
														
 
															+    
														
 
															+    # 可视化
														
 
															+    if len(results) > 0:
														
 
															+        print("\n🎨 Generating visualization...")
														
 
															+        
														
 
															+        output_dir = Path(__file__).parent / "output"
														
 
															+        output_dir.mkdir(parents=True, exist_ok=True)
														
 
															+        output_path = output_dir / f"{Path(img_path).stem}_dit_layout_vis.jpg"
														
 
															+        
														
 
															+        vis_img = detector.visualize(
														
 
															+            img, 
														
 
															+            results, 
														
 
															+            output_path=str(output_path),
														
 
															+            show_confidence=True,
														
 
															+            min_confidence=0.0
														
 
															+        )
														
 
															+        
														
 
															+        print(f"💾 Visualization saved to: {output_path}")
														
 
															+    
														
 
															+    # 清理
														
 
															+    detector.cleanup()
														
 
															+    print("\n✅ 测试完成!")
														
 
															+