zhengchun
/
ocr_platform


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464
							"""
可视化工具模块

提供文档处理结果的可视化功能：
- Layout 布局可视化
- OCR 结果可视化
- 图片元素保存
"""
from pathlib import Path
from typing import Dict, Any, List, Tuple
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import cv2
from loguru import logger


class VisualizationUtils:
    """可视化工具类"""
    
    # 颜色映射（与 MinerU BlockType / EnhancedDocPipeline 类别保持一致）
    COLOR_MAP = {
        # 文本类元素 (TEXT_CATEGORIES)
        'title': (102, 102, 255),           # 蓝色
        'text': (153, 0, 76),               # 深红
        'ocr_text': (153, 0, 76),           # 深红（同 text）
        'low_score_text': (200, 100, 100),  # 浅红
        'header': (128, 128, 128),          # 灰色
        'footer': (128, 128, 128),          # 灰色
        'page_number': (160, 160, 160),     # 浅灰
        'ref_text': (180, 180, 180),        # 浅灰
        'aside_text': (180, 180, 180),      # 浅灰
        'page_footnote': (200, 200, 200),   # 浅灰
        
        # 表格相关元素
        'table': (204, 204, 0),             # 黄色
        'table_body': (204, 204, 0),        # 黄色
        'table_caption': (255, 255, 102),   # 浅黄
        'table_footnote': (229, 255, 204),  # 浅黄绿
        
        # 图片相关元素
        'image': (153, 255, 51),            # 绿色
        'image_body': (153, 255, 51),       # 绿色
        'figure': (153, 255, 51),           # 绿色
        'image_caption': (102, 178, 255),   # 浅蓝
        'image_footnote': (255, 178, 102),  # 橙色
        
        # 公式类元素
        'interline_equation': (0, 255, 0),  # 亮绿
        'inline_equation': (0, 200, 0),     # 绿色
        'equation': (0, 220, 0),            # 绿色
        'interline_equation_yolo': (0, 180, 0),
        'interline_equation_number': (0, 160, 0),
        
        # 代码类元素
        'code': (102, 0, 204),              # 紫色
        'code_body': (102, 0, 204),         # 紫色
        'code_caption': (153, 51, 255),     # 浅紫
        'algorithm': (128, 0, 255),         # 紫色
        
        # 列表类元素
        'list': (40, 169, 92),              # 青绿
        'index': (60, 180, 100),            # 青绿

        # 图表 / 印章
        'chart': (0, 200, 200),
        'seal': (255, 140, 0),              # 亮橙（RGB），debug 与最终 layout 图一致
        
        # 丢弃类元素
        'abandon': (100, 100, 100),         # 深灰
        'discarded': (100, 100, 100),       # 深灰
        
        # 错误
        'error': (255, 0, 0),               # 红色
        
        # --- 通用工具颜色（非元素类别，供 module_debug_viz / ocr_validator 引用） ---
        
        # OCR 文字框：亮蓝（白底/浅灰上比黄/红色易辨认）
        'ocr_box': (0, 0, 255),
        # 印章 OCR 框：亮橙（独立管线，与 layout seal 颜色一致，审计时区分）
        'seal_ocr_box': (255, 140, 0),
        # 表格单元格框：与 ocr_box 同色
        'cell_box': (0, 0, 255),
        # 丢弃/废弃元素框
        'discard': (128, 128, 128),
    }
    
    @staticmethod
    def rgb_to_bgr(rgb: tuple) -> tuple:
        """RGB → BGR（供 OpenCV 模块使用）。"""
        return tuple(rgb[i] for i in (2, 1, 0)) if len(rgb) >= 3 else rgb
    
    # --- 向后兼容别名（推荐使用 COLOR_MAP['ocr_box'] 等） ---
    OCR_BOX_COLOR = (0, 0, 255)
    CELL_BOX_COLOR = (0, 0, 255)
    DISCARD_COLOR = (128, 128, 128)  # 灰色
    
    @staticmethod
    def save_image_elements(
        results: Dict[str, Any],
        images_dir: Path,
        doc_name: str,
        is_pdf: bool = True
    ) -> List[str]:
        """
        保存图片元素
        
        命名规则:
        - PDF输入: 文件名_page_001_image_1.png
        - 图片输入（单页）: 文件名_image_1.png
        
        Args:
            results: 处理结果
            images_dir: 图片输出目录
            doc_name: 文档名称
            is_pdf: 是否为 PDF 输入
            
        Returns:
            保存的图片路径列表
        """
        saved_paths = []
        image_count = 0
        total_pages = len(results.get('pages', []))
        
        for page in results.get('pages', []):
            page_idx = page.get('page_idx', 0)
            
            for element in page.get('elements', []):
                if element.get('type') in ['image', 'image_body', 'figure']:
                    content = element.get('content', {})
                    image_data = content.get('image_data')
                    
                    if image_data is not None:
                        image_count += 1
                        
                        # 根据输入类型决定命名
                        if is_pdf or total_pages > 1:
                            image_filename = f"{doc_name}_page_{page_idx + 1}_image_{image_count}.png"
                        else:
                            image_filename = f"{doc_name}_image_{image_count}.png"
                        
                        image_path = images_dir / image_filename
                        
                        try:
                            if isinstance(image_data, np.ndarray):
                                cv2.imwrite(str(image_path), image_data)
                            else:
                                Image.fromarray(image_data).save(image_path)
                            
                            # 更新路径（只保存文件名）
                            content['image_path'] = image_filename
                            content.pop('image_data', None)
                            
                            saved_paths.append(str(image_path))
                            logger.debug(f"🖼️ Image saved: {image_path}")
                        except Exception as e:
                            logger.warning(f"Failed to save image: {e}")
        
        if image_count > 0:
            logger.info(f"🖼️ {image_count} images saved to: {images_dir}")
        
        return saved_paths
    
    @staticmethod
    def save_layout_images(
        results: Dict[str, Any],
        output_dir: Path,
        doc_name: str,
        draw_type_label: bool = True,
        draw_bbox_number: bool = True,
        is_pdf: bool = True
    ) -> List[str]:
        """
        保存 Layout 可视化图片
        
        命名规则:
        - PDF输入: 文件名_page_001_layout.png
        - 图片输入（单页）: 文件名_layout.png
        
        Args:
            results: 处理结果
            output_dir: 输出目录
            doc_name: 文档名称
            draw_type_label: 是否绘制类型标签
            draw_bbox_number: 是否绘制序号
            is_pdf: 是否为 PDF 输入
            
        Returns:
            保存的图片路径列表
        """
        layout_paths = []
        total_pages = len(results.get('pages', []))
        
        for page in results.get('pages', []):
            page_idx = page.get('page_idx', 0)
            processed_image = page.get('original_image')
            if processed_image is None:
                processed_image = page.get('processed_image')
            
            if processed_image is None:
                logger.warning(f"Page {page_idx}: No image data found for layout visualization")
                continue
            
            if isinstance(processed_image, np.ndarray):
                image = Image.fromarray(processed_image).convert('RGB')
            elif isinstance(processed_image, Image.Image):
                image = processed_image.convert('RGB')
            else:
                continue
            
            draw = ImageDraw.Draw(image, 'RGBA')
            font = VisualizationUtils._get_font(14)
            
            # 绘制普通元素
            for idx, element in enumerate(page.get('elements', []), 1):
                elem_type = element.get('type', '')
                bbox = element.get('bbox', [0, 0, 0, 0])
                
                if len(bbox) < 4:
                    continue
                
                x0, y0, x1, y1 = map(int, bbox[:4])
                color = VisualizationUtils.COLOR_MAP.get(elem_type, (255, 0, 0))
                
                # 半透明填充
                overlay = Image.new('RGBA', image.size, (255, 255, 255, 0))
                overlay_draw = ImageDraw.Draw(overlay)
                overlay_draw.rectangle([x0, y0, x1, y1], fill=(*color, 50))
                image = Image.alpha_composite(image.convert('RGBA'), overlay).convert('RGB')
                draw = ImageDraw.Draw(image)
                
                # 边框
                draw.rectangle([x0, y0, x1, y1], outline=color, width=2)
                
                # 类型标签
                if draw_type_label:
                    label = elem_type.replace('_', ' ').title()
                    bbox_label = draw.textbbox((x0 + 2, y0 + 2), label, font=font)
                    draw.rectangle(bbox_label, fill=color)
                    draw.text((x0 + 2, y0 + 2), label, fill='white', font=font)
                
                # 序号
                if draw_bbox_number:
                    number_text = str(idx)
                    bbox_number = draw.textbbox((x1 - 25, y0 + 2), number_text, font=font)
                    draw.rectangle(bbox_number, fill=(255, 0, 0))
                    draw.text((x1 - 25, y0 + 2), number_text, fill='white', font=font)
            
            # 绘制丢弃元素（灰色样式）
            for idx, element in enumerate(page.get('discarded_blocks', []), 1):
                original_category = element.get('original_category', 'unknown')
                bbox = element.get('bbox', [0, 0, 0, 0])
                
                if len(bbox) < 4:
                    continue
                
                x0, y0, x1, y1 = map(int, bbox[:4])
                
                # 半透明填充
                overlay = Image.new('RGBA', image.size, (255, 255, 255, 0))
                overlay_draw = ImageDraw.Draw(overlay)
                overlay_draw.rectangle([x0, y0, x1, y1], fill=(*VisualizationUtils.COLOR_MAP['discard'], 30))
                image = Image.alpha_composite(image.convert('RGBA'), overlay).convert('RGB')
                draw = ImageDraw.Draw(image)
                
                # 灰色边框
                draw.rectangle([x0, y0, x1, y1], outline=VisualizationUtils.COLOR_MAP['discard'], width=1)
                
                # 类型标签
                if draw_type_label:
                    label = f"D:{original_category}"
                    bbox_label = draw.textbbox((x0 + 2, y0 + 2), label, font=font)
                    draw.rectangle(bbox_label, fill=VisualizationUtils.COLOR_MAP['discard'])
                    draw.text((x0 + 2, y0 + 2), label, fill='white', font=font)
            
            # 根据输入类型决定命名
            if is_pdf or total_pages > 1:
                layout_path = output_dir / f"{doc_name}_page_{page_idx + 1:03d}_layout.png"
            else:
                layout_path = output_dir / f"{doc_name}_layout.png"
            
            image.save(layout_path)
            layout_paths.append(str(layout_path))
            logger.info(f"🖼️ Layout image saved: {layout_path}")
        
        return layout_paths
    
    @staticmethod
    def save_ocr_images(
        results: Dict[str, Any],
        output_dir: Path,
        doc_name: str,
        is_pdf: bool = True
    ) -> List[str]:
        """
        保存 OCR 可视化图片（与 *_page_001.json 同源同构）。

        数据源为 JSONFormatters._element_to_cell_bbox_format 转换后的扁平格式
        （与 save_page_jsons 输出的 JSON 一致）；
        绘制样式与 debug/ocr_recognition 一致：亮蓝实线=有文字，虚线=仅框无字。

        命名规则:
        - PDF输入: 文件名_page_001_ocr.png
        - 图片输入（单页）: 文件名_ocr.png
        """
        from ocr_utils.json_formatters import JSONFormatters
        from ocr_utils.module_debug_viz import draw_ocr_spans_cv2

        ocr_paths = []
        total_pages = len(results.get('pages', []))

        for page in results.get('pages', []):
            page_idx = page.get('page_idx', 0)
            processed_image = page.get('original_image')
            if processed_image is None:
                processed_image = page.get('processed_image')

            if processed_image is None:
                logger.warning(f"Page {page_idx}: No image data found for OCR visualization")
                continue

            page_rotation_angle = float(page.get('angle', 0))

            flat_elements = []
            for element in (page.get('elements') or []):
                converted = JSONFormatters._element_to_cell_bbox_format(
                    element, page_idx, page_rotation_angle
                )
                if converted:
                    flat_elements.append(converted)
            for element in (page.get('discarded_blocks') or []):
                converted = JSONFormatters._element_to_cell_bbox_format(
                    element, page_idx, page_rotation_angle
                )
                if converted:
                    flat_elements.append(converted)

            spans = []
            for elem in flat_elements:
                bbox = elem.get('bbox', [])
                if not bbox or len(bbox) < 4:
                    continue
                elem_type = elem.get('type', '')
                if 'table_cells' in elem:
                    for cell in elem['table_cells']:
                        cell_bbox = cell.get('bbox', [])
                        if cell_bbox and len(cell_bbox) >= 4:
                            spans.append({
                                'bbox': cell_bbox[:4],
                                'text': cell.get('text', '').strip(),
                            })
                elif elem.get('text') is not None:
                    spans.append({
                        'bbox': bbox[:4],
                        'text': str(elem.get('text', '')).strip(),
                        'category': 'seal' if elem_type == 'seal' else None,
                    })
                else:
                    spans.append({
                        'bbox': bbox[:4],
                        'text': '',
                    })

            vis_bgr = draw_ocr_spans_cv2(processed_image, spans)
            vis_rgb = cv2.cvtColor(vis_bgr, cv2.COLOR_BGR2RGB)
            image = Image.fromarray(vis_rgb)

            if is_pdf or total_pages > 1:
                ocr_path = output_dir / f"{doc_name}_page_{page_idx + 1:03d}_ocr.png"
            else:
                ocr_path = output_dir / f"{doc_name}_ocr.png"

            image.save(ocr_path)
            ocr_paths.append(str(ocr_path))
            logger.info(f"🖼️ OCR image saved: {ocr_path}")

        return ocr_paths
    
    @staticmethod
    def _draw_polygon(
        draw: ImageDraw.Draw,
        bbox: List,
        color: Tuple[int, int, int],
        width: int = 1
    ):
        """
        绘制多边形或矩形
        
        Args:
            draw: ImageDraw 对象
            bbox: 坐标（4点多边形或矩形）
            color: 颜色
            width: 线宽
        """
        if isinstance(bbox[0], (list, tuple)):
            points = [(int(p[0]), int(p[1])) for p in bbox]
            points.append(points[0])
            draw.line(points, fill=color, width=width)
        elif len(bbox) >= 4:
            x0, y0, x1, y1 = map(int, bbox[:4])
            draw.rectangle([x0, y0, x1, y1], outline=color, width=width)
    
    @staticmethod
    def _get_font(size: int) -> ImageFont.FreeTypeFont:
        """
        获取字体
        
        Args:
            size: 字体大小
            
        Returns:
            字体对象
        """
        font_paths = [
            "/System/Library/Fonts/Helvetica.ttc",
            "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
            "/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf",
        ]
        
        for font_path in font_paths:
            try:
                return ImageFont.truetype(font_path, size)
            except:
                continue
        
        return ImageFont.load_default()
    
    @staticmethod
    def draw_bbox_on_image(image: Image.Image, bbox: List[int], color: str = "red", width: int = 3) -> Image.Image:
        """
        在图片上绘制bbox框
        
        Args:
            image: PIL Image 对象
            bbox: 边界框坐标 [x1, y1, x2, y2]
            color: 边框颜色（字符串，如 "red", "blue", "green"）
            width: 边框宽度
            
        Returns:
            绘制了 bbox 的图像副本
        """
        img_copy = image.copy()
        draw = ImageDraw.Draw(img_copy)
        
        x1, y1, x2, y2 = bbox
        
        # 绘制矩形框
        draw.rectangle([x1, y1, x2, y2], outline=color, width=width)
        
        # 添加半透明填充
        overlay = Image.new('RGBA', img_copy.size, (0, 0, 0, 0))
        overlay_draw = ImageDraw.Draw(overlay)
        
        color_map = {
            "red": (255, 0, 0, 30),
            "blue": (0, 0, 255, 30),
            "green": (0, 255, 0, 30)
        }
        fill_color = color_map.get(color, (255, 255, 0, 30))
        
        overlay_draw.rectangle([x1, y1, x2, y2], fill=fill_color)
        img_copy = Image.alpha_composite(img_copy.convert('RGBA'), overlay).convert('RGB')
        
        return img_copy