소스 검색

feat: introduce new utility modules for document processing, including HTML and JSON formatters, along with a Markdown generator and visualization tools, enhancing output capabilities and modularity

zhch158_admin 17 시간 전
부모
커밋
7d96321bc1

+ 0 - 2
zhch/universal_doc_parser/utils/__init__.py

@@ -1,10 +1,8 @@
 """工具模块"""
 
-from .output_formatter import OutputFormatter
 from .output_formatter_v2 import OutputFormatterV2, save_mineru_format
 
 __all__ = [
-    'OutputFormatter', 
     'OutputFormatterV2',
     'save_mineru_format'
 ]

+ 187 - 0
zhch/universal_doc_parser/utils/html_generator.py

@@ -0,0 +1,187 @@
+"""
+HTML 生成器模块
+
+提供 HTML 输出功能:
+- 表格 HTML 生成(带样式)
+- 单元格坐标展示
+"""
+import json
+from pathlib import Path
+from typing import Dict, Any, List
+from loguru import logger
+
+
+class HTMLGenerator:
+    """HTML 生成器类"""
+    
+    @staticmethod
+    def save_table_htmls(
+        results: Dict[str, Any],
+        output_dir: Path,
+        doc_name: str
+    ) -> Path:
+        """
+        保存表格 HTML 文件
+        
+        Args:
+            results: 处理结果
+            output_dir: 输出目录
+            doc_name: 文档名称
+            
+        Returns:
+            表格目录路径
+        """
+        tables_dir = output_dir / 'tables'
+        tables_dir.mkdir(exist_ok=True)
+        
+        table_count = 0
+        
+        for page in results.get('pages', []):
+            page_idx = page.get('page_idx', 0)
+            
+            for element in page.get('elements', []):
+                if element.get('type') in ['table', 'table_body']:
+                    table_count += 1
+                    content = element.get('content', {})
+                    html = content.get('html', '')
+                    cells = content.get('cells', [])
+                    
+                    if html:
+                        full_html = HTMLGenerator._generate_table_html_with_styles(
+                            html, cells, doc_name, page_idx, table_count
+                        )
+                        
+                        html_path = tables_dir / f"{doc_name}_table_{table_count}_page_{page_idx + 1}.html"
+                        with open(html_path, 'w', encoding='utf-8') as f:
+                            f.write(full_html)
+        
+        if table_count > 0:
+            logger.info(f"📊 {table_count} tables saved to: {tables_dir}")
+        
+        return tables_dir
+    
+    @staticmethod
+    def _generate_table_html_with_styles(
+        table_html: str,
+        cells: List[Dict],
+        doc_name: str,
+        page_idx: int,
+        table_idx: int
+    ) -> str:
+        """
+        生成带样式的完整 HTML
+        
+        Args:
+            table_html: 表格 HTML 内容
+            cells: 单元格列表
+            doc_name: 文档名称
+            page_idx: 页码
+            table_idx: 表格序号
+            
+        Returns:
+            完整的 HTML 字符串
+        """
+        cells_json = json.dumps(cells, ensure_ascii=False, indent=2) if cells else "[]"
+        
+        return f"""<!DOCTYPE html>
+<html lang="zh-CN">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>{doc_name} - Table {table_idx}</title>
+    <style>
+        body {{
+            font-family: Arial, "Microsoft YaHei", sans-serif;
+            margin: 20px;
+            background-color: #f5f5f5;
+        }}
+        .container {{
+            max-width: 1400px;
+            margin: 0 auto;
+            background-color: white;
+            padding: 20px;
+            box-shadow: 0 0 10px rgba(0,0,0,0.1);
+            border-radius: 8px;
+        }}
+        .meta {{
+            color: #666;
+            font-size: 0.9em;
+            margin-bottom: 20px;
+            padding-bottom: 10px;
+            border-bottom: 1px solid #ddd;
+        }}
+        table {{
+            border-collapse: collapse;
+            width: 100%;
+            margin: 20px 0;
+        }}
+        th, td {{
+            border: 1px solid #ddd;
+            padding: 8px 12px;
+            text-align: left;
+        }}
+        th {{
+            background-color: #f2f2f2;
+            font-weight: bold;
+        }}
+        tr:hover {{
+            background-color: #f9f9f9;
+        }}
+        td[data-bbox], th[data-bbox] {{
+            position: relative;
+        }}
+        td[data-bbox]:hover::after, th[data-bbox]:hover::after {{
+            content: attr(data-bbox);
+            position: absolute;
+            bottom: 100%;
+            left: 0;
+            background: #333;
+            color: white;
+            padding: 2px 6px;
+            font-size: 10px;
+            border-radius: 3px;
+            white-space: nowrap;
+            z-index: 100;
+        }}
+        .cells-info {{
+            margin-top: 30px;
+            padding: 15px;
+            background-color: #f8f9fa;
+            border-radius: 5px;
+        }}
+        .cells-info summary {{
+            cursor: pointer;
+            font-weight: bold;
+            color: #333;
+        }}
+        .cells-info pre {{
+            background-color: #2d2d2d;
+            color: #f8f8f2;
+            padding: 15px;
+            border-radius: 5px;
+            overflow-x: auto;
+            font-size: 12px;
+        }}
+    </style>
+</head>
+<body>
+    <div class="container">
+        <div class="meta">
+            <p><strong>Document:</strong> {doc_name}</p>
+            <p><strong>Page:</strong> {page_idx + 1}</p>
+            <p><strong>Table:</strong> {table_idx}</p>
+            <p><strong>Cells with coordinates:</strong> {len(cells)}</p>
+        </div>
+        
+        {table_html}
+        
+        <div class="cells-info">
+            <details>
+                <summary>📍 单元格坐标数据 (JSON)</summary>
+                <pre>{cells_json}</pre>
+            </details>
+        </div>
+    </div>
+</body>
+</html>"""
+

+ 364 - 0
zhch/universal_doc_parser/utils/json_formatters.py

@@ -0,0 +1,364 @@
+"""
+JSON 格式化工具模块
+
+提供 JSON 输出格式化功能:
+- MinerU middle.json 格式转换
+- mineru_vllm_results_cell_bbox 格式转换
+- 表格单元格格式化
+"""
+import json
+from pathlib import Path
+from typing import Dict, Any, List, Optional
+from loguru import logger
+
+
+class JSONFormatters:
+    """JSON 格式化工具类"""
+    
+    @staticmethod
+    def convert_to_middle_json(results: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        转换为 MinerU 标准 middle.json 格式
+        
+        用于 vlm_union_make 生成 Markdown
+        
+        Args:
+            results: 处理结果
+            
+        Returns:
+            MinerU middle.json 格式的字典
+        """
+        middle_json = {
+            "pdf_info": [],
+            "_backend": "vlm",
+            "_scene": results.get('scene', 'unknown'),
+            "_version_name": "2.5.0"
+        }
+        
+        for page in results.get('pages', []):
+            page_info = {
+                'page_idx': page['page_idx'],
+                'page_size': list(page.get('image_shape', [0, 0])[:2][::-1]),
+                'angle': page.get('angle', 0),
+                'para_blocks': [],
+                'discarded_blocks': []
+            }
+            
+            # 处理普通元素
+            for element in page.get('elements', []):
+                block = JSONFormatters._element_to_middle_block(element)
+                if block:
+                    elem_type = element.get('type', '')
+                    if elem_type in ['header', 'footer', 'page_number', 'aside_text', 'abandon', 'discarded']:
+                        page_info['discarded_blocks'].append(block)
+                    else:
+                        page_info['para_blocks'].append(block)
+            
+            # 处理丢弃元素(从 discarded_blocks 字段)
+            for element in page.get('discarded_blocks', []):
+                block = JSONFormatters._element_to_middle_block(element)
+                if block:
+                    page_info['discarded_blocks'].append(block)
+            
+            middle_json['pdf_info'].append(page_info)
+        
+        return middle_json
+    
+    @staticmethod
+    def _element_to_middle_block(element: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+        """
+        将元素转换为 MinerU middle.json block 格式
+        
+        MinerU 期望的嵌套结构:
+        - image 类型: { type: "image", blocks: [{ type: "image_body", lines: [...] }] }
+        - table 类型: { type: "table", blocks: [{ type: "table_body", lines: [...] }] }
+        """
+        elem_type = element.get('type', '')
+        bbox = element.get('bbox', [0, 0, 0, 0])
+        content = element.get('content', {})
+        
+        block = {
+            'type': elem_type,
+            'bbox': bbox,
+            'angle': element.get('angle', 0),
+            'reading_order': element.get('reading_order', 0),
+            'lines': []
+        }
+        
+        # 文本类型
+        if elem_type in ['text', 'title', 'ref_text', 'header', 'footer', 'ocr_text']:
+            text = content.get('text', '') if isinstance(content, dict) else str(content)
+            if text:
+                block['lines'] = [{
+                    'bbox': bbox,
+                    'spans': [{
+                        'bbox': bbox,
+                        'type': 'text',
+                        'content': text
+                    }]
+                }]
+        
+        # 表格类型 - 嵌套结构
+        elif elem_type in ['table', 'table_body']:
+            table_html = content.get('html', '')
+            cells = content.get('cells', [])
+            
+            block['type'] = 'table'
+            block['blocks'] = [{
+                'type': 'table_body',
+                'bbox': bbox,
+                'angle': 0,
+                'lines': [{
+                    'bbox': bbox,
+                    'spans': [{
+                        'bbox': bbox,
+                        'type': 'table',
+                        'html': table_html,
+                        'cells': cells
+                    }]
+                }]
+            }]
+        
+        # 图片类型 - 嵌套结构
+        elif elem_type in ['image', 'image_body', 'figure']:
+            block['type'] = 'image'
+            block['blocks'] = [{
+                'type': 'image_body',
+                'bbox': bbox,
+                'angle': element.get('angle', 0),
+                'lines': [{
+                    'bbox': bbox,
+                    'spans': [{
+                        'bbox': bbox,
+                        'type': 'image',
+                        'image_path': content.get('image_path', ''),
+                        'description': content.get('description', '')
+                    }]
+                }]
+            }]
+        
+        # 公式类型
+        elif elem_type in ['interline_equation', 'inline_equation', 'equation']:
+            latex = content.get('latex', '')
+            block['lines'] = [{
+                'bbox': bbox,
+                'spans': [{
+                    'bbox': bbox,
+                    'type': 'interline_equation' if 'interline' in elem_type else 'inline_equation',
+                    'content': latex
+                }]
+            }]
+        
+        # 表格/图片附属文本
+        elif elem_type in ['table_caption', 'table_footnote', 'image_caption', 'image_footnote']:
+            text = content.get('text', '') if isinstance(content, dict) else str(content)
+            if text:
+                block['lines'] = [{
+                    'bbox': bbox,
+                    'spans': [{
+                        'bbox': bbox,
+                        'type': 'text',
+                        'content': text
+                    }]
+                }]
+        
+        # 丢弃类型
+        elif elem_type in ['abandon', 'discarded']:
+            block['type'] = 'abandon'
+            text = content.get('text', '') if isinstance(content, dict) else str(content)
+            if text:
+                block['lines'] = [{
+                    'bbox': bbox,
+                    'spans': [{
+                        'bbox': bbox,
+                        'type': 'text',
+                        'content': text
+                    }]
+                }]
+        
+        return block
+    
+    @staticmethod
+    def save_page_jsons(
+        results: Dict[str, Any],
+        output_dir: Path,
+        doc_name: str
+    ) -> List[str]:
+        """
+        保存每页独立的 JSON(mineru_vllm_results_cell_bbox 格式)
+        
+        Args:
+            results: 处理结果
+            output_dir: 输出目录
+            doc_name: 文档名称
+            
+        Returns:
+            保存的文件路径列表
+        """
+        saved_paths = []
+        
+        for page in results.get('pages', []):
+            page_idx = page.get('page_idx', 0)
+            page_name = f"{doc_name}_page_{page_idx + 1:03d}"
+            
+            # 转换为 mineru_vllm_results_cell_bbox 格式
+            page_elements = []
+            for element in page.get('elements', []):
+                converted = JSONFormatters._element_to_cell_bbox_format(element, page_idx)
+                if converted:
+                    page_elements.append(converted)
+            
+            # 添加丢弃元素
+            for element in page.get('discarded_blocks', []):
+                converted = JSONFormatters._element_to_cell_bbox_format(element, page_idx)
+                if converted:
+                    page_elements.append(converted)
+            
+            # 保存 JSON
+            json_path = output_dir / f"{page_name}.json"
+            with open(json_path, 'w', encoding='utf-8') as f:
+                json.dump(page_elements, f, ensure_ascii=False, indent=2)
+            
+            saved_paths.append(str(json_path))
+            logger.debug(f"📄 Page JSON saved: {json_path}")
+        
+        if saved_paths:
+            logger.info(f"📄 {len(saved_paths)} page JSONs saved")
+        
+        return saved_paths
+    
+    @staticmethod
+    def _element_to_cell_bbox_format(
+        element: Dict[str, Any],
+        page_idx: int
+    ) -> Optional[Dict[str, Any]]:
+        """
+        将元素转换为 mineru_vllm_results_cell_bbox 格式
+        """
+        elem_type = element.get('type', '')
+        bbox = element.get('bbox', [0, 0, 0, 0])
+        content = element.get('content', {})
+        
+        # 确保 bbox 是整数列表
+        bbox = [int(x) for x in bbox[:4]] if bbox else [0, 0, 0, 0]
+        
+        result = {
+            'bbox': bbox,
+            'page_idx': page_idx,
+            'reading_order': element.get('reading_order', 0)
+        }
+        
+        # 文本类型
+        if elem_type in ['text', 'title', 'ref_text', 'ocr_text']:
+            text = content.get('text', '') if isinstance(content, dict) else str(content)
+            result['type'] = 'text' if elem_type != 'title' else 'title'
+            result['text'] = text
+            if elem_type == 'title':
+                result['text_level'] = element.get('level', 1)
+        
+        # 表格类型
+        elif elem_type in ['table', 'table_body']:
+            result['type'] = 'table'
+            result['img_path'] = content.get('table_image_path', '')
+            result['table_caption'] = JSONFormatters._ensure_list(content.get('table_caption', []))
+            result['table_footnote'] = JSONFormatters._ensure_list(content.get('table_footnote', []))
+            result['table_body'] = content.get('html', '')
+            
+            # 关键:table_cells 数组
+            cells = content.get('cells', [])
+            if cells:
+                result['table_cells'] = JSONFormatters.format_table_cells(cells)
+            
+            # 旋转和倾斜信息
+            if 'table_angle' in content:
+                result['image_rotation_angle'] = float(content['table_angle'])
+            if 'skew_angle' in content:
+                result['skew_angle'] = float(content['skew_angle'])
+        
+        # 图片类型
+        elif elem_type in ['image', 'image_body', 'figure']:
+            result['type'] = 'image'
+            image_filename = content.get('image_path', '')
+            result['img_path'] = f"images/{image_filename}" if image_filename else ''
+            result['image_caption'] = JSONFormatters._ensure_list(content.get('caption', []))
+            result['image_footnote'] = JSONFormatters._ensure_list(content.get('footnote', []))
+        
+        # 公式类型
+        elif elem_type in ['interline_equation', 'inline_equation', 'equation']:
+            result['type'] = 'equation'
+            result['text'] = content.get('latex', '') if isinstance(content, dict) else ''
+            result['text_format'] = 'latex'
+        
+        # 列表类型
+        elif elem_type == 'list':
+            result['type'] = 'list'
+            result['sub_type'] = 'text'
+            result['list_items'] = content.get('list_items', []) if isinstance(content, dict) else []
+        
+        # 页眉页脚
+        elif elem_type in ['header', 'footer']:
+            result['type'] = elem_type
+            result['text'] = content.get('text', '') if isinstance(content, dict) else str(content)
+        
+        # 表格/图片附属文本
+        elif elem_type in ['table_caption', 'table_footnote', 'image_caption', 'image_footnote']:
+            result['type'] = elem_type
+            result['text'] = content.get('text', '') if isinstance(content, dict) else str(content)
+        
+        # 丢弃元素
+        elif elem_type in ['discarded', 'abandon']:
+            result['type'] = 'discarded'
+            result['original_category'] = element.get('original_category', 'unknown')
+            result['text'] = content.get('text', '') if isinstance(content, dict) else ''
+        
+        else:
+            return None
+        
+        return result
+    
+    @staticmethod
+    def format_table_cells(cells: List[Dict]) -> List[Dict[str, Any]]:
+        """
+        格式化表格单元格为 mineru_vllm_results_cell_bbox 格式
+        
+        输出格式:
+        {
+            "type": "table_cell",
+            "text": "单元格内容",
+            "matched_text": "OCR匹配文本",
+            "bbox": [x1, y1, x2, y2],
+            "row": 1,
+            "col": 1,
+            "score": 100.0,
+            "paddle_bbox_indices": [0, 1]
+        }
+        """
+        formatted_cells = []
+        
+        for cell in cells:
+            formatted_cell = {
+                'type': 'table_cell',
+                'text': cell.get('text', ''),
+                'matched_text': cell.get('matched_text', cell.get('text', '')),
+                'bbox': [float(x) for x in cell.get('bbox', [0, 0, 0, 0])[:4]],
+                'row': cell.get('row', 0),
+                'col': cell.get('col', 0),
+                'score': float(cell.get('score', 100.0)),
+                'paddle_bbox_indices': cell.get('paddle_bbox_indices', 
+                                                cell.get('paddle_indices', []))
+            }
+            formatted_cells.append(formatted_cell)
+        
+        return formatted_cells
+    
+    @staticmethod
+    def _ensure_list(value) -> List:
+        """确保值是列表"""
+        if value is None:
+            return []
+        if isinstance(value, str):
+            return [value] if value else []
+        if isinstance(value, list):
+            return value
+        return [str(value)]
+

+ 415 - 0
zhch/universal_doc_parser/utils/markdown_generator.py

@@ -0,0 +1,415 @@
+"""
+Markdown 生成器模块
+
+提供 Markdown 输出功能:
+- 完整文档 Markdown 生成
+- 按页 Markdown 生成
+- MinerU union_make 集成
+"""
+import sys
+from pathlib import Path
+from typing import Dict, Any, List
+from loguru import logger
+
+# 导入 MinerU 组件
+mineru_path = Path(__file__).parents[3]
+if str(mineru_path) not in sys.path:
+    sys.path.insert(0, str(mineru_path))
+
+try:
+    from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
+    from mineru.utils.enum_class import MakeMode
+    MINERU_AVAILABLE = True
+except ImportError:
+    MINERU_AVAILABLE = False
+    vlm_union_make = None
+    
+    class MakeMode:
+        MM_MD = 'mm_md'
+        NLP_MD = 'nlp_md'
+
+
+class MarkdownGenerator:
+    """Markdown 生成器类"""
+    
+    @staticmethod
+    def save_markdown(
+        results: Dict[str, Any],
+        middle_json: Dict[str, Any],
+        output_dir: Path,
+        doc_name: str,
+        use_mineru_union: bool = False
+    ) -> Path:
+        """
+        保存 Markdown 文件
+        
+        默认使用自定义实现,确保所有元素类型(包括 table_caption 等)都被正确处理
+        可选使用 MinerU union_make(但它不处理 table_caption 等独立元素)
+        
+        Args:
+            results: 处理结果
+            middle_json: middle.json 格式数据
+            output_dir: 输出目录
+            doc_name: 文档名称
+            use_mineru_union: 是否使用 MinerU union_make(默认 False)
+            
+        Returns:
+            Markdown 文件路径
+        """
+        md_path = output_dir / f"{doc_name}.md"
+        
+        if use_mineru_union and MINERU_AVAILABLE and vlm_union_make is not None:
+            try:
+                img_bucket_path = "images"
+                markdown_content = vlm_union_make(
+                    middle_json['pdf_info'],
+                    MakeMode.MM_MD,
+                    img_bucket_path
+                )
+                
+                if markdown_content:
+                    if isinstance(markdown_content, list):
+                        markdown_content = '\n\n'.join(markdown_content)
+                    
+                    header = MarkdownGenerator._generate_header(results)
+                    markdown_content = header + str(markdown_content)
+                    
+                    with open(md_path, 'w', encoding='utf-8') as f:
+                        f.write(markdown_content)
+                    
+                    logger.info(f"📝 Markdown saved (MinerU format): {md_path}")
+                    return md_path
+                    
+            except Exception as e:
+                logger.warning(f"MinerU union_make failed: {e}, falling back to custom implementation")
+        
+        # 使用自定义实现,确保所有元素类型都被处理
+        markdown_content = MarkdownGenerator._generate_full_markdown(results)
+        with open(md_path, 'w', encoding='utf-8') as f:
+            f.write(markdown_content)
+        
+        logger.info(f"📝 Markdown saved (custom format): {md_path}")
+        return md_path
+    
+    @staticmethod
+    def save_page_markdowns(
+        results: Dict[str, Any],
+        output_dir: Path,
+        doc_name: str
+    ) -> List[str]:
+        """
+        按页保存 Markdown 文件
+        
+        Args:
+            results: 处理结果
+            output_dir: 输出目录
+            doc_name: 文档名称
+            
+        Returns:
+            保存的 Markdown 文件路径列表
+        """
+        saved_paths = []
+        
+        for page in results.get('pages', []):
+            page_idx = page.get('page_idx', 0)
+            page_name = f"{doc_name}_page_{page_idx + 1:03d}"
+            
+            # 生成单页 Markdown
+            md_content = MarkdownGenerator._generate_page_markdown(page, doc_name, page_idx)
+            
+            # 保存
+            md_path = output_dir / f"{page_name}.md"
+            with open(md_path, 'w', encoding='utf-8') as f:
+                f.write(md_content)
+            
+            saved_paths.append(str(md_path))
+            logger.debug(f"📝 Page Markdown saved: {md_path}")
+        
+        if saved_paths:
+            logger.info(f"📝 {len(saved_paths)} page Markdowns saved")
+        
+        return saved_paths
+    
+    @staticmethod
+    def _generate_header(results: Dict[str, Any]) -> str:
+        """生成 Markdown 文件头"""
+        return f"""---
+scene: {results.get('scene', 'unknown')}
+document: {results.get('document_path', '')}
+pages: {len(results.get('pages', []))}
+---
+
+"""
+    
+    @staticmethod
+    def _generate_full_markdown(results: Dict[str, Any]) -> str:
+        """
+        生成完整文档的 Markdown(自定义实现)
+        
+        确保所有元素类型都被正确处理,包括 table_caption、table_footnote 等
+        
+        Args:
+            results: 处理结果
+            
+        Returns:
+            Markdown 内容字符串
+        """
+        md_lines = [
+            f"---",
+            f"scene: {results.get('scene', 'unknown')}",
+            f"document: {results.get('document_path', '')}",
+            f"pages: {len(results.get('pages', []))}",
+            f"---",
+            "",
+        ]
+        
+        for page in results.get('pages', []):
+            # 按阅读顺序处理元素
+            for element in page.get('elements', []):
+                elem_type = element.get('type', '')
+                content = element.get('content', {})
+                
+                if elem_type == 'title':
+                    text = content.get('text', '') if isinstance(content, dict) else str(content)
+                    level = element.get('level', 1)
+                    if text:
+                        md_lines.append(f"{'#' * min(level, 6)} {text}")
+                        md_lines.append("")
+                
+                elif elem_type in ['text', 'ocr_text', 'ref_text']:
+                    text = content.get('text', '') if isinstance(content, dict) else str(content)
+                    if text:
+                        md_lines.append(text)
+                        md_lines.append("")
+                
+                elif elem_type in ['table', 'table_body']:
+                    html = content.get('html', '')
+                    if html:
+                        md_lines.append(f"\n{html}\n")
+                        md_lines.append("")
+                
+                elif elem_type in ['image', 'image_body', 'figure']:
+                    img_filename = content.get('image_path', '')
+                    if img_filename:
+                        md_lines.append(f"![](images/{img_filename})")
+                        md_lines.append("")
+                
+                elif elem_type in ['interline_equation', 'inline_equation', 'equation']:
+                    latex = content.get('latex', '')
+                    if latex:
+                        md_lines.append(f"$$\n{latex}\n$$")
+                        md_lines.append("")
+                
+                elif elem_type in ['table_caption', 'table_footnote']:
+                    text = content.get('text', '') if isinstance(content, dict) else str(content)
+                    if text:
+                        if elem_type == 'table_caption':
+                            md_lines.append(f"**{text}**")
+                        else:
+                            md_lines.append(f"*{text}*")
+                        md_lines.append("")
+                
+                elif elem_type in ['image_caption', 'image_footnote']:
+                    text = content.get('text', '') if isinstance(content, dict) else str(content)
+                    if text:
+                        if elem_type == 'image_caption':
+                            md_lines.append(f"**{text}**")
+                        else:
+                            md_lines.append(f"*{text}*")
+                        md_lines.append("")
+        
+        return '\n'.join(md_lines)
+    
+    @staticmethod
+    def _generate_fallback(results: Dict[str, Any]) -> str:
+        """降级方案:自定义 Markdown 生成"""
+        md_lines = [
+            f"---",
+            f"scene: {results.get('scene', 'unknown')}",
+            f"document: {results.get('document_path', '')}",
+            f"pages: {len(results.get('pages', []))}",
+            f"---",
+            "",
+        ]
+        
+        for page in results.get('pages', []):
+            for element in page.get('elements', []):
+                elem_type = element.get('type', '')
+                content = element.get('content', {})
+                bbox = element.get('bbox', [])
+                
+                # 添加 bbox 注释
+                if bbox:
+                    md_lines.append(f"<!-- bbox: {bbox} -->")
+                
+                if elem_type == 'title':
+                    text = content.get('text', '') if isinstance(content, dict) else str(content)
+                    level = element.get('level', 1)
+                    md_lines.append(f"{'#' * min(level, 6)} {text}")
+                    md_lines.append("")
+                
+                elif elem_type in ['text', 'ocr_text', 'ref_text']:
+                    text = content.get('text', '') if isinstance(content, dict) else str(content)
+                    if text:
+                        md_lines.append(text)
+                        md_lines.append("")
+                
+                elif elem_type in ['table', 'table_body']:
+                    # 表格标题
+                    table_captions = content.get('table_caption', [])
+                    if isinstance(table_captions, str):
+                        table_captions = [table_captions] if table_captions else []
+                    for caption in table_captions:
+                        md_lines.append(f"**{caption}**")
+                    
+                    html = content.get('html', '')
+                    if html:
+                        md_lines.append(f"\n{html}\n")
+                    md_lines.append("")
+                
+                elif elem_type in ['image', 'image_body', 'figure']:
+                    img_filename = content.get('image_path', '')
+                    if img_filename:
+                        md_lines.append(f"![](images/{img_filename})")
+                        md_lines.append("")
+                
+                elif elem_type in ['interline_equation', 'inline_equation', 'equation']:
+                    latex = content.get('latex', '')
+                    if latex:
+                        md_lines.append(f"$$\n{latex}\n$$")
+                        md_lines.append("")
+                
+                elif elem_type in ['table_caption', 'table_footnote']:
+                    text = content.get('text', '') if isinstance(content, dict) else str(content)
+                    if text:
+                        if elem_type == 'table_caption':
+                            md_lines.append(f"**{text}**")
+                        else:
+                            md_lines.append(f"*{text}*")
+                        md_lines.append("")
+                
+                elif elem_type in ['image_caption', 'image_footnote']:
+                    text = content.get('text', '') if isinstance(content, dict) else str(content)
+                    if text:
+                        if elem_type == 'image_caption':
+                            md_lines.append(f"**{text}**")
+                        else:
+                            md_lines.append(f"*{text}*")
+                        md_lines.append("")
+        
+        return '\n'.join(md_lines)
+    
+    @staticmethod
+    def _generate_page_markdown(
+        page: Dict[str, Any],
+        doc_name: str,
+        page_idx: int
+    ) -> str:
+        """
+        生成单页的 Markdown 内容
+        
+        Args:
+            page: 页面数据
+            doc_name: 文档名称
+            page_idx: 页码索引
+            
+        Returns:
+            Markdown 内容字符串
+        """
+        md_lines = [
+            f"---",
+            f"document: {doc_name}",
+            f"page: {page_idx + 1}",
+            f"angle: {page.get('angle', 0)}",
+            f"---",
+            "",
+        ]
+        
+        for element in page.get('elements', []):
+            elem_type = element.get('type', '')
+            content = element.get('content', {})
+            bbox = element.get('bbox', [])
+            reading_order = element.get('reading_order', 0)
+            
+            # 添加元素注释
+            md_lines.append(f"<!-- reading_order: {reading_order}, bbox: {bbox} -->")
+            
+            if elem_type == 'title':
+                text = content.get('text', '') if isinstance(content, dict) else str(content)
+                level = element.get('level', 1)
+                md_lines.append(f"{'#' * min(level, 6)} {text}")
+                md_lines.append("")
+            
+            elif elem_type in ['text', 'ocr_text', 'ref_text']:
+                text = content.get('text', '') if isinstance(content, dict) else str(content)
+                if text:
+                    md_lines.append(text)
+                    md_lines.append("")
+            
+            elif elem_type in ['table', 'table_body']:
+                table_captions = content.get('table_caption', [])
+                if isinstance(table_captions, str):
+                    table_captions = [table_captions] if table_captions else []
+                for caption in table_captions:
+                    md_lines.append(f"**{caption}**")
+                
+                html = content.get('html', '')
+                if html:
+                    md_lines.append(f"\n{html}\n")
+                md_lines.append("")
+            
+            elif elem_type in ['image', 'image_body', 'figure']:
+                img_filename = content.get('image_path', '')
+                if img_filename:
+                    md_lines.append(f"![](images/{img_filename})")
+                    md_lines.append("")
+            
+            elif elem_type in ['interline_equation', 'inline_equation', 'equation']:
+                latex = content.get('latex', '')
+                if latex:
+                    md_lines.append(f"$$\n{latex}\n$$")
+                    md_lines.append("")
+            
+            elif elem_type in ['table_caption', 'table_footnote']:
+                text = content.get('text', '') if isinstance(content, dict) else str(content)
+                if text:
+                    # 表格标题加粗,表格脚注斜体
+                    if elem_type == 'table_caption':
+                        md_lines.append(f"**{text}**")
+                    else:
+                        md_lines.append(f"*{text}*")
+                    md_lines.append("")
+            
+            elif elem_type in ['image_caption', 'image_footnote']:
+                text = content.get('text', '') if isinstance(content, dict) else str(content)
+                if text:
+                    # 图片标题加粗,图片脚注斜体
+                    if elem_type == 'image_caption':
+                        md_lines.append(f"**{text}**")
+                    else:
+                        md_lines.append(f"*{text}*")
+                    md_lines.append("")
+            
+            elif elem_type == 'discarded':
+                text = content.get('text', '') if isinstance(content, dict) else ''
+                if text:
+                    md_lines.append(f"<!-- [discarded: {element.get('original_category', 'unknown')}] {text} -->")
+                    md_lines.append("")
+        
+        # 处理丢弃元素
+        for element in page.get('discarded_blocks', []):
+            content = element.get('content', {})
+            bbox = element.get('bbox', [])
+            reading_order = element.get('reading_order', 0)
+            original_category = element.get('original_category', 'unknown')
+            
+            md_lines.append(f"<!-- reading_order: {reading_order}, bbox: {bbox} -->")
+            text = content.get('text', '') if isinstance(content, dict) else ''
+            if text:
+                md_lines.append(f"<!-- [discarded: {original_category}] {text} -->")
+            else:
+                md_lines.append(f"<!-- [discarded: {original_category}] (no text) -->")
+            md_lines.append("")
+        
+        return '\n'.join(md_lines)
+

+ 0 - 770
zhch/universal_doc_parser/utils/output_formatter.py

@@ -1,770 +0,0 @@
-"""
-输出格式化器 - 将处理结果转换为多种格式输出
-严格复用MinerU的输出格式,确保完全兼容
-"""
-import json
-import os
-import sys
-from pathlib import Path
-from typing import Dict, Any, List, Union
-from loguru import logger
-import numpy as np
-from PIL import Image, ImageDraw, ImageFont
-
-# 导入MinerU的中间格式转换模块
-mineru_path = Path(__file__).parents[3]
-if str(mineru_path) not in sys.path:
-    sys.path.insert(0, str(mineru_path))
-
-from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
-from mineru.utils.enum_class import MakeMode, BlockType, ContentType
-
-
-class OutputFormatter:
-    """输出格式化器 - 严格按照MinerU格式"""
-    
-    def __init__(self, output_dir: str):
-        self.output_dir = Path(output_dir)
-        self.output_dir.mkdir(parents=True, exist_ok=True)
-        
-        # 颜色映射(与MinerU保持一致)
-        self.color_map = {
-            BlockType.TITLE: (102, 102, 255),         # 蓝色
-            BlockType.TEXT: (153, 0, 76),             # 深红
-            BlockType.IMAGE: (153, 255, 51),          # 绿色
-            BlockType.IMAGE_BODY: (153, 255, 51),
-            BlockType.IMAGE_CAPTION: (102, 178, 255),
-            BlockType.IMAGE_FOOTNOTE: (255, 178, 102),
-            BlockType.TABLE: (204, 204, 0),           # 黄色
-            BlockType.TABLE_BODY: (204, 204, 0),
-            BlockType.TABLE_CAPTION: (255, 255, 102),
-            BlockType.TABLE_FOOTNOTE: (229, 255, 204),
-            BlockType.INTERLINE_EQUATION: (0, 255, 0), # 亮绿
-            BlockType.LIST: (40, 169, 92),
-            BlockType.CODE: (102, 0, 204),            # 紫色
-            BlockType.CODE_BODY: (102, 0, 204),
-            BlockType.CODE_CAPTION: (204, 153, 255),
-        }
-        
-    def save_results(
-        self, 
-        results: Dict[str, Any], 
-        output_config: Dict[str, Any]
-    ) -> Dict[str, str]:
-        """
-        保存处理结果为多种格式
-        
-        Args:
-            results: 处理结果字典(包含pages列表,每页有processed_image)
-            output_config: 输出配置
-            
-        Returns:
-            各种格式的输出文件路径字典
-        """
-        output_paths = {}
-        
-        # 创建文档特定的输出目录
-        doc_name = Path(results['document_path']).stem
-        doc_output_dir = self.output_dir / doc_name
-        doc_output_dir.mkdir(parents=True, exist_ok=True)
-        
-        # 1. 转换为MinerU标准的middle.json格式
-        middle_json = self._convert_to_middle_json(results)
-        
-        # 2. 保存middle.json
-        if output_config.get('save_json', True):
-            middle_json_path = doc_output_dir / f"{doc_name}_middle.json"
-            with open(middle_json_path, 'w', encoding='utf-8') as f:
-                json.dump(middle_json, f, ensure_ascii=False, indent=2)
-            output_paths['middle_json'] = str(middle_json_path)
-            logger.info(f"📄 Middle JSON saved: {middle_json_path}")
-        
-        # 3. 使用vlm_union_make生成content_list.json
-        if output_config.get('save_content_list', True):
-            content_list_path = self._save_content_list(
-                middle_json, doc_output_dir, doc_name
-            )
-            output_paths['content_list'] = str(content_list_path)
-        
-        # 4. 生成Markdown
-        if output_config.get('save_markdown', True):
-            md_path = self._save_markdown(middle_json, doc_output_dir, doc_name)
-            output_paths['markdown'] = str(md_path)
-        
-        # 5. 保存表格HTML(每个表格一个文件)
-        if output_config.get('save_table_html', True):
-            table_html_dir = self._save_table_htmls(
-                middle_json, doc_output_dir, doc_name
-            )
-            output_paths['table_htmls'] = str(table_html_dir)
-        
-        # 6. 绘制布局图片
-        if output_config.get('save_layout_image', False):
-            layout_image_paths = self._save_layout_image(
-                middle_json=middle_json,
-                results=results,
-                output_dir=doc_output_dir,
-                doc_name=doc_name,
-                draw_type_label=output_config.get('draw_type_label', True),
-                draw_bbox_number=output_config.get('draw_bbox_number', True)
-            )
-            output_paths['layout_images'] = layout_image_paths
-        
-        logger.info(f"✅ Results saved to: {doc_output_dir}")
-        return output_paths
-    
-    def _convert_to_middle_json(self, results: Dict[str, Any]) -> Dict[str, Any]:
-        """
-        转换为MinerU标准的middle.json格式
-        严格按照 docs/zh/reference/output_files.md 中的VLM后端格式
-        """
-        middle_json = {
-            "pdf_info": [],
-            "_backend": "vlm",  # 标记为VLM后端
-            "_scene": results.get('scene', 'unknown'),
-            "_version_name": "2.5.0"
-        }
-        
-        for page in results['pages']:
-            page_info = {
-                'page_idx': page['page_idx'],
-                'page_size': list(page.get('image_shape', [0, 0])[:2][::-1]),  # [width, height]
-                'angle': page.get('angle', 0),
-                'para_blocks': [],
-                'discarded_blocks': []
-            }
-            
-            # 转换每个元素为MinerU格式的block
-            for element in page['elements']:
-                block = self._element_to_mineru_block(element, page_info['page_size'])
-                if block:
-                    # 根据类型分类到para_blocks或discarded_blocks
-                    if element.get('type') in ['header', 'footer', 'page_number', 
-                                               'aside_text', 'page_footnote']:
-                        page_info['discarded_blocks'].append(block)
-                    else:
-                        page_info['para_blocks'].append(block)
-            
-            middle_json['pdf_info'].append(page_info)
-        
-        return middle_json
-    
-    def _element_to_mineru_block(
-        self, 
-        element: Dict[str, Any],
-        page_size: List[int]
-    ) -> Dict[str, Any]:
-        """
-        将处理结果的元素转换为MinerU标准的block格式
-        
-        参考: mineru/backend/vlm/vlm_middle_json_mkcontent.py
-        """
-        element_type = element.get('type', '')
-        bbox = element.get('bbox', [0, 0, 0, 0])
-        
-        # 归一化bbox坐标到0-1范围
-        # normalized_bbox = self._normalize_bbox(bbox, page_size)
-        
-        block = {
-            'type': element_type,
-            'bbox': bbox,
-            'angle': element.get('angle', 0),  # VLM后端特有
-            'lines': []
-        }
-        
-        # 文本类型(text, title, ref_text等)
-        if element_type in [BlockType.TEXT, BlockType.TITLE, BlockType.REF_TEXT,
-                           BlockType.PHONETIC, BlockType.HEADER, BlockType.FOOTER,
-                           BlockType.PAGE_NUMBER, BlockType.ASIDE_TEXT, BlockType.PAGE_FOOTNOTE]:
-            content = element.get('content', {})
-            text = content.get('text', '') if isinstance(content, dict) else str(content)
-            
-            if text:
-                block['lines'] = [{
-                    'bbox': bbox,
-                    'spans': [{
-                        'bbox': bbox,
-                        'type': ContentType.TEXT,
-                        'content': text
-                    }]
-                }]
-            
-            # 添加标题级别
-            if element_type == BlockType.TITLE and 'level' in element:
-                block['level'] = element['level']
-        
-        # 列表类型
-        elif element_type == BlockType.LIST:
-            block['sub_type'] = element.get('sub_type', 'text')
-            block['blocks'] = []
-            
-            list_items = element.get('content', {}).get('list_items', [])
-            for item_text in list_items:
-                item_block = {
-                    'type': BlockType.TEXT,
-                    'bbox': bbox,
-                    'angle': 0,
-                    'lines': [{
-                        'bbox': bbox,
-                        'spans': [{
-                            'bbox': bbox,
-                            'type': ContentType.TEXT,
-                            'content': item_text
-                        }]
-                    }]
-                }
-                block['blocks'].append(item_block)
-        
-        # 代码块类型
-        elif element_type == BlockType.CODE:
-            block['sub_type'] = element.get('sub_type', 'code')
-            block['blocks'] = []
-            
-            code_content = element.get('content', {})
-            
-            # code_body
-            code_body = code_content.get('code_body', '')
-            if code_body:
-                code_body_block = {
-                    'type': BlockType.CODE_BODY,
-                    'bbox': bbox,
-                    'angle': 0,
-                    'lines': [{
-                        'bbox': bbox,
-                        'spans': [{
-                            'bbox': bbox,
-                            'type': ContentType.TEXT,
-                            'content': code_body
-                        }]
-                    }]
-                }
-                block['blocks'].append(code_body_block)
-                
-                # 添加语言标识
-                if 'guess_lang' in element:
-                    block['guess_lang'] = element['guess_lang']
-            
-            # code_caption
-            code_caption = code_content.get('code_caption', [])
-            for caption_text in code_caption:
-                caption_block = {
-                    'type': BlockType.CODE_CAPTION,
-                    'bbox': bbox,
-                    'angle': 0,
-                    'lines': [{
-                        'bbox': bbox,
-                        'spans': [{
-                            'bbox': bbox,
-                            'type': ContentType.TEXT,
-                            'content': caption_text
-                        }]
-                    }]
-                }
-                block['blocks'].append(caption_block)
-        
-        # 行间公式
-        elif element_type == BlockType.INTERLINE_EQUATION:
-            formula_content = element.get('content', {})
-            latex = formula_content.get('latex', '')
-            
-            block['lines'] = [{
-                'bbox': bbox,
-                'spans': [{
-                    'bbox': bbox,
-                    'type': ContentType.INTERLINE_EQUATION,
-                    'content': latex
-                }]
-            }]
-        
-        # 图片
-        elif element_type == BlockType.IMAGE:
-            block['blocks'] = []
-            
-            image_content = element.get('content', {})
-            
-            # image_body
-            img_path = image_content.get('img_path', '')
-            if img_path:
-                image_body_block = {
-                    'type': BlockType.IMAGE_BODY,
-                    'bbox': bbox,
-                    'angle': 0,
-                    'lines': [{
-                        'bbox': bbox,
-                        'spans': [{
-                            'bbox': bbox,
-                            'type': ContentType.IMAGE,
-                            'image_path': img_path
-                        }]
-                    }]
-                }
-                block['blocks'].append(image_body_block)
-            
-            # image_caption
-            for caption_text in image_content.get('image_caption', []):
-                caption_block = {
-                    'type': BlockType.IMAGE_CAPTION,
-                    'bbox': bbox,
-                    'angle': 0,
-                    'lines': [{
-                        'bbox': bbox,
-                        'spans': [{
-                            'bbox': bbox,
-                            'type': ContentType.TEXT,
-                            'content': caption_text
-                        }]
-                    }]
-                }
-                block['blocks'].append(caption_block)
-            
-            # image_footnote
-            for footnote_text in image_content.get('image_footnote', []):
-                footnote_block = {
-                    'type': BlockType.IMAGE_FOOTNOTE,
-                    'bbox': bbox,
-                    'angle': 0,
-                    'lines': [{
-                        'bbox': bbox,
-                        'spans': [{
-                            'bbox': bbox,
-                            'type': ContentType.TEXT,
-                            'content': footnote_text
-                        }]
-                    }]
-                }
-                block['blocks'].append(footnote_block)
-        
-        # 表格
-        elif element_type == BlockType.TABLE:
-            block['blocks'] = []
-            
-            table_content = element.get('content', {})
-            
-            # table_body
-            table_html = table_content.get('html', '')
-            img_path = table_content.get('img_path', '')
-            
-            if table_html or img_path:
-                table_body_block = {
-                    'type': BlockType.TABLE_BODY,
-                    'bbox': bbox,
-                    'angle': 0,
-                    'lines': [{
-                        'bbox': bbox,
-                        'spans': [{
-                            'bbox': bbox,
-                            'type': ContentType.TABLE,
-                            'html': table_html,
-                            'image_path': img_path
-                        }]
-                    }]
-                }
-                block['blocks'].append(table_body_block)
-            
-            # table_caption
-            for caption_text in table_content.get('table_caption', []):
-                caption_block = {
-                    'type': BlockType.TABLE_CAPTION,
-                    'bbox': bbox,
-                    'angle': 0,
-                    'lines': [{
-                        'bbox': bbox,
-                        'spans': [{
-                            'bbox': bbox,
-                            'type': ContentType.TEXT,
-                            'content': caption_text
-                        }]
-                    }]
-                }
-                block['blocks'].append(caption_block)
-            
-            # table_footnote
-            for footnote_text in table_content.get('table_footnote', []):
-                footnote_block = {
-                    'type': BlockType.TABLE_FOOTNOTE,
-                    'bbox': bbox,
-                    'angle': 0,
-                    'lines': [{
-                        'bbox': bbox,
-                        'spans': [{
-                            'bbox': bbox,
-                            'type': ContentType.TEXT,
-                            'content': footnote_text
-                        }]
-                    }]
-                }
-                block['blocks'].append(footnote_block)
-        
-        return block
-    
-    def _normalize_bbox(self, bbox: List[float], page_size: List[int]) -> List[float]:
-        """
-        将bbox归一化到0-1范围
-        
-        Args:
-            bbox: [x0, y0, x1, y1] 绝对坐标
-            page_size: [width, height] 页面尺寸
-            
-        Returns:
-            归一化后的bbox
-        """
-        if not bbox or len(bbox) != 4:
-            return [0.0, 0.0, 0.0, 0.0]
-        
-        page_width, page_height = page_size
-        x0, y0, x1, y1 = bbox
-        
-        return [
-            x0 / page_width if page_width > 0 else 0.0,
-            y0 / page_height if page_height > 0 else 0.0,
-            x1 / page_width if page_width > 0 else 0.0,
-            y1 / page_height if page_height > 0 else 0.0
-        ]
-    
-    def _save_content_list(
-        self, 
-        middle_json: Dict[str, Any], 
-        output_dir: Path,
-        doc_name: str
-    ) -> Path:
-        """
-        使用vlm_union_make生成content_list.json
-        """
-        content_list_path = output_dir / f"{doc_name}_content_list.json"
-        
-        try:
-            # 直接调用MinerU的vlm_union_make函数
-            content_list = vlm_union_make(
-                middle_json['pdf_info'],
-                make_mode=MakeMode.CONTENT_LIST,
-                img_buket_path='images'
-            )
-            
-            with open(content_list_path, 'w', encoding='utf-8') as f:
-                json.dump(content_list, f, ensure_ascii=False, indent=2)
-            
-            logger.info(f"📋 Content list saved: {content_list_path}")
-            
-        except Exception as e:
-            logger.error(f"❌ Failed to generate content_list: {e}")
-            # Fallback: 保存空列表
-            with open(content_list_path, 'w', encoding='utf-8') as f:
-                json.dump([], f)
-        
-        return content_list_path
-    
-    def _save_markdown(
-        self, 
-        middle_json: Dict[str, Any], 
-        output_dir: Path,
-        doc_name: str
-    ) -> Path:
-        """
-        使用vlm_union_make生成markdown
-        """
-        md_path = output_dir / f"{doc_name}.md"
-        
-        try:
-            # 创建images目录
-            images_dir = output_dir / 'images'
-            images_dir.mkdir(exist_ok=True)
-            
-            # 调用MinerU的vlm_union_make生成markdown
-            markdown_content = vlm_union_make(
-                middle_json['pdf_info'],
-                make_mode=MakeMode.MM_MD,
-                img_buket_path='images'
-            )
-            
-            # 添加元信息头部
-            metadata = f"""---
-scene: {middle_json.get('_scene', 'unknown')}
-backend: {middle_json.get('_backend', 'vlm')}
-version: {middle_json.get('_version_name', '2.5.0')}
----
-
-"""
-            
-            with open(md_path, 'w', encoding='utf-8') as f:
-                f.write(metadata)
-                f.write(markdown_content)
-            
-            logger.info(f"📝 Markdown saved: {md_path}")
-            
-        except Exception as e:
-            logger.error(f"❌ Failed to generate markdown: {e}")
-            # Fallback
-            with open(md_path, 'w', encoding='utf-8') as f:
-                f.write(f"# {doc_name}\n\n*Markdown generation failed*\n")
-        
-        return md_path
-    
-    def _save_table_htmls(
-        self,
-        middle_json: Dict[str, Any],
-        output_dir: Path,
-        doc_name: str
-    ) -> Path:
-        """
-        保存每个表格为单独的HTML文件
-        """
-        tables_dir = output_dir / 'tables'
-        tables_dir.mkdir(exist_ok=True)
-        
-        table_count = 0
-        
-        for page_idx, page_info in enumerate(middle_json['pdf_info']):
-            for block in page_info.get('para_blocks', []):
-                if block.get('type') == BlockType.TABLE:
-                    # 提取表格HTML
-                    for sub_block in block.get('blocks', []):
-                        if sub_block.get('type') == BlockType.TABLE_BODY:
-                            for line in sub_block.get('lines', []):
-                                for span in line.get('spans', []):
-                                    html_content = span.get('html', '')
-                                    if html_content:
-                                        # 保存表格HTML
-                                        table_count += 1
-                                        table_path = tables_dir / f"{doc_name}_table_{table_count}_page_{page_idx}.html"
-                                        
-                                        # 生成完整的HTML文档
-                                        full_html = self._wrap_table_html(
-                                            html_content,
-                                            f"{doc_name} - Table {table_count}",
-                                            page_idx
-                                        )
-                                        
-                                        with open(table_path, 'w', encoding='utf-8') as f:
-                                            f.write(full_html)
-                                        
-                                        logger.info(f"📊 Table {table_count} saved: {table_path}")
-        
-        if table_count > 0:
-            logger.info(f"📊 Total {table_count} tables saved to: {tables_dir}")
-        
-        return tables_dir
-    
-    def _wrap_table_html(self, table_html: str, title: str, page_idx: int) -> str:
-        """为表格HTML添加完整的HTML文档结构"""
-        return f"""<!DOCTYPE html>
-<html lang="zh-CN">
-<head>
-    <meta charset="UTF-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>{title}</title>
-    <style>
-        body {{
-            font-family: Arial, "Microsoft YaHei", sans-serif;
-            margin: 20px;
-            background-color: #f5f5f5;
-        }}
-        .container {{
-            max-width: 1200px;
-            margin: 0 auto;
-            background-color: white;
-            padding: 20px;
-            box-shadow: 0 0 10px rgba(0,0,0,0.1);
-        }}
-        .meta {{
-            color: #666;
-            font-size: 0.9em;
-            margin-bottom: 20px;
-            padding-bottom: 10px;
-            border-bottom: 1px solid #ddd;
-        }}
-        table {{
-            border-collapse: collapse;
-            width: 100%;
-            margin: 20px 0;
-        }}
-        th, td {{
-            border: 1px solid #ddd;
-            padding: 8px 12px;
-            text-align: left;
-        }}
-        th {{
-            background-color: #f2f2f2;
-            font-weight: bold;
-        }}
-        tr:hover {{
-            background-color: #f9f9f9;
-        }}
-    </style>
-</head>
-<body>
-    <div class="container">
-        <div class="meta">
-            <p><strong>Title:</strong> {title}</p>
-            <p><strong>Page:</strong> {page_idx + 1}</p>
-        </div>
-        {table_html}
-    </div>
-</body>
-</html>"""
-    
-    def _save_layout_image(
-        self,
-        middle_json: Dict[str, Any],
-        results: Dict[str, Any],
-        output_dir: Path,
-        doc_name: str,
-        draw_type_label: bool = True,
-        draw_bbox_number: bool = True
-    ) -> List[Path]:
-        """
-        在原始图片上绘制布局检测结果
-        
-        Args:
-            middle_json: MinerU中间JSON
-            results: 处理结果, processed_image字段包含预处理后的图像
-            output_dir: 输出目录
-            doc_name: 文档名称
-            draw_type_label: 是否标注类型
-            draw_bbox_number: 是否标注序号
-        """
-        layout_image_paths = []
-        
-        # 获取所有页面
-        pages = results.get('pages', [])
-        pdf_info = middle_json.get('pdf_info', [])
-        
-        if len(pages) == 0:
-            logger.warning("⚠️  No pages found in results")
-            return [output_dir]
-        
-        logger.info(f"🖼️  Generating layout images for {len(pages)} page(s)...")
-        
-        # 处理每一页
-        for page_idx, (page, page_info) in enumerate(zip(pages, pdf_info)):
-            original_image = page.get('processed_image')
-            if original_image is None:
-                logger.warning(f"⚠️  No processed_image found for page {page_idx}, skipping layout image.")
-                continue
-            layout_image_path = output_dir / f"{doc_name}_{page_idx + 1}_layout.png"
-            
-            # 读取图片
-            if isinstance(original_image, str):
-                image = Image.open(original_image).convert('RGB')
-            elif isinstance(original_image, np.ndarray):
-                image = Image.fromarray(original_image).convert('RGB')
-            elif isinstance(original_image, Image.Image):
-                image = original_image.convert('RGB')
-            else:
-                logger.error("Invalid image type")
-                return layout_image_path
-            
-            # 创建绘图对象
-            draw = ImageDraw.Draw(image, 'RGBA')
-            
-            # 加载字体
-            try:
-                font = ImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", 14)
-            except:
-                try:
-                    font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 14)
-                except:
-                    font = ImageFont.load_default()
-            
-            # 假设只处理第一页
-            page_size = page_info.get('page_size', [image.width, image.height])
-            image_width, image_height = image.size
-            
-            # 绘制所有blocks
-            block_idx = 1
-            for block in page_info.get('para_blocks', []) + page_info.get('discarded_blocks', []):
-                block_type = block.get('type', '')
-                bbox_original = block.get('bbox', [0, 0, 0, 0])
-                
-                x0 = int(bbox_original[0])
-                y0 = int(bbox_original[1])
-                x1 = int(bbox_original[2])
-                y1 = int(bbox_original[3])
-
-                # 获取颜色
-                color = self.color_map.get(block_type, (255, 0, 0))
-                
-                # 绘制半透明填充
-                overlay = Image.new('RGBA', image.size, (255, 255, 255, 0))
-                overlay_draw = ImageDraw.Draw(overlay)
-                overlay_draw.rectangle(
-                    [x0, y0, x1, y1],
-                    fill=(*color, 76),  # 30% 透明度
-                    outline=color,
-                    width=2
-                )
-                image.paste(Image.alpha_composite(image.convert('RGBA'), overlay).convert('RGB'))
-                draw = ImageDraw.Draw(image)
-                
-                # 绘制边框
-                draw.rectangle([x0, y0, x1, y1], outline=color, width=2)
-                
-                # 标注类型
-                if draw_type_label:
-                    label = block_type.replace('_', ' ').title()
-                    bbox_label = draw.textbbox((x0 + 2, y0 + 2), label, font=font)
-                    draw.rectangle(bbox_label, fill=color)
-                    draw.text((x0 + 2, y0 + 2), label, fill='white', font=font)
-                
-                # 标注序号
-                if draw_bbox_number:
-                    number_text = str(block_idx)
-                    bbox_number = draw.textbbox((x1 - 25, y0 + 2), number_text, font=font)
-                    draw.rectangle(bbox_number, fill=(255, 0, 0))
-                    draw.text((x1 - 25, y0 + 2), number_text, fill='white', font=font)
-                    block_idx += 1
-            
-            # 保存图片
-            image.save(layout_image_path)
-            logger.info(f"🖼️ Layout image saved: {layout_image_path}")
-
-            layout_image_paths.append(layout_image_path)
-
-        return layout_image_paths
-
-
-if __name__ == "__main__":
-    # 测试代码
-    sample_results = {
-        "document_path": "/path/to/sample.pdf",
-        "scene": "financial_report",
-        "pages": [
-            {
-                "page_idx": 0,
-                "image_shape": [1654, 2338, 3],
-                "elements": [
-                    {
-                        "type": "title",
-                        "bbox": [100, 50, 800, 100],
-                        "content": {"text": "财务报告"},
-                        "confidence": 0.98,
-                        "level": 1
-                    },
-                    {
-                        "type": "table",
-                        "bbox": [100, 200, 800, 600],
-                        "content": {
-                            "html": "<table><tr><td>项目</td><td>金额</td></tr></table>",
-                            "markdown": "| 项目 | 金额 |\n|------|------|",
-                            "table_caption": ["表1: 财务数据"],
-                            "table_footnote": []
-                        },
-                        "confidence": 0.95
-                    }
-                ]
-            }
-        ]
-    }
-    
-    formatter = OutputFormatter("./test_output")
-    output_files = formatter.save_results(
-        sample_results,
-        {
-            "save_json": True,
-            "save_content_list": True,
-            "save_markdown": True,
-            "save_table_html": True,
-            "save_layout_image": False
-        }
-    )
-    
-    print("Generated files:", output_files)

파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
+ 36 - 875
zhch/universal_doc_parser/utils/output_formatter_v2.py


+ 370 - 0
zhch/universal_doc_parser/utils/visualization_utils.py

@@ -0,0 +1,370 @@
+"""
+可视化工具模块
+
+提供文档处理结果的可视化功能:
+- Layout 布局可视化
+- OCR 结果可视化
+- 图片元素保存
+"""
+from pathlib import Path
+from typing import Dict, Any, List, Tuple
+import numpy as np
+from PIL import Image, ImageDraw, ImageFont
+import cv2
+from loguru import logger
+
+
+class VisualizationUtils:
+    """可视化工具类"""
+    
+    # 颜色映射(与 MinerU 保持一致)
+    COLOR_MAP = {
+        'title': (102, 102, 255),           # 蓝色
+        'text': (153, 0, 76),               # 深红
+        'image': (153, 255, 51),            # 绿色
+        'image_body': (153, 255, 51),
+        'image_caption': (102, 178, 255),
+        'image_footnote': (255, 178, 102),
+        'table': (204, 204, 0),             # 黄色
+        'table_body': (204, 204, 0),
+        'table_caption': (255, 255, 102),
+        'table_footnote': (229, 255, 204),
+        'interline_equation': (0, 255, 0),  # 亮绿
+        'inline_equation': (0, 200, 0),
+        'list': (40, 169, 92),
+        'code': (102, 0, 204),              # 紫色
+        'header': (128, 128, 128),          # 灰色
+        'footer': (128, 128, 128),
+        'ref_text': (180, 180, 180),
+        'ocr_text': (153, 0, 76),
+        'error': (255, 0, 0),               # 红色
+    }
+    
+    # OCR 框颜色
+    OCR_BOX_COLOR = (0, 255, 0)  # 绿色
+    CELL_BOX_COLOR = (255, 165, 0)  # 橙色
+    DISCARD_COLOR = (128, 128, 128)  # 灰色
+    
+    @staticmethod
+    def save_image_elements(
+        results: Dict[str, Any],
+        images_dir: Path,
+        doc_name: str
+    ) -> List[str]:
+        """
+        保存图片元素
+        
+        Args:
+            results: 处理结果
+            images_dir: 图片输出目录
+            doc_name: 文档名称
+            
+        Returns:
+            保存的图片路径列表
+        """
+        saved_paths = []
+        image_count = 0
+        
+        for page in results.get('pages', []):
+            page_idx = page.get('page_idx', 0)
+            
+            for element in page.get('elements', []):
+                if element.get('type') in ['image', 'image_body', 'figure']:
+                    content = element.get('content', {})
+                    image_data = content.get('image_data')
+                    
+                    if image_data is not None:
+                        image_count += 1
+                        image_filename = f"{doc_name}_page_{page_idx + 1}_image_{image_count}.png"
+                        image_path = images_dir / image_filename
+                        
+                        try:
+                            if isinstance(image_data, np.ndarray):
+                                cv2.imwrite(str(image_path), image_data)
+                            else:
+                                Image.fromarray(image_data).save(image_path)
+                            
+                            # 更新路径(只保存文件名)
+                            content['image_path'] = image_filename
+                            content.pop('image_data', None)
+                            
+                            saved_paths.append(str(image_path))
+                            logger.debug(f"🖼️ Image saved: {image_path}")
+                        except Exception as e:
+                            logger.warning(f"Failed to save image: {e}")
+        
+        if image_count > 0:
+            logger.info(f"🖼️ {image_count} images saved to: {images_dir}")
+        
+        return saved_paths
+    
+    @staticmethod
+    def save_layout_images(
+        results: Dict[str, Any],
+        output_dir: Path,
+        doc_name: str,
+        draw_type_label: bool = True,
+        draw_bbox_number: bool = True
+    ) -> List[str]:
+        """
+        保存 Layout 可视化图片
+        
+        Args:
+            results: 处理结果
+            output_dir: 输出目录
+            doc_name: 文档名称
+            draw_type_label: 是否绘制类型标签
+            draw_bbox_number: 是否绘制序号
+            
+        Returns:
+            保存的图片路径列表
+        """
+        layout_paths = []
+        
+        for page in results.get('pages', []):
+            page_idx = page.get('page_idx', 0)
+            processed_image = page.get('original_image')
+            if processed_image is None:
+                processed_image = page.get('processed_image')
+            
+            if processed_image is None:
+                logger.warning(f"Page {page_idx}: No image data found for layout visualization")
+                continue
+            
+            if isinstance(processed_image, np.ndarray):
+                image = Image.fromarray(processed_image).convert('RGB')
+            elif isinstance(processed_image, Image.Image):
+                image = processed_image.convert('RGB')
+            else:
+                continue
+            
+            draw = ImageDraw.Draw(image, 'RGBA')
+            font = VisualizationUtils._get_font(14)
+            
+            # 绘制普通元素
+            for idx, element in enumerate(page.get('elements', []), 1):
+                elem_type = element.get('type', '')
+                bbox = element.get('bbox', [0, 0, 0, 0])
+                
+                if len(bbox) < 4:
+                    continue
+                
+                x0, y0, x1, y1 = map(int, bbox[:4])
+                color = VisualizationUtils.COLOR_MAP.get(elem_type, (255, 0, 0))
+                
+                # 半透明填充
+                overlay = Image.new('RGBA', image.size, (255, 255, 255, 0))
+                overlay_draw = ImageDraw.Draw(overlay)
+                overlay_draw.rectangle([x0, y0, x1, y1], fill=(*color, 50))
+                image = Image.alpha_composite(image.convert('RGBA'), overlay).convert('RGB')
+                draw = ImageDraw.Draw(image)
+                
+                # 边框
+                draw.rectangle([x0, y0, x1, y1], outline=color, width=2)
+                
+                # 类型标签
+                if draw_type_label:
+                    label = elem_type.replace('_', ' ').title()
+                    bbox_label = draw.textbbox((x0 + 2, y0 + 2), label, font=font)
+                    draw.rectangle(bbox_label, fill=color)
+                    draw.text((x0 + 2, y0 + 2), label, fill='white', font=font)
+                
+                # 序号
+                if draw_bbox_number:
+                    number_text = str(idx)
+                    bbox_number = draw.textbbox((x1 - 25, y0 + 2), number_text, font=font)
+                    draw.rectangle(bbox_number, fill=(255, 0, 0))
+                    draw.text((x1 - 25, y0 + 2), number_text, fill='white', font=font)
+            
+            # 绘制丢弃元素(灰色样式)
+            for idx, element in enumerate(page.get('discarded_blocks', []), 1):
+                original_category = element.get('original_category', 'unknown')
+                bbox = element.get('bbox', [0, 0, 0, 0])
+                
+                if len(bbox) < 4:
+                    continue
+                
+                x0, y0, x1, y1 = map(int, bbox[:4])
+                
+                # 半透明填充
+                overlay = Image.new('RGBA', image.size, (255, 255, 255, 0))
+                overlay_draw = ImageDraw.Draw(overlay)
+                overlay_draw.rectangle([x0, y0, x1, y1], fill=(*VisualizationUtils.DISCARD_COLOR, 30))
+                image = Image.alpha_composite(image.convert('RGBA'), overlay).convert('RGB')
+                draw = ImageDraw.Draw(image)
+                
+                # 灰色边框
+                draw.rectangle([x0, y0, x1, y1], outline=VisualizationUtils.DISCARD_COLOR, width=1)
+                
+                # 类型标签
+                if draw_type_label:
+                    label = f"D:{original_category}"
+                    bbox_label = draw.textbbox((x0 + 2, y0 + 2), label, font=font)
+                    draw.rectangle(bbox_label, fill=VisualizationUtils.DISCARD_COLOR)
+                    draw.text((x0 + 2, y0 + 2), label, fill='white', font=font)
+            
+            layout_path = output_dir / f"{doc_name}_page_{page_idx + 1}_layout.png"
+            image.save(layout_path)
+            layout_paths.append(str(layout_path))
+            logger.info(f"🖼️ Layout image saved: {layout_path}")
+        
+        return layout_paths
+    
+    @staticmethod
+    def save_ocr_images(
+        results: Dict[str, Any],
+        output_dir: Path,
+        doc_name: str
+    ) -> List[str]:
+        """
+        保存 OCR 可视化图片
+        
+        Args:
+            results: 处理结果
+            output_dir: 输出目录
+            doc_name: 文档名称
+            
+        Returns:
+            保存的图片路径列表
+        """
+        ocr_paths = []
+        
+        for page in results.get('pages', []):
+            page_idx = page.get('page_idx', 0)
+            processed_image = page.get('original_image')
+            if processed_image is None:
+                processed_image = page.get('processed_image')
+            
+            if processed_image is None:
+                logger.warning(f"Page {page_idx}: No image data found for OCR visualization")
+                continue
+            
+            if isinstance(processed_image, np.ndarray):
+                image = Image.fromarray(processed_image).convert('RGB')
+            elif isinstance(processed_image, Image.Image):
+                image = processed_image.convert('RGB')
+            else:
+                continue
+            
+            draw = ImageDraw.Draw(image)
+            font = VisualizationUtils._get_font(10)
+            
+            for element in page.get('elements', []):
+                content = element.get('content', {})
+                
+                # OCR 文本框
+                ocr_details = content.get('ocr_details', [])
+                for ocr_item in ocr_details:
+                    ocr_bbox = ocr_item.get('bbox', [])
+                    if ocr_bbox:
+                        VisualizationUtils._draw_polygon(
+                            draw, ocr_bbox, VisualizationUtils.OCR_BOX_COLOR, width=1
+                        )
+                
+                # 表格单元格
+                cells = content.get('cells', [])
+                for cell in cells:
+                    cell_bbox = cell.get('bbox', [])
+                    if cell_bbox and len(cell_bbox) >= 4:
+                        x0, y0, x1, y1 = map(int, cell_bbox[:4])
+                        draw.rectangle(
+                            [x0, y0, x1, y1], 
+                            outline=VisualizationUtils.CELL_BOX_COLOR, 
+                            width=2
+                        )
+                        
+                        cell_text = cell.get('text', '')[:10]
+                        if cell_text:
+                            draw.text(
+                                (x0 + 2, y0 + 2), 
+                                cell_text, 
+                                fill=VisualizationUtils.CELL_BOX_COLOR, 
+                                font=font
+                            )
+                
+                # OCR 框
+                ocr_boxes = content.get('ocr_boxes', [])
+                for ocr_box in ocr_boxes:
+                    bbox = ocr_box.get('bbox', [])
+                    if bbox:
+                        VisualizationUtils._draw_polygon(
+                            draw, bbox, VisualizationUtils.OCR_BOX_COLOR, width=1
+                        )
+            
+            # 绘制丢弃元素的 OCR 框
+            for element in page.get('discarded_blocks', []):
+                bbox = element.get('bbox', [0, 0, 0, 0])
+                content = element.get('content', {})
+                
+                if len(bbox) >= 4:
+                    x0, y0, x1, y1 = map(int, bbox[:4])
+                    draw.rectangle(
+                        [x0, y0, x1, y1], 
+                        outline=VisualizationUtils.DISCARD_COLOR, 
+                        width=1
+                    )
+                    
+                    ocr_details = content.get('ocr_details', [])
+                    for ocr_item in ocr_details:
+                        ocr_bbox = ocr_item.get('bbox', [])
+                        if ocr_bbox:
+                            VisualizationUtils._draw_polygon(
+                                draw, ocr_bbox, VisualizationUtils.DISCARD_COLOR, width=1
+                            )
+            
+            ocr_path = output_dir / f"{doc_name}_page_{page_idx + 1}_ocr.png"
+            image.save(ocr_path)
+            ocr_paths.append(str(ocr_path))
+            logger.info(f"🖼️ OCR image saved: {ocr_path}")
+        
+        return ocr_paths
+    
+    @staticmethod
+    def _draw_polygon(
+        draw: ImageDraw.Draw,
+        bbox: List,
+        color: Tuple[int, int, int],
+        width: int = 1
+    ):
+        """
+        绘制多边形或矩形
+        
+        Args:
+            draw: ImageDraw 对象
+            bbox: 坐标(4点多边形或矩形)
+            color: 颜色
+            width: 线宽
+        """
+        if isinstance(bbox[0], (list, tuple)):
+            points = [(int(p[0]), int(p[1])) for p in bbox]
+            points.append(points[0])
+            draw.line(points, fill=color, width=width)
+        elif len(bbox) >= 4:
+            x0, y0, x1, y1 = map(int, bbox[:4])
+            draw.rectangle([x0, y0, x1, y1], outline=color, width=width)
+    
+    @staticmethod
+    def _get_font(size: int) -> ImageFont.FreeTypeFont:
+        """
+        获取字体
+        
+        Args:
+            size: 字体大小
+            
+        Returns:
+            字体对象
+        """
+        font_paths = [
+            "/System/Library/Fonts/Helvetica.ttc",
+            "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
+            "/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf",
+        ]
+        
+        for font_path in font_paths:
+            try:
+                return ImageFont.truetype(font_path, size)
+            except:
+                continue
+        
+        return ImageFont.load_default()
+

이 변경점에서 너무 많은 파일들이 변경되어 몇몇 파일들은 표시되지 않았습니다.