zhengchun
/
ocr_verify


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460
							"""
Markdown 生成模块
负责将合并后的数据生成 Markdown 文件
"""
import shutil
from pathlib import Path
from typing import List, Dict, Optional


class MarkdownGenerator:
    """Markdown 生成器"""
    
    @staticmethod
    def detect_data_format(merged_data: List[Dict]) -> str:
        """
        检测数据格式
        
        Returns:
            'mineru' 或 'paddleocr_vl'
        """
        if not merged_data:
            return 'mineru'
        
        first_item = merged_data[0]
        
        # 检查是否有 PaddleOCR_VL 特有字段
        if 'block_label' in first_item and 'block_content' in first_item:
            return 'paddleocr_vl'
        
        # 检查是否有 MinerU 特有字段
        if 'type' in first_item and ('table_body' in first_item or 'text' in first_item):
            return 'mineru'
        
        # 默认按 MinerU 格式处理
        return 'mineru'
    
    @staticmethod
    def generate_enhanced_markdown(merged_data: List[Dict], 
                                   output_path: Optional[str] = None,
                                   source_file: Optional[str] = None,
                                   data_format: Optional[str] = None) -> str:
        """
        生成增强的 Markdown（包含 bbox 信息的注释）
        
        Args:
            merged_data: 合并后的数据
            output_path: 输出路径
            source_file: 源文件路径（用于复制图片）
            data_format: 数据格式 ('mineru' 或 'paddleocr_vl')，None 则自动检测
        
        Returns:
            Markdown 内容
        """
        # ✅ 自动检测数据格式
        if data_format is None:
            data_format = MarkdownGenerator.detect_data_format(merged_data)
        
        print(f"ℹ️  检测到数据格式: {data_format}")
        
        # ✅ 根据格式选择处理函数
        if data_format == 'paddleocr_vl':
            return MarkdownGenerator._generate_paddleocr_vl_markdown(
                merged_data, output_path, source_file
            )
        else:
            return MarkdownGenerator._generate_mineru_markdown(
                merged_data, output_path, source_file
            )
    
    @staticmethod
    def _generate_mineru_markdown(merged_data: List[Dict],
                                  output_path: Optional[str] = None,
                                  source_file: Optional[str] = None) -> str:
        """生成 MinerU 格式的 Markdown"""
        md_lines = []
        
        for item in merged_data:
            item_type = item.get('type', '')
            
            if item_type == 'title':
                md_lines.extend(MarkdownGenerator._format_mineru_title(item))
            elif item_type == 'text':
                md_lines.extend(MarkdownGenerator._format_mineru_text(item))
            elif item_type == 'list':
                md_lines.extend(MarkdownGenerator._format_mineru_list(item))
            elif item_type == 'table':
                md_lines.extend(MarkdownGenerator._format_mineru_table(item))
            elif item_type == 'image':
                md_lines.extend(MarkdownGenerator._format_mineru_image(
                    item, output_path, source_file
                ))
            elif item_type in ['equation', 'interline_equation']:
                md_lines.extend(MarkdownGenerator._format_equation(item))
            elif item_type == 'inline_equation':
                md_lines.extend(MarkdownGenerator._format_inline_equation(item))
            elif item_type == 'header':
                md_lines.extend(MarkdownGenerator._format_mineru_header(item))
            elif item_type == 'footer':
                md_lines.extend(MarkdownGenerator._format_mineru_footer(item))
            elif item_type == 'page_number':
                md_lines.extend(MarkdownGenerator._format_mineru_page_number(item))
            elif item_type == 'ref_text':
                md_lines.extend(MarkdownGenerator._format_reference(item))
            else:
                md_lines.extend(MarkdownGenerator._format_unknown(item))
        
        markdown_content = '\n'.join(md_lines)
        
        if output_path:
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(markdown_content)
        
        return markdown_content
    
    @staticmethod
    def _generate_paddleocr_vl_markdown(merged_data: List[Dict],
                                        output_path: Optional[str] = None,
                                        source_file: Optional[str] = None) -> str:
        """生成 PaddleOCR_VL 格式的 Markdown"""
        md_lines = []
        
        for item in merged_data:
            block_label = item.get('block_label', '')
            
            if 'title' in block_label:
                md_lines.extend(MarkdownGenerator._format_paddleocr_vl_title(item))
            elif block_label == 'text':
                md_lines.extend(MarkdownGenerator._format_paddleocr_vl_text(item))
            elif block_label == 'table':
                md_lines.extend(MarkdownGenerator._format_paddleocr_vl_table(item))
            elif block_label == 'image':
                md_lines.extend(MarkdownGenerator._format_paddleocr_vl_figure(item))
            elif block_label == 'equation':
                md_lines.extend(MarkdownGenerator._format_paddleocr_vl_equation(item))
            elif block_label == 'reference':
                md_lines.extend(MarkdownGenerator._format_paddleocr_vl_reference(item))
            else:
                md_lines.extend(MarkdownGenerator._format_paddleocr_vl_unknown(item))
        
        markdown_content = '\n'.join(md_lines)
        
        if output_path:
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(markdown_content)
        
        return markdown_content
    
    # ================== MinerU 格式化方法 ==================
    
    @staticmethod
    def _format_mineru_title(item: Dict) -> List[str]:
        """格式化 MinerU 标题"""
        lines = []
        bbox = item.get('bbox', [])
        if bbox:
            lines.append(MarkdownGenerator._add_bbox_comment(bbox))
        
        text = item.get('text', '')
        text_level = item.get('text_level', 1)
        heading = '#' * min(text_level, 6)
        lines.append(f"{heading} {text}\n")
        
        return lines
    
    @staticmethod
    def _format_mineru_text(item: Dict) -> List[str]:
        """格式化 MinerU 文本"""
        lines = []
        bbox = item.get('bbox', [])
        if bbox:
            lines.append(MarkdownGenerator._add_bbox_comment(bbox))
        
        text = item.get('text', '')
        text_level = item.get('text_level', 0)
        
        if text_level > 0:
            heading = '#' * min(text_level, 6)
            lines.append(f"{heading} {text}\n")
        else:
            lines.append(f"{text}\n")
        
        return lines
    
    @staticmethod
    def _format_mineru_list(item: Dict) -> List[str]:
        """格式化 MinerU 列表"""
        lines = []
        bbox = item.get('bbox', [])
        if bbox:
            lines.append(MarkdownGenerator._add_bbox_comment(bbox))
        
        list_items = item.get('list_items', [])
        for list_item in list_items:
            lines.append(f"{list_item}\n")
        
        lines.append("")
        return lines
    
    @staticmethod
    def _format_mineru_table(item: Dict) -> List[str]:
        """格式化 MinerU 表格"""
        lines = []
        bbox = item.get('bbox', [])
        if bbox:
            lines.append(MarkdownGenerator._add_bbox_comment(bbox))
        
        # 表格标题
        table_caption = item.get('table_caption', [])
        for caption in table_caption:
            if caption:
                lines.append(f"**{caption}**\n")
        
        # 表格内容
        table_body = item.get('table_body_with_bbox', item.get('table_body', ''))
        if table_body:
            lines.append(table_body)
            lines.append("")
        
        # 表格脚注
        table_footnote = item.get('table_footnote', [])
        for footnote in table_footnote:
            if footnote:
                lines.append(f"*{footnote}*")
        if table_footnote:
            lines.append("")
        
        return lines
    
    @staticmethod
    def _format_mineru_image(item: Dict, output_path: Optional[str],
                            source_file: Optional[str]) -> List[str]:
        """格式化 MinerU 图片"""
        lines = []
        bbox = item.get('bbox', [])
        if bbox:
            lines.append(MarkdownGenerator._add_bbox_comment(bbox))
        
        img_path = item.get('img_path', '')
        
        # 复制图片
        if img_path and source_file and output_path:
            MarkdownGenerator._copy_image(img_path, source_file, output_path)
        
        # 图片标题
        image_caption = item.get('image_caption', [])
        for caption in image_caption:
            if caption:
                lines.append(f"**{caption}**\n")
        
        lines.append(f"![Image]({img_path})\n")
        
        # 图片脚注
        image_footnote = item.get('image_footnote', [])
        for footnote in image_footnote:
            if footnote:
                lines.append(f"*{footnote}*")
        if image_footnote:
            lines.append("")
        
        return lines
    
    @staticmethod
    def _format_mineru_header(item: Dict) -> List[str]:
        """格式化MinerU header"""
        lines = []
        bbox = item.get('bbox', [])
        if bbox:
            lines.append(MarkdownGenerator._add_bbox_comment(bbox))
        
        text = item.get('text', '')
        lines.append(f"<!-- 页眉: {text} -->\n")
        return lines

    @staticmethod
    def _format_mineru_footer(item: Dict) -> List[str]:
        """格式化MinerU footer"""
        lines = []
        bbox = item.get('bbox', [])
        if bbox:
            lines.append(MarkdownGenerator._add_bbox_comment(bbox))
        
        text = item.get('text', '')
        lines.append(f"<!-- 页脚: {text} -->\n")
        return lines

    @staticmethod
    def _format_mineru_page_number(item: Dict) -> List[str]:
        """格式化MinerU page_number"""
        lines = []
        bbox = item.get('bbox', [])
        if bbox:
            lines.append(MarkdownGenerator._add_bbox_comment(bbox))
        
        text = item.get('text', '')
        lines.append(f"<!-- 页码: {text} -->\n")
        return lines

    # ================== PaddleOCR_VL 格式化方法 ==================
    
    @staticmethod
    def _format_paddleocr_vl_title(item: Dict) -> List[str]:
        """格式化 PaddleOCR_VL 标题"""
        lines = []
        bbox = item.get('block_bbox', [])
        if bbox:
            lines.append(MarkdownGenerator._add_bbox_comment(bbox))
        
        text = item.get('block_content', '')
        block_label = item.get('block_label', '')
        
        # 根据 block_label 确定标题级别
        level_map = {
            'paragraph_title': 1,
            'figure_title': 2,
            'title': 1
        }
        text_level = level_map.get(block_label, 1)
        
        heading = '#' * min(text_level, 6)
        lines.append(f"{heading} {text}\n")
        
        return lines
    
    @staticmethod
    def _format_paddleocr_vl_text(item: Dict) -> List[str]:
        """格式化 PaddleOCR_VL 文本"""
        lines = []
        bbox = item.get('block_bbox', [])
        if bbox:
            lines.append(MarkdownGenerator._add_bbox_comment(bbox))
        
        text = item.get('block_content', '')
        lines.append(f"{text}\n")
        
        return lines
    
    @staticmethod
    def _format_paddleocr_vl_table(item: Dict) -> List[str]:
        """格式化 PaddleOCR_VL 表格"""
        lines = []
        bbox = item.get('block_bbox', [])
        if bbox:
            lines.append(MarkdownGenerator._add_bbox_comment(bbox))
        
        # 表格内容
        table_content = item.get('block_content_with_bbox', 
                                item.get('block_content', ''))
        if table_content:
            lines.append(table_content)
            lines.append("")
        
        return lines
    
    @staticmethod
    def _format_paddleocr_vl_figure(item: Dict) -> List[str]:
        """格式化 PaddleOCR_VL 图片"""
        lines = []
        bbox = item.get('block_bbox', [])
        if bbox:
            lines.append(MarkdownGenerator._add_bbox_comment(bbox))
        
        # PaddleOCR_VL 图片信息在 block_content 中
        content = item.get('block_content', '')
        lines.append(f"![Figure]({content})\n")
        
        return lines
    
    @staticmethod
    def _format_paddleocr_vl_equation(item: Dict) -> List[str]:
        """格式化 PaddleOCR_VL 公式"""
        lines = []
        bbox = item.get('block_bbox', [])
        if bbox:
            lines.append(MarkdownGenerator._add_bbox_comment(bbox))
        
        latex = item.get('block_content', '')
        if latex:
            lines.append(f"$$\n{latex}\n$$\n")
        
        return lines
    
    @staticmethod
    def _format_paddleocr_vl_reference(item: Dict) -> List[str]:
        """格式化 PaddleOCR_VL 参考文献"""
        text = item.get('block_content', '')
        return [f"> {text}\n"]
    
    @staticmethod
    def _format_paddleocr_vl_unknown(item: Dict) -> List[str]:
        """格式化 PaddleOCR_VL 未知类型"""
        lines = []
        bbox = item.get('block_bbox', [])
        if bbox:
            lines.append(MarkdownGenerator._add_bbox_comment(bbox))
        
        text = item.get('block_content', '')
        if text:
            lines.append(f"{text}\n")
        
        return lines
    
    # ================== 通用方法 ==================
    
    @staticmethod
    def _add_bbox_comment(bbox: List) -> str:
        """添加 bbox 注释"""
        return f"<!-- bbox: {bbox} -->"
    
    @staticmethod
    def _format_equation(item: Dict) -> List[str]:
        """格式化公式（通用）"""
        latex = item.get('latex', '')
        if latex:
            return [f"$$\n{latex}\n$$\n"]
        return []
    
    @staticmethod
    def _format_inline_equation(item: Dict) -> List[str]:
        """格式化行内公式（通用）"""
        latex = item.get('latex', '')
        if latex:
            return [f"${latex}$\n"]
        return []
    
    @staticmethod
    def _format_metadata(item: Dict, item_type: str) -> List[str]:
        """格式化元数据（通用）"""
        text = item.get('text', '')
        type_map = {
            'page_number': '页码',
            'header': '页眉',
            'footer': '页脚'
        }
        if text:
            return [f"<!-- {type_map.get(item_type, item_type)}: {text} -->\n"]
        return []
    
    @staticmethod
    def _format_reference(item: Dict) -> List[str]:
        """格式化参考文献（MinerU）"""
        text = item.get('text', '')
        return [f"> {text}\n"]
    
    @staticmethod
    def _format_unknown(item: Dict) -> List[str]:
        """格式化未知类型（MinerU）"""
        text = item.get('text', '')
        if text:
            return [f"{text}\n"]
        return []
    
    @staticmethod
    def _copy_image(img_path: str, source_file: str, output_path: str):
        """复制图片到输出目录"""
        source_dir = Path(source_file).parent
        img_full_path = source_dir / img_path
        if img_full_path.exists():
            output_img_path = Path(output_path).parent / img_path
            output_img_path.parent.mkdir(parents=True, exist_ok=True)
            shutil.copy(img_full_path, output_img_path)