""" Markdown 生成模块 负责将合并后的数据生成 Markdown 文件 """ import shutil from pathlib import Path from typing import List, Dict, Optional class MarkdownGenerator: """Markdown 生成器""" @staticmethod def detect_data_format(merged_data: List[Dict]) -> str: """ 检测数据格式 Returns: 'mineru' 或 'paddleocr_vl' """ if not merged_data: return 'mineru' first_item = merged_data[0] # 检查是否有 PaddleOCR_VL 特有字段 if 'block_label' in first_item and 'block_content' in first_item: return 'paddleocr_vl' # 检查是否有 MinerU 特有字段 if 'type' in first_item and ('table_body' in first_item or 'text' in first_item): return 'mineru' # 默认按 MinerU 格式处理 return 'mineru' @staticmethod def generate_enhanced_markdown(merged_data: List[Dict], output_path: Optional[str] = None, source_file: Optional[str] = None, data_format: Optional[str] = None) -> str: """ 生成增强的 Markdown(包含 bbox 信息的注释) Args: merged_data: 合并后的数据 output_path: 输出路径 source_file: 源文件路径(用于复制图片) data_format: 数据格式 ('mineru' 或 'paddleocr_vl'),None 则自动检测 Returns: Markdown 内容 """ # ✅ 自动检测数据格式 if data_format is None: data_format = MarkdownGenerator.detect_data_format(merged_data) print(f"ℹ️ 检测到数据格式: {data_format}") # ✅ 根据格式选择处理函数 if data_format == 'paddleocr_vl': return MarkdownGenerator._generate_paddleocr_vl_markdown( merged_data, output_path, source_file ) else: return MarkdownGenerator._generate_mineru_markdown( merged_data, output_path, source_file ) @staticmethod def _generate_mineru_markdown(merged_data: List[Dict], output_path: Optional[str] = None, source_file: Optional[str] = None) -> str: """生成 MinerU 格式的 Markdown""" md_lines = [] for item in merged_data: item_type = item.get('type', '') if item_type == 'title': md_lines.extend(MarkdownGenerator._format_mineru_title(item)) elif item_type == 'text': md_lines.extend(MarkdownGenerator._format_mineru_text(item)) elif item_type == 'list': md_lines.extend(MarkdownGenerator._format_mineru_list(item)) elif item_type == 'table': md_lines.extend(MarkdownGenerator._format_mineru_table(item)) elif item_type == 'image': md_lines.extend(MarkdownGenerator._format_mineru_image( item, output_path, source_file )) elif item_type in ['equation', 'interline_equation']: md_lines.extend(MarkdownGenerator._format_equation(item)) elif item_type == 'inline_equation': md_lines.extend(MarkdownGenerator._format_inline_equation(item)) elif item_type == 'header': md_lines.extend(MarkdownGenerator._format_mineru_header(item)) elif item_type == 'footer': md_lines.extend(MarkdownGenerator._format_mineru_footer(item)) elif item_type == 'page_number': md_lines.extend(MarkdownGenerator._format_mineru_page_number(item)) elif item_type == 'ref_text': md_lines.extend(MarkdownGenerator._format_reference(item)) else: md_lines.extend(MarkdownGenerator._format_unknown(item)) markdown_content = '\n'.join(md_lines) if output_path: with open(output_path, 'w', encoding='utf-8') as f: f.write(markdown_content) return markdown_content @staticmethod def _generate_paddleocr_vl_markdown(merged_data: List[Dict], output_path: Optional[str] = None, source_file: Optional[str] = None) -> str: """生成 PaddleOCR_VL 格式的 Markdown""" md_lines = [] for item in merged_data: block_label = item.get('block_label', '') if 'title' in block_label: md_lines.extend(MarkdownGenerator._format_paddleocr_vl_title(item)) elif block_label == 'text': md_lines.extend(MarkdownGenerator._format_paddleocr_vl_text(item)) elif block_label == 'table': md_lines.extend(MarkdownGenerator._format_paddleocr_vl_table(item)) elif block_label == 'image': md_lines.extend(MarkdownGenerator._format_paddleocr_vl_figure(item)) elif block_label == 'equation': md_lines.extend(MarkdownGenerator._format_paddleocr_vl_equation(item)) elif block_label == 'reference': md_lines.extend(MarkdownGenerator._format_paddleocr_vl_reference(item)) else: md_lines.extend(MarkdownGenerator._format_paddleocr_vl_unknown(item)) markdown_content = '\n'.join(md_lines) if output_path: with open(output_path, 'w', encoding='utf-8') as f: f.write(markdown_content) return markdown_content # ================== MinerU 格式化方法 ================== @staticmethod def _format_mineru_title(item: Dict) -> List[str]: """格式化 MinerU 标题""" lines = [] bbox = item.get('bbox', []) if bbox: lines.append(MarkdownGenerator._add_bbox_comment(bbox)) text = item.get('text', '') text_level = item.get('text_level', 1) heading = '#' * min(text_level, 6) lines.append(f"{heading} {text}\n") return lines @staticmethod def _format_mineru_text(item: Dict) -> List[str]: """格式化 MinerU 文本""" lines = [] bbox = item.get('bbox', []) if bbox: lines.append(MarkdownGenerator._add_bbox_comment(bbox)) text = item.get('text', '') text_level = item.get('text_level', 0) if text_level > 0: heading = '#' * min(text_level, 6) lines.append(f"{heading} {text}\n") else: lines.append(f"{text}\n") return lines @staticmethod def _format_mineru_list(item: Dict) -> List[str]: """格式化 MinerU 列表""" lines = [] bbox = item.get('bbox', []) if bbox: lines.append(MarkdownGenerator._add_bbox_comment(bbox)) list_items = item.get('list_items', []) for list_item in list_items: lines.append(f"{list_item}\n") lines.append("") return lines @staticmethod def _format_mineru_table(item: Dict) -> List[str]: """格式化 MinerU 表格""" lines = [] bbox = item.get('bbox', []) if bbox: lines.append(MarkdownGenerator._add_bbox_comment(bbox)) # 表格标题 table_caption = item.get('table_caption', []) for caption in table_caption: if caption: lines.append(f"**{caption}**\n") # 表格内容 table_body = item.get('table_body_with_bbox', item.get('table_body', '')) if table_body: lines.append(table_body) lines.append("") # 表格脚注 table_footnote = item.get('table_footnote', []) for footnote in table_footnote: if footnote: lines.append(f"*{footnote}*") if table_footnote: lines.append("") return lines @staticmethod def _format_mineru_image(item: Dict, output_path: Optional[str], source_file: Optional[str]) -> List[str]: """格式化 MinerU 图片""" lines = [] bbox = item.get('bbox', []) if bbox: lines.append(MarkdownGenerator._add_bbox_comment(bbox)) img_path = item.get('img_path', '') # 复制图片 if img_path and source_file and output_path: MarkdownGenerator._copy_image(img_path, source_file, output_path) # 图片标题 image_caption = item.get('image_caption', []) for caption in image_caption: if caption: lines.append(f"**{caption}**\n") lines.append(f"![Image]({img_path})\n") # 图片脚注 image_footnote = item.get('image_footnote', []) for footnote in image_footnote: if footnote: lines.append(f"*{footnote}*") if image_footnote: lines.append("") return lines @staticmethod def _format_mineru_header(item: Dict) -> List[str]: """格式化MinerU header""" lines = [] bbox = item.get('bbox', []) if bbox: lines.append(MarkdownGenerator._add_bbox_comment(bbox)) text = item.get('text', '') lines.append(f"\n") return lines @staticmethod def _format_mineru_footer(item: Dict) -> List[str]: """格式化MinerU footer""" lines = [] bbox = item.get('bbox', []) if bbox: lines.append(MarkdownGenerator._add_bbox_comment(bbox)) text = item.get('text', '') lines.append(f"\n") return lines @staticmethod def _format_mineru_page_number(item: Dict) -> List[str]: """格式化MinerU page_number""" lines = [] bbox = item.get('bbox', []) if bbox: lines.append(MarkdownGenerator._add_bbox_comment(bbox)) text = item.get('text', '') lines.append(f"\n") return lines # ================== PaddleOCR_VL 格式化方法 ================== @staticmethod def _format_paddleocr_vl_title(item: Dict) -> List[str]: """格式化 PaddleOCR_VL 标题""" lines = [] bbox = item.get('block_bbox', []) if bbox: lines.append(MarkdownGenerator._add_bbox_comment(bbox)) text = item.get('block_content', '') block_label = item.get('block_label', '') # 根据 block_label 确定标题级别 level_map = { 'paragraph_title': 1, 'figure_title': 2, 'title': 1 } text_level = level_map.get(block_label, 1) heading = '#' * min(text_level, 6) lines.append(f"{heading} {text}\n") return lines @staticmethod def _format_paddleocr_vl_text(item: Dict) -> List[str]: """格式化 PaddleOCR_VL 文本""" lines = [] bbox = item.get('block_bbox', []) if bbox: lines.append(MarkdownGenerator._add_bbox_comment(bbox)) text = item.get('block_content', '') lines.append(f"{text}\n") return lines @staticmethod def _format_paddleocr_vl_table(item: Dict) -> List[str]: """格式化 PaddleOCR_VL 表格""" lines = [] bbox = item.get('block_bbox', []) if bbox: lines.append(MarkdownGenerator._add_bbox_comment(bbox)) # 表格内容 table_content = item.get('block_content_with_bbox', item.get('block_content', '')) if table_content: lines.append(table_content) lines.append("") return lines @staticmethod def _format_paddleocr_vl_figure(item: Dict) -> List[str]: """格式化 PaddleOCR_VL 图片""" lines = [] bbox = item.get('block_bbox', []) if bbox: lines.append(MarkdownGenerator._add_bbox_comment(bbox)) # PaddleOCR_VL 图片信息在 block_content 中 content = item.get('block_content', '') lines.append(f"![Figure]({content})\n") return lines @staticmethod def _format_paddleocr_vl_equation(item: Dict) -> List[str]: """格式化 PaddleOCR_VL 公式""" lines = [] bbox = item.get('block_bbox', []) if bbox: lines.append(MarkdownGenerator._add_bbox_comment(bbox)) latex = item.get('block_content', '') if latex: lines.append(f"$$\n{latex}\n$$\n") return lines @staticmethod def _format_paddleocr_vl_reference(item: Dict) -> List[str]: """格式化 PaddleOCR_VL 参考文献""" text = item.get('block_content', '') return [f"> {text}\n"] @staticmethod def _format_paddleocr_vl_unknown(item: Dict) -> List[str]: """格式化 PaddleOCR_VL 未知类型""" lines = [] bbox = item.get('block_bbox', []) if bbox: lines.append(MarkdownGenerator._add_bbox_comment(bbox)) text = item.get('block_content', '') if text: lines.append(f"{text}\n") return lines # ================== 通用方法 ================== @staticmethod def _add_bbox_comment(bbox: List) -> str: """添加 bbox 注释""" return f"" @staticmethod def _format_equation(item: Dict) -> List[str]: """格式化公式(通用)""" latex = item.get('latex', '') if latex: return [f"$$\n{latex}\n$$\n"] return [] @staticmethod def _format_inline_equation(item: Dict) -> List[str]: """格式化行内公式(通用)""" latex = item.get('latex', '') if latex: return [f"${latex}$\n"] return [] @staticmethod def _format_metadata(item: Dict, item_type: str) -> List[str]: """格式化元数据(通用)""" text = item.get('text', '') type_map = { 'page_number': '页码', 'header': '页眉', 'footer': '页脚' } if text: return [f"\n"] return [] @staticmethod def _format_reference(item: Dict) -> List[str]: """格式化参考文献(MinerU)""" text = item.get('text', '') return [f"> {text}\n"] @staticmethod def _format_unknown(item: Dict) -> List[str]: """格式化未知类型(MinerU)""" text = item.get('text', '') if text: return [f"{text}\n"] return [] @staticmethod def _copy_image(img_path: str, source_file: str, output_path: str): """复制图片到输出目录""" source_dir = Path(source_file).parent img_full_path = source_dir / img_path if img_full_path.exists(): output_img_path = Path(output_path).parent / img_path output_img_path.parent.mkdir(parents=True, exist_ok=True) shutil.copy(img_full_path, output_img_path)