""" Markdown 生成模块 负责将合并后的数据生成 Markdown 文件 """ import shutil from pathlib import Path from typing import List, Dict, Optional class MarkdownGenerator: """Markdown 生成器""" @staticmethod def generate_enhanced_markdown(merged_data: List[Dict], output_path: Optional[str] = None, mineru_file: Optional[str] = None) -> str: """ 生成增强的 Markdown(包含 bbox 信息的注释) Args: merged_data: 合并后的数据 output_path: 输出路径 mineru_file: MinerU 源文件路径(用于复制图片) Returns: Markdown 内容 """ md_lines = [] for item in merged_data: item_type = item.get('type', '') if item_type == 'title': md_lines.extend(MarkdownGenerator._format_title(item)) elif item_type == 'text': md_lines.extend(MarkdownGenerator._format_text(item)) elif item_type == 'list': md_lines.extend(MarkdownGenerator._format_list(item)) elif item_type == 'table': md_lines.extend(MarkdownGenerator._format_table(item)) elif item_type == 'image': md_lines.extend(MarkdownGenerator._format_image( item, output_path, mineru_file )) elif item_type == 'equation': md_lines.extend(MarkdownGenerator._format_equation(item)) elif item_type == 'inline_equation': md_lines.extend(MarkdownGenerator._format_inline_equation(item)) elif item_type in ['page_number', 'header', 'footer']: md_lines.extend(MarkdownGenerator._format_metadata(item, item_type)) elif item_type == 'reference': md_lines.extend(MarkdownGenerator._format_reference(item)) else: md_lines.extend(MarkdownGenerator._format_unknown(item)) markdown_content = '\n'.join(md_lines) if output_path: with open(output_path, 'w', encoding='utf-8') as f: f.write(markdown_content) return markdown_content @staticmethod def _add_bbox_comment(bbox: List) -> str: """添加 bbox 注释""" return f"" @staticmethod def _format_title(item: Dict) -> List[str]: """格式化标题""" lines = [] bbox = item.get('bbox', []) if bbox: lines.append(MarkdownGenerator._add_bbox_comment(bbox)) text = item.get('text', '') text_level = item.get('text_level', 1) heading = '#' * min(text_level, 6) lines.append(f"{heading} {text}\n") return lines @staticmethod def _format_text(item: Dict) -> List[str]: """格式化文本""" lines = [] bbox = item.get('bbox', []) if bbox: lines.append(MarkdownGenerator._add_bbox_comment(bbox)) text = item.get('text', '') text_level = item.get('text_level', 0) if text_level > 0: heading = '#' * min(text_level, 6) lines.append(f"{heading} {text}\n") else: lines.append(f"{text}\n") return lines @staticmethod def _format_list(item: Dict) -> List[str]: """格式化列表""" lines = [] bbox = item.get('bbox', []) if bbox: lines.append(MarkdownGenerator._add_bbox_comment(bbox)) list_items = item.get('list_items', []) for list_item in list_items: lines.append(f"{list_item}\n") lines.append("") return lines @staticmethod def _format_table(item: Dict) -> List[str]: """格式化表格""" lines = [] bbox = item.get('bbox', []) if bbox: lines.append(MarkdownGenerator._add_bbox_comment(bbox)) # 表格标题 table_caption = item.get('table_caption', []) for caption in table_caption: if caption: lines.append(f"**{caption}**\n") # 表格内容 table_body = item.get('table_body_with_bbox', item.get('table_body', '')) if table_body: lines.append(table_body) lines.append("") # 表格脚注 table_footnote = item.get('table_footnote', []) for footnote in table_footnote: if footnote: lines.append(f"*{footnote}*") if table_footnote: lines.append("") return lines @staticmethod def _format_image(item: Dict, output_path: Optional[str], mineru_file: Optional[str]) -> List[str]: """格式化图片""" lines = [] bbox = item.get('bbox', []) if bbox: lines.append(MarkdownGenerator._add_bbox_comment(bbox)) img_path = item.get('img_path', '') # 复制图片 if img_path and mineru_file and output_path: MarkdownGenerator._copy_image(img_path, mineru_file, output_path) # 图片标题 image_caption = item.get('image_caption', []) for caption in image_caption: if caption: lines.append(f"**{caption}**\n") lines.append(f"![Image]({img_path})\n") # 图片脚注 image_footnote = item.get('image_footnote', []) for footnote in image_footnote: if footnote: lines.append(f"*{footnote}*") if image_footnote: lines.append("") return lines @staticmethod def _copy_image(img_path: str, mineru_file: str, output_path: str): """复制图片到输出目录""" mineru_dir = Path(mineru_file).parent img_full_path = mineru_dir / img_path if img_full_path.exists(): output_img_path = Path(output_path).parent / img_path output_img_path.parent.mkdir(parents=True, exist_ok=True) shutil.copy(img_full_path, output_img_path) @staticmethod def _format_equation(item: Dict) -> List[str]: """格式化公式""" latex = item.get('latex', '') if latex: return [f"$$\n{latex}\n$$\n"] return [] @staticmethod def _format_inline_equation(item: Dict) -> List[str]: """格式化行内公式""" latex = item.get('latex', '') if latex: return [f"${latex}$\n"] return [] @staticmethod def _format_metadata(item: Dict, item_type: str) -> List[str]: """格式化元数据(页码、页眉、页脚)""" text = item.get('text', '') type_map = { 'page_number': '页码', 'header': '页眉', 'footer': '页脚' } if text: return [f"\n"] return [] @staticmethod def _format_reference(item: Dict) -> List[str]: """格式化参考文献""" text = item.get('text', '') return [f"> {text}\n"] @staticmethod def _format_unknown(item: Dict) -> List[str]: """格式化未知类型""" text = item.get('text', '') if text: return [f"{text}\n"] return []