| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232 |
- """
- Markdown 生成模块
- 负责将合并后的数据生成 Markdown 文件
- """
- import shutil
- from pathlib import Path
- from typing import List, Dict, Optional
- class MarkdownGenerator:
- """Markdown 生成器"""
-
- @staticmethod
- def generate_enhanced_markdown(merged_data: List[Dict],
- output_path: Optional[str] = None,
- mineru_file: Optional[str] = None) -> str:
- """
- 生成增强的 Markdown(包含 bbox 信息的注释)
-
- Args:
- merged_data: 合并后的数据
- output_path: 输出路径
- mineru_file: MinerU 源文件路径(用于复制图片)
-
- Returns:
- Markdown 内容
- """
- md_lines = []
-
- for item in merged_data:
- item_type = item.get('type', '')
-
- if item_type == 'title':
- md_lines.extend(MarkdownGenerator._format_title(item))
- elif item_type == 'text':
- md_lines.extend(MarkdownGenerator._format_text(item))
- elif item_type == 'list':
- md_lines.extend(MarkdownGenerator._format_list(item))
- elif item_type == 'table':
- md_lines.extend(MarkdownGenerator._format_table(item))
- elif item_type == 'image':
- md_lines.extend(MarkdownGenerator._format_image(
- item, output_path, mineru_file
- ))
- elif item_type == 'equation':
- md_lines.extend(MarkdownGenerator._format_equation(item))
- elif item_type == 'inline_equation':
- md_lines.extend(MarkdownGenerator._format_inline_equation(item))
- elif item_type in ['page_number', 'header', 'footer']:
- md_lines.extend(MarkdownGenerator._format_metadata(item, item_type))
- elif item_type == 'reference':
- md_lines.extend(MarkdownGenerator._format_reference(item))
- else:
- md_lines.extend(MarkdownGenerator._format_unknown(item))
-
- markdown_content = '\n'.join(md_lines)
-
- if output_path:
- with open(output_path, 'w', encoding='utf-8') as f:
- f.write(markdown_content)
-
- return markdown_content
-
- @staticmethod
- def _add_bbox_comment(bbox: List) -> str:
- """添加 bbox 注释"""
- return f"<!-- bbox: {bbox} -->"
-
- @staticmethod
- def _format_title(item: Dict) -> List[str]:
- """格式化标题"""
- lines = []
- bbox = item.get('bbox', [])
- if bbox:
- lines.append(MarkdownGenerator._add_bbox_comment(bbox))
-
- text = item.get('text', '')
- text_level = item.get('text_level', 1)
- heading = '#' * min(text_level, 6)
- lines.append(f"{heading} {text}\n")
-
- return lines
-
- @staticmethod
- def _format_text(item: Dict) -> List[str]:
- """格式化文本"""
- lines = []
- bbox = item.get('bbox', [])
- if bbox:
- lines.append(MarkdownGenerator._add_bbox_comment(bbox))
-
- text = item.get('text', '')
- text_level = item.get('text_level', 0)
-
- if text_level > 0:
- heading = '#' * min(text_level, 6)
- lines.append(f"{heading} {text}\n")
- else:
- lines.append(f"{text}\n")
-
- return lines
-
- @staticmethod
- def _format_list(item: Dict) -> List[str]:
- """格式化列表"""
- lines = []
- bbox = item.get('bbox', [])
- if bbox:
- lines.append(MarkdownGenerator._add_bbox_comment(bbox))
-
- list_items = item.get('list_items', [])
- for list_item in list_items:
- lines.append(f"{list_item}\n")
-
- lines.append("")
- return lines
-
- @staticmethod
- def _format_table(item: Dict) -> List[str]:
- """格式化表格"""
- lines = []
- bbox = item.get('bbox', [])
- if bbox:
- lines.append(MarkdownGenerator._add_bbox_comment(bbox))
-
- # 表格标题
- table_caption = item.get('table_caption', [])
- for caption in table_caption:
- if caption:
- lines.append(f"**{caption}**\n")
-
- # 表格内容
- table_body = item.get('table_body_with_bbox', item.get('table_body', ''))
- if table_body:
- lines.append(table_body)
- lines.append("")
-
- # 表格脚注
- table_footnote = item.get('table_footnote', [])
- for footnote in table_footnote:
- if footnote:
- lines.append(f"*{footnote}*")
- if table_footnote:
- lines.append("")
-
- return lines
-
- @staticmethod
- def _format_image(item: Dict, output_path: Optional[str],
- mineru_file: Optional[str]) -> List[str]:
- """格式化图片"""
- lines = []
- bbox = item.get('bbox', [])
- if bbox:
- lines.append(MarkdownGenerator._add_bbox_comment(bbox))
-
- img_path = item.get('img_path', '')
-
- # 复制图片
- if img_path and mineru_file and output_path:
- MarkdownGenerator._copy_image(img_path, mineru_file, output_path)
-
- # 图片标题
- image_caption = item.get('image_caption', [])
- for caption in image_caption:
- if caption:
- lines.append(f"**{caption}**\n")
-
- lines.append(f"\n")
-
- # 图片脚注
- image_footnote = item.get('image_footnote', [])
- for footnote in image_footnote:
- if footnote:
- lines.append(f"*{footnote}*")
- if image_footnote:
- lines.append("")
-
- return lines
-
- @staticmethod
- def _copy_image(img_path: str, mineru_file: str, output_path: str):
- """复制图片到输出目录"""
- mineru_dir = Path(mineru_file).parent
- img_full_path = mineru_dir / img_path
- if img_full_path.exists():
- output_img_path = Path(output_path).parent / img_path
- output_img_path.parent.mkdir(parents=True, exist_ok=True)
- shutil.copy(img_full_path, output_img_path)
-
- @staticmethod
- def _format_equation(item: Dict) -> List[str]:
- """格式化公式"""
- latex = item.get('latex', '')
- if latex:
- return [f"$$\n{latex}\n$$\n"]
- return []
-
- @staticmethod
- def _format_inline_equation(item: Dict) -> List[str]:
- """格式化行内公式"""
- latex = item.get('latex', '')
- if latex:
- return [f"${latex}$\n"]
- return []
-
- @staticmethod
- def _format_metadata(item: Dict, item_type: str) -> List[str]:
- """格式化元数据(页码、页眉、页脚)"""
- text = item.get('text', '')
- type_map = {
- 'page_number': '页码',
- 'header': '页眉',
- 'footer': '页脚'
- }
- if text:
- return [f"<!-- {type_map.get(item_type, item_type)}: {text} -->\n"]
- return []
-
- @staticmethod
- def _format_reference(item: Dict) -> List[str]:
- """格式化参考文献"""
- text = item.get('text', '')
- return [f"> {text}\n"]
-
- @staticmethod
- def _format_unknown(item: Dict) -> List[str]:
- """格式化未知类型"""
- text = item.get('text', '')
- if text:
- return [f"{text}\n"]
- return []
|