| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460 |
- """
- Markdown 生成模块
- 负责将合并后的数据生成 Markdown 文件
- """
- import shutil
- from pathlib import Path
- from typing import List, Dict, Optional
- class MarkdownGenerator:
- """Markdown 生成器"""
-
- @staticmethod
- def detect_data_format(merged_data: List[Dict]) -> str:
- """
- 检测数据格式
-
- Returns:
- 'mineru' 或 'paddleocr_vl'
- """
- if not merged_data:
- return 'mineru'
-
- first_item = merged_data[0]
-
- # 检查是否有 PaddleOCR_VL 特有字段
- if 'block_label' in first_item and 'block_content' in first_item:
- return 'paddleocr_vl'
-
- # 检查是否有 MinerU 特有字段
- if 'type' in first_item and ('table_body' in first_item or 'text' in first_item):
- return 'mineru'
-
- # 默认按 MinerU 格式处理
- return 'mineru'
-
- @staticmethod
- def generate_enhanced_markdown(merged_data: List[Dict],
- output_path: Optional[str] = None,
- source_file: Optional[str] = None,
- data_format: Optional[str] = None) -> str:
- """
- 生成增强的 Markdown(包含 bbox 信息的注释)
-
- Args:
- merged_data: 合并后的数据
- output_path: 输出路径
- source_file: 源文件路径(用于复制图片)
- data_format: 数据格式 ('mineru' 或 'paddleocr_vl'),None 则自动检测
-
- Returns:
- Markdown 内容
- """
- # ✅ 自动检测数据格式
- if data_format is None:
- data_format = MarkdownGenerator.detect_data_format(merged_data)
-
- print(f"ℹ️ 检测到数据格式: {data_format}")
-
- # ✅ 根据格式选择处理函数
- if data_format == 'paddleocr_vl':
- return MarkdownGenerator._generate_paddleocr_vl_markdown(
- merged_data, output_path, source_file
- )
- else:
- return MarkdownGenerator._generate_mineru_markdown(
- merged_data, output_path, source_file
- )
-
- @staticmethod
- def _generate_mineru_markdown(merged_data: List[Dict],
- output_path: Optional[str] = None,
- source_file: Optional[str] = None) -> str:
- """生成 MinerU 格式的 Markdown"""
- md_lines = []
-
- for item in merged_data:
- item_type = item.get('type', '')
-
- if item_type == 'title':
- md_lines.extend(MarkdownGenerator._format_mineru_title(item))
- elif item_type == 'text':
- md_lines.extend(MarkdownGenerator._format_mineru_text(item))
- elif item_type == 'list':
- md_lines.extend(MarkdownGenerator._format_mineru_list(item))
- elif item_type == 'table':
- md_lines.extend(MarkdownGenerator._format_mineru_table(item))
- elif item_type == 'image':
- md_lines.extend(MarkdownGenerator._format_mineru_image(
- item, output_path, source_file
- ))
- elif item_type in ['equation', 'interline_equation']:
- md_lines.extend(MarkdownGenerator._format_equation(item))
- elif item_type == 'inline_equation':
- md_lines.extend(MarkdownGenerator._format_inline_equation(item))
- elif item_type == 'header':
- md_lines.extend(MarkdownGenerator._format_mineru_header(item))
- elif item_type == 'footer':
- md_lines.extend(MarkdownGenerator._format_mineru_footer(item))
- elif item_type == 'page_number':
- md_lines.extend(MarkdownGenerator._format_mineru_page_number(item))
- elif item_type == 'ref_text':
- md_lines.extend(MarkdownGenerator._format_reference(item))
- else:
- md_lines.extend(MarkdownGenerator._format_unknown(item))
-
- markdown_content = '\n'.join(md_lines)
-
- if output_path:
- with open(output_path, 'w', encoding='utf-8') as f:
- f.write(markdown_content)
-
- return markdown_content
-
- @staticmethod
- def _generate_paddleocr_vl_markdown(merged_data: List[Dict],
- output_path: Optional[str] = None,
- source_file: Optional[str] = None) -> str:
- """生成 PaddleOCR_VL 格式的 Markdown"""
- md_lines = []
-
- for item in merged_data:
- block_label = item.get('block_label', '')
-
- if 'title' in block_label:
- md_lines.extend(MarkdownGenerator._format_paddleocr_vl_title(item))
- elif block_label == 'text':
- md_lines.extend(MarkdownGenerator._format_paddleocr_vl_text(item))
- elif block_label == 'table':
- md_lines.extend(MarkdownGenerator._format_paddleocr_vl_table(item))
- elif block_label == 'image':
- md_lines.extend(MarkdownGenerator._format_paddleocr_vl_figure(item))
- elif block_label == 'equation':
- md_lines.extend(MarkdownGenerator._format_paddleocr_vl_equation(item))
- elif block_label == 'reference':
- md_lines.extend(MarkdownGenerator._format_paddleocr_vl_reference(item))
- else:
- md_lines.extend(MarkdownGenerator._format_paddleocr_vl_unknown(item))
-
- markdown_content = '\n'.join(md_lines)
-
- if output_path:
- with open(output_path, 'w', encoding='utf-8') as f:
- f.write(markdown_content)
-
- return markdown_content
-
- # ================== MinerU 格式化方法 ==================
-
- @staticmethod
- def _format_mineru_title(item: Dict) -> List[str]:
- """格式化 MinerU 标题"""
- lines = []
- bbox = item.get('bbox', [])
- if bbox:
- lines.append(MarkdownGenerator._add_bbox_comment(bbox))
-
- text = item.get('text', '')
- text_level = item.get('text_level', 1)
- heading = '#' * min(text_level, 6)
- lines.append(f"{heading} {text}\n")
-
- return lines
-
- @staticmethod
- def _format_mineru_text(item: Dict) -> List[str]:
- """格式化 MinerU 文本"""
- lines = []
- bbox = item.get('bbox', [])
- if bbox:
- lines.append(MarkdownGenerator._add_bbox_comment(bbox))
-
- text = item.get('text', '')
- text_level = item.get('text_level', 0)
-
- if text_level > 0:
- heading = '#' * min(text_level, 6)
- lines.append(f"{heading} {text}\n")
- else:
- lines.append(f"{text}\n")
-
- return lines
-
- @staticmethod
- def _format_mineru_list(item: Dict) -> List[str]:
- """格式化 MinerU 列表"""
- lines = []
- bbox = item.get('bbox', [])
- if bbox:
- lines.append(MarkdownGenerator._add_bbox_comment(bbox))
-
- list_items = item.get('list_items', [])
- for list_item in list_items:
- lines.append(f"{list_item}\n")
-
- lines.append("")
- return lines
-
- @staticmethod
- def _format_mineru_table(item: Dict) -> List[str]:
- """格式化 MinerU 表格"""
- lines = []
- bbox = item.get('bbox', [])
- if bbox:
- lines.append(MarkdownGenerator._add_bbox_comment(bbox))
-
- # 表格标题
- table_caption = item.get('table_caption', [])
- for caption in table_caption:
- if caption:
- lines.append(f"**{caption}**\n")
-
- # 表格内容
- table_body = item.get('table_body_with_bbox', item.get('table_body', ''))
- if table_body:
- lines.append(table_body)
- lines.append("")
-
- # 表格脚注
- table_footnote = item.get('table_footnote', [])
- for footnote in table_footnote:
- if footnote:
- lines.append(f"*{footnote}*")
- if table_footnote:
- lines.append("")
-
- return lines
-
- @staticmethod
- def _format_mineru_image(item: Dict, output_path: Optional[str],
- source_file: Optional[str]) -> List[str]:
- """格式化 MinerU 图片"""
- lines = []
- bbox = item.get('bbox', [])
- if bbox:
- lines.append(MarkdownGenerator._add_bbox_comment(bbox))
-
- img_path = item.get('img_path', '')
-
- # 复制图片
- if img_path and source_file and output_path:
- MarkdownGenerator._copy_image(img_path, source_file, output_path)
-
- # 图片标题
- image_caption = item.get('image_caption', [])
- for caption in image_caption:
- if caption:
- lines.append(f"**{caption}**\n")
-
- lines.append(f"\n")
-
- # 图片脚注
- image_footnote = item.get('image_footnote', [])
- for footnote in image_footnote:
- if footnote:
- lines.append(f"*{footnote}*")
- if image_footnote:
- lines.append("")
-
- return lines
-
- @staticmethod
- def _format_mineru_header(item: Dict) -> List[str]:
- """格式化MinerU header"""
- lines = []
- bbox = item.get('bbox', [])
- if bbox:
- lines.append(MarkdownGenerator._add_bbox_comment(bbox))
-
- text = item.get('text', '')
- lines.append(f"<!-- 页眉: {text} -->\n")
- return lines
- @staticmethod
- def _format_mineru_footer(item: Dict) -> List[str]:
- """格式化MinerU footer"""
- lines = []
- bbox = item.get('bbox', [])
- if bbox:
- lines.append(MarkdownGenerator._add_bbox_comment(bbox))
-
- text = item.get('text', '')
- lines.append(f"<!-- 页脚: {text} -->\n")
- return lines
- @staticmethod
- def _format_mineru_page_number(item: Dict) -> List[str]:
- """格式化MinerU page_number"""
- lines = []
- bbox = item.get('bbox', [])
- if bbox:
- lines.append(MarkdownGenerator._add_bbox_comment(bbox))
-
- text = item.get('text', '')
- lines.append(f"<!-- 页码: {text} -->\n")
- return lines
- # ================== PaddleOCR_VL 格式化方法 ==================
-
- @staticmethod
- def _format_paddleocr_vl_title(item: Dict) -> List[str]:
- """格式化 PaddleOCR_VL 标题"""
- lines = []
- bbox = item.get('block_bbox', [])
- if bbox:
- lines.append(MarkdownGenerator._add_bbox_comment(bbox))
-
- text = item.get('block_content', '')
- block_label = item.get('block_label', '')
-
- # 根据 block_label 确定标题级别
- level_map = {
- 'paragraph_title': 1,
- 'figure_title': 2,
- 'title': 1
- }
- text_level = level_map.get(block_label, 1)
-
- heading = '#' * min(text_level, 6)
- lines.append(f"{heading} {text}\n")
-
- return lines
-
- @staticmethod
- def _format_paddleocr_vl_text(item: Dict) -> List[str]:
- """格式化 PaddleOCR_VL 文本"""
- lines = []
- bbox = item.get('block_bbox', [])
- if bbox:
- lines.append(MarkdownGenerator._add_bbox_comment(bbox))
-
- text = item.get('block_content', '')
- lines.append(f"{text}\n")
-
- return lines
-
- @staticmethod
- def _format_paddleocr_vl_table(item: Dict) -> List[str]:
- """格式化 PaddleOCR_VL 表格"""
- lines = []
- bbox = item.get('block_bbox', [])
- if bbox:
- lines.append(MarkdownGenerator._add_bbox_comment(bbox))
-
- # 表格内容
- table_content = item.get('block_content_with_bbox',
- item.get('block_content', ''))
- if table_content:
- lines.append(table_content)
- lines.append("")
-
- return lines
-
- @staticmethod
- def _format_paddleocr_vl_figure(item: Dict) -> List[str]:
- """格式化 PaddleOCR_VL 图片"""
- lines = []
- bbox = item.get('block_bbox', [])
- if bbox:
- lines.append(MarkdownGenerator._add_bbox_comment(bbox))
-
- # PaddleOCR_VL 图片信息在 block_content 中
- content = item.get('block_content', '')
- lines.append(f"\n")
-
- return lines
-
- @staticmethod
- def _format_paddleocr_vl_equation(item: Dict) -> List[str]:
- """格式化 PaddleOCR_VL 公式"""
- lines = []
- bbox = item.get('block_bbox', [])
- if bbox:
- lines.append(MarkdownGenerator._add_bbox_comment(bbox))
-
- latex = item.get('block_content', '')
- if latex:
- lines.append(f"$$\n{latex}\n$$\n")
-
- return lines
-
- @staticmethod
- def _format_paddleocr_vl_reference(item: Dict) -> List[str]:
- """格式化 PaddleOCR_VL 参考文献"""
- text = item.get('block_content', '')
- return [f"> {text}\n"]
-
- @staticmethod
- def _format_paddleocr_vl_unknown(item: Dict) -> List[str]:
- """格式化 PaddleOCR_VL 未知类型"""
- lines = []
- bbox = item.get('block_bbox', [])
- if bbox:
- lines.append(MarkdownGenerator._add_bbox_comment(bbox))
-
- text = item.get('block_content', '')
- if text:
- lines.append(f"{text}\n")
-
- return lines
-
- # ================== 通用方法 ==================
-
- @staticmethod
- def _add_bbox_comment(bbox: List) -> str:
- """添加 bbox 注释"""
- return f"<!-- bbox: {bbox} -->"
-
- @staticmethod
- def _format_equation(item: Dict) -> List[str]:
- """格式化公式(通用)"""
- latex = item.get('latex', '')
- if latex:
- return [f"$$\n{latex}\n$$\n"]
- return []
-
- @staticmethod
- def _format_inline_equation(item: Dict) -> List[str]:
- """格式化行内公式(通用)"""
- latex = item.get('latex', '')
- if latex:
- return [f"${latex}$\n"]
- return []
-
- @staticmethod
- def _format_metadata(item: Dict, item_type: str) -> List[str]:
- """格式化元数据(通用)"""
- text = item.get('text', '')
- type_map = {
- 'page_number': '页码',
- 'header': '页眉',
- 'footer': '页脚'
- }
- if text:
- return [f"<!-- {type_map.get(item_type, item_type)}: {text} -->\n"]
- return []
-
- @staticmethod
- def _format_reference(item: Dict) -> List[str]:
- """格式化参考文献(MinerU)"""
- text = item.get('text', '')
- return [f"> {text}\n"]
-
- @staticmethod
- def _format_unknown(item: Dict) -> List[str]:
- """格式化未知类型(MinerU)"""
- text = item.get('text', '')
- if text:
- return [f"{text}\n"]
- return []
-
- @staticmethod
- def _copy_image(img_path: str, source_file: str, output_path: str):
- """复制图片到输出目录"""
- source_dir = Path(source_file).parent
- img_full_path = source_dir / img_path
- if img_full_path.exists():
- output_img_path = Path(output_path).parent / img_path
- output_img_path.parent.mkdir(parents=True, exist_ok=True)
- shutil.copy(img_full_path, output_img_path)
|