markdown_generator.py 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232
  1. """
  2. Markdown 生成模块
  3. 负责将合并后的数据生成 Markdown 文件
  4. """
  5. import shutil
  6. from pathlib import Path
  7. from typing import List, Dict, Optional
  8. class MarkdownGenerator:
  9. """Markdown 生成器"""
  10. @staticmethod
  11. def generate_enhanced_markdown(merged_data: List[Dict],
  12. output_path: Optional[str] = None,
  13. mineru_file: Optional[str] = None) -> str:
  14. """
  15. 生成增强的 Markdown(包含 bbox 信息的注释)
  16. Args:
  17. merged_data: 合并后的数据
  18. output_path: 输出路径
  19. mineru_file: MinerU 源文件路径(用于复制图片)
  20. Returns:
  21. Markdown 内容
  22. """
  23. md_lines = []
  24. for item in merged_data:
  25. item_type = item.get('type', '')
  26. if item_type == 'title':
  27. md_lines.extend(MarkdownGenerator._format_title(item))
  28. elif item_type == 'text':
  29. md_lines.extend(MarkdownGenerator._format_text(item))
  30. elif item_type == 'list':
  31. md_lines.extend(MarkdownGenerator._format_list(item))
  32. elif item_type == 'table':
  33. md_lines.extend(MarkdownGenerator._format_table(item))
  34. elif item_type == 'image':
  35. md_lines.extend(MarkdownGenerator._format_image(
  36. item, output_path, mineru_file
  37. ))
  38. elif item_type == 'equation':
  39. md_lines.extend(MarkdownGenerator._format_equation(item))
  40. elif item_type == 'inline_equation':
  41. md_lines.extend(MarkdownGenerator._format_inline_equation(item))
  42. elif item_type in ['page_number', 'header', 'footer']:
  43. md_lines.extend(MarkdownGenerator._format_metadata(item, item_type))
  44. elif item_type == 'reference':
  45. md_lines.extend(MarkdownGenerator._format_reference(item))
  46. else:
  47. md_lines.extend(MarkdownGenerator._format_unknown(item))
  48. markdown_content = '\n'.join(md_lines)
  49. if output_path:
  50. with open(output_path, 'w', encoding='utf-8') as f:
  51. f.write(markdown_content)
  52. return markdown_content
  53. @staticmethod
  54. def _add_bbox_comment(bbox: List) -> str:
  55. """添加 bbox 注释"""
  56. return f"<!-- bbox: {bbox} -->"
  57. @staticmethod
  58. def _format_title(item: Dict) -> List[str]:
  59. """格式化标题"""
  60. lines = []
  61. bbox = item.get('bbox', [])
  62. if bbox:
  63. lines.append(MarkdownGenerator._add_bbox_comment(bbox))
  64. text = item.get('text', '')
  65. text_level = item.get('text_level', 1)
  66. heading = '#' * min(text_level, 6)
  67. lines.append(f"{heading} {text}\n")
  68. return lines
  69. @staticmethod
  70. def _format_text(item: Dict) -> List[str]:
  71. """格式化文本"""
  72. lines = []
  73. bbox = item.get('bbox', [])
  74. if bbox:
  75. lines.append(MarkdownGenerator._add_bbox_comment(bbox))
  76. text = item.get('text', '')
  77. text_level = item.get('text_level', 0)
  78. if text_level > 0:
  79. heading = '#' * min(text_level, 6)
  80. lines.append(f"{heading} {text}\n")
  81. else:
  82. lines.append(f"{text}\n")
  83. return lines
  84. @staticmethod
  85. def _format_list(item: Dict) -> List[str]:
  86. """格式化列表"""
  87. lines = []
  88. bbox = item.get('bbox', [])
  89. if bbox:
  90. lines.append(MarkdownGenerator._add_bbox_comment(bbox))
  91. list_items = item.get('list_items', [])
  92. for list_item in list_items:
  93. lines.append(f"{list_item}\n")
  94. lines.append("")
  95. return lines
  96. @staticmethod
  97. def _format_table(item: Dict) -> List[str]:
  98. """格式化表格"""
  99. lines = []
  100. bbox = item.get('bbox', [])
  101. if bbox:
  102. lines.append(MarkdownGenerator._add_bbox_comment(bbox))
  103. # 表格标题
  104. table_caption = item.get('table_caption', [])
  105. for caption in table_caption:
  106. if caption:
  107. lines.append(f"**{caption}**\n")
  108. # 表格内容
  109. table_body = item.get('table_body_with_bbox', item.get('table_body', ''))
  110. if table_body:
  111. lines.append(table_body)
  112. lines.append("")
  113. # 表格脚注
  114. table_footnote = item.get('table_footnote', [])
  115. for footnote in table_footnote:
  116. if footnote:
  117. lines.append(f"*{footnote}*")
  118. if table_footnote:
  119. lines.append("")
  120. return lines
  121. @staticmethod
  122. def _format_image(item: Dict, output_path: Optional[str],
  123. mineru_file: Optional[str]) -> List[str]:
  124. """格式化图片"""
  125. lines = []
  126. bbox = item.get('bbox', [])
  127. if bbox:
  128. lines.append(MarkdownGenerator._add_bbox_comment(bbox))
  129. img_path = item.get('img_path', '')
  130. # 复制图片
  131. if img_path and mineru_file and output_path:
  132. MarkdownGenerator._copy_image(img_path, mineru_file, output_path)
  133. # 图片标题
  134. image_caption = item.get('image_caption', [])
  135. for caption in image_caption:
  136. if caption:
  137. lines.append(f"**{caption}**\n")
  138. lines.append(f"![Image]({img_path})\n")
  139. # 图片脚注
  140. image_footnote = item.get('image_footnote', [])
  141. for footnote in image_footnote:
  142. if footnote:
  143. lines.append(f"*{footnote}*")
  144. if image_footnote:
  145. lines.append("")
  146. return lines
  147. @staticmethod
  148. def _copy_image(img_path: str, mineru_file: str, output_path: str):
  149. """复制图片到输出目录"""
  150. mineru_dir = Path(mineru_file).parent
  151. img_full_path = mineru_dir / img_path
  152. if img_full_path.exists():
  153. output_img_path = Path(output_path).parent / img_path
  154. output_img_path.parent.mkdir(parents=True, exist_ok=True)
  155. shutil.copy(img_full_path, output_img_path)
  156. @staticmethod
  157. def _format_equation(item: Dict) -> List[str]:
  158. """格式化公式"""
  159. latex = item.get('latex', '')
  160. if latex:
  161. return [f"$$\n{latex}\n$$\n"]
  162. return []
  163. @staticmethod
  164. def _format_inline_equation(item: Dict) -> List[str]:
  165. """格式化行内公式"""
  166. latex = item.get('latex', '')
  167. if latex:
  168. return [f"${latex}$\n"]
  169. return []
  170. @staticmethod
  171. def _format_metadata(item: Dict, item_type: str) -> List[str]:
  172. """格式化元数据(页码、页眉、页脚)"""
  173. text = item.get('text', '')
  174. type_map = {
  175. 'page_number': '页码',
  176. 'header': '页眉',
  177. 'footer': '页脚'
  178. }
  179. if text:
  180. return [f"<!-- {type_map.get(item_type, item_type)}: {text} -->\n"]
  181. return []
  182. @staticmethod
  183. def _format_reference(item: Dict) -> List[str]:
  184. """格式化参考文献"""
  185. text = item.get('text', '')
  186. return [f"> {text}\n"]
  187. @staticmethod
  188. def _format_unknown(item: Dict) -> List[str]:
  189. """格式化未知类型"""
  190. text = item.get('text', '')
  191. if text:
  192. return [f"{text}\n"]
  193. return []