markdown_generator.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415
  1. """
  2. Markdown 生成器模块
  3. 提供 Markdown 输出功能:
  4. - 完整文档 Markdown 生成
  5. - 按页 Markdown 生成
  6. - MinerU union_make 集成
  7. """
  8. import sys
  9. from pathlib import Path
  10. from typing import Dict, Any, List
  11. from loguru import logger
  12. # 导入 MinerU 组件
  13. mineru_path = Path(__file__).parents[3]
  14. if str(mineru_path) not in sys.path:
  15. sys.path.insert(0, str(mineru_path))
  16. try:
  17. from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
  18. from mineru.utils.enum_class import MakeMode
  19. MINERU_AVAILABLE = True
  20. except ImportError:
  21. MINERU_AVAILABLE = False
  22. vlm_union_make = None
  23. class MakeMode:
  24. MM_MD = 'mm_md'
  25. NLP_MD = 'nlp_md'
  26. class MarkdownGenerator:
  27. """Markdown 生成器类"""
  28. @staticmethod
  29. def save_markdown(
  30. results: Dict[str, Any],
  31. middle_json: Dict[str, Any],
  32. output_dir: Path,
  33. doc_name: str,
  34. use_mineru_union: bool = False
  35. ) -> Path:
  36. """
  37. 保存 Markdown 文件
  38. 默认使用自定义实现,确保所有元素类型(包括 table_caption 等)都被正确处理
  39. 可选使用 MinerU union_make(但它不处理 table_caption 等独立元素)
  40. Args:
  41. results: 处理结果
  42. middle_json: middle.json 格式数据
  43. output_dir: 输出目录
  44. doc_name: 文档名称
  45. use_mineru_union: 是否使用 MinerU union_make(默认 False)
  46. Returns:
  47. Markdown 文件路径
  48. """
  49. md_path = output_dir / f"{doc_name}.md"
  50. if use_mineru_union and MINERU_AVAILABLE and vlm_union_make is not None:
  51. try:
  52. img_bucket_path = "images"
  53. markdown_content = vlm_union_make(
  54. middle_json['pdf_info'],
  55. MakeMode.MM_MD,
  56. img_bucket_path
  57. )
  58. if markdown_content:
  59. if isinstance(markdown_content, list):
  60. markdown_content = '\n\n'.join(markdown_content)
  61. header = MarkdownGenerator._generate_header(results)
  62. markdown_content = header + str(markdown_content)
  63. with open(md_path, 'w', encoding='utf-8') as f:
  64. f.write(markdown_content)
  65. logger.info(f"📝 Markdown saved (MinerU format): {md_path}")
  66. return md_path
  67. except Exception as e:
  68. logger.warning(f"MinerU union_make failed: {e}, falling back to custom implementation")
  69. # 使用自定义实现,确保所有元素类型都被处理
  70. markdown_content = MarkdownGenerator._generate_full_markdown(results)
  71. with open(md_path, 'w', encoding='utf-8') as f:
  72. f.write(markdown_content)
  73. logger.info(f"📝 Markdown saved (custom format): {md_path}")
  74. return md_path
  75. @staticmethod
  76. def save_page_markdowns(
  77. results: Dict[str, Any],
  78. output_dir: Path,
  79. doc_name: str
  80. ) -> List[str]:
  81. """
  82. 按页保存 Markdown 文件
  83. Args:
  84. results: 处理结果
  85. output_dir: 输出目录
  86. doc_name: 文档名称
  87. Returns:
  88. 保存的 Markdown 文件路径列表
  89. """
  90. saved_paths = []
  91. for page in results.get('pages', []):
  92. page_idx = page.get('page_idx', 0)
  93. page_name = f"{doc_name}_page_{page_idx + 1:03d}"
  94. # 生成单页 Markdown
  95. md_content = MarkdownGenerator._generate_page_markdown(page, doc_name, page_idx)
  96. # 保存
  97. md_path = output_dir / f"{page_name}.md"
  98. with open(md_path, 'w', encoding='utf-8') as f:
  99. f.write(md_content)
  100. saved_paths.append(str(md_path))
  101. logger.debug(f"📝 Page Markdown saved: {md_path}")
  102. if saved_paths:
  103. logger.info(f"📝 {len(saved_paths)} page Markdowns saved")
  104. return saved_paths
  105. @staticmethod
  106. def _generate_header(results: Dict[str, Any]) -> str:
  107. """生成 Markdown 文件头"""
  108. return f"""---
  109. scene: {results.get('scene', 'unknown')}
  110. document: {results.get('document_path', '')}
  111. pages: {len(results.get('pages', []))}
  112. ---
  113. """
  114. @staticmethod
  115. def _generate_full_markdown(results: Dict[str, Any]) -> str:
  116. """
  117. 生成完整文档的 Markdown(自定义实现)
  118. 确保所有元素类型都被正确处理,包括 table_caption、table_footnote 等
  119. Args:
  120. results: 处理结果
  121. Returns:
  122. Markdown 内容字符串
  123. """
  124. md_lines = [
  125. f"---",
  126. f"scene: {results.get('scene', 'unknown')}",
  127. f"document: {results.get('document_path', '')}",
  128. f"pages: {len(results.get('pages', []))}",
  129. f"---",
  130. "",
  131. ]
  132. for page in results.get('pages', []):
  133. # 按阅读顺序处理元素
  134. for element in page.get('elements', []):
  135. elem_type = element.get('type', '')
  136. content = element.get('content', {})
  137. if elem_type == 'title':
  138. text = content.get('text', '') if isinstance(content, dict) else str(content)
  139. level = element.get('level', 1)
  140. if text:
  141. md_lines.append(f"{'#' * min(level, 6)} {text}")
  142. md_lines.append("")
  143. elif elem_type in ['text', 'ocr_text', 'ref_text']:
  144. text = content.get('text', '') if isinstance(content, dict) else str(content)
  145. if text:
  146. md_lines.append(text)
  147. md_lines.append("")
  148. elif elem_type in ['table', 'table_body']:
  149. html = content.get('html', '')
  150. if html:
  151. md_lines.append(f"\n{html}\n")
  152. md_lines.append("")
  153. elif elem_type in ['image', 'image_body', 'figure']:
  154. img_filename = content.get('image_path', '')
  155. if img_filename:
  156. md_lines.append(f"![](images/{img_filename})")
  157. md_lines.append("")
  158. elif elem_type in ['interline_equation', 'inline_equation', 'equation']:
  159. latex = content.get('latex', '')
  160. if latex:
  161. md_lines.append(f"$$\n{latex}\n$$")
  162. md_lines.append("")
  163. elif elem_type in ['table_caption', 'table_footnote']:
  164. text = content.get('text', '') if isinstance(content, dict) else str(content)
  165. if text:
  166. if elem_type == 'table_caption':
  167. md_lines.append(f"**{text}**")
  168. else:
  169. md_lines.append(f"*{text}*")
  170. md_lines.append("")
  171. elif elem_type in ['image_caption', 'image_footnote']:
  172. text = content.get('text', '') if isinstance(content, dict) else str(content)
  173. if text:
  174. if elem_type == 'image_caption':
  175. md_lines.append(f"**{text}**")
  176. else:
  177. md_lines.append(f"*{text}*")
  178. md_lines.append("")
  179. return '\n'.join(md_lines)
  180. @staticmethod
  181. def _generate_fallback(results: Dict[str, Any]) -> str:
  182. """降级方案:自定义 Markdown 生成"""
  183. md_lines = [
  184. f"---",
  185. f"scene: {results.get('scene', 'unknown')}",
  186. f"document: {results.get('document_path', '')}",
  187. f"pages: {len(results.get('pages', []))}",
  188. f"---",
  189. "",
  190. ]
  191. for page in results.get('pages', []):
  192. for element in page.get('elements', []):
  193. elem_type = element.get('type', '')
  194. content = element.get('content', {})
  195. bbox = element.get('bbox', [])
  196. # 添加 bbox 注释
  197. if bbox:
  198. md_lines.append(f"<!-- bbox: {bbox} -->")
  199. if elem_type == 'title':
  200. text = content.get('text', '') if isinstance(content, dict) else str(content)
  201. level = element.get('level', 1)
  202. md_lines.append(f"{'#' * min(level, 6)} {text}")
  203. md_lines.append("")
  204. elif elem_type in ['text', 'ocr_text', 'ref_text']:
  205. text = content.get('text', '') if isinstance(content, dict) else str(content)
  206. if text:
  207. md_lines.append(text)
  208. md_lines.append("")
  209. elif elem_type in ['table', 'table_body']:
  210. # 表格标题
  211. table_captions = content.get('table_caption', [])
  212. if isinstance(table_captions, str):
  213. table_captions = [table_captions] if table_captions else []
  214. for caption in table_captions:
  215. md_lines.append(f"**{caption}**")
  216. html = content.get('html', '')
  217. if html:
  218. md_lines.append(f"\n{html}\n")
  219. md_lines.append("")
  220. elif elem_type in ['image', 'image_body', 'figure']:
  221. img_filename = content.get('image_path', '')
  222. if img_filename:
  223. md_lines.append(f"![](images/{img_filename})")
  224. md_lines.append("")
  225. elif elem_type in ['interline_equation', 'inline_equation', 'equation']:
  226. latex = content.get('latex', '')
  227. if latex:
  228. md_lines.append(f"$$\n{latex}\n$$")
  229. md_lines.append("")
  230. elif elem_type in ['table_caption', 'table_footnote']:
  231. text = content.get('text', '') if isinstance(content, dict) else str(content)
  232. if text:
  233. if elem_type == 'table_caption':
  234. md_lines.append(f"**{text}**")
  235. else:
  236. md_lines.append(f"*{text}*")
  237. md_lines.append("")
  238. elif elem_type in ['image_caption', 'image_footnote']:
  239. text = content.get('text', '') if isinstance(content, dict) else str(content)
  240. if text:
  241. if elem_type == 'image_caption':
  242. md_lines.append(f"**{text}**")
  243. else:
  244. md_lines.append(f"*{text}*")
  245. md_lines.append("")
  246. return '\n'.join(md_lines)
  247. @staticmethod
  248. def _generate_page_markdown(
  249. page: Dict[str, Any],
  250. doc_name: str,
  251. page_idx: int
  252. ) -> str:
  253. """
  254. 生成单页的 Markdown 内容
  255. Args:
  256. page: 页面数据
  257. doc_name: 文档名称
  258. page_idx: 页码索引
  259. Returns:
  260. Markdown 内容字符串
  261. """
  262. md_lines = [
  263. f"---",
  264. f"document: {doc_name}",
  265. f"page: {page_idx + 1}",
  266. f"angle: {page.get('angle', 0)}",
  267. f"---",
  268. "",
  269. ]
  270. for element in page.get('elements', []):
  271. elem_type = element.get('type', '')
  272. content = element.get('content', {})
  273. bbox = element.get('bbox', [])
  274. reading_order = element.get('reading_order', 0)
  275. # 添加元素注释
  276. md_lines.append(f"<!-- reading_order: {reading_order}, bbox: {bbox} -->")
  277. if elem_type == 'title':
  278. text = content.get('text', '') if isinstance(content, dict) else str(content)
  279. level = element.get('level', 1)
  280. md_lines.append(f"{'#' * min(level, 6)} {text}")
  281. md_lines.append("")
  282. elif elem_type in ['text', 'ocr_text', 'ref_text']:
  283. text = content.get('text', '') if isinstance(content, dict) else str(content)
  284. if text:
  285. md_lines.append(text)
  286. md_lines.append("")
  287. elif elem_type in ['table', 'table_body']:
  288. table_captions = content.get('table_caption', [])
  289. if isinstance(table_captions, str):
  290. table_captions = [table_captions] if table_captions else []
  291. for caption in table_captions:
  292. md_lines.append(f"**{caption}**")
  293. html = content.get('html', '')
  294. if html:
  295. md_lines.append(f"\n{html}\n")
  296. md_lines.append("")
  297. elif elem_type in ['image', 'image_body', 'figure']:
  298. img_filename = content.get('image_path', '')
  299. if img_filename:
  300. md_lines.append(f"![](images/{img_filename})")
  301. md_lines.append("")
  302. elif elem_type in ['interline_equation', 'inline_equation', 'equation']:
  303. latex = content.get('latex', '')
  304. if latex:
  305. md_lines.append(f"$$\n{latex}\n$$")
  306. md_lines.append("")
  307. elif elem_type in ['table_caption', 'table_footnote']:
  308. text = content.get('text', '') if isinstance(content, dict) else str(content)
  309. if text:
  310. # 表格标题加粗,表格脚注斜体
  311. if elem_type == 'table_caption':
  312. md_lines.append(f"**{text}**")
  313. else:
  314. md_lines.append(f"*{text}*")
  315. md_lines.append("")
  316. elif elem_type in ['image_caption', 'image_footnote']:
  317. text = content.get('text', '') if isinstance(content, dict) else str(content)
  318. if text:
  319. # 图片标题加粗,图片脚注斜体
  320. if elem_type == 'image_caption':
  321. md_lines.append(f"**{text}**")
  322. else:
  323. md_lines.append(f"*{text}*")
  324. md_lines.append("")
  325. elif elem_type == 'discarded':
  326. text = content.get('text', '') if isinstance(content, dict) else ''
  327. if text:
  328. md_lines.append(f"<!-- [discarded: {element.get('original_category', 'unknown')}] {text} -->")
  329. md_lines.append("")
  330. # 处理丢弃元素
  331. for element in page.get('discarded_blocks', []):
  332. content = element.get('content', {})
  333. bbox = element.get('bbox', [])
  334. reading_order = element.get('reading_order', 0)
  335. original_category = element.get('original_category', 'unknown')
  336. md_lines.append(f"<!-- reading_order: {reading_order}, bbox: {bbox} -->")
  337. text = content.get('text', '') if isinstance(content, dict) else ''
  338. if text:
  339. md_lines.append(f"<!-- [discarded: {original_category}] {text} -->")
  340. else:
  341. md_lines.append(f"<!-- [discarded: {original_category}] (no text) -->")
  342. md_lines.append("")
  343. return '\n'.join(md_lines)