markdown_generator.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388
  1. """
  2. Markdown 生成器模块
  3. 提供 Markdown 输出功能:
  4. - 完整文档 Markdown 生成
  5. - 按页 Markdown 生成
  6. - MinerU union_make 集成
  7. - 金额数字标准化(全角→半角)
  8. """
  9. import sys
  10. from pathlib import Path
  11. from typing import Dict, Any, List, Tuple, Optional
  12. from loguru import logger
  13. # 导入 MinerU 组件
  14. mineru_path = Path(__file__).parents[3]
  15. if str(mineru_path) not in sys.path:
  16. sys.path.insert(0, str(mineru_path))
  17. try:
  18. from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
  19. from mineru.utils.enum_class import MakeMode
  20. MINERU_AVAILABLE = True
  21. except ImportError:
  22. MINERU_AVAILABLE = False
  23. vlm_union_make = None
  24. class MakeMode:
  25. MM_MD = 'mm_md'
  26. NLP_MD = 'nlp_md'
  27. class MarkdownGenerator:
  28. """Markdown 生成器类"""
  29. @staticmethod
  30. def save_markdown(
  31. results: Dict[str, Any],
  32. middle_json: Dict[str, Any],
  33. output_dir: Path,
  34. doc_name: str,
  35. use_mineru_union: bool = False,
  36. normalize_numbers: bool = True
  37. ) -> Tuple[Path, Optional[Path]]:
  38. """
  39. 保存 Markdown 文件
  40. 默认使用自定义实现,确保所有元素类型(包括 table_caption 等)都被正确处理
  41. 可选使用 MinerU union_make(但它不处理 table_caption 等独立元素)
  42. Args:
  43. results: 处理结果
  44. middle_json: middle.json 格式数据
  45. output_dir: 输出目录
  46. doc_name: 文档名称
  47. use_mineru_union: 是否使用 MinerU union_make(默认 False)
  48. normalize_numbers: 是否标准化金额数字(全角→半角)
  49. Returns:
  50. (Markdown 文件路径, 原始文件路径 或 None)
  51. """
  52. md_path = output_dir / f"{doc_name}.md"
  53. original_path = None
  54. if use_mineru_union and MINERU_AVAILABLE and vlm_union_make is not None:
  55. try:
  56. img_bucket_path = "images"
  57. markdown_content = vlm_union_make(
  58. middle_json['pdf_info'],
  59. MakeMode.MM_MD,
  60. img_bucket_path
  61. )
  62. if markdown_content:
  63. if isinstance(markdown_content, list):
  64. markdown_content = '\n\n'.join(markdown_content)
  65. header = MarkdownGenerator._generate_header(results)
  66. markdown_content = header + str(markdown_content)
  67. # 金额标准化已在 pipeline element_processors 中完成,此处不再重复
  68. with open(md_path, 'w', encoding='utf-8') as f:
  69. f.write(markdown_content)
  70. logger.info(f"📝 Markdown saved (MinerU format): {md_path}")
  71. return md_path, original_path
  72. except Exception as e:
  73. logger.warning(f"MinerU union_make failed: {e}, falling back to custom implementation")
  74. # 使用自定义实现,确保所有元素类型都被处理
  75. markdown_content = MarkdownGenerator._generate_full_markdown(results)
  76. # 金额标准化已在 pipeline element_processors 中完成,此处不再重复
  77. with open(md_path, 'w', encoding='utf-8') as f:
  78. f.write(markdown_content)
  79. logger.info(f"📝 Markdown saved (custom format): {md_path}")
  80. return md_path, original_path
  81. @staticmethod
  82. def save_page_markdowns(
  83. results: Dict[str, Any],
  84. output_dir: Path,
  85. doc_name: str,
  86. is_pdf: bool = True,
  87. normalize_numbers: bool = True
  88. ) -> List[str]:
  89. """
  90. 按页保存 Markdown 文件
  91. 命名规则:
  92. - PDF输入: 文件名_page_001.md
  93. - 图片输入(单页): 文件名.md(跳过,因为已有完整版)
  94. Args:
  95. results: 处理结果
  96. output_dir: 输出目录
  97. doc_name: 文档名称
  98. is_pdf: 是否为 PDF 输入
  99. normalize_numbers: 是否标准化金额数字(全角→半角)
  100. Returns:
  101. 保存的 Markdown 文件路径列表
  102. """
  103. saved_paths = []
  104. total_pages = len(results.get('pages', []))
  105. # 单个图片输入时,跳过按页保存(因为已有完整版 doc_name.md)
  106. if not is_pdf and total_pages == 1:
  107. logger.debug("📝 Single image input, skipping page markdown (full version exists)")
  108. return saved_paths
  109. for page in results.get('pages', []):
  110. page_idx = page.get('page_idx', 0)
  111. # 根据输入类型决定命名
  112. if is_pdf or total_pages > 1:
  113. page_name = f"{doc_name}_page_{page_idx + 1:03d}"
  114. else:
  115. page_name = doc_name
  116. # 生成单页 Markdown(金额标准化已在 pipeline element_processors 中完成,此处不再重复)
  117. md_content = MarkdownGenerator._generate_page_markdown(page, doc_name, page_idx)
  118. # 保存
  119. md_path = output_dir / f"{page_name}.md"
  120. with open(md_path, 'w', encoding='utf-8') as f:
  121. f.write(md_content)
  122. saved_paths.append(str(md_path))
  123. logger.debug(f"📝 Page Markdown saved: {md_path}")
  124. if saved_paths:
  125. logger.info(f"📝 {len(saved_paths)} page Markdowns saved")
  126. return saved_paths
  127. @staticmethod
  128. def _generate_header(results: Dict[str, Any]) -> str:
  129. """生成 Markdown 文件头"""
  130. return f"""<!--
  131. scene: {results.get('scene', 'unknown')}
  132. document: {results.get('document_path', '')}
  133. pages: {len(results.get('pages', []))}
  134. -->
  135. """
  136. @staticmethod
  137. def _generate_full_markdown(results: Dict[str, Any]) -> str:
  138. """
  139. 生成完整文档的 Markdown(自定义实现)
  140. 确保所有元素类型都被正确处理,包括 table_caption、table_footnote 等
  141. Args:
  142. results: 处理结果
  143. Returns:
  144. Markdown 内容字符串
  145. """
  146. md_lines = [
  147. f"<!-- ",
  148. f"scene: {results.get('scene', 'unknown')}",
  149. f"document: {results.get('document_path', '')}",
  150. f"pages: {len(results.get('pages', []))}",
  151. f"-->",
  152. "",
  153. ]
  154. for page in results.get('pages', []):
  155. # 按阅读顺序处理元素
  156. for element in page.get('elements', []):
  157. elem_type = element.get('type', '')
  158. content = element.get('content', {})
  159. if elem_type == 'title':
  160. text = content.get('text', '') if isinstance(content, dict) else str(content)
  161. level = element.get('level', 1)
  162. if text:
  163. md_lines.append(f"{'#' * min(level, 6)} {text}")
  164. md_lines.append("")
  165. elif elem_type in ['text', 'ocr_text', 'ref_text']:
  166. text = content.get('text', '') if isinstance(content, dict) else str(content)
  167. if text:
  168. md_lines.append(text)
  169. md_lines.append("")
  170. elif elem_type in ['table', 'table_body']:
  171. html = content.get('html', '')
  172. if html:
  173. md_lines.append(f"\n{html}\n")
  174. changes = content.get('number_normalization_changes', [])
  175. if changes:
  176. md_lines.append("")
  177. md_lines.append("<!-- 数字标准化说明:")
  178. for ch in changes:
  179. md_lines.append(f" - [row={ch.get('row')},col={ch.get('col')}] {ch.get('old', '')} -> {ch.get('new', '')}")
  180. md_lines.append("-->")
  181. md_lines.append("")
  182. elif elem_type in ['image', 'image_body', 'figure']:
  183. img_filename = content.get('image_path', '')
  184. if img_filename:
  185. md_lines.append(f"![](images/{img_filename})")
  186. md_lines.append("")
  187. elif elem_type in ['interline_equation', 'inline_equation', 'equation']:
  188. latex = content.get('latex', '')
  189. if latex:
  190. md_lines.append(f"$$\n{latex}\n$$")
  191. md_lines.append("")
  192. elif elem_type in ['table_caption', 'table_footnote']:
  193. text = content.get('text', '') if isinstance(content, dict) else str(content)
  194. if text:
  195. if elem_type == 'table_caption':
  196. md_lines.append(f"**{text}**")
  197. else:
  198. md_lines.append(f"*{text}*")
  199. md_lines.append("")
  200. elif elem_type in ['image_caption', 'image_footnote']:
  201. text = content.get('text', '') if isinstance(content, dict) else str(content)
  202. if text:
  203. if elem_type == 'image_caption':
  204. md_lines.append(f"**{text}**")
  205. else:
  206. md_lines.append(f"*{text}*")
  207. md_lines.append("")
  208. elif elem_type == 'seal':
  209. text = content.get('text', '') if isinstance(content, dict) else str(content)
  210. if text:
  211. md_lines.append(f"🔖 **[印章]** {text}")
  212. md_lines.append("")
  213. return '\n'.join(md_lines)
  214. @staticmethod
  215. def _generate_page_markdown(
  216. page: Dict[str, Any],
  217. doc_name: str,
  218. page_idx: int
  219. ) -> str:
  220. """
  221. 生成单页的 Markdown 内容
  222. Args:
  223. page: 页面数据
  224. doc_name: 文档名称
  225. page_idx: 页码索引
  226. Returns:
  227. Markdown 内容字符串
  228. """
  229. md_lines = [
  230. f"<!--",
  231. f"document: {doc_name}",
  232. f"page: {page_idx + 1}",
  233. f"angle: {page.get('angle', 0)}",
  234. f"-->",
  235. "",
  236. ]
  237. md_lines.append("")
  238. for element in page.get('elements', []):
  239. elem_type = element.get('type', '')
  240. content = element.get('content', {})
  241. bbox = element.get('bbox', [])
  242. reading_order = element.get('reading_order', 0)
  243. # 添加元素注释
  244. md_lines.append(f"<!-- reading_order: {reading_order}, bbox: {bbox} -->")
  245. if elem_type == 'title':
  246. text = content.get('text', '') if isinstance(content, dict) else str(content)
  247. level = element.get('level', 1)
  248. md_lines.append(f"{'#' * min(level, 6)} {text}")
  249. md_lines.append("")
  250. elif elem_type in ['text', 'ocr_text', 'ref_text']:
  251. text = content.get('text', '') if isinstance(content, dict) else str(content)
  252. if text:
  253. md_lines.append(text)
  254. md_lines.append("")
  255. elif elem_type in ['table', 'table_body']:
  256. table_captions = content.get('table_caption', [])
  257. if isinstance(table_captions, str):
  258. table_captions = [table_captions] if table_captions else []
  259. for caption in table_captions:
  260. md_lines.append(f"**{caption}**")
  261. html = content.get('html', '')
  262. if html:
  263. md_lines.append(f"\n{html}\n")
  264. # 金额标准化说明(来自 element_processors._normalize_table_content)
  265. changes = content.get('number_normalization_changes', [])
  266. if changes:
  267. md_lines.append("")
  268. md_lines.append("<!-- 数字标准化说明:")
  269. for ch in changes:
  270. md_lines.append(f" - [row={ch.get('row')},col={ch.get('col')}] {ch.get('old', '')} -> {ch.get('new', '')}")
  271. md_lines.append("-->")
  272. md_lines.append("")
  273. elif elem_type in ['image', 'image_body', 'figure']:
  274. img_filename = content.get('image_path', '')
  275. if img_filename:
  276. md_lines.append(f"![](images/{img_filename})")
  277. md_lines.append("")
  278. elif elem_type in ['interline_equation', 'inline_equation', 'equation']:
  279. latex = content.get('latex', '')
  280. if latex:
  281. md_lines.append(f"$$\n{latex}\n$$")
  282. md_lines.append("")
  283. elif elem_type in ['table_caption', 'table_footnote']:
  284. text = content.get('text', '') if isinstance(content, dict) else str(content)
  285. if text:
  286. # 表格标题加粗,表格脚注斜体
  287. if elem_type == 'table_caption':
  288. md_lines.append(f"**{text}**")
  289. else:
  290. md_lines.append(f"*{text}*")
  291. md_lines.append("")
  292. elif elem_type in ['image_caption', 'image_footnote']:
  293. text = content.get('text', '') if isinstance(content, dict) else str(content)
  294. if text:
  295. # 图片标题加粗,图片脚注斜体
  296. if elem_type == 'image_caption':
  297. md_lines.append(f"**{text}**")
  298. else:
  299. md_lines.append(f"*{text}*")
  300. md_lines.append("")
  301. elif elem_type == 'seal':
  302. text = content.get('text', '') if isinstance(content, dict) else str(content)
  303. if text:
  304. confidence = content.get('confidence', 0.0) if isinstance(content, dict) else 0.0
  305. # md_lines.append(f"🔖 **[印章]** {text} _(置信度: {confidence:.2f})_")
  306. md_lines.append(f"🔖 **[印章]** {text}")
  307. md_lines.append("")
  308. elif elem_type == 'discarded':
  309. text = content.get('text', '') if isinstance(content, dict) else ''
  310. if text:
  311. md_lines.append(f"<!-- [discarded: {element.get('original_category', 'unknown')}] {text} -->")
  312. md_lines.append("")
  313. # 处理丢弃元素
  314. for element in page.get('discarded_blocks', []):
  315. content = element.get('content', {})
  316. bbox = element.get('bbox', [])
  317. reading_order = element.get('reading_order', 0)
  318. original_category = element.get('original_category', 'unknown')
  319. md_lines.append(f"<!-- reading_order: {reading_order}, bbox: {bbox} -->")
  320. text = content.get('text', '') if isinstance(content, dict) else ''
  321. if text:
  322. md_lines.append(f"<!-- [discarded: {original_category}] {text} -->")
  323. else:
  324. md_lines.append(f"<!-- [discarded: {original_category}] (no text) -->")
  325. md_lines.append("")
  326. return '\n'.join(md_lines)