json_formatters.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364
  1. """
  2. JSON 格式化工具模块
  3. 提供 JSON 输出格式化功能:
  4. - MinerU middle.json 格式转换
  5. - mineru_vllm_results_cell_bbox 格式转换
  6. - 表格单元格格式化
  7. """
  8. import json
  9. from pathlib import Path
  10. from typing import Dict, Any, List, Optional
  11. from loguru import logger
  12. class JSONFormatters:
  13. """JSON 格式化工具类"""
  14. @staticmethod
  15. def convert_to_middle_json(results: Dict[str, Any]) -> Dict[str, Any]:
  16. """
  17. 转换为 MinerU 标准 middle.json 格式
  18. 用于 vlm_union_make 生成 Markdown
  19. Args:
  20. results: 处理结果
  21. Returns:
  22. MinerU middle.json 格式的字典
  23. """
  24. middle_json = {
  25. "pdf_info": [],
  26. "_backend": "vlm",
  27. "_scene": results.get('scene', 'unknown'),
  28. "_version_name": "2.5.0"
  29. }
  30. for page in results.get('pages', []):
  31. page_info = {
  32. 'page_idx': page['page_idx'],
  33. 'page_size': list(page.get('image_shape', [0, 0])[:2][::-1]),
  34. 'angle': page.get('angle', 0),
  35. 'para_blocks': [],
  36. 'discarded_blocks': []
  37. }
  38. # 处理普通元素
  39. for element in page.get('elements', []):
  40. block = JSONFormatters._element_to_middle_block(element)
  41. if block:
  42. elem_type = element.get('type', '')
  43. if elem_type in ['header', 'footer', 'page_number', 'aside_text', 'abandon', 'discarded']:
  44. page_info['discarded_blocks'].append(block)
  45. else:
  46. page_info['para_blocks'].append(block)
  47. # 处理丢弃元素(从 discarded_blocks 字段)
  48. for element in page.get('discarded_blocks', []):
  49. block = JSONFormatters._element_to_middle_block(element)
  50. if block:
  51. page_info['discarded_blocks'].append(block)
  52. middle_json['pdf_info'].append(page_info)
  53. return middle_json
  54. @staticmethod
  55. def _element_to_middle_block(element: Dict[str, Any]) -> Optional[Dict[str, Any]]:
  56. """
  57. 将元素转换为 MinerU middle.json block 格式
  58. MinerU 期望的嵌套结构:
  59. - image 类型: { type: "image", blocks: [{ type: "image_body", lines: [...] }] }
  60. - table 类型: { type: "table", blocks: [{ type: "table_body", lines: [...] }] }
  61. """
  62. elem_type = element.get('type', '')
  63. bbox = element.get('bbox', [0, 0, 0, 0])
  64. content = element.get('content', {})
  65. block = {
  66. 'type': elem_type,
  67. 'bbox': bbox,
  68. 'angle': element.get('angle', 0),
  69. 'reading_order': element.get('reading_order', 0),
  70. 'lines': []
  71. }
  72. # 文本类型
  73. if elem_type in ['text', 'title', 'ref_text', 'header', 'footer', 'ocr_text']:
  74. text = content.get('text', '') if isinstance(content, dict) else str(content)
  75. if text:
  76. block['lines'] = [{
  77. 'bbox': bbox,
  78. 'spans': [{
  79. 'bbox': bbox,
  80. 'type': 'text',
  81. 'content': text
  82. }]
  83. }]
  84. # 表格类型 - 嵌套结构
  85. elif elem_type in ['table', 'table_body']:
  86. table_html = content.get('html', '')
  87. cells = content.get('cells', [])
  88. block['type'] = 'table'
  89. block['blocks'] = [{
  90. 'type': 'table_body',
  91. 'bbox': bbox,
  92. 'angle': 0,
  93. 'lines': [{
  94. 'bbox': bbox,
  95. 'spans': [{
  96. 'bbox': bbox,
  97. 'type': 'table',
  98. 'html': table_html,
  99. 'cells': cells
  100. }]
  101. }]
  102. }]
  103. # 图片类型 - 嵌套结构
  104. elif elem_type in ['image', 'image_body', 'figure']:
  105. block['type'] = 'image'
  106. block['blocks'] = [{
  107. 'type': 'image_body',
  108. 'bbox': bbox,
  109. 'angle': element.get('angle', 0),
  110. 'lines': [{
  111. 'bbox': bbox,
  112. 'spans': [{
  113. 'bbox': bbox,
  114. 'type': 'image',
  115. 'image_path': content.get('image_path', ''),
  116. 'description': content.get('description', '')
  117. }]
  118. }]
  119. }]
  120. # 公式类型
  121. elif elem_type in ['interline_equation', 'inline_equation', 'equation']:
  122. latex = content.get('latex', '')
  123. block['lines'] = [{
  124. 'bbox': bbox,
  125. 'spans': [{
  126. 'bbox': bbox,
  127. 'type': 'interline_equation' if 'interline' in elem_type else 'inline_equation',
  128. 'content': latex
  129. }]
  130. }]
  131. # 表格/图片附属文本
  132. elif elem_type in ['table_caption', 'table_footnote', 'image_caption', 'image_footnote']:
  133. text = content.get('text', '') if isinstance(content, dict) else str(content)
  134. if text:
  135. block['lines'] = [{
  136. 'bbox': bbox,
  137. 'spans': [{
  138. 'bbox': bbox,
  139. 'type': 'text',
  140. 'content': text
  141. }]
  142. }]
  143. # 丢弃类型
  144. elif elem_type in ['abandon', 'discarded']:
  145. block['type'] = 'abandon'
  146. text = content.get('text', '') if isinstance(content, dict) else str(content)
  147. if text:
  148. block['lines'] = [{
  149. 'bbox': bbox,
  150. 'spans': [{
  151. 'bbox': bbox,
  152. 'type': 'text',
  153. 'content': text
  154. }]
  155. }]
  156. return block
  157. @staticmethod
  158. def save_page_jsons(
  159. results: Dict[str, Any],
  160. output_dir: Path,
  161. doc_name: str
  162. ) -> List[str]:
  163. """
  164. 保存每页独立的 JSON(mineru_vllm_results_cell_bbox 格式)
  165. Args:
  166. results: 处理结果
  167. output_dir: 输出目录
  168. doc_name: 文档名称
  169. Returns:
  170. 保存的文件路径列表
  171. """
  172. saved_paths = []
  173. for page in results.get('pages', []):
  174. page_idx = page.get('page_idx', 0)
  175. page_name = f"{doc_name}_page_{page_idx + 1:03d}"
  176. # 转换为 mineru_vllm_results_cell_bbox 格式
  177. page_elements = []
  178. for element in page.get('elements', []):
  179. converted = JSONFormatters._element_to_cell_bbox_format(element, page_idx)
  180. if converted:
  181. page_elements.append(converted)
  182. # 添加丢弃元素
  183. for element in page.get('discarded_blocks', []):
  184. converted = JSONFormatters._element_to_cell_bbox_format(element, page_idx)
  185. if converted:
  186. page_elements.append(converted)
  187. # 保存 JSON
  188. json_path = output_dir / f"{page_name}.json"
  189. with open(json_path, 'w', encoding='utf-8') as f:
  190. json.dump(page_elements, f, ensure_ascii=False, indent=2)
  191. saved_paths.append(str(json_path))
  192. logger.debug(f"📄 Page JSON saved: {json_path}")
  193. if saved_paths:
  194. logger.info(f"📄 {len(saved_paths)} page JSONs saved")
  195. return saved_paths
  196. @staticmethod
  197. def _element_to_cell_bbox_format(
  198. element: Dict[str, Any],
  199. page_idx: int
  200. ) -> Optional[Dict[str, Any]]:
  201. """
  202. 将元素转换为 mineru_vllm_results_cell_bbox 格式
  203. """
  204. elem_type = element.get('type', '')
  205. bbox = element.get('bbox', [0, 0, 0, 0])
  206. content = element.get('content', {})
  207. # 确保 bbox 是整数列表
  208. bbox = [int(x) for x in bbox[:4]] if bbox else [0, 0, 0, 0]
  209. result = {
  210. 'bbox': bbox,
  211. 'page_idx': page_idx,
  212. 'reading_order': element.get('reading_order', 0)
  213. }
  214. # 文本类型
  215. if elem_type in ['text', 'title', 'ref_text', 'ocr_text']:
  216. text = content.get('text', '') if isinstance(content, dict) else str(content)
  217. result['type'] = 'text' if elem_type != 'title' else 'title'
  218. result['text'] = text
  219. if elem_type == 'title':
  220. result['text_level'] = element.get('level', 1)
  221. # 表格类型
  222. elif elem_type in ['table', 'table_body']:
  223. result['type'] = 'table'
  224. result['img_path'] = content.get('table_image_path', '')
  225. result['table_caption'] = JSONFormatters._ensure_list(content.get('table_caption', []))
  226. result['table_footnote'] = JSONFormatters._ensure_list(content.get('table_footnote', []))
  227. result['table_body'] = content.get('html', '')
  228. # 关键:table_cells 数组
  229. cells = content.get('cells', [])
  230. if cells:
  231. result['table_cells'] = JSONFormatters.format_table_cells(cells)
  232. # 旋转和倾斜信息
  233. if 'table_angle' in content:
  234. result['image_rotation_angle'] = float(content['table_angle'])
  235. if 'skew_angle' in content:
  236. result['skew_angle'] = float(content['skew_angle'])
  237. # 图片类型
  238. elif elem_type in ['image', 'image_body', 'figure']:
  239. result['type'] = 'image'
  240. image_filename = content.get('image_path', '')
  241. result['img_path'] = f"images/{image_filename}" if image_filename else ''
  242. result['image_caption'] = JSONFormatters._ensure_list(content.get('caption', []))
  243. result['image_footnote'] = JSONFormatters._ensure_list(content.get('footnote', []))
  244. # 公式类型
  245. elif elem_type in ['interline_equation', 'inline_equation', 'equation']:
  246. result['type'] = 'equation'
  247. result['text'] = content.get('latex', '') if isinstance(content, dict) else ''
  248. result['text_format'] = 'latex'
  249. # 列表类型
  250. elif elem_type == 'list':
  251. result['type'] = 'list'
  252. result['sub_type'] = 'text'
  253. result['list_items'] = content.get('list_items', []) if isinstance(content, dict) else []
  254. # 页眉页脚
  255. elif elem_type in ['header', 'footer']:
  256. result['type'] = elem_type
  257. result['text'] = content.get('text', '') if isinstance(content, dict) else str(content)
  258. # 表格/图片附属文本
  259. elif elem_type in ['table_caption', 'table_footnote', 'image_caption', 'image_footnote']:
  260. result['type'] = elem_type
  261. result['text'] = content.get('text', '') if isinstance(content, dict) else str(content)
  262. # 丢弃元素
  263. elif elem_type in ['discarded', 'abandon']:
  264. result['type'] = 'discarded'
  265. result['original_category'] = element.get('original_category', 'unknown')
  266. result['text'] = content.get('text', '') if isinstance(content, dict) else ''
  267. else:
  268. return None
  269. return result
  270. @staticmethod
  271. def format_table_cells(cells: List[Dict]) -> List[Dict[str, Any]]:
  272. """
  273. 格式化表格单元格为 mineru_vllm_results_cell_bbox 格式
  274. 输出格式:
  275. {
  276. "type": "table_cell",
  277. "text": "单元格内容",
  278. "matched_text": "OCR匹配文本",
  279. "bbox": [x1, y1, x2, y2],
  280. "row": 1,
  281. "col": 1,
  282. "score": 100.0,
  283. "paddle_bbox_indices": [0, 1]
  284. }
  285. """
  286. formatted_cells = []
  287. for cell in cells:
  288. formatted_cell = {
  289. 'type': 'table_cell',
  290. 'text': cell.get('text', ''),
  291. 'matched_text': cell.get('matched_text', cell.get('text', '')),
  292. 'bbox': [float(x) for x in cell.get('bbox', [0, 0, 0, 0])[:4]],
  293. 'row': cell.get('row', 0),
  294. 'col': cell.get('col', 0),
  295. 'score': float(cell.get('score', 100.0)),
  296. 'paddle_bbox_indices': cell.get('paddle_bbox_indices',
  297. cell.get('paddle_indices', []))
  298. }
  299. formatted_cells.append(formatted_cell)
  300. return formatted_cells
  301. @staticmethod
  302. def _ensure_list(value) -> List:
  303. """确保值是列表"""
  304. if value is None:
  305. return []
  306. if isinstance(value, str):
  307. return [value] if value else []
  308. if isinstance(value, list):
  309. return value
  310. return [str(value)]