json_formatters.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429
  1. """
  2. JSON 格式化工具模块
  3. 提供 JSON 输出格式化功能:
  4. - MinerU middle.json 格式转换
  5. - mineru_vllm_results_cell_bbox 格式转换
  6. - 表格单元格格式化
  7. - 金额数字标准化(全角→半角)
  8. """
  9. import json
  10. import sys
  11. import numpy as np
  12. from pathlib import Path
  13. from typing import Dict, Any, List, Optional
  14. from loguru import logger
  15. class NumpyEncoder(json.JSONEncoder):
  16. """自定义JSON编码器,处理numpy类型"""
  17. def default(self, obj):
  18. if isinstance(obj, np.integer):
  19. return int(obj)
  20. elif isinstance(obj, np.floating):
  21. return float(obj)
  22. elif isinstance(obj, np.ndarray):
  23. return obj.tolist()
  24. return super().default(obj)
  25. class JSONFormatters:
  26. """JSON 格式化工具类"""
  27. @staticmethod
  28. def convert_to_middle_json(results: Dict[str, Any]) -> Dict[str, Any]:
  29. """
  30. 转换为 MinerU 标准 middle.json 格式
  31. 用于 vlm_union_make 生成 Markdown
  32. Args:
  33. results: 处理结果
  34. Returns:
  35. MinerU middle.json 格式的字典
  36. """
  37. middle_json = {
  38. "pdf_info": [],
  39. "_backend": "vlm",
  40. "_scene": results.get('scene', 'unknown'),
  41. "_version_name": "2.5.0"
  42. }
  43. for page in results.get('pages', []):
  44. page_info = {
  45. 'page_idx': page['page_idx'],
  46. 'page_size': list(page.get('image_shape', [0, 0])[:2][::-1]),
  47. 'angle': page.get('angle', 0),
  48. 'para_blocks': [],
  49. 'discarded_blocks': []
  50. }
  51. # 处理普通元素
  52. for element in page.get('elements', []):
  53. block = JSONFormatters._element_to_middle_block(element)
  54. if block:
  55. elem_type = element.get('type', '')
  56. if elem_type in ['header', 'footer', 'page_number', 'aside_text', 'abandon', 'discarded']:
  57. page_info['discarded_blocks'].append(block)
  58. else:
  59. page_info['para_blocks'].append(block)
  60. # 处理丢弃元素(从 discarded_blocks 字段)
  61. for element in page.get('discarded_blocks', []):
  62. block = JSONFormatters._element_to_middle_block(element)
  63. if block:
  64. page_info['discarded_blocks'].append(block)
  65. middle_json['pdf_info'].append(page_info)
  66. return middle_json
  67. @staticmethod
  68. def _element_to_middle_block(element: Dict[str, Any]) -> Optional[Dict[str, Any]]:
  69. """
  70. 将元素转换为 MinerU middle.json block 格式
  71. MinerU 期望的嵌套结构:
  72. - image 类型: { type: "image", blocks: [{ type: "image_body", lines: [...] }] }
  73. - table 类型: { type: "table", blocks: [{ type: "table_body", lines: [...] }] }
  74. """
  75. elem_type = element.get('type', '')
  76. bbox = element.get('bbox', [0, 0, 0, 0])
  77. content = element.get('content', {})
  78. block = {
  79. 'type': elem_type,
  80. 'bbox': bbox,
  81. 'angle': element.get('angle', 0),
  82. 'reading_order': element.get('reading_order', 0),
  83. 'lines': []
  84. }
  85. # 文本类型
  86. if elem_type in ['text', 'title', 'ref_text', 'header', 'footer', 'ocr_text']:
  87. text = content.get('text', '') if isinstance(content, dict) else str(content)
  88. if text:
  89. block['lines'] = [{
  90. 'bbox': bbox,
  91. 'spans': [{
  92. 'bbox': bbox,
  93. 'type': 'text',
  94. 'content': text
  95. }]
  96. }]
  97. # 表格类型 - 嵌套结构
  98. elif elem_type in ['table', 'table_body']:
  99. table_html = content.get('html', '')
  100. cells = content.get('cells', [])
  101. block['type'] = 'table'
  102. block['blocks'] = [{
  103. 'type': 'table_body',
  104. 'bbox': bbox,
  105. 'angle': 0,
  106. 'lines': [{
  107. 'bbox': bbox,
  108. 'spans': [{
  109. 'bbox': bbox,
  110. 'type': 'table',
  111. 'html': table_html,
  112. 'cells': cells
  113. }]
  114. }]
  115. }]
  116. # 图片类型 - 嵌套结构
  117. elif elem_type in ['image', 'image_body', 'figure']:
  118. block['type'] = 'image'
  119. block['blocks'] = [{
  120. 'type': 'image_body',
  121. 'bbox': bbox,
  122. 'angle': element.get('angle', 0),
  123. 'lines': [{
  124. 'bbox': bbox,
  125. 'spans': [{
  126. 'bbox': bbox,
  127. 'type': 'image',
  128. 'image_path': content.get('image_path', ''),
  129. 'description': content.get('description', '')
  130. }]
  131. }]
  132. }]
  133. # 公式类型
  134. elif elem_type in ['interline_equation', 'inline_equation', 'equation']:
  135. latex = content.get('latex', '')
  136. block['lines'] = [{
  137. 'bbox': bbox,
  138. 'spans': [{
  139. 'bbox': bbox,
  140. 'type': 'interline_equation' if 'interline' in elem_type else 'inline_equation',
  141. 'content': latex
  142. }]
  143. }]
  144. # 表格/图片附属文本
  145. elif elem_type in ['table_caption', 'table_footnote', 'image_caption', 'image_footnote']:
  146. text = content.get('text', '') if isinstance(content, dict) else str(content)
  147. if text:
  148. block['lines'] = [{
  149. 'bbox': bbox,
  150. 'spans': [{
  151. 'bbox': bbox,
  152. 'type': 'text',
  153. 'content': text
  154. }]
  155. }]
  156. # 印章类型
  157. elif elem_type == 'seal':
  158. text = content.get('text', '') if isinstance(content, dict) else str(content)
  159. confidence = content.get('confidence', 0.0) if isinstance(content, dict) else 0.0
  160. block['lines'] = [{
  161. 'bbox': bbox,
  162. 'spans': [{
  163. 'bbox': bbox,
  164. 'type': 'seal',
  165. 'content': text,
  166. 'confidence': confidence
  167. }]
  168. }]
  169. # 丢弃类型
  170. elif elem_type in ['abandon', 'discarded']:
  171. block['type'] = 'abandon'
  172. text = content.get('text', '') if isinstance(content, dict) else str(content)
  173. if text:
  174. block['lines'] = [{
  175. 'bbox': bbox,
  176. 'spans': [{
  177. 'bbox': bbox,
  178. 'type': 'text',
  179. 'content': text
  180. }]
  181. }]
  182. return block
  183. @staticmethod
  184. def save_page_jsons(
  185. results: Dict[str, Any],
  186. output_dir: Path,
  187. doc_name: str,
  188. is_pdf: bool = True,
  189. normalize_numbers: bool = True
  190. ) -> List[str]:
  191. """
  192. 保存每页独立的 JSON(mineru_vllm_results_cell_bbox 格式)
  193. 命名规则:
  194. - PDF输入: 文件名_page_001.json
  195. - 图片输入(单页): 文件名.json
  196. Args:
  197. results: 处理结果
  198. output_dir: 输出目录
  199. doc_name: 文档名称
  200. is_pdf: 是否为 PDF 输入
  201. normalize_numbers: 是否标准化金额数字(全角→半角)
  202. Returns:
  203. 保存的文件路径列表
  204. """
  205. saved_paths = []
  206. total_pages = len(results.get('pages', []))
  207. for page in results.get('pages', []):
  208. page_idx = page.get('page_idx', 0)
  209. # 根据输入类型决定命名
  210. if is_pdf or total_pages > 1:
  211. page_name = f"{doc_name}_page_{page_idx + 1:03d}"
  212. else:
  213. page_name = doc_name
  214. # 获取页面旋转角度
  215. page_rotation_angle = float(page.get('angle', 0))
  216. # 转换为 mineru_vllm_results_cell_bbox 格式
  217. page_elements = []
  218. for element in page.get('elements', []):
  219. converted = JSONFormatters._element_to_cell_bbox_format(element, page_idx, page_rotation_angle)
  220. if converted:
  221. page_elements.append(converted)
  222. # 添加丢弃元素
  223. for element in page.get('discarded_blocks', []):
  224. converted = JSONFormatters._element_to_cell_bbox_format(element, page_idx, page_rotation_angle)
  225. if converted:
  226. page_elements.append(converted)
  227. # 转换为 JSON 字符串(金额标准化已在 pipeline element_processors 中完成,此处不再重复)
  228. json_content = json.dumps(page_elements, ensure_ascii=False, indent=2, cls=NumpyEncoder)
  229. # 保存 JSON
  230. json_path = output_dir / f"{page_name}.json"
  231. with open(json_path, 'w', encoding='utf-8') as f:
  232. f.write(json_content)
  233. saved_paths.append(str(json_path))
  234. logger.debug(f"📄 Page JSON saved: {json_path}")
  235. if saved_paths:
  236. logger.info(f"📄 {len(saved_paths)} page JSONs saved")
  237. return saved_paths
  238. @staticmethod
  239. def _element_to_cell_bbox_format(
  240. element: Dict[str, Any],
  241. page_idx: int,
  242. page_rotation_angle: float = 0.0
  243. ) -> Optional[Dict[str, Any]]:
  244. """
  245. 将元素转换为 mineru_vllm_results_cell_bbox 格式
  246. Args:
  247. element: 元素字典
  248. page_idx: 页面索引
  249. page_rotation_angle: 页面旋转角度(0, 90, 180, 270)
  250. """
  251. elem_type = element.get('type', '')
  252. bbox = element.get('bbox', [0, 0, 0, 0])
  253. content = element.get('content', {})
  254. # 确保 bbox 是整数列表
  255. bbox = [int(x) for x in bbox[:4]] if bbox else [0, 0, 0, 0]
  256. result = {
  257. 'bbox': bbox,
  258. 'page_idx': page_idx,
  259. 'page_rotation_angle': page_rotation_angle,
  260. 'reading_order': element.get('reading_order', 0)
  261. }
  262. # 文本类型
  263. if elem_type in ['text', 'title', 'ref_text', 'ocr_text']:
  264. text = content.get('text', '') if isinstance(content, dict) else str(content)
  265. result['type'] = 'text' if elem_type != 'title' else 'title'
  266. result['text'] = text
  267. if elem_type == 'title':
  268. result['text_level'] = element.get('level', 1)
  269. # 表格类型
  270. elif elem_type in ['table', 'table_body']:
  271. result['type'] = 'table'
  272. result['img_path'] = content.get('table_image_path', '')
  273. result['table_caption'] = JSONFormatters._ensure_list(content.get('table_caption', []))
  274. result['table_footnote'] = JSONFormatters._ensure_list(content.get('table_footnote', []))
  275. result['table_body'] = content.get('html', '')
  276. # 关键:table_cells 数组
  277. cells = content.get('cells', [])
  278. if cells:
  279. result['table_cells'] = JSONFormatters.format_table_cells(cells)
  280. # 金额标准化变更记录(来自 element_processors._normalize_table_content)
  281. changes = content.get('number_normalization_changes', [])
  282. if changes:
  283. result['number_normalization_changes'] = changes
  284. # 旋转和倾斜信息
  285. if 'table_angle' in content:
  286. result['image_rotation_angle'] = float(content['table_angle'])
  287. if 'skew_angle' in content:
  288. result['skew_angle'] = float(content['skew_angle'])
  289. # 图片类型
  290. elif elem_type in ['image', 'image_body', 'figure']:
  291. result['type'] = 'image'
  292. image_filename = content.get('image_path', '')
  293. result['img_path'] = f"images/{image_filename}" if image_filename else ''
  294. result['image_caption'] = JSONFormatters._ensure_list(content.get('caption', []))
  295. result['image_footnote'] = JSONFormatters._ensure_list(content.get('footnote', []))
  296. # 公式类型
  297. elif elem_type in ['interline_equation', 'inline_equation', 'equation']:
  298. result['type'] = 'equation'
  299. result['text'] = content.get('latex', '') if isinstance(content, dict) else ''
  300. result['text_format'] = 'latex'
  301. # 列表类型
  302. elif elem_type == 'list':
  303. result['type'] = 'list'
  304. result['sub_type'] = 'text'
  305. result['list_items'] = content.get('list_items', []) if isinstance(content, dict) else []
  306. # 页眉页脚
  307. elif elem_type in ['header', 'footer']:
  308. result['type'] = elem_type
  309. result['text'] = content.get('text', '') if isinstance(content, dict) else str(content)
  310. # 表格/图片附属文本
  311. elif elem_type in ['table_caption', 'table_footnote', 'image_caption', 'image_footnote']:
  312. result['type'] = elem_type
  313. result['text'] = content.get('text', '') if isinstance(content, dict) else str(content)
  314. # 印章类型
  315. elif elem_type == 'seal':
  316. result['type'] = 'seal'
  317. result['text'] = content.get('text', '') if isinstance(content, dict) else str(content)
  318. result['confidence'] = content.get('confidence', 0.0) if isinstance(content, dict) else 0.0
  319. # 丢弃元素
  320. elif elem_type in ['discarded', 'abandon']:
  321. result['type'] = 'discarded'
  322. result['original_category'] = element.get('original_category', 'unknown')
  323. result['text'] = content.get('text', '') if isinstance(content, dict) else ''
  324. else:
  325. return None
  326. return result
  327. @staticmethod
  328. def format_table_cells(cells: List[Dict]) -> List[Dict[str, Any]]:
  329. """
  330. 格式化表格单元格为 mineru_vllm_results_cell_bbox 格式
  331. 输出格式:
  332. {
  333. "type": "table_cell",
  334. "text": "单元格内容",
  335. "matched_text": "OCR匹配文本",
  336. "bbox": [x1, y1, x2, y2],
  337. "row": 1,
  338. "col": 1,
  339. "score": 100.0,
  340. "paddle_bbox_indices": [0, 1]
  341. }
  342. """
  343. formatted_cells = []
  344. for cell in cells:
  345. formatted_cell = {
  346. 'type': 'table_cell',
  347. 'text': cell.get('text', ''),
  348. 'matched_text': cell.get('matched_text', cell.get('text', '')),
  349. 'bbox': [float(x) for x in cell.get('bbox', [0, 0, 0, 0])[:4]],
  350. 'row': cell.get('row', 0),
  351. 'col': cell.get('col', 0),
  352. 'score': float(cell.get('score', 100.0)),
  353. 'paddle_bbox_indices': cell.get('paddle_bbox_indices',
  354. cell.get('paddle_indices', []))
  355. }
  356. formatted_cells.append(formatted_cell)
  357. return formatted_cells
  358. @staticmethod
  359. def _ensure_list(value) -> List:
  360. """确保值是列表"""
  361. if value is None:
  362. return []
  363. if isinstance(value, str):
  364. return [value] if value else []
  365. if isinstance(value, list):
  366. return value
  367. return [str(value)]