json_formatters.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406
  1. """
  2. JSON 格式化工具模块
  3. 提供 JSON 输出格式化功能:
  4. - MinerU middle.json 格式转换
  5. - mineru_vllm_results_cell_bbox 格式转换
  6. - 表格单元格格式化
  7. - 金额数字标准化(全角→半角)
  8. """
  9. import json
  10. import sys
  11. from pathlib import Path
  12. from typing import Dict, Any, List, Optional
  13. from loguru import logger
  14. # 导入数字标准化工具
  15. from .normalize_financial_numbers import normalize_json_table
  16. class JSONFormatters:
  17. """JSON 格式化工具类"""
  18. @staticmethod
  19. def convert_to_middle_json(results: Dict[str, Any]) -> Dict[str, Any]:
  20. """
  21. 转换为 MinerU 标准 middle.json 格式
  22. 用于 vlm_union_make 生成 Markdown
  23. Args:
  24. results: 处理结果
  25. Returns:
  26. MinerU middle.json 格式的字典
  27. """
  28. middle_json = {
  29. "pdf_info": [],
  30. "_backend": "vlm",
  31. "_scene": results.get('scene', 'unknown'),
  32. "_version_name": "2.5.0"
  33. }
  34. for page in results.get('pages', []):
  35. page_info = {
  36. 'page_idx': page['page_idx'],
  37. 'page_size': list(page.get('image_shape', [0, 0])[:2][::-1]),
  38. 'angle': page.get('angle', 0),
  39. 'para_blocks': [],
  40. 'discarded_blocks': []
  41. }
  42. # 处理普通元素
  43. for element in page.get('elements', []):
  44. block = JSONFormatters._element_to_middle_block(element)
  45. if block:
  46. elem_type = element.get('type', '')
  47. if elem_type in ['header', 'footer', 'page_number', 'aside_text', 'abandon', 'discarded']:
  48. page_info['discarded_blocks'].append(block)
  49. else:
  50. page_info['para_blocks'].append(block)
  51. # 处理丢弃元素(从 discarded_blocks 字段)
  52. for element in page.get('discarded_blocks', []):
  53. block = JSONFormatters._element_to_middle_block(element)
  54. if block:
  55. page_info['discarded_blocks'].append(block)
  56. middle_json['pdf_info'].append(page_info)
  57. return middle_json
  58. @staticmethod
  59. def _element_to_middle_block(element: Dict[str, Any]) -> Optional[Dict[str, Any]]:
  60. """
  61. 将元素转换为 MinerU middle.json block 格式
  62. MinerU 期望的嵌套结构:
  63. - image 类型: { type: "image", blocks: [{ type: "image_body", lines: [...] }] }
  64. - table 类型: { type: "table", blocks: [{ type: "table_body", lines: [...] }] }
  65. """
  66. elem_type = element.get('type', '')
  67. bbox = element.get('bbox', [0, 0, 0, 0])
  68. content = element.get('content', {})
  69. block = {
  70. 'type': elem_type,
  71. 'bbox': bbox,
  72. 'angle': element.get('angle', 0),
  73. 'reading_order': element.get('reading_order', 0),
  74. 'lines': []
  75. }
  76. # 文本类型
  77. if elem_type in ['text', 'title', 'ref_text', 'header', 'footer', 'ocr_text']:
  78. text = content.get('text', '') if isinstance(content, dict) else str(content)
  79. if text:
  80. block['lines'] = [{
  81. 'bbox': bbox,
  82. 'spans': [{
  83. 'bbox': bbox,
  84. 'type': 'text',
  85. 'content': text
  86. }]
  87. }]
  88. # 表格类型 - 嵌套结构
  89. elif elem_type in ['table', 'table_body']:
  90. table_html = content.get('html', '')
  91. cells = content.get('cells', [])
  92. block['type'] = 'table'
  93. block['blocks'] = [{
  94. 'type': 'table_body',
  95. 'bbox': bbox,
  96. 'angle': 0,
  97. 'lines': [{
  98. 'bbox': bbox,
  99. 'spans': [{
  100. 'bbox': bbox,
  101. 'type': 'table',
  102. 'html': table_html,
  103. 'cells': cells
  104. }]
  105. }]
  106. }]
  107. # 图片类型 - 嵌套结构
  108. elif elem_type in ['image', 'image_body', 'figure']:
  109. block['type'] = 'image'
  110. block['blocks'] = [{
  111. 'type': 'image_body',
  112. 'bbox': bbox,
  113. 'angle': element.get('angle', 0),
  114. 'lines': [{
  115. 'bbox': bbox,
  116. 'spans': [{
  117. 'bbox': bbox,
  118. 'type': 'image',
  119. 'image_path': content.get('image_path', ''),
  120. 'description': content.get('description', '')
  121. }]
  122. }]
  123. }]
  124. # 公式类型
  125. elif elem_type in ['interline_equation', 'inline_equation', 'equation']:
  126. latex = content.get('latex', '')
  127. block['lines'] = [{
  128. 'bbox': bbox,
  129. 'spans': [{
  130. 'bbox': bbox,
  131. 'type': 'interline_equation' if 'interline' in elem_type else 'inline_equation',
  132. 'content': latex
  133. }]
  134. }]
  135. # 表格/图片附属文本
  136. elif elem_type in ['table_caption', 'table_footnote', 'image_caption', 'image_footnote']:
  137. text = content.get('text', '') if isinstance(content, dict) else str(content)
  138. if text:
  139. block['lines'] = [{
  140. 'bbox': bbox,
  141. 'spans': [{
  142. 'bbox': bbox,
  143. 'type': 'text',
  144. 'content': text
  145. }]
  146. }]
  147. # 丢弃类型
  148. elif elem_type in ['abandon', 'discarded']:
  149. block['type'] = 'abandon'
  150. text = content.get('text', '') if isinstance(content, dict) else str(content)
  151. if text:
  152. block['lines'] = [{
  153. 'bbox': bbox,
  154. 'spans': [{
  155. 'bbox': bbox,
  156. 'type': 'text',
  157. 'content': text
  158. }]
  159. }]
  160. return block
  161. @staticmethod
  162. def save_page_jsons(
  163. results: Dict[str, Any],
  164. output_dir: Path,
  165. doc_name: str,
  166. is_pdf: bool = True,
  167. normalize_numbers: bool = True
  168. ) -> List[str]:
  169. """
  170. 保存每页独立的 JSON(mineru_vllm_results_cell_bbox 格式)
  171. 命名规则:
  172. - PDF输入: 文件名_page_001.json
  173. - 图片输入(单页): 文件名.json
  174. Args:
  175. results: 处理结果
  176. output_dir: 输出目录
  177. doc_name: 文档名称
  178. is_pdf: 是否为 PDF 输入
  179. normalize_numbers: 是否标准化金额数字(全角→半角)
  180. Returns:
  181. 保存的文件路径列表
  182. """
  183. saved_paths = []
  184. total_pages = len(results.get('pages', []))
  185. for page in results.get('pages', []):
  186. page_idx = page.get('page_idx', 0)
  187. # 根据输入类型决定命名
  188. if is_pdf or total_pages > 1:
  189. page_name = f"{doc_name}_page_{page_idx + 1:03d}"
  190. else:
  191. page_name = doc_name
  192. # 获取页面旋转角度
  193. page_rotation_angle = float(page.get('angle', 0))
  194. # 转换为 mineru_vllm_results_cell_bbox 格式
  195. page_elements = []
  196. for element in page.get('elements', []):
  197. converted = JSONFormatters._element_to_cell_bbox_format(element, page_idx, page_rotation_angle)
  198. if converted:
  199. page_elements.append(converted)
  200. # 添加丢弃元素
  201. for element in page.get('discarded_blocks', []):
  202. converted = JSONFormatters._element_to_cell_bbox_format(element, page_idx, page_rotation_angle)
  203. if converted:
  204. page_elements.append(converted)
  205. # 转换为 JSON 字符串
  206. json_content = json.dumps(page_elements, ensure_ascii=False, indent=2)
  207. # 金额数字标准化
  208. if normalize_numbers:
  209. original_content = json_content
  210. json_content = normalize_json_table(json_content)
  211. if json_content != original_content:
  212. original_path = output_dir / f"{page_name}_original.json"
  213. with open(original_path, 'w', encoding='utf-8') as f:
  214. f.write(original_content)
  215. logger.debug(f"📄 Original page JSON saved: {original_path}")
  216. # 保存 JSON
  217. json_path = output_dir / f"{page_name}.json"
  218. with open(json_path, 'w', encoding='utf-8') as f:
  219. f.write(json_content)
  220. saved_paths.append(str(json_path))
  221. logger.debug(f"📄 Page JSON saved: {json_path}")
  222. if saved_paths:
  223. logger.info(f"📄 {len(saved_paths)} page JSONs saved")
  224. return saved_paths
  225. @staticmethod
  226. def _element_to_cell_bbox_format(
  227. element: Dict[str, Any],
  228. page_idx: int,
  229. page_rotation_angle: float = 0.0
  230. ) -> Optional[Dict[str, Any]]:
  231. """
  232. 将元素转换为 mineru_vllm_results_cell_bbox 格式
  233. Args:
  234. element: 元素字典
  235. page_idx: 页面索引
  236. page_rotation_angle: 页面旋转角度(0, 90, 180, 270)
  237. """
  238. elem_type = element.get('type', '')
  239. bbox = element.get('bbox', [0, 0, 0, 0])
  240. content = element.get('content', {})
  241. # 确保 bbox 是整数列表
  242. bbox = [int(x) for x in bbox[:4]] if bbox else [0, 0, 0, 0]
  243. result = {
  244. 'bbox': bbox,
  245. 'page_idx': page_idx,
  246. 'page_rotation_angle': page_rotation_angle,
  247. 'reading_order': element.get('reading_order', 0)
  248. }
  249. # 文本类型
  250. if elem_type in ['text', 'title', 'ref_text', 'ocr_text']:
  251. text = content.get('text', '') if isinstance(content, dict) else str(content)
  252. result['type'] = 'text' if elem_type != 'title' else 'title'
  253. result['text'] = text
  254. if elem_type == 'title':
  255. result['text_level'] = element.get('level', 1)
  256. # 表格类型
  257. elif elem_type in ['table', 'table_body']:
  258. result['type'] = 'table'
  259. result['img_path'] = content.get('table_image_path', '')
  260. result['table_caption'] = JSONFormatters._ensure_list(content.get('table_caption', []))
  261. result['table_footnote'] = JSONFormatters._ensure_list(content.get('table_footnote', []))
  262. result['table_body'] = content.get('html', '')
  263. # 关键:table_cells 数组
  264. cells = content.get('cells', [])
  265. if cells:
  266. result['table_cells'] = JSONFormatters.format_table_cells(cells)
  267. # 旋转和倾斜信息
  268. if 'table_angle' in content:
  269. result['image_rotation_angle'] = float(content['table_angle'])
  270. if 'skew_angle' in content:
  271. result['skew_angle'] = float(content['skew_angle'])
  272. # 图片类型
  273. elif elem_type in ['image', 'image_body', 'figure']:
  274. result['type'] = 'image'
  275. image_filename = content.get('image_path', '')
  276. result['img_path'] = f"images/{image_filename}" if image_filename else ''
  277. result['image_caption'] = JSONFormatters._ensure_list(content.get('caption', []))
  278. result['image_footnote'] = JSONFormatters._ensure_list(content.get('footnote', []))
  279. # 公式类型
  280. elif elem_type in ['interline_equation', 'inline_equation', 'equation']:
  281. result['type'] = 'equation'
  282. result['text'] = content.get('latex', '') if isinstance(content, dict) else ''
  283. result['text_format'] = 'latex'
  284. # 列表类型
  285. elif elem_type == 'list':
  286. result['type'] = 'list'
  287. result['sub_type'] = 'text'
  288. result['list_items'] = content.get('list_items', []) if isinstance(content, dict) else []
  289. # 页眉页脚
  290. elif elem_type in ['header', 'footer']:
  291. result['type'] = elem_type
  292. result['text'] = content.get('text', '') if isinstance(content, dict) else str(content)
  293. # 表格/图片附属文本
  294. elif elem_type in ['table_caption', 'table_footnote', 'image_caption', 'image_footnote']:
  295. result['type'] = elem_type
  296. result['text'] = content.get('text', '') if isinstance(content, dict) else str(content)
  297. # 丢弃元素
  298. elif elem_type in ['discarded', 'abandon']:
  299. result['type'] = 'discarded'
  300. result['original_category'] = element.get('original_category', 'unknown')
  301. result['text'] = content.get('text', '') if isinstance(content, dict) else ''
  302. else:
  303. return None
  304. return result
  305. @staticmethod
  306. def format_table_cells(cells: List[Dict]) -> List[Dict[str, Any]]:
  307. """
  308. 格式化表格单元格为 mineru_vllm_results_cell_bbox 格式
  309. 输出格式:
  310. {
  311. "type": "table_cell",
  312. "text": "单元格内容",
  313. "matched_text": "OCR匹配文本",
  314. "bbox": [x1, y1, x2, y2],
  315. "row": 1,
  316. "col": 1,
  317. "score": 100.0,
  318. "paddle_bbox_indices": [0, 1]
  319. }
  320. """
  321. formatted_cells = []
  322. for cell in cells:
  323. formatted_cell = {
  324. 'type': 'table_cell',
  325. 'text': cell.get('text', ''),
  326. 'matched_text': cell.get('matched_text', cell.get('text', '')),
  327. 'bbox': [float(x) for x in cell.get('bbox', [0, 0, 0, 0])[:4]],
  328. 'row': cell.get('row', 0),
  329. 'col': cell.get('col', 0),
  330. 'score': float(cell.get('score', 100.0)),
  331. 'paddle_bbox_indices': cell.get('paddle_bbox_indices',
  332. cell.get('paddle_indices', []))
  333. }
  334. formatted_cells.append(formatted_cell)
  335. return formatted_cells
  336. @staticmethod
  337. def _ensure_list(value) -> List:
  338. """确保值是列表"""
  339. if value is None:
  340. return []
  341. if isinstance(value, str):
  342. return [value] if value else []
  343. if isinstance(value, list):
  344. return value
  345. return [str(value)]