json_formatters.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440
  1. """
  2. JSON 格式化工具模块
  3. 提供 JSON 输出格式化功能:
  4. - MinerU middle.json 格式转换
  5. - mineru_vllm_results_cell_bbox 格式转换
  6. - 表格单元格格式化
  7. - 金额数字标准化(全角→半角)
  8. """
  9. import json
  10. import sys
  11. import numpy as np
  12. from pathlib import Path
  13. from typing import Dict, Any, List, Optional
  14. from loguru import logger
  15. # 导入数字标准化工具
  16. from .normalize_financial_numbers import normalize_json_table
  17. class NumpyEncoder(json.JSONEncoder):
  18. """自定义JSON编码器,处理numpy类型"""
  19. def default(self, obj):
  20. if isinstance(obj, np.integer):
  21. return int(obj)
  22. elif isinstance(obj, np.floating):
  23. return float(obj)
  24. elif isinstance(obj, np.ndarray):
  25. return obj.tolist()
  26. return super().default(obj)
  27. class JSONFormatters:
  28. """JSON 格式化工具类"""
  29. @staticmethod
  30. def convert_to_middle_json(results: Dict[str, Any]) -> Dict[str, Any]:
  31. """
  32. 转换为 MinerU 标准 middle.json 格式
  33. 用于 vlm_union_make 生成 Markdown
  34. Args:
  35. results: 处理结果
  36. Returns:
  37. MinerU middle.json 格式的字典
  38. """
  39. middle_json = {
  40. "pdf_info": [],
  41. "_backend": "vlm",
  42. "_scene": results.get('scene', 'unknown'),
  43. "_version_name": "2.5.0"
  44. }
  45. for page in results.get('pages', []):
  46. page_info = {
  47. 'page_idx': page['page_idx'],
  48. 'page_size': list(page.get('image_shape', [0, 0])[:2][::-1]),
  49. 'angle': page.get('angle', 0),
  50. 'para_blocks': [],
  51. 'discarded_blocks': []
  52. }
  53. # 处理普通元素
  54. for element in page.get('elements', []):
  55. block = JSONFormatters._element_to_middle_block(element)
  56. if block:
  57. elem_type = element.get('type', '')
  58. if elem_type in ['header', 'footer', 'page_number', 'aside_text', 'abandon', 'discarded']:
  59. page_info['discarded_blocks'].append(block)
  60. else:
  61. page_info['para_blocks'].append(block)
  62. # 处理丢弃元素(从 discarded_blocks 字段)
  63. for element in page.get('discarded_blocks', []):
  64. block = JSONFormatters._element_to_middle_block(element)
  65. if block:
  66. page_info['discarded_blocks'].append(block)
  67. middle_json['pdf_info'].append(page_info)
  68. return middle_json
  69. @staticmethod
  70. def _element_to_middle_block(element: Dict[str, Any]) -> Optional[Dict[str, Any]]:
  71. """
  72. 将元素转换为 MinerU middle.json block 格式
  73. MinerU 期望的嵌套结构:
  74. - image 类型: { type: "image", blocks: [{ type: "image_body", lines: [...] }] }
  75. - table 类型: { type: "table", blocks: [{ type: "table_body", lines: [...] }] }
  76. """
  77. elem_type = element.get('type', '')
  78. bbox = element.get('bbox', [0, 0, 0, 0])
  79. content = element.get('content', {})
  80. block = {
  81. 'type': elem_type,
  82. 'bbox': bbox,
  83. 'angle': element.get('angle', 0),
  84. 'reading_order': element.get('reading_order', 0),
  85. 'lines': []
  86. }
  87. # 文本类型
  88. if elem_type in ['text', 'title', 'ref_text', 'header', 'footer', 'ocr_text']:
  89. text = content.get('text', '') if isinstance(content, dict) else str(content)
  90. if text:
  91. block['lines'] = [{
  92. 'bbox': bbox,
  93. 'spans': [{
  94. 'bbox': bbox,
  95. 'type': 'text',
  96. 'content': text
  97. }]
  98. }]
  99. # 表格类型 - 嵌套结构
  100. elif elem_type in ['table', 'table_body']:
  101. table_html = content.get('html', '')
  102. cells = content.get('cells', [])
  103. block['type'] = 'table'
  104. block['blocks'] = [{
  105. 'type': 'table_body',
  106. 'bbox': bbox,
  107. 'angle': 0,
  108. 'lines': [{
  109. 'bbox': bbox,
  110. 'spans': [{
  111. 'bbox': bbox,
  112. 'type': 'table',
  113. 'html': table_html,
  114. 'cells': cells
  115. }]
  116. }]
  117. }]
  118. # 图片类型 - 嵌套结构
  119. elif elem_type in ['image', 'image_body', 'figure']:
  120. block['type'] = 'image'
  121. block['blocks'] = [{
  122. 'type': 'image_body',
  123. 'bbox': bbox,
  124. 'angle': element.get('angle', 0),
  125. 'lines': [{
  126. 'bbox': bbox,
  127. 'spans': [{
  128. 'bbox': bbox,
  129. 'type': 'image',
  130. 'image_path': content.get('image_path', ''),
  131. 'description': content.get('description', '')
  132. }]
  133. }]
  134. }]
  135. # 公式类型
  136. elif elem_type in ['interline_equation', 'inline_equation', 'equation']:
  137. latex = content.get('latex', '')
  138. block['lines'] = [{
  139. 'bbox': bbox,
  140. 'spans': [{
  141. 'bbox': bbox,
  142. 'type': 'interline_equation' if 'interline' in elem_type else 'inline_equation',
  143. 'content': latex
  144. }]
  145. }]
  146. # 表格/图片附属文本
  147. elif elem_type in ['table_caption', 'table_footnote', 'image_caption', 'image_footnote']:
  148. text = content.get('text', '') if isinstance(content, dict) else str(content)
  149. if text:
  150. block['lines'] = [{
  151. 'bbox': bbox,
  152. 'spans': [{
  153. 'bbox': bbox,
  154. 'type': 'text',
  155. 'content': text
  156. }]
  157. }]
  158. # 印章类型
  159. elif elem_type == 'seal':
  160. text = content.get('text', '') if isinstance(content, dict) else str(content)
  161. confidence = content.get('confidence', 0.0) if isinstance(content, dict) else 0.0
  162. block['lines'] = [{
  163. 'bbox': bbox,
  164. 'spans': [{
  165. 'bbox': bbox,
  166. 'type': 'seal',
  167. 'content': text,
  168. 'confidence': confidence
  169. }]
  170. }]
  171. # 丢弃类型
  172. elif elem_type in ['abandon', 'discarded']:
  173. block['type'] = 'abandon'
  174. text = content.get('text', '') if isinstance(content, dict) else str(content)
  175. if text:
  176. block['lines'] = [{
  177. 'bbox': bbox,
  178. 'spans': [{
  179. 'bbox': bbox,
  180. 'type': 'text',
  181. 'content': text
  182. }]
  183. }]
  184. return block
  185. @staticmethod
  186. def save_page_jsons(
  187. results: Dict[str, Any],
  188. output_dir: Path,
  189. doc_name: str,
  190. is_pdf: bool = True,
  191. normalize_numbers: bool = True
  192. ) -> List[str]:
  193. """
  194. 保存每页独立的 JSON(mineru_vllm_results_cell_bbox 格式)
  195. 命名规则:
  196. - PDF输入: 文件名_page_001.json
  197. - 图片输入(单页): 文件名.json
  198. Args:
  199. results: 处理结果
  200. output_dir: 输出目录
  201. doc_name: 文档名称
  202. is_pdf: 是否为 PDF 输入
  203. normalize_numbers: 是否标准化金额数字(全角→半角)
  204. Returns:
  205. 保存的文件路径列表
  206. """
  207. saved_paths = []
  208. total_pages = len(results.get('pages', []))
  209. for page in results.get('pages', []):
  210. page_idx = page.get('page_idx', 0)
  211. # 根据输入类型决定命名
  212. if is_pdf or total_pages > 1:
  213. page_name = f"{doc_name}_page_{page_idx + 1:03d}"
  214. else:
  215. page_name = doc_name
  216. # 获取页面旋转角度
  217. page_rotation_angle = float(page.get('angle', 0))
  218. # 转换为 mineru_vllm_results_cell_bbox 格式
  219. page_elements = []
  220. for element in page.get('elements', []):
  221. converted = JSONFormatters._element_to_cell_bbox_format(element, page_idx, page_rotation_angle)
  222. if converted:
  223. page_elements.append(converted)
  224. # 添加丢弃元素
  225. for element in page.get('discarded_blocks', []):
  226. converted = JSONFormatters._element_to_cell_bbox_format(element, page_idx, page_rotation_angle)
  227. if converted:
  228. page_elements.append(converted)
  229. # 转换为 JSON 字符串
  230. json_content = json.dumps(page_elements, ensure_ascii=False, indent=2, cls=NumpyEncoder)
  231. # 金额数字标准化
  232. if normalize_numbers:
  233. original_content = json_content
  234. json_content = normalize_json_table(json_content)
  235. if json_content != original_content:
  236. original_path = output_dir / f"{page_name}_original.json"
  237. with open(original_path, 'w', encoding='utf-8') as f:
  238. f.write(original_content)
  239. logger.debug(f"📄 Original page JSON saved: {original_path}")
  240. # 保存 JSON
  241. json_path = output_dir / f"{page_name}.json"
  242. with open(json_path, 'w', encoding='utf-8') as f:
  243. f.write(json_content)
  244. saved_paths.append(str(json_path))
  245. logger.debug(f"📄 Page JSON saved: {json_path}")
  246. if saved_paths:
  247. logger.info(f"📄 {len(saved_paths)} page JSONs saved")
  248. return saved_paths
  249. @staticmethod
  250. def _element_to_cell_bbox_format(
  251. element: Dict[str, Any],
  252. page_idx: int,
  253. page_rotation_angle: float = 0.0
  254. ) -> Optional[Dict[str, Any]]:
  255. """
  256. 将元素转换为 mineru_vllm_results_cell_bbox 格式
  257. Args:
  258. element: 元素字典
  259. page_idx: 页面索引
  260. page_rotation_angle: 页面旋转角度(0, 90, 180, 270)
  261. """
  262. elem_type = element.get('type', '')
  263. bbox = element.get('bbox', [0, 0, 0, 0])
  264. content = element.get('content', {})
  265. # 确保 bbox 是整数列表
  266. bbox = [int(x) for x in bbox[:4]] if bbox else [0, 0, 0, 0]
  267. result = {
  268. 'bbox': bbox,
  269. 'page_idx': page_idx,
  270. 'page_rotation_angle': page_rotation_angle,
  271. 'reading_order': element.get('reading_order', 0)
  272. }
  273. # 文本类型
  274. if elem_type in ['text', 'title', 'ref_text', 'ocr_text']:
  275. text = content.get('text', '') if isinstance(content, dict) else str(content)
  276. result['type'] = 'text' if elem_type != 'title' else 'title'
  277. result['text'] = text
  278. if elem_type == 'title':
  279. result['text_level'] = element.get('level', 1)
  280. # 表格类型
  281. elif elem_type in ['table', 'table_body']:
  282. result['type'] = 'table'
  283. result['img_path'] = content.get('table_image_path', '')
  284. result['table_caption'] = JSONFormatters._ensure_list(content.get('table_caption', []))
  285. result['table_footnote'] = JSONFormatters._ensure_list(content.get('table_footnote', []))
  286. result['table_body'] = content.get('html', '')
  287. # 关键:table_cells 数组
  288. cells = content.get('cells', [])
  289. if cells:
  290. result['table_cells'] = JSONFormatters.format_table_cells(cells)
  291. # 旋转和倾斜信息
  292. if 'table_angle' in content:
  293. result['image_rotation_angle'] = float(content['table_angle'])
  294. if 'skew_angle' in content:
  295. result['skew_angle'] = float(content['skew_angle'])
  296. # 图片类型
  297. elif elem_type in ['image', 'image_body', 'figure']:
  298. result['type'] = 'image'
  299. image_filename = content.get('image_path', '')
  300. result['img_path'] = f"images/{image_filename}" if image_filename else ''
  301. result['image_caption'] = JSONFormatters._ensure_list(content.get('caption', []))
  302. result['image_footnote'] = JSONFormatters._ensure_list(content.get('footnote', []))
  303. # 公式类型
  304. elif elem_type in ['interline_equation', 'inline_equation', 'equation']:
  305. result['type'] = 'equation'
  306. result['text'] = content.get('latex', '') if isinstance(content, dict) else ''
  307. result['text_format'] = 'latex'
  308. # 列表类型
  309. elif elem_type == 'list':
  310. result['type'] = 'list'
  311. result['sub_type'] = 'text'
  312. result['list_items'] = content.get('list_items', []) if isinstance(content, dict) else []
  313. # 页眉页脚
  314. elif elem_type in ['header', 'footer']:
  315. result['type'] = elem_type
  316. result['text'] = content.get('text', '') if isinstance(content, dict) else str(content)
  317. # 表格/图片附属文本
  318. elif elem_type in ['table_caption', 'table_footnote', 'image_caption', 'image_footnote']:
  319. result['type'] = elem_type
  320. result['text'] = content.get('text', '') if isinstance(content, dict) else str(content)
  321. # 印章类型
  322. elif elem_type == 'seal':
  323. result['type'] = 'seal'
  324. result['text'] = content.get('text', '') if isinstance(content, dict) else str(content)
  325. result['confidence'] = content.get('confidence', 0.0) if isinstance(content, dict) else 0.0
  326. # 丢弃元素
  327. elif elem_type in ['discarded', 'abandon']:
  328. result['type'] = 'discarded'
  329. result['original_category'] = element.get('original_category', 'unknown')
  330. result['text'] = content.get('text', '') if isinstance(content, dict) else ''
  331. else:
  332. return None
  333. return result
  334. @staticmethod
  335. def format_table_cells(cells: List[Dict]) -> List[Dict[str, Any]]:
  336. """
  337. 格式化表格单元格为 mineru_vllm_results_cell_bbox 格式
  338. 输出格式:
  339. {
  340. "type": "table_cell",
  341. "text": "单元格内容",
  342. "matched_text": "OCR匹配文本",
  343. "bbox": [x1, y1, x2, y2],
  344. "row": 1,
  345. "col": 1,
  346. "score": 100.0,
  347. "paddle_bbox_indices": [0, 1]
  348. }
  349. """
  350. formatted_cells = []
  351. for cell in cells:
  352. formatted_cell = {
  353. 'type': 'table_cell',
  354. 'text': cell.get('text', ''),
  355. 'matched_text': cell.get('matched_text', cell.get('text', '')),
  356. 'bbox': [float(x) for x in cell.get('bbox', [0, 0, 0, 0])[:4]],
  357. 'row': cell.get('row', 0),
  358. 'col': cell.get('col', 0),
  359. 'score': float(cell.get('score', 100.0)),
  360. 'paddle_bbox_indices': cell.get('paddle_bbox_indices',
  361. cell.get('paddle_indices', []))
  362. }
  363. formatted_cells.append(formatted_cell)
  364. return formatted_cells
  365. @staticmethod
  366. def _ensure_list(value) -> List:
  367. """确保值是列表"""
  368. if value is None:
  369. return []
  370. if isinstance(value, str):
  371. return [value] if value else []
  372. if isinstance(value, list):
  373. return value
  374. return [str(value)]