json_formatters.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420
  1. """
  2. JSON 格式化工具模块
  3. 提供 JSON 输出格式化功能:
  4. - MinerU middle.json 格式转换
  5. - mineru_vllm_results_cell_bbox 格式转换
  6. - 表格单元格格式化
  7. - 金额数字标准化(全角→半角)
  8. """
  9. import json
  10. import sys
  11. import numpy as np
  12. from pathlib import Path
  13. from typing import Dict, Any, List, Optional
  14. from loguru import logger
  15. # 导入数字标准化工具
  16. from .normalize_financial_numbers import normalize_json_table
  17. class NumpyEncoder(json.JSONEncoder):
  18. """自定义JSON编码器,处理numpy类型"""
  19. def default(self, obj):
  20. if isinstance(obj, np.integer):
  21. return int(obj)
  22. elif isinstance(obj, np.floating):
  23. return float(obj)
  24. elif isinstance(obj, np.ndarray):
  25. return obj.tolist()
  26. return super().default(obj)
  27. class JSONFormatters:
  28. """JSON 格式化工具类"""
  29. @staticmethod
  30. def convert_to_middle_json(results: Dict[str, Any]) -> Dict[str, Any]:
  31. """
  32. 转换为 MinerU 标准 middle.json 格式
  33. 用于 vlm_union_make 生成 Markdown
  34. Args:
  35. results: 处理结果
  36. Returns:
  37. MinerU middle.json 格式的字典
  38. """
  39. middle_json = {
  40. "pdf_info": [],
  41. "_backend": "vlm",
  42. "_scene": results.get('scene', 'unknown'),
  43. "_version_name": "2.5.0"
  44. }
  45. for page in results.get('pages', []):
  46. page_info = {
  47. 'page_idx': page['page_idx'],
  48. 'page_size': list(page.get('image_shape', [0, 0])[:2][::-1]),
  49. 'angle': page.get('angle', 0),
  50. 'para_blocks': [],
  51. 'discarded_blocks': []
  52. }
  53. # 处理普通元素
  54. for element in page.get('elements', []):
  55. block = JSONFormatters._element_to_middle_block(element)
  56. if block:
  57. elem_type = element.get('type', '')
  58. if elem_type in ['header', 'footer', 'page_number', 'aside_text', 'abandon', 'discarded']:
  59. page_info['discarded_blocks'].append(block)
  60. else:
  61. page_info['para_blocks'].append(block)
  62. # 处理丢弃元素(从 discarded_blocks 字段)
  63. for element in page.get('discarded_blocks', []):
  64. block = JSONFormatters._element_to_middle_block(element)
  65. if block:
  66. page_info['discarded_blocks'].append(block)
  67. middle_json['pdf_info'].append(page_info)
  68. return middle_json
  69. @staticmethod
  70. def _element_to_middle_block(element: Dict[str, Any]) -> Optional[Dict[str, Any]]:
  71. """
  72. 将元素转换为 MinerU middle.json block 格式
  73. MinerU 期望的嵌套结构:
  74. - image 类型: { type: "image", blocks: [{ type: "image_body", lines: [...] }] }
  75. - table 类型: { type: "table", blocks: [{ type: "table_body", lines: [...] }] }
  76. """
  77. elem_type = element.get('type', '')
  78. bbox = element.get('bbox', [0, 0, 0, 0])
  79. content = element.get('content', {})
  80. block = {
  81. 'type': elem_type,
  82. 'bbox': bbox,
  83. 'angle': element.get('angle', 0),
  84. 'reading_order': element.get('reading_order', 0),
  85. 'lines': []
  86. }
  87. # 文本类型
  88. if elem_type in ['text', 'title', 'ref_text', 'header', 'footer', 'ocr_text']:
  89. text = content.get('text', '') if isinstance(content, dict) else str(content)
  90. if text:
  91. block['lines'] = [{
  92. 'bbox': bbox,
  93. 'spans': [{
  94. 'bbox': bbox,
  95. 'type': 'text',
  96. 'content': text
  97. }]
  98. }]
  99. # 表格类型 - 嵌套结构
  100. elif elem_type in ['table', 'table_body']:
  101. table_html = content.get('html', '')
  102. cells = content.get('cells', [])
  103. block['type'] = 'table'
  104. block['blocks'] = [{
  105. 'type': 'table_body',
  106. 'bbox': bbox,
  107. 'angle': 0,
  108. 'lines': [{
  109. 'bbox': bbox,
  110. 'spans': [{
  111. 'bbox': bbox,
  112. 'type': 'table',
  113. 'html': table_html,
  114. 'cells': cells
  115. }]
  116. }]
  117. }]
  118. # 图片类型 - 嵌套结构
  119. elif elem_type in ['image', 'image_body', 'figure']:
  120. block['type'] = 'image'
  121. block['blocks'] = [{
  122. 'type': 'image_body',
  123. 'bbox': bbox,
  124. 'angle': element.get('angle', 0),
  125. 'lines': [{
  126. 'bbox': bbox,
  127. 'spans': [{
  128. 'bbox': bbox,
  129. 'type': 'image',
  130. 'image_path': content.get('image_path', ''),
  131. 'description': content.get('description', '')
  132. }]
  133. }]
  134. }]
  135. # 公式类型
  136. elif elem_type in ['interline_equation', 'inline_equation', 'equation']:
  137. latex = content.get('latex', '')
  138. block['lines'] = [{
  139. 'bbox': bbox,
  140. 'spans': [{
  141. 'bbox': bbox,
  142. 'type': 'interline_equation' if 'interline' in elem_type else 'inline_equation',
  143. 'content': latex
  144. }]
  145. }]
  146. # 表格/图片附属文本
  147. elif elem_type in ['table_caption', 'table_footnote', 'image_caption', 'image_footnote']:
  148. text = content.get('text', '') if isinstance(content, dict) else str(content)
  149. if text:
  150. block['lines'] = [{
  151. 'bbox': bbox,
  152. 'spans': [{
  153. 'bbox': bbox,
  154. 'type': 'text',
  155. 'content': text
  156. }]
  157. }]
  158. # 丢弃类型
  159. elif elem_type in ['abandon', 'discarded']:
  160. block['type'] = 'abandon'
  161. text = content.get('text', '') if isinstance(content, dict) else str(content)
  162. if text:
  163. block['lines'] = [{
  164. 'bbox': bbox,
  165. 'spans': [{
  166. 'bbox': bbox,
  167. 'type': 'text',
  168. 'content': text
  169. }]
  170. }]
  171. return block
  172. @staticmethod
  173. def save_page_jsons(
  174. results: Dict[str, Any],
  175. output_dir: Path,
  176. doc_name: str,
  177. is_pdf: bool = True,
  178. normalize_numbers: bool = True
  179. ) -> List[str]:
  180. """
  181. 保存每页独立的 JSON(mineru_vllm_results_cell_bbox 格式)
  182. 命名规则:
  183. - PDF输入: 文件名_page_001.json
  184. - 图片输入(单页): 文件名.json
  185. Args:
  186. results: 处理结果
  187. output_dir: 输出目录
  188. doc_name: 文档名称
  189. is_pdf: 是否为 PDF 输入
  190. normalize_numbers: 是否标准化金额数字(全角→半角)
  191. Returns:
  192. 保存的文件路径列表
  193. """
  194. saved_paths = []
  195. total_pages = len(results.get('pages', []))
  196. for page in results.get('pages', []):
  197. page_idx = page.get('page_idx', 0)
  198. # 根据输入类型决定命名
  199. if is_pdf or total_pages > 1:
  200. page_name = f"{doc_name}_page_{page_idx + 1:03d}"
  201. else:
  202. page_name = doc_name
  203. # 获取页面旋转角度
  204. page_rotation_angle = float(page.get('angle', 0))
  205. # 转换为 mineru_vllm_results_cell_bbox 格式
  206. page_elements = []
  207. for element in page.get('elements', []):
  208. converted = JSONFormatters._element_to_cell_bbox_format(element, page_idx, page_rotation_angle)
  209. if converted:
  210. page_elements.append(converted)
  211. # 添加丢弃元素
  212. for element in page.get('discarded_blocks', []):
  213. converted = JSONFormatters._element_to_cell_bbox_format(element, page_idx, page_rotation_angle)
  214. if converted:
  215. page_elements.append(converted)
  216. # 转换为 JSON 字符串
  217. json_content = json.dumps(page_elements, ensure_ascii=False, indent=2, cls=NumpyEncoder)
  218. # 金额数字标准化
  219. if normalize_numbers:
  220. original_content = json_content
  221. json_content = normalize_json_table(json_content)
  222. if json_content != original_content:
  223. original_path = output_dir / f"{page_name}_original.json"
  224. with open(original_path, 'w', encoding='utf-8') as f:
  225. f.write(original_content)
  226. logger.debug(f"📄 Original page JSON saved: {original_path}")
  227. # 保存 JSON
  228. json_path = output_dir / f"{page_name}.json"
  229. with open(json_path, 'w', encoding='utf-8') as f:
  230. f.write(json_content)
  231. saved_paths.append(str(json_path))
  232. logger.debug(f"📄 Page JSON saved: {json_path}")
  233. if saved_paths:
  234. logger.info(f"📄 {len(saved_paths)} page JSONs saved")
  235. return saved_paths
  236. @staticmethod
  237. def _element_to_cell_bbox_format(
  238. element: Dict[str, Any],
  239. page_idx: int,
  240. page_rotation_angle: float = 0.0
  241. ) -> Optional[Dict[str, Any]]:
  242. """
  243. 将元素转换为 mineru_vllm_results_cell_bbox 格式
  244. Args:
  245. element: 元素字典
  246. page_idx: 页面索引
  247. page_rotation_angle: 页面旋转角度(0, 90, 180, 270)
  248. """
  249. elem_type = element.get('type', '')
  250. bbox = element.get('bbox', [0, 0, 0, 0])
  251. content = element.get('content', {})
  252. # 确保 bbox 是整数列表
  253. bbox = [int(x) for x in bbox[:4]] if bbox else [0, 0, 0, 0]
  254. result = {
  255. 'bbox': bbox,
  256. 'page_idx': page_idx,
  257. 'page_rotation_angle': page_rotation_angle,
  258. 'reading_order': element.get('reading_order', 0)
  259. }
  260. # 文本类型
  261. if elem_type in ['text', 'title', 'ref_text', 'ocr_text']:
  262. text = content.get('text', '') if isinstance(content, dict) else str(content)
  263. result['type'] = 'text' if elem_type != 'title' else 'title'
  264. result['text'] = text
  265. if elem_type == 'title':
  266. result['text_level'] = element.get('level', 1)
  267. # 表格类型
  268. elif elem_type in ['table', 'table_body']:
  269. result['type'] = 'table'
  270. result['img_path'] = content.get('table_image_path', '')
  271. result['table_caption'] = JSONFormatters._ensure_list(content.get('table_caption', []))
  272. result['table_footnote'] = JSONFormatters._ensure_list(content.get('table_footnote', []))
  273. result['table_body'] = content.get('html', '')
  274. # 关键:table_cells 数组
  275. cells = content.get('cells', [])
  276. if cells:
  277. result['table_cells'] = JSONFormatters.format_table_cells(cells)
  278. # 旋转和倾斜信息
  279. if 'table_angle' in content:
  280. result['image_rotation_angle'] = float(content['table_angle'])
  281. if 'skew_angle' in content:
  282. result['skew_angle'] = float(content['skew_angle'])
  283. # 图片类型
  284. elif elem_type in ['image', 'image_body', 'figure']:
  285. result['type'] = 'image'
  286. image_filename = content.get('image_path', '')
  287. result['img_path'] = f"images/{image_filename}" if image_filename else ''
  288. result['image_caption'] = JSONFormatters._ensure_list(content.get('caption', []))
  289. result['image_footnote'] = JSONFormatters._ensure_list(content.get('footnote', []))
  290. # 公式类型
  291. elif elem_type in ['interline_equation', 'inline_equation', 'equation']:
  292. result['type'] = 'equation'
  293. result['text'] = content.get('latex', '') if isinstance(content, dict) else ''
  294. result['text_format'] = 'latex'
  295. # 列表类型
  296. elif elem_type == 'list':
  297. result['type'] = 'list'
  298. result['sub_type'] = 'text'
  299. result['list_items'] = content.get('list_items', []) if isinstance(content, dict) else []
  300. # 页眉页脚
  301. elif elem_type in ['header', 'footer']:
  302. result['type'] = elem_type
  303. result['text'] = content.get('text', '') if isinstance(content, dict) else str(content)
  304. # 表格/图片附属文本
  305. elif elem_type in ['table_caption', 'table_footnote', 'image_caption', 'image_footnote']:
  306. result['type'] = elem_type
  307. result['text'] = content.get('text', '') if isinstance(content, dict) else str(content)
  308. # 丢弃元素
  309. elif elem_type in ['discarded', 'abandon']:
  310. result['type'] = 'discarded'
  311. result['original_category'] = element.get('original_category', 'unknown')
  312. result['text'] = content.get('text', '') if isinstance(content, dict) else ''
  313. else:
  314. return None
  315. return result
  316. @staticmethod
  317. def format_table_cells(cells: List[Dict]) -> List[Dict[str, Any]]:
  318. """
  319. 格式化表格单元格为 mineru_vllm_results_cell_bbox 格式
  320. 输出格式:
  321. {
  322. "type": "table_cell",
  323. "text": "单元格内容",
  324. "matched_text": "OCR匹配文本",
  325. "bbox": [x1, y1, x2, y2],
  326. "row": 1,
  327. "col": 1,
  328. "score": 100.0,
  329. "paddle_bbox_indices": [0, 1]
  330. }
  331. """
  332. formatted_cells = []
  333. for cell in cells:
  334. formatted_cell = {
  335. 'type': 'table_cell',
  336. 'text': cell.get('text', ''),
  337. 'matched_text': cell.get('matched_text', cell.get('text', '')),
  338. 'bbox': [float(x) for x in cell.get('bbox', [0, 0, 0, 0])[:4]],
  339. 'row': cell.get('row', 0),
  340. 'col': cell.get('col', 0),
  341. 'score': float(cell.get('score', 100.0)),
  342. 'paddle_bbox_indices': cell.get('paddle_bbox_indices',
  343. cell.get('paddle_indices', []))
  344. }
  345. formatted_cells.append(formatted_cell)
  346. return formatted_cells
  347. @staticmethod
  348. def _ensure_list(value) -> List:
  349. """确保值是列表"""
  350. if value is None:
  351. return []
  352. if isinstance(value, str):
  353. return [value] if value else []
  354. if isinstance(value, list):
  355. return value
  356. return [str(value)]