""" JSON 格式化工具模块 提供 JSON 输出格式化功能: - MinerU middle.json 格式转换 - mineru_vllm_results_cell_bbox 格式转换 - 表格单元格格式化 """ import json from pathlib import Path from typing import Dict, Any, List, Optional from loguru import logger class JSONFormatters: """JSON 格式化工具类""" @staticmethod def convert_to_middle_json(results: Dict[str, Any]) -> Dict[str, Any]: """ 转换为 MinerU 标准 middle.json 格式 用于 vlm_union_make 生成 Markdown Args: results: 处理结果 Returns: MinerU middle.json 格式的字典 """ middle_json = { "pdf_info": [], "_backend": "vlm", "_scene": results.get('scene', 'unknown'), "_version_name": "2.5.0" } for page in results.get('pages', []): page_info = { 'page_idx': page['page_idx'], 'page_size': list(page.get('image_shape', [0, 0])[:2][::-1]), 'angle': page.get('angle', 0), 'para_blocks': [], 'discarded_blocks': [] } # 处理普通元素 for element in page.get('elements', []): block = JSONFormatters._element_to_middle_block(element) if block: elem_type = element.get('type', '') if elem_type in ['header', 'footer', 'page_number', 'aside_text', 'abandon', 'discarded']: page_info['discarded_blocks'].append(block) else: page_info['para_blocks'].append(block) # 处理丢弃元素(从 discarded_blocks 字段) for element in page.get('discarded_blocks', []): block = JSONFormatters._element_to_middle_block(element) if block: page_info['discarded_blocks'].append(block) middle_json['pdf_info'].append(page_info) return middle_json @staticmethod def _element_to_middle_block(element: Dict[str, Any]) -> Optional[Dict[str, Any]]: """ 将元素转换为 MinerU middle.json block 格式 MinerU 期望的嵌套结构: - image 类型: { type: "image", blocks: [{ type: "image_body", lines: [...] }] } - table 类型: { type: "table", blocks: [{ type: "table_body", lines: [...] }] } """ elem_type = element.get('type', '') bbox = element.get('bbox', [0, 0, 0, 0]) content = element.get('content', {}) block = { 'type': elem_type, 'bbox': bbox, 'angle': element.get('angle', 0), 'reading_order': element.get('reading_order', 0), 'lines': [] } # 文本类型 if elem_type in ['text', 'title', 'ref_text', 'header', 'footer', 'ocr_text']: text = content.get('text', '') if isinstance(content, dict) else str(content) if text: block['lines'] = [{ 'bbox': bbox, 'spans': [{ 'bbox': bbox, 'type': 'text', 'content': text }] }] # 表格类型 - 嵌套结构 elif elem_type in ['table', 'table_body']: table_html = content.get('html', '') cells = content.get('cells', []) block['type'] = 'table' block['blocks'] = [{ 'type': 'table_body', 'bbox': bbox, 'angle': 0, 'lines': [{ 'bbox': bbox, 'spans': [{ 'bbox': bbox, 'type': 'table', 'html': table_html, 'cells': cells }] }] }] # 图片类型 - 嵌套结构 elif elem_type in ['image', 'image_body', 'figure']: block['type'] = 'image' block['blocks'] = [{ 'type': 'image_body', 'bbox': bbox, 'angle': element.get('angle', 0), 'lines': [{ 'bbox': bbox, 'spans': [{ 'bbox': bbox, 'type': 'image', 'image_path': content.get('image_path', ''), 'description': content.get('description', '') }] }] }] # 公式类型 elif elem_type in ['interline_equation', 'inline_equation', 'equation']: latex = content.get('latex', '') block['lines'] = [{ 'bbox': bbox, 'spans': [{ 'bbox': bbox, 'type': 'interline_equation' if 'interline' in elem_type else 'inline_equation', 'content': latex }] }] # 表格/图片附属文本 elif elem_type in ['table_caption', 'table_footnote', 'image_caption', 'image_footnote']: text = content.get('text', '') if isinstance(content, dict) else str(content) if text: block['lines'] = [{ 'bbox': bbox, 'spans': [{ 'bbox': bbox, 'type': 'text', 'content': text }] }] # 丢弃类型 elif elem_type in ['abandon', 'discarded']: block['type'] = 'abandon' text = content.get('text', '') if isinstance(content, dict) else str(content) if text: block['lines'] = [{ 'bbox': bbox, 'spans': [{ 'bbox': bbox, 'type': 'text', 'content': text }] }] return block @staticmethod def save_page_jsons( results: Dict[str, Any], output_dir: Path, doc_name: str ) -> List[str]: """ 保存每页独立的 JSON(mineru_vllm_results_cell_bbox 格式) Args: results: 处理结果 output_dir: 输出目录 doc_name: 文档名称 Returns: 保存的文件路径列表 """ saved_paths = [] for page in results.get('pages', []): page_idx = page.get('page_idx', 0) page_name = f"{doc_name}_page_{page_idx + 1:03d}" # 转换为 mineru_vllm_results_cell_bbox 格式 page_elements = [] for element in page.get('elements', []): converted = JSONFormatters._element_to_cell_bbox_format(element, page_idx) if converted: page_elements.append(converted) # 添加丢弃元素 for element in page.get('discarded_blocks', []): converted = JSONFormatters._element_to_cell_bbox_format(element, page_idx) if converted: page_elements.append(converted) # 保存 JSON json_path = output_dir / f"{page_name}.json" with open(json_path, 'w', encoding='utf-8') as f: json.dump(page_elements, f, ensure_ascii=False, indent=2) saved_paths.append(str(json_path)) logger.debug(f"📄 Page JSON saved: {json_path}") if saved_paths: logger.info(f"📄 {len(saved_paths)} page JSONs saved") return saved_paths @staticmethod def _element_to_cell_bbox_format( element: Dict[str, Any], page_idx: int ) -> Optional[Dict[str, Any]]: """ 将元素转换为 mineru_vllm_results_cell_bbox 格式 """ elem_type = element.get('type', '') bbox = element.get('bbox', [0, 0, 0, 0]) content = element.get('content', {}) # 确保 bbox 是整数列表 bbox = [int(x) for x in bbox[:4]] if bbox else [0, 0, 0, 0] result = { 'bbox': bbox, 'page_idx': page_idx, 'reading_order': element.get('reading_order', 0) } # 文本类型 if elem_type in ['text', 'title', 'ref_text', 'ocr_text']: text = content.get('text', '') if isinstance(content, dict) else str(content) result['type'] = 'text' if elem_type != 'title' else 'title' result['text'] = text if elem_type == 'title': result['text_level'] = element.get('level', 1) # 表格类型 elif elem_type in ['table', 'table_body']: result['type'] = 'table' result['img_path'] = content.get('table_image_path', '') result['table_caption'] = JSONFormatters._ensure_list(content.get('table_caption', [])) result['table_footnote'] = JSONFormatters._ensure_list(content.get('table_footnote', [])) result['table_body'] = content.get('html', '') # 关键:table_cells 数组 cells = content.get('cells', []) if cells: result['table_cells'] = JSONFormatters.format_table_cells(cells) # 旋转和倾斜信息 if 'table_angle' in content: result['image_rotation_angle'] = float(content['table_angle']) if 'skew_angle' in content: result['skew_angle'] = float(content['skew_angle']) # 图片类型 elif elem_type in ['image', 'image_body', 'figure']: result['type'] = 'image' image_filename = content.get('image_path', '') result['img_path'] = f"images/{image_filename}" if image_filename else '' result['image_caption'] = JSONFormatters._ensure_list(content.get('caption', [])) result['image_footnote'] = JSONFormatters._ensure_list(content.get('footnote', [])) # 公式类型 elif elem_type in ['interline_equation', 'inline_equation', 'equation']: result['type'] = 'equation' result['text'] = content.get('latex', '') if isinstance(content, dict) else '' result['text_format'] = 'latex' # 列表类型 elif elem_type == 'list': result['type'] = 'list' result['sub_type'] = 'text' result['list_items'] = content.get('list_items', []) if isinstance(content, dict) else [] # 页眉页脚 elif elem_type in ['header', 'footer']: result['type'] = elem_type result['text'] = content.get('text', '') if isinstance(content, dict) else str(content) # 表格/图片附属文本 elif elem_type in ['table_caption', 'table_footnote', 'image_caption', 'image_footnote']: result['type'] = elem_type result['text'] = content.get('text', '') if isinstance(content, dict) else str(content) # 丢弃元素 elif elem_type in ['discarded', 'abandon']: result['type'] = 'discarded' result['original_category'] = element.get('original_category', 'unknown') result['text'] = content.get('text', '') if isinstance(content, dict) else '' else: return None return result @staticmethod def format_table_cells(cells: List[Dict]) -> List[Dict[str, Any]]: """ 格式化表格单元格为 mineru_vllm_results_cell_bbox 格式 输出格式: { "type": "table_cell", "text": "单元格内容", "matched_text": "OCR匹配文本", "bbox": [x1, y1, x2, y2], "row": 1, "col": 1, "score": 100.0, "paddle_bbox_indices": [0, 1] } """ formatted_cells = [] for cell in cells: formatted_cell = { 'type': 'table_cell', 'text': cell.get('text', ''), 'matched_text': cell.get('matched_text', cell.get('text', '')), 'bbox': [float(x) for x in cell.get('bbox', [0, 0, 0, 0])[:4]], 'row': cell.get('row', 0), 'col': cell.get('col', 0), 'score': float(cell.get('score', 100.0)), 'paddle_bbox_indices': cell.get('paddle_bbox_indices', cell.get('paddle_indices', [])) } formatted_cells.append(formatted_cell) return formatted_cells @staticmethod def _ensure_list(value) -> List: """确保值是列表""" if value is None: return [] if isinstance(value, str): return [value] if value else [] if isinstance(value, list): return value return [str(value)]