||
- """
- JSON 格式化工具模块
- 提供 JSON 输出格式化功能:
- - MinerU middle.json 格式转换
- - mineru_vllm_results_cell_bbox 格式转换
- - 表格单元格格式化
- """
- import json
- from pathlib import Path
- from typing import Dict, Any, List, Optional
- from loguru import logger
- class JSONFormatters:
- """JSON 格式化工具类"""
-
- @staticmethod
- def convert_to_middle_json(results: Dict[str, Any]) -> Dict[str, Any]:
- """
- 转换为 MinerU 标准 middle.json 格式
-
- 用于 vlm_union_make 生成 Markdown
-
- Args:
- results: 处理结果
-
- Returns:
- MinerU middle.json 格式的字典
- """
- middle_json = {
- "pdf_info": [],
- "_backend": "vlm",
- "_scene": results.get('scene', 'unknown'),
- "_version_name": "2.5.0"
- }
-
- for page in results.get('pages', []):
- page_info = {
- 'page_idx': page['page_idx'],
- 'page_size': list(page.get('image_shape', [0, 0])[:2][::-1]),
- 'angle': page.get('angle', 0),
- 'para_blocks': [],
- 'discarded_blocks': []
- }
-
- # 处理普通元素
- for element in page.get('elements', []):
- block = JSONFormatters._element_to_middle_block(element)
- if block:
- elem_type = element.get('type', '')
- if elem_type in ['header', 'footer', 'page_number', 'aside_text', 'abandon', 'discarded']:
- page_info['discarded_blocks'].append(block)
- else:
- page_info['para_blocks'].append(block)
-
- # 处理丢弃元素(从 discarded_blocks 字段)
- for element in page.get('discarded_blocks', []):
- block = JSONFormatters._element_to_middle_block(element)
- if block:
- page_info['discarded_blocks'].append(block)
-
- middle_json['pdf_info'].append(page_info)
-
- return middle_json
-
- @staticmethod
- def _element_to_middle_block(element: Dict[str, Any]) -> Optional[Dict[str, Any]]:
- """
- 将元素转换为 MinerU middle.json block 格式
-
- MinerU 期望的嵌套结构:
- - image 类型: { type: "image", blocks: [{ type: "image_body", lines: [...] }] }
- - table 类型: { type: "table", blocks: [{ type: "table_body", lines: [...] }] }
- """
- elem_type = element.get('type', '')
- bbox = element.get('bbox', [0, 0, 0, 0])
- content = element.get('content', {})
-
- block = {
- 'type': elem_type,
- 'bbox': bbox,
- 'angle': element.get('angle', 0),
- 'reading_order': element.get('reading_order', 0),
- 'lines': []
- }
-
- # 文本类型
- if elem_type in ['text', 'title', 'ref_text', 'header', 'footer', 'ocr_text']:
- text = content.get('text', '') if isinstance(content, dict) else str(content)
- if text:
- block['lines'] = [{
- 'bbox': bbox,
- 'spans': [{
- 'bbox': bbox,
- 'type': 'text',
- 'content': text
- }]
- }]
-
- # 表格类型 - 嵌套结构
- elif elem_type in ['table', 'table_body']:
- table_html = content.get('html', '')
- cells = content.get('cells', [])
-
- block['type'] = 'table'
- block['blocks'] = [{
- 'type': 'table_body',
- 'bbox': bbox,
- 'angle': 0,
- 'lines': [{
- 'bbox': bbox,
- 'spans': [{
- 'bbox': bbox,
- 'type': 'table',
- 'html': table_html,
- 'cells': cells
- }]
- }]
- }]
-
- # 图片类型 - 嵌套结构
- elif elem_type in ['image', 'image_body', 'figure']:
- block['type'] = 'image'
- block['blocks'] = [{
- 'type': 'image_body',
- 'bbox': bbox,
- 'angle': element.get('angle', 0),
- 'lines': [{
- 'bbox': bbox,
- 'spans': [{
- 'bbox': bbox,
- 'type': 'image',
- 'image_path': content.get('image_path', ''),
- 'description': content.get('description', '')
- }]
- }]
- }]
-
- # 公式类型
- elif elem_type in ['interline_equation', 'inline_equation', 'equation']:
- latex = content.get('latex', '')
- block['lines'] = [{
- 'bbox': bbox,
- 'spans': [{
- 'bbox': bbox,
- 'type': 'interline_equation' if 'interline' in elem_type else 'inline_equation',
- 'content': latex
- }]
- }]
-
- # 表格/图片附属文本
- elif elem_type in ['table_caption', 'table_footnote', 'image_caption', 'image_footnote']:
- text = content.get('text', '') if isinstance(content, dict) else str(content)
- if text:
- block['lines'] = [{
- 'bbox': bbox,
- 'spans': [{
- 'bbox': bbox,
- 'type': 'text',
- 'content': text
- }]
- }]
-
- # 丢弃类型
- elif elem_type in ['abandon', 'discarded']:
- block['type'] = 'abandon'
- text = content.get('text', '') if isinstance(content, dict) else str(content)
- if text:
- block['lines'] = [{
- 'bbox': bbox,
- 'spans': [{
- 'bbox': bbox,
- 'type': 'text',
- 'content': text
- }]
- }]
-
- return block
-
- @staticmethod
- def save_page_jsons(
- results: Dict[str, Any],
- output_dir: Path,
- doc_name: str
- ) -> List[str]:
- """
- 保存每页独立的 JSON(mineru_vllm_results_cell_bbox 格式)
-
- Args:
- results: 处理结果
- output_dir: 输出目录
- doc_name: 文档名称
-
- Returns:
- 保存的文件路径列表
- """
- saved_paths = []
-
- for page in results.get('pages', []):
- page_idx = page.get('page_idx', 0)
- page_name = f"{doc_name}_page_{page_idx + 1:03d}"
-
- # 转换为 mineru_vllm_results_cell_bbox 格式
- page_elements = []
- for element in page.get('elements', []):
- converted = JSONFormatters._element_to_cell_bbox_format(element, page_idx)
- if converted:
- page_elements.append(converted)
-
- # 添加丢弃元素
- for element in page.get('discarded_blocks', []):
- converted = JSONFormatters._element_to_cell_bbox_format(element, page_idx)
- if converted:
- page_elements.append(converted)
-
- # 保存 JSON
- json_path = output_dir / f"{page_name}.json"
- with open(json_path, 'w', encoding='utf-8') as f:
- json.dump(page_elements, f, ensure_ascii=False, indent=2)
-
- saved_paths.append(str(json_path))
- logger.debug(f"📄 Page JSON saved: {json_path}")
-
- if saved_paths:
- logger.info(f"📄 {len(saved_paths)} page JSONs saved")
-
- return saved_paths
-
- @staticmethod
- def _element_to_cell_bbox_format(
- element: Dict[str, Any],
- page_idx: int
- ) -> Optional[Dict[str, Any]]:
- """
- 将元素转换为 mineru_vllm_results_cell_bbox 格式
- """
- elem_type = element.get('type', '')
- bbox = element.get('bbox', [0, 0, 0, 0])
- content = element.get('content', {})
-
- # 确保 bbox 是整数列表
- bbox = [int(x) for x in bbox[:4]] if bbox else [0, 0, 0, 0]
-
- result = {
- 'bbox': bbox,
- 'page_idx': page_idx,
- 'reading_order': element.get('reading_order', 0)
- }
-
- # 文本类型
- if elem_type in ['text', 'title', 'ref_text', 'ocr_text']:
- text = content.get('text', '') if isinstance(content, dict) else str(content)
- result['type'] = 'text' if elem_type != 'title' else 'title'
- result['text'] = text
- if elem_type == 'title':
- result['text_level'] = element.get('level', 1)
-
- # 表格类型
- elif elem_type in ['table', 'table_body']:
- result['type'] = 'table'
- result['img_path'] = content.get('table_image_path', '')
- result['table_caption'] = JSONFormatters._ensure_list(content.get('table_caption', []))
- result['table_footnote'] = JSONFormatters._ensure_list(content.get('table_footnote', []))
- result['table_body'] = content.get('html', '')
-
- # 关键:table_cells 数组
- cells = content.get('cells', [])
- if cells:
- result['table_cells'] = JSONFormatters.format_table_cells(cells)
-
- # 旋转和倾斜信息
- if 'table_angle' in content:
- result['image_rotation_angle'] = float(content['table_angle'])
- if 'skew_angle' in content:
- result['skew_angle'] = float(content['skew_angle'])
-
- # 图片类型
- elif elem_type in ['image', 'image_body', 'figure']:
- result['type'] = 'image'
- image_filename = content.get('image_path', '')
- result['img_path'] = f"images/{image_filename}" if image_filename else ''
- result['image_caption'] = JSONFormatters._ensure_list(content.get('caption', []))
- result['image_footnote'] = JSONFormatters._ensure_list(content.get('footnote', []))
-
- # 公式类型
- elif elem_type in ['interline_equation', 'inline_equation', 'equation']:
- result['type'] = 'equation'
- result['text'] = content.get('latex', '') if isinstance(content, dict) else ''
- result['text_format'] = 'latex'
-
- # 列表类型
- elif elem_type == 'list':
- result['type'] = 'list'
- result['sub_type'] = 'text'
- result['list_items'] = content.get('list_items', []) if isinstance(content, dict) else []
-
- # 页眉页脚
- elif elem_type in ['header', 'footer']:
- result['type'] = elem_type
- result['text'] = content.get('text', '') if isinstance(content, dict) else str(content)
-
- # 表格/图片附属文本
- elif elem_type in ['table_caption', 'table_footnote', 'image_caption', 'image_footnote']:
- result['type'] = elem_type
- result['text'] = content.get('text', '') if isinstance(content, dict) else str(content)
-
- # 丢弃元素
- elif elem_type in ['discarded', 'abandon']:
- result['type'] = 'discarded'
- result['original_category'] = element.get('original_category', 'unknown')
- result['text'] = content.get('text', '') if isinstance(content, dict) else ''
-
- else:
- return None
-
- return result
-
- @staticmethod
- def format_table_cells(cells: List[Dict]) -> List[Dict[str, Any]]:
- """
- 格式化表格单元格为 mineru_vllm_results_cell_bbox 格式
-
- 输出格式:
- {
- "type": "table_cell",
- "text": "单元格内容",
- "matched_text": "OCR匹配文本",
- "bbox": [x1, y1, x2, y2],
- "row": 1,
- "col": 1,
- "score": 100.0,
- "paddle_bbox_indices": [0, 1]
- }
- """
- formatted_cells = []
-
- for cell in cells:
- formatted_cell = {
- 'type': 'table_cell',
- 'text': cell.get('text', ''),
- 'matched_text': cell.get('matched_text', cell.get('text', '')),
- 'bbox': [float(x) for x in cell.get('bbox', [0, 0, 0, 0])[:4]],
- 'row': cell.get('row', 0),
- 'col': cell.get('col', 0),
- 'score': float(cell.get('score', 100.0)),
- 'paddle_bbox_indices': cell.get('paddle_bbox_indices',
- cell.get('paddle_indices', []))
- }
- formatted_cells.append(formatted_cell)
-
- return formatted_cells
-
- @staticmethod
- def _ensure_list(value) -> List:
- """确保值是列表"""
- if value is None:
- return []
- if isinstance(value, str):
- return [value] if value else []
- if isinstance(value, list):
- return value
- return [str(value)]
|