| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137 |
- """
- 统一输出格式转换器
- 将不同OCR工具的结果转换为标准的MinerU格式
- """
- from typing import List, Dict
- from bs4 import BeautifulSoup
- class UnifiedOutputConverter:
- """统一输出格式转换器"""
-
- def __init__(self):
- # self.text_processor = TextProcessor()
- pass
-
- def convert_to_mineru_format(self, merged_data: List[Dict],
- data_source: str = 'auto') -> List[Dict]:
- """
- 将合并后的数据转换为MinerU标准格式
-
- Args:
- merged_data: 合并后的数据
- data_source: 数据来源 ('paddleocr_vl', 'mineru', 'auto')
-
- Returns:
- MinerU格式的数据列表
- """
- # 自动检测数据格式
- if data_source == 'auto':
- data_source = self._detect_data_source(merged_data)
-
- if data_source == 'paddleocr_vl':
- return self._convert_paddleocr_vl_to_mineru(merged_data)
- elif data_source == 'mineru':
- return merged_data # 已经是MinerU格式
- else:
- raise ValueError(f"Unsupported data source: {data_source}")
-
- def _detect_data_source(self, merged_data: List[Dict]) -> str:
- """检测数据来源"""
- if not merged_data:
- return 'mineru'
-
- first_item = merged_data[0]
-
- # 检查PaddleOCR_VL特征
- if 'block_label' in first_item:
- return 'paddleocr_vl'
-
- # 检查MinerU特征
- if 'type' in first_item:
- return 'mineru'
-
- return 'mineru' # 默认
-
- def _convert_paddleocr_vl_to_mineru(self, merged_data: List[Dict]) -> List[Dict]:
- """将PaddleOCR_VL格式转换为MinerU格式"""
- mineru_data = []
-
- for item in merged_data:
- block_label = item.get('block_label', '')
-
- # 映射block_label到MinerU的type
- type_mapping = {
- 'header': 'header',
- 'footer': 'footer',
- 'page_number': 'page_number',
- 'paragraph_title': 'title',
- 'doc_title': 'title',
- 'abstract_title': 'title',
- 'reference_title': 'title',
- 'content_title': 'title',
- 'figure_title': 'title',
- 'table_title': 'title',
- 'text': 'text',
- 'table': 'table',
- 'figure': 'image',
- 'chart': 'image',
- 'seal': 'image',
- 'equation': 'interline_equation',
- 'reference': 'ref_text',
- }
-
- mineru_type = type_mapping.get(block_label, 'text')
-
- # 构建MinerU格式的数据项
- mineru_item = {
- 'type': mineru_type,
- 'bbox': item.get('block_bbox', item.get('bbox', [])),
- 'page_idx': item.get('page_idx', 0)
- }
-
- # 根据类型添加特定字段
- if mineru_type == 'title':
- mineru_item['text'] = item.get('block_content', item.get('text', ''))
- # 根据block_label确定标题级别
- level_map = {
- 'doc_title': 1,
- 'paragraph_title': 2,
- 'abstract_title': 2,
- 'reference_title': 2,
- 'content_title': 3,
- 'figure_title': 4,
- 'table_title': 4,
- }
- mineru_item['text_level'] = level_map.get(block_label, 1)
-
- elif mineru_type == 'text':
- mineru_item['text'] = item.get('block_content', item.get('text', ''))
-
- elif mineru_type in ['header', 'footer', 'page_number', 'ref_text']:
- mineru_item['text'] = item.get('block_content', item.get('text', ''))
-
- elif mineru_type == 'table':
- # 处理表格
- table_html = item.get('block_content_with_bbox',
- item.get('block_content', ''))
- mineru_item['table_body'] = table_html
- mineru_item['table_body_with_bbox'] = table_html
- mineru_item['table_caption'] = []
- mineru_item['table_footnote'] = []
-
- # 提取表格单元格信息
- if item.get('table_cells'):
- mineru_item['table_cells'] = item['table_cells']
-
- elif mineru_type == 'image':
- mineru_item['img_path'] = item.get('img_path', '')
- mineru_item['image_caption'] = []
- mineru_item['image_footnote'] = []
-
- elif mineru_type == 'interline_equation':
- mineru_item['text'] = item.get('block_content', item.get('text', ''))
- mineru_item['text_format'] = 'latex'
-
- mineru_data.append(mineru_item)
-
- return mineru_data
|