|
|
@@ -0,0 +1,137 @@
|
|
|
+"""
|
|
|
+统一输出格式转换器
|
|
|
+将不同OCR工具的结果转换为标准的MinerU格式
|
|
|
+"""
|
|
|
+from typing import List, Dict
|
|
|
+from bs4 import BeautifulSoup
|
|
|
+
|
|
|
+class UnifiedOutputConverter:
|
|
|
+ """统一输出格式转换器"""
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ # self.text_processor = TextProcessor()
|
|
|
+ pass
|
|
|
+
|
|
|
+ def convert_to_mineru_format(self, merged_data: List[Dict],
|
|
|
+ data_source: str = 'auto') -> List[Dict]:
|
|
|
+ """
|
|
|
+ 将合并后的数据转换为MinerU标准格式
|
|
|
+
|
|
|
+ Args:
|
|
|
+ merged_data: 合并后的数据
|
|
|
+ data_source: 数据来源 ('paddleocr_vl', 'mineru', 'auto')
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ MinerU格式的数据列表
|
|
|
+ """
|
|
|
+ # 自动检测数据格式
|
|
|
+ if data_source == 'auto':
|
|
|
+ data_source = self._detect_data_source(merged_data)
|
|
|
+
|
|
|
+ if data_source == 'paddleocr_vl':
|
|
|
+ return self._convert_paddleocr_vl_to_mineru(merged_data)
|
|
|
+ elif data_source == 'mineru':
|
|
|
+ return merged_data # 已经是MinerU格式
|
|
|
+ else:
|
|
|
+ raise ValueError(f"Unsupported data source: {data_source}")
|
|
|
+
|
|
|
+ def _detect_data_source(self, merged_data: List[Dict]) -> str:
|
|
|
+ """检测数据来源"""
|
|
|
+ if not merged_data:
|
|
|
+ return 'mineru'
|
|
|
+
|
|
|
+ first_item = merged_data[0]
|
|
|
+
|
|
|
+ # 检查PaddleOCR_VL特征
|
|
|
+ if 'block_label' in first_item:
|
|
|
+ return 'paddleocr_vl'
|
|
|
+
|
|
|
+ # 检查MinerU特征
|
|
|
+ if 'type' in first_item:
|
|
|
+ return 'mineru'
|
|
|
+
|
|
|
+ return 'mineru' # 默认
|
|
|
+
|
|
|
+ def _convert_paddleocr_vl_to_mineru(self, merged_data: List[Dict]) -> List[Dict]:
|
|
|
+ """将PaddleOCR_VL格式转换为MinerU格式"""
|
|
|
+ mineru_data = []
|
|
|
+
|
|
|
+ for item in merged_data:
|
|
|
+ block_label = item.get('block_label', '')
|
|
|
+
|
|
|
+ # 映射block_label到MinerU的type
|
|
|
+ type_mapping = {
|
|
|
+ 'header': 'header',
|
|
|
+ 'footer': 'footer',
|
|
|
+ 'page_number': 'page_number',
|
|
|
+ 'paragraph_title': 'title',
|
|
|
+ 'doc_title': 'title',
|
|
|
+ 'abstract_title': 'title',
|
|
|
+ 'reference_title': 'title',
|
|
|
+ 'content_title': 'title',
|
|
|
+ 'figure_title': 'title',
|
|
|
+ 'table_title': 'title',
|
|
|
+ 'text': 'text',
|
|
|
+ 'table': 'table',
|
|
|
+ 'figure': 'image',
|
|
|
+ 'chart': 'image',
|
|
|
+ 'seal': 'image',
|
|
|
+ 'equation': 'interline_equation',
|
|
|
+ 'reference': 'ref_text',
|
|
|
+ }
|
|
|
+
|
|
|
+ mineru_type = type_mapping.get(block_label, 'text')
|
|
|
+
|
|
|
+ # 构建MinerU格式的数据项
|
|
|
+ mineru_item = {
|
|
|
+ 'type': mineru_type,
|
|
|
+ 'bbox': item.get('block_bbox', item.get('bbox', [])),
|
|
|
+ 'page_idx': item.get('page_idx', 0)
|
|
|
+ }
|
|
|
+
|
|
|
+ # 根据类型添加特定字段
|
|
|
+ if mineru_type == 'title':
|
|
|
+ mineru_item['text'] = item.get('block_content', item.get('text', ''))
|
|
|
+ # 根据block_label确定标题级别
|
|
|
+ level_map = {
|
|
|
+ 'doc_title': 1,
|
|
|
+ 'paragraph_title': 2,
|
|
|
+ 'abstract_title': 2,
|
|
|
+ 'reference_title': 2,
|
|
|
+ 'content_title': 3,
|
|
|
+ 'figure_title': 4,
|
|
|
+ 'table_title': 4,
|
|
|
+ }
|
|
|
+ mineru_item['text_level'] = level_map.get(block_label, 1)
|
|
|
+
|
|
|
+ elif mineru_type == 'text':
|
|
|
+ mineru_item['text'] = item.get('block_content', item.get('text', ''))
|
|
|
+
|
|
|
+ elif mineru_type in ['header', 'footer', 'page_number', 'ref_text']:
|
|
|
+ mineru_item['text'] = item.get('block_content', item.get('text', ''))
|
|
|
+
|
|
|
+ elif mineru_type == 'table':
|
|
|
+ # 处理表格
|
|
|
+ table_html = item.get('block_content_with_bbox',
|
|
|
+ item.get('block_content', ''))
|
|
|
+ mineru_item['table_body'] = table_html
|
|
|
+ mineru_item['table_body_with_bbox'] = table_html
|
|
|
+ mineru_item['table_caption'] = []
|
|
|
+ mineru_item['table_footnote'] = []
|
|
|
+
|
|
|
+ # 提取表格单元格信息
|
|
|
+ if item.get('table_cells'):
|
|
|
+ mineru_item['table_cells'] = item['table_cells']
|
|
|
+
|
|
|
+ elif mineru_type == 'image':
|
|
|
+ mineru_item['img_path'] = item.get('img_path', '')
|
|
|
+ mineru_item['image_caption'] = []
|
|
|
+ mineru_item['image_footnote'] = []
|
|
|
+
|
|
|
+ elif mineru_type == 'interline_equation':
|
|
|
+ mineru_item['text'] = item.get('block_content', item.get('text', ''))
|
|
|
+ mineru_item['text_format'] = 'latex'
|
|
|
+
|
|
|
+ mineru_data.append(mineru_item)
|
|
|
+
|
|
|
+ return mineru_data
|