4 veckor sedan · 7cf744a8dc
--- a/merger/unified_output_converter.py
+++ b/merger/unified_output_converter.py
@@ -0,0 +1,137 @@
 
				+"""
			
 
				+统一输出格式转换器
			
 
				+将不同OCR工具的结果转换为标准的MinerU格式
			
 
				+"""
			
 
				+from typing import List, Dict
			
 
				+from bs4 import BeautifulSoup
			
 
				+
			
 
				+class UnifiedOutputConverter:
			
 
				+    """统一输出格式转换器"""
			
 
				+    
			
 
				+    def __init__(self):
			
 
				+        # self.text_processor = TextProcessor()
			
 
				+        pass
			
 
				+    
			
 
				+    def convert_to_mineru_format(self, merged_data: List[Dict], 
			
 
				+                                 data_source: str = 'auto') -> List[Dict]:
			
 
				+        """
			
 
				+        将合并后的数据转换为MinerU标准格式
			
 
				+        
			
 
				+        Args:
			
 
				+            merged_data: 合并后的数据
			
 
				+            data_source: 数据来源 ('paddleocr_vl', 'mineru', 'auto')
			
 
				+        
			
 
				+        Returns:
			
 
				+            MinerU格式的数据列表
			
 
				+        """
			
 
				+        # 自动检测数据格式
			
 
				+        if data_source == 'auto':
			
 
				+            data_source = self._detect_data_source(merged_data)
			
 
				+        
			
 
				+        if data_source == 'paddleocr_vl':
			
 
				+            return self._convert_paddleocr_vl_to_mineru(merged_data)
			
 
				+        elif data_source == 'mineru':
			
 
				+            return merged_data  # 已经是MinerU格式
			
 
				+        else:
			
 
				+            raise ValueError(f"Unsupported data source: {data_source}")
			
 
				+    
			
 
				+    def _detect_data_source(self, merged_data: List[Dict]) -> str:
			
 
				+        """检测数据来源"""
			
 
				+        if not merged_data:
			
 
				+            return 'mineru'
			
 
				+        
			
 
				+        first_item = merged_data[0]
			
 
				+        
			
 
				+        # 检查PaddleOCR_VL特征
			
 
				+        if 'block_label' in first_item:
			
 
				+            return 'paddleocr_vl'
			
 
				+        
			
 
				+        # 检查MinerU特征
			
 
				+        if 'type' in first_item:
			
 
				+            return 'mineru'
			
 
				+        
			
 
				+        return 'mineru'  # 默认
			
 
				+    
			
 
				+    def _convert_paddleocr_vl_to_mineru(self, merged_data: List[Dict]) -> List[Dict]:
			
 
				+        """将PaddleOCR_VL格式转换为MinerU格式"""
			
 
				+        mineru_data = []
			
 
				+        
			
 
				+        for item in merged_data:
			
 
				+            block_label = item.get('block_label', '')
			
 
				+            
			
 
				+            # 映射block_label到MinerU的type
			
 
				+            type_mapping = {
			
 
				+                'header': 'header',
			
 
				+                'footer': 'footer',
			
 
				+                'page_number': 'page_number',
			
 
				+                'paragraph_title': 'title',
			
 
				+                'doc_title': 'title',
			
 
				+                'abstract_title': 'title',
			
 
				+                'reference_title': 'title',
			
 
				+                'content_title': 'title',
			
 
				+                'figure_title': 'title',
			
 
				+                'table_title': 'title',
			
 
				+                'text': 'text',
			
 
				+                'table': 'table',
			
 
				+                'figure': 'image',
			
 
				+                'chart': 'image',
			
 
				+                'seal': 'image',
			
 
				+                'equation': 'interline_equation',
			
 
				+                'reference': 'ref_text',
			
 
				+            }
			
 
				+            
			
 
				+            mineru_type = type_mapping.get(block_label, 'text')
			
 
				+            
			
 
				+            # 构建MinerU格式的数据项
			
 
				+            mineru_item = {
			
 
				+                'type': mineru_type,
			
 
				+                'bbox': item.get('block_bbox', item.get('bbox', [])),
			
 
				+                'page_idx': item.get('page_idx', 0)
			
 
				+            }
			
 
				+            
			
 
				+            # 根据类型添加特定字段
			
 
				+            if mineru_type == 'title':
			
 
				+                mineru_item['text'] = item.get('block_content', item.get('text', ''))
			
 
				+                # 根据block_label确定标题级别
			
 
				+                level_map = {
			
 
				+                    'doc_title': 1,
			
 
				+                    'paragraph_title': 2,
			
 
				+                    'abstract_title': 2,
			
 
				+                    'reference_title': 2,
			
 
				+                    'content_title': 3,
			
 
				+                    'figure_title': 4,
			
 
				+                    'table_title': 4,
			
 
				+                }
			
 
				+                mineru_item['text_level'] = level_map.get(block_label, 1)
			
 
				+            
			
 
				+            elif mineru_type == 'text':
			
 
				+                mineru_item['text'] = item.get('block_content', item.get('text', ''))
			
 
				+            
			
 
				+            elif mineru_type in ['header', 'footer', 'page_number', 'ref_text']:
			
 
				+                mineru_item['text'] = item.get('block_content', item.get('text', ''))
			
 
				+            
			
 
				+            elif mineru_type == 'table':
			
 
				+                # 处理表格
			
 
				+                table_html = item.get('block_content_with_bbox', 
			
 
				+                                    item.get('block_content', ''))
			
 
				+                mineru_item['table_body'] = table_html
			
 
				+                mineru_item['table_body_with_bbox'] = table_html
			
 
				+                mineru_item['table_caption'] = []
			
 
				+                mineru_item['table_footnote'] = []
			
 
				+                
			
 
				+                # 提取表格单元格信息
			
 
				+                if item.get('table_cells'):
			
 
				+                    mineru_item['table_cells'] = item['table_cells']
			
 
				+            
			
 
				+            elif mineru_type == 'image':
			
 
				+                mineru_item['img_path'] = item.get('img_path', '')
			
 
				+                mineru_item['image_caption'] = []
			
 
				+                mineru_item['image_footnote'] = []
			
 
				+            
			
 
				+            elif mineru_type == 'interline_equation':
			
 
				+                mineru_item['text'] = item.get('block_content', item.get('text', ''))
			
 
				+                mineru_item['text_format'] = 'latex'
			
 
				+            
			
 
				+            mineru_data.append(mineru_item)
			
 
				+        
			
 
				+        return mineru_data