Bläddra i källkod

feat: 添加UnifiedOutputConverter类,实现不同OCR工具结果转换为MinerU格式

zhch158_admin 4 veckor sedan
förälder
incheckning
7cf744a8dc
1 ändrade filer med 137 tillägg och 0 borttagningar
  1. 137 0
      merger/unified_output_converter.py

+ 137 - 0
merger/unified_output_converter.py

@@ -0,0 +1,137 @@
+"""
+统一输出格式转换器
+将不同OCR工具的结果转换为标准的MinerU格式
+"""
+from typing import List, Dict
+from bs4 import BeautifulSoup
+
+class UnifiedOutputConverter:
+    """统一输出格式转换器"""
+    
+    def __init__(self):
+        # self.text_processor = TextProcessor()
+        pass
+    
+    def convert_to_mineru_format(self, merged_data: List[Dict], 
+                                 data_source: str = 'auto') -> List[Dict]:
+        """
+        将合并后的数据转换为MinerU标准格式
+        
+        Args:
+            merged_data: 合并后的数据
+            data_source: 数据来源 ('paddleocr_vl', 'mineru', 'auto')
+        
+        Returns:
+            MinerU格式的数据列表
+        """
+        # 自动检测数据格式
+        if data_source == 'auto':
+            data_source = self._detect_data_source(merged_data)
+        
+        if data_source == 'paddleocr_vl':
+            return self._convert_paddleocr_vl_to_mineru(merged_data)
+        elif data_source == 'mineru':
+            return merged_data  # 已经是MinerU格式
+        else:
+            raise ValueError(f"Unsupported data source: {data_source}")
+    
+    def _detect_data_source(self, merged_data: List[Dict]) -> str:
+        """检测数据来源"""
+        if not merged_data:
+            return 'mineru'
+        
+        first_item = merged_data[0]
+        
+        # 检查PaddleOCR_VL特征
+        if 'block_label' in first_item:
+            return 'paddleocr_vl'
+        
+        # 检查MinerU特征
+        if 'type' in first_item:
+            return 'mineru'
+        
+        return 'mineru'  # 默认
+    
+    def _convert_paddleocr_vl_to_mineru(self, merged_data: List[Dict]) -> List[Dict]:
+        """将PaddleOCR_VL格式转换为MinerU格式"""
+        mineru_data = []
+        
+        for item in merged_data:
+            block_label = item.get('block_label', '')
+            
+            # 映射block_label到MinerU的type
+            type_mapping = {
+                'header': 'header',
+                'footer': 'footer',
+                'page_number': 'page_number',
+                'paragraph_title': 'title',
+                'doc_title': 'title',
+                'abstract_title': 'title',
+                'reference_title': 'title',
+                'content_title': 'title',
+                'figure_title': 'title',
+                'table_title': 'title',
+                'text': 'text',
+                'table': 'table',
+                'figure': 'image',
+                'chart': 'image',
+                'seal': 'image',
+                'equation': 'interline_equation',
+                'reference': 'ref_text',
+            }
+            
+            mineru_type = type_mapping.get(block_label, 'text')
+            
+            # 构建MinerU格式的数据项
+            mineru_item = {
+                'type': mineru_type,
+                'bbox': item.get('block_bbox', item.get('bbox', [])),
+                'page_idx': item.get('page_idx', 0)
+            }
+            
+            # 根据类型添加特定字段
+            if mineru_type == 'title':
+                mineru_item['text'] = item.get('block_content', item.get('text', ''))
+                # 根据block_label确定标题级别
+                level_map = {
+                    'doc_title': 1,
+                    'paragraph_title': 2,
+                    'abstract_title': 2,
+                    'reference_title': 2,
+                    'content_title': 3,
+                    'figure_title': 4,
+                    'table_title': 4,
+                }
+                mineru_item['text_level'] = level_map.get(block_label, 1)
+            
+            elif mineru_type == 'text':
+                mineru_item['text'] = item.get('block_content', item.get('text', ''))
+            
+            elif mineru_type in ['header', 'footer', 'page_number', 'ref_text']:
+                mineru_item['text'] = item.get('block_content', item.get('text', ''))
+            
+            elif mineru_type == 'table':
+                # 处理表格
+                table_html = item.get('block_content_with_bbox', 
+                                    item.get('block_content', ''))
+                mineru_item['table_body'] = table_html
+                mineru_item['table_body_with_bbox'] = table_html
+                mineru_item['table_caption'] = []
+                mineru_item['table_footnote'] = []
+                
+                # 提取表格单元格信息
+                if item.get('table_cells'):
+                    mineru_item['table_cells'] = item['table_cells']
+            
+            elif mineru_type == 'image':
+                mineru_item['img_path'] = item.get('img_path', '')
+                mineru_item['image_caption'] = []
+                mineru_item['image_footnote'] = []
+            
+            elif mineru_type == 'interline_equation':
+                mineru_item['text'] = item.get('block_content', item.get('text', ''))
+                mineru_item['text_format'] = 'latex'
+            
+            mineru_data.append(mineru_item)
+        
+        return mineru_data