zhengchun
/
ocr_verify


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
							"""
统一输出格式转换器
将不同OCR工具的结果转换为标准的MinerU格式
"""
from typing import List, Dict
from bs4 import BeautifulSoup

class UnifiedOutputConverter:
    """统一输出格式转换器"""
    
    def __init__(self):
        # self.text_processor = TextProcessor()
        pass
    
    def convert_to_mineru_format(self, merged_data: List[Dict], 
                                 data_source: str = 'auto') -> List[Dict]:
        """
        将合并后的数据转换为MinerU标准格式
        
        Args:
            merged_data: 合并后的数据
            data_source: 数据来源 ('paddleocr_vl', 'mineru', 'auto')
        
        Returns:
            MinerU格式的数据列表
        """
        # 自动检测数据格式
        if data_source == 'auto':
            data_source = self._detect_data_source(merged_data)
        
        if data_source == 'paddleocr_vl':
            return self._convert_paddleocr_vl_to_mineru(merged_data)
        elif data_source == 'mineru':
            return merged_data  # 已经是MinerU格式
        else:
            raise ValueError(f"Unsupported data source: {data_source}")
    
    def _detect_data_source(self, merged_data: List[Dict]) -> str:
        """检测数据来源"""
        if not merged_data:
            return 'mineru'
        
        first_item = merged_data[0]
        
        # 检查PaddleOCR_VL特征
        if 'block_label' in first_item:
            return 'paddleocr_vl'
        
        # 检查MinerU特征
        if 'type' in first_item:
            return 'mineru'
        
        return 'mineru'  # 默认
    
    def _convert_paddleocr_vl_to_mineru(self, merged_data: List[Dict]) -> List[Dict]:
        """将PaddleOCR_VL格式转换为MinerU格式"""
        mineru_data = []
        
        for item in merged_data:
            block_label = item.get('block_label', '')
            
            # 映射block_label到MinerU的type
            type_mapping = {
                'header': 'header',
                'footer': 'footer',
                'page_number': 'page_number',
                'paragraph_title': 'title',
                'doc_title': 'title',
                'abstract_title': 'title',
                'reference_title': 'title',
                'content_title': 'title',
                'figure_title': 'title',
                'table_title': 'title',
                'text': 'text',
                'table': 'table',
                'figure': 'image',
                'chart': 'image',
                'seal': 'image',
                'equation': 'interline_equation',
                'reference': 'ref_text',
            }
            
            mineru_type = type_mapping.get(block_label, 'text')
            
            # 构建MinerU格式的数据项
            mineru_item = {
                'type': mineru_type,
                'bbox': item.get('block_bbox', item.get('bbox', [])),
                'page_idx': item.get('page_idx', 0)
            }
            
            # 根据类型添加特定字段
            if mineru_type == 'title':
                mineru_item['text'] = item.get('block_content', item.get('text', ''))
                # 根据block_label确定标题级别
                level_map = {
                    'doc_title': 1,
                    'paragraph_title': 2,
                    'abstract_title': 2,
                    'reference_title': 2,
                    'content_title': 3,
                    'figure_title': 4,
                    'table_title': 4,
                }
                mineru_item['text_level'] = level_map.get(block_label, 1)
            
            elif mineru_type == 'text':
                mineru_item['text'] = item.get('block_content', item.get('text', ''))
            
            elif mineru_type in ['header', 'footer', 'page_number', 'ref_text']:
                mineru_item['text'] = item.get('block_content', item.get('text', ''))
            
            elif mineru_type == 'table':
                # 处理表格
                table_html = item.get('block_content_with_bbox', 
                                    item.get('block_content', ''))
                mineru_item['table_body'] = table_html
                mineru_item['table_body_with_bbox'] = table_html
                mineru_item['table_caption'] = []
                mineru_item['table_footnote'] = []
                
                # 提取表格单元格信息
                if item.get('table_cells'):
                    mineru_item['table_cells'] = item['table_cells']
            
            elif mineru_type == 'image':
                mineru_item['img_path'] = item.get('img_path', '')
                mineru_item['image_caption'] = []
                mineru_item['image_footnote'] = []
            
            elif mineru_type == 'interline_equation':
                mineru_item['text'] = item.get('block_content', item.get('text', ''))
                mineru_item['text_format'] = 'latex'
            
            mineru_data.append(mineru_item)
        
        return mineru_data