zhengchun
/
ocr_verify


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450
							"""
数据处理模块
负责处理 MinerU/PaddleOCR_VL/DotsOCR 数据，添加 bbox 信息
"""
from typing import List, Dict, Tuple, Optional
from bs4 import BeautifulSoup

try:
    from .text_matcher import TextMatcher
    from .bbox_extractor import BBoxExtractor
    from .table_cell_matcher import TableCellMatcher
except ImportError:
    from text_matcher import TextMatcher
    from bbox_extractor import BBoxExtractor
    from table_cell_matcher import TableCellMatcher


class DataProcessor:
    """数据处理器"""
    """_summary_
    1.负责处理 MinerU/PaddleOCR_VL/DotsOCR 数据，添加 table_cells bbox 信息, 其他类型的bbox信息依然使用vl自带的bbox
    2.由于不同OCR工具的输出格式不同，DataProcessor 需要包含多个处理方法，分别处理 MinerU、DotsOCR 和 PaddleOCR_VL 数据, 都先转换成mineru格式再添加table cells bbox信息
    3.使用 TextMatcher 进行文本匹配，TableCellMatcher 进行表单元格匹配
    4.最终输出统一的 MinerU 格式数据
    
    由于VL模型minerU，dotsocr坐标都是使用的原图坐标，不是旋转后的坐标，PaddleVL使用的时旋转转换后的坐标，而ppstructure使用的ocr文本块是旋转后的坐标，
    因此在处理VL数据时，
    1.首先需要根据ppstructure的旋转角度和原图尺寸，将VL的table坐标转换为旋转后的坐标
    2.通过TableCellMatcher 进行表单元格匹配
    3.再将匹配到的单元格bbox逆向转换为原图坐标，存储在最终输出的MinerU格式数据中
    4.其他类型的bbox信息依然使用vl自带的bbox
    """
    
    def __init__(self, text_matcher: TextMatcher, look_ahead_window: int = 10, x_tolerance: int = 3, y_tolerance: int = 10):
        """
        Args:
            text_matcher: 文本匹配器
            look_ahead_window: 向前查找窗口
            x_tolerance: x轴容差
        """
        self.text_matcher = text_matcher
        self.look_ahead_window = look_ahead_window
        # X轴容差, 用于判断文本框是否在同一列
        self.x_tolerance = x_tolerance
        self.y_tolerance = y_tolerance  # Y轴容差, 用于行分组
        
        # 🎯 创建表格单元格匹配器
        self.table_cell_matcher = TableCellMatcher(
            text_matcher=text_matcher,
            x_tolerance=x_tolerance,
            y_tolerance=y_tolerance
        )
    
    def process_mineru_data(self, mineru_data: List[Dict], 
                           paddle_text_boxes: List[Dict], rotation_angle: float, orig_image_size: Tuple[int, int]) -> List[Dict]:
        """
        处理 MinerU 数据，添加 bbox 信息
        
        Args:
            mineru_data: MinerU 数据
            paddle_text_boxes: PaddleOCR 文字框列表
        
        Returns:
            合并后的数据, table cell使用paddle的bbox，其他类型只是移动指针，bbox还是沿用minerU的bbox
        """
        merged_data = []
        paddle_pointer = 0
        last_matched_index = 0

        # 按 bbox 排序
        mineru_data.sort(
            key=lambda x: (x['bbox'][1], x['bbox'][0]) 
            if 'bbox' in x else (float('inf'), float('inf'))
        )

        for item in mineru_data:
            item_type = item.get('type', '')
            
            if item_type == 'table':
                if rotation_angle != 0:
                    inverse_table_bbox = BBoxExtractor.rotate_box_coordinates(item['bbox'], rotation_angle, orig_image_size)  
                    inverse_item = item.copy()
                    inverse_item['bbox'] = inverse_table_bbox
                else:
                    inverse_item = item
                merged_item, paddle_pointer = self._process_table(
                    inverse_item, paddle_text_boxes, paddle_pointer
                )
                # 如果有旋转，需要将匹配到的单元格bbox逆向转换为原图坐标
                if rotation_angle != 0:
                    for cell in merged_item.get('table_cells', []):
                        cell_bbox = cell.get('bbox', [])
                        if cell_bbox:
                            original_bbox = BBoxExtractor.inverse_rotate_box_coordinates(cell_bbox, rotation_angle, orig_image_size)
                            cell['bbox'] = original_bbox
                    merged_item['bbox'] = item['bbox']  # 保持表格的原始bbox不变        
                            
                merged_data.append(merged_item)
            
            elif item_type in ['text', 'title', 'header', 'footer']:
                merged_item, paddle_pointer, last_matched_index = self._process_text(
                    item, paddle_text_boxes, paddle_pointer, last_matched_index
                )
                merged_data.append(merged_item)
            
            elif item_type == 'list':
                merged_item, paddle_pointer, last_matched_index = self._process_list(
                    item, paddle_text_boxes, paddle_pointer, last_matched_index
                )
                merged_data.append(merged_item)
            
            else:
                merged_data.append(item.copy())
        
        return merged_data
    
    def process_dotsocr_data(self, dotsocr_data: List[Dict],
                            paddle_text_boxes: List[Dict], 
                            rotation_angle: float, 
                            orig_image_size: Tuple[int, int]) -> List[Dict]:
        """
        处理 DotsOCR 数据（简化版：转换后复用 MinerU 处理逻辑）
        
        Args:
            dotsocr_data: DotsOCR 输出数据
            paddle_text_boxes: PaddleOCR 文本框
            rotation_angle: 旋转角度
            orig_image_size: 原始图片尺寸
        
        Returns:
            统一的 MinerU 格式数据（带 table_cells bbox）
        """
        print(f"📊 处理 DotsOCR 数据: {len(dotsocr_data)} 个块")
        
        # 🎯 第一步：转换为 MinerU 格式
        mineru_format_data = []
        for item in dotsocr_data:
            try:
                converted_item = self._convert_dotsocr_to_mineru(item)
                if converted_item:
                    mineru_format_data.append(converted_item)
            except Exception as e:
                print(f"⚠️ DotsOCR 转换失败: {e}")
                continue
        
        print(f"   ✓ 转换完成: {len(mineru_format_data)} 个块")
        
        # 🎯 第二步：复用 MinerU 处理逻辑
        return self.process_mineru_data(
            mineru_data=mineru_format_data,
            paddle_text_boxes=paddle_text_boxes,
            rotation_angle=rotation_angle,
            orig_image_size=orig_image_size
        )

    def _convert_dotsocr_to_mineru(self, dotsocr_item: Dict) -> Dict:
        """
        🎯 将 DotsOCR 格式转换为 MinerU 格式
        
        DotsOCR:
        {
            "category": "Table",
            "bbox": [x1, y1, x2, y2],
            "text": "..."
        }
        
        MinerU:
        {
            "type": "table",
            "bbox": [x1, y1, x2, y2],
            "table_body": "...",
            "page_idx": 0
        }
        """
        category = dotsocr_item.get('category', '')
        
        # 🎯 Category 映射
        category_map = {
            'Page-header': 'header',
            'Page-footer': 'footer',
            'Picture': 'image',
            'Figure': 'image',
            'Section-header': 'title',
            'Table': 'table',
            'Text': 'text',
            'Title': 'title',
            'List': 'list',
            'Caption': 'title'
        }
        
        mineru_type = category_map.get(category, 'text')
        
        # 🎯 基础转换
        mineru_item = {
            'type': mineru_type,
            'bbox': dotsocr_item.get('bbox', []),
            'page_idx': 0  # DotsOCR 默认单页
        }
        
        # 🎯 处理文本内容
        text = dotsocr_item.get('text', '')
        
        if mineru_type == 'table':
            # 表格：text -> table_body
            mineru_item['table_body'] = text
        else:
            # 其他类型：保持 text
            mineru_item['text'] = text
            
            # 标题级别
            if category == 'Section-header':
                mineru_item['text_level'] = 1
        
        return mineru_item
    
    def process_paddleocr_vl_data(self, paddleocr_vl_data: Dict,
                                  paddle_text_boxes: List[Dict], rotation_angle: float, orig_image_size: Tuple[int, int]) -> List[Dict]:
        """
        处理 PaddleOCR_VL 数据，添加 bbox 信息
        
        Args:
            paddleocr_vl_data: PaddleOCR_VL 数据 (JSON 对象)
            paddle_text_boxes: PaddleOCR 文字框列表
        
        Returns:
            🎯 MinerU 格式的合并数据（统一输出格式）
        """
        # 🎯 获取旋转角度和原始图像尺寸
        vl_rotation_angle = self._get_rotation_angle_from_vl(paddleocr_vl_data)
        vl_orig_image_size = (0,0)
        
        if vl_rotation_angle != 0:
            vl_orig_image_size = self._get_original_image_size_from_vl(paddleocr_vl_data)
            print(f"🔄 PaddleOCR_VL 检测到旋转角度: {vl_rotation_angle}°")
            print(f"📐 原始图像尺寸: {vl_orig_image_size[0]} x {vl_orig_image_size[1]}")
        
        # 提取 parsing_res_list
        parsing_res_list = paddleocr_vl_data.get('parsing_res_list', [])
        
        # 按 bbox 排序
        parsing_res_list.sort(
            key=lambda x: (x['block_bbox'][1], x['block_bbox'][0])
            if 'block_bbox' in x else (float('inf'), float('inf'))
        )
        mineru_format_data = []
    
        for item in parsing_res_list:
            # 🎯 先转换 bbox 坐标（如果需要）
            if vl_rotation_angle != 0 and orig_image_size:
                item = self._transform_vl_block_bbox(item, vl_rotation_angle, vl_orig_image_size)
            converted_item = self._convert_paddleocr_vl_to_mineru(item)
            if converted_item:
                mineru_format_data.append(converted_item)
    
        print(f"   ✓ 转换完成: {len(mineru_format_data)} 个块")
        
        # 🎯 第三步：复用 MinerU 处理逻辑
        return self.process_mineru_data(
            mineru_data=mineru_format_data,
            paddle_text_boxes=paddle_text_boxes,
            rotation_angle=rotation_angle,
            orig_image_size=orig_image_size
        )    

    
    def _get_rotation_angle_from_vl(self, paddleocr_vl_data: Dict) -> float:
        """从 PaddleOCR_VL 数据中获取旋转角度"""
        return BBoxExtractor._get_rotation_angle(paddleocr_vl_data)
    
    def _get_original_image_size_from_vl(self, paddleocr_vl_data: Dict) -> tuple:
        """从 PaddleOCR_VL 数据中获取原始图像尺寸"""
        return BBoxExtractor._get_original_image_size(paddleocr_vl_data)
    
    def _transform_vl_block_bbox(self, item: Dict, angle: float, 
                                 orig_image_size: tuple) -> Dict:
        """
        转换 PaddleOCR_VL 的 block_bbox 坐标
        
        Args:
            item: PaddleOCR_VL 的 block 数据
            angle: 旋转角度
            orig_image_size: 原始图像尺寸
        
        Returns:
            转换后的 block 数据
        """
        transformed_item = item.copy()
        
        if 'block_bbox' not in item:
            return transformed_item
        
        block_bbox = item['block_bbox']
        if len(block_bbox) < 4:
            return transformed_item
        
        transformed_bbox = BBoxExtractor.inverse_rotate_box_coordinates(block_bbox, angle, orig_image_size)
        
        transformed_item['block_bbox'] = transformed_bbox
        
        return transformed_item
    
    def _convert_paddleocr_vl_to_mineru(self, paddleocr_vl_item: Dict) -> Dict:
        """
        🎯 将 PaddleOCR_VL 格式转换为 MinerU 格式
        
        基于 PP-DocLayout_plus-L 的 20 种类别
        """
        block_label = paddleocr_vl_item.get('block_label', '')
        
        # 🎯 PP-DocLayout_plus-L 类别映射（共 20 种）
        label_map = {
            # 标题类（3种）
            'paragraph_title': 'title',
            'doc_title': 'title',
            'figure_table_chart_title': 'title',
            
            # 文本类（9种）
            'text': 'text',
            'number': 'text',
            'content': 'text',
            'abstract': 'text',
            'footnote': 'text',
            'aside_text': 'text',
            'algorithm': 'text',
            'reference': 'text',
            'reference_content': 'text',
            
            # 页眉页脚（2种）
            'header': 'header',
            'footer': 'footer',
            
            # 表格（1种）
            'table': 'table',
            
            # 图片/图表（3种）
            'image': 'image',
            'chart': 'image',
            'seal': 'image',
            
            # 公式（2种）
            'formula': 'equation',
            'formula_number': 'equation'
        }
        
        mineru_type = label_map.get(block_label, 'text')
        
        mineru_item = {
            'type': mineru_type,
            'bbox': paddleocr_vl_item.get('block_bbox', []),
            'page_idx': 0
        }
        
        content = paddleocr_vl_item.get('block_content', '')
        
        if mineru_type == 'table':
            mineru_item['table_body'] = content
        else:
            mineru_item['text'] = content
            
            # 标题级别
            if block_label == 'doc_title':
                mineru_item['text_level'] = 1
            elif block_label == 'paragraph_title':
                mineru_item['text_level'] = 2
            elif block_label == 'figure_table_chart_title':
                mineru_item['text_level'] = 3
        
        return mineru_item
    
    def _process_table(self, item: Dict, paddle_text_boxes: List[Dict],
                  start_pointer: int) -> Tuple[Dict, int]:
        """
        处理表格类型（MinerU 格式）
        
        策略：
        - 解析 HTML 表格
        - 为每个单元格匹配 PaddleOCR 的 bbox
        - 返回处理后的表格和新指针位置
        """
        table_body = item.get('table_body', '')
        
        if not table_body:
            print(f"⚠️ 表格内容为空，跳过")
            return item, start_pointer
        
        try:
            # 🔑 传入 table_bbox 用于筛选
            table_bbox = item.get('bbox')  # MinerU 提供的表格边界
            
            # 🎯 委托给 TableCellMatcher
            enhanced_html, cells, new_pointer = \
                self.table_cell_matcher.enhance_table_html_with_bbox(
                    table_body,
                    paddle_text_boxes,
                    start_pointer,
                    table_bbox
                )
            
            # 更新 item
            item['table_body'] = enhanced_html
            item['table_cells'] = cells
            
            # 统计信息
            matched_count = len(cells)
            total_cells = len(BeautifulSoup(table_body, 'html.parser').find_all(['td', 'th']))
            
            print(f"   表格单元格: {matched_count}/{total_cells} 匹配")
            
            return item, new_pointer
            
        except Exception as e:
            print(f"⚠️ 表格处理失败: {e}")
            import traceback
            traceback.print_exc()
            return item, start_pointer
        
    def _process_text(self, item: Dict, paddle_text_boxes: List[Dict],
                     paddle_pointer: int, last_matched_index: int) -> Tuple[Dict, int, int]:
        """处理文本"""
        merged_item = item.copy()
        text = item.get('text', '')
        
        matched_bbox, paddle_pointer, last_matched_index = \
            self.text_matcher.find_matching_bbox(
                text, paddle_text_boxes, paddle_pointer, last_matched_index,
                self.look_ahead_window
            )
        
        if matched_bbox:
            matched_bbox['used'] = True
        
        return merged_item, paddle_pointer, last_matched_index
    
    def _process_list(self, item: Dict, paddle_text_boxes: List[Dict],
                     paddle_pointer: int, last_matched_index: int) -> Tuple[Dict, int, int]:
        """处理列表"""
        merged_item = item.copy()
        list_items = item.get('list_items', [])
        
        for list_item in list_items:
            matched_bbox, paddle_pointer, last_matched_index = \
                self.text_matcher.find_matching_bbox(
                    list_item, paddle_text_boxes, paddle_pointer, last_matched_index,
                    self.look_ahead_window
                )
            
            if matched_bbox:
                matched_bbox['used'] = True
        
        return merged_item, paddle_pointer, last_matched_index