""" 数据处理模块 负责处理 MinerU/PaddleOCR_VL/DotsOCR 数据,添加 bbox 信息 """ from typing import List, Dict, Tuple, Optional from bs4 import BeautifulSoup try: from .text_matcher import TextMatcher from .bbox_extractor import BBoxExtractor from .table_cell_matcher import TableCellMatcher except ImportError: from text_matcher import TextMatcher from bbox_extractor import BBoxExtractor from table_cell_matcher import TableCellMatcher class DataProcessor: """数据处理器""" """_summary_ 1.负责处理 MinerU/PaddleOCR_VL/DotsOCR 数据,添加 table_cells bbox 信息, 其他类型的bbox信息依然使用vl自带的bbox 2.由于不同OCR工具的输出格式不同,DataProcessor 需要包含多个处理方法,分别处理 MinerU、DotsOCR 和 PaddleOCR_VL 数据, 都先转换成mineru格式再添加table cells bbox信息 3.使用 TextMatcher 进行文本匹配,TableCellMatcher 进行表单元格匹配 4.最终输出统一的 MinerU 格式数据 由于VL模型minerU,dotsocr坐标都是使用的原图坐标,不是旋转后的坐标,PaddleVL使用的时旋转转换后的坐标,而ppstructure使用的ocr文本块是旋转后的坐标, 因此在处理VL数据时, 1.首先需要根据ppstructure的旋转角度和原图尺寸,将VL的table坐标转换为旋转后的坐标 2.通过TableCellMatcher 进行表单元格匹配 3.再将匹配到的单元格bbox逆向转换为原图坐标,存储在最终输出的MinerU格式数据中 4.其他类型的bbox信息依然使用vl自带的bbox """ def __init__(self, text_matcher: TextMatcher, look_ahead_window: int = 10, x_tolerance: int = 3, y_tolerance: int = 10): """ Args: text_matcher: 文本匹配器 look_ahead_window: 向前查找窗口 x_tolerance: x轴容差 """ self.text_matcher = text_matcher self.look_ahead_window = look_ahead_window # X轴容差, 用于判断文本框是否在同一列 self.x_tolerance = x_tolerance self.y_tolerance = y_tolerance # Y轴容差, 用于行分组 # 🎯 创建表格单元格匹配器 self.table_cell_matcher = TableCellMatcher( text_matcher=text_matcher, x_tolerance=x_tolerance, y_tolerance=y_tolerance ) def process_mineru_data(self, mineru_data: List[Dict], paddle_text_boxes: List[Dict], rotation_angle: float, orig_image_size: Tuple[int, int]) -> List[Dict]: """ 处理 MinerU 数据,添加 bbox 信息 Args: mineru_data: MinerU 数据 paddle_text_boxes: PaddleOCR 文字框列表 Returns: 合并后的数据, table cell使用paddle的bbox,其他类型只是移动指针,bbox还是沿用minerU的bbox """ merged_data = [] paddle_pointer = 0 last_matched_index = 0 # 按 bbox 排序 mineru_data.sort( key=lambda x: (x['bbox'][1], x['bbox'][0]) if 'bbox' in x else (float('inf'), float('inf')) ) for item in mineru_data: item_type = item.get('type', '') if item_type == 'table': if rotation_angle != 0: inverse_table_bbox = BBoxExtractor.rotate_box_coordinates(item['bbox'], rotation_angle, orig_image_size) inverse_item = item.copy() inverse_item['bbox'] = inverse_table_bbox else: inverse_item = item merged_item, paddle_pointer = self._process_table( inverse_item, paddle_text_boxes, paddle_pointer ) # 如果有旋转,需要将匹配到的单元格bbox逆向转换为原图坐标 if rotation_angle != 0: for cell in merged_item.get('table_cells', []): cell_bbox = cell.get('bbox', []) if cell_bbox: original_bbox = BBoxExtractor.inverse_rotate_box_coordinates(cell_bbox, rotation_angle, orig_image_size) cell['bbox'] = original_bbox merged_item['bbox'] = item['bbox'] # 保持表格的原始bbox不变 merged_data.append(merged_item) elif item_type in ['text', 'title', 'header', 'footer']: merged_item, paddle_pointer, last_matched_index = self._process_text( item, paddle_text_boxes, paddle_pointer, last_matched_index ) merged_data.append(merged_item) elif item_type == 'list': merged_item, paddle_pointer, last_matched_index = self._process_list( item, paddle_text_boxes, paddle_pointer, last_matched_index ) merged_data.append(merged_item) else: merged_data.append(item.copy()) return merged_data def process_dotsocr_data(self, dotsocr_data: List[Dict], paddle_text_boxes: List[Dict], rotation_angle: float, orig_image_size: Tuple[int, int]) -> List[Dict]: """ 处理 DotsOCR 数据(简化版:转换后复用 MinerU 处理逻辑) Args: dotsocr_data: DotsOCR 输出数据 paddle_text_boxes: PaddleOCR 文本框 rotation_angle: 旋转角度 orig_image_size: 原始图片尺寸 Returns: 统一的 MinerU 格式数据(带 table_cells bbox) """ print(f"📊 处理 DotsOCR 数据: {len(dotsocr_data)} 个块") # 🎯 第一步:转换为 MinerU 格式 mineru_format_data = [] for item in dotsocr_data: try: converted_item = self._convert_dotsocr_to_mineru(item) if converted_item: mineru_format_data.append(converted_item) except Exception as e: print(f"⚠️ DotsOCR 转换失败: {e}") continue print(f" ✓ 转换完成: {len(mineru_format_data)} 个块") # 🎯 第二步:复用 MinerU 处理逻辑 return self.process_mineru_data( mineru_data=mineru_format_data, paddle_text_boxes=paddle_text_boxes, rotation_angle=rotation_angle, orig_image_size=orig_image_size ) def _convert_dotsocr_to_mineru(self, dotsocr_item: Dict) -> Dict: """ 🎯 将 DotsOCR 格式转换为 MinerU 格式 DotsOCR: { "category": "Table", "bbox": [x1, y1, x2, y2], "text": "..." } MinerU: { "type": "table", "bbox": [x1, y1, x2, y2], "table_body": "...", "page_idx": 0 } """ category = dotsocr_item.get('category', '') # 🎯 Category 映射 category_map = { 'Page-header': 'header', 'Page-footer': 'footer', 'Picture': 'image', 'Figure': 'image', 'Section-header': 'title', 'Table': 'table', 'Text': 'text', 'Title': 'title', 'List': 'list', 'Caption': 'title' } mineru_type = category_map.get(category, 'text') # 🎯 基础转换 mineru_item = { 'type': mineru_type, 'bbox': dotsocr_item.get('bbox', []), 'page_idx': 0 # DotsOCR 默认单页 } # 🎯 处理文本内容 text = dotsocr_item.get('text', '') if mineru_type == 'table': # 表格:text -> table_body mineru_item['table_body'] = text else: # 其他类型:保持 text mineru_item['text'] = text # 标题级别 if category == 'Section-header': mineru_item['text_level'] = 1 return mineru_item def process_paddleocr_vl_data(self, paddleocr_vl_data: Dict, paddle_text_boxes: List[Dict], rotation_angle: float, orig_image_size: Tuple[int, int]) -> List[Dict]: """ 处理 PaddleOCR_VL 数据,添加 bbox 信息 Args: paddleocr_vl_data: PaddleOCR_VL 数据 (JSON 对象) paddle_text_boxes: PaddleOCR 文字框列表 Returns: 🎯 MinerU 格式的合并数据(统一输出格式) """ # 🎯 获取旋转角度和原始图像尺寸 vl_rotation_angle = self._get_rotation_angle_from_vl(paddleocr_vl_data) vl_orig_image_size = (0,0) if vl_rotation_angle != 0: vl_orig_image_size = self._get_original_image_size_from_vl(paddleocr_vl_data) print(f"🔄 PaddleOCR_VL 检测到旋转角度: {vl_rotation_angle}°") print(f"📐 原始图像尺寸: {vl_orig_image_size[0]} x {vl_orig_image_size[1]}") # 提取 parsing_res_list parsing_res_list = paddleocr_vl_data.get('parsing_res_list', []) # 按 bbox 排序 parsing_res_list.sort( key=lambda x: (x['block_bbox'][1], x['block_bbox'][0]) if 'block_bbox' in x else (float('inf'), float('inf')) ) mineru_format_data = [] for item in parsing_res_list: # 🎯 先转换 bbox 坐标(如果需要) if vl_rotation_angle != 0 and orig_image_size: item = self._transform_vl_block_bbox(item, vl_rotation_angle, vl_orig_image_size) converted_item = self._convert_paddleocr_vl_to_mineru(item) if converted_item: mineru_format_data.append(converted_item) print(f" ✓ 转换完成: {len(mineru_format_data)} 个块") # 🎯 第三步:复用 MinerU 处理逻辑 return self.process_mineru_data( mineru_data=mineru_format_data, paddle_text_boxes=paddle_text_boxes, rotation_angle=rotation_angle, orig_image_size=orig_image_size ) def _get_rotation_angle_from_vl(self, paddleocr_vl_data: Dict) -> float: """从 PaddleOCR_VL 数据中获取旋转角度""" return BBoxExtractor._get_rotation_angle(paddleocr_vl_data) def _get_original_image_size_from_vl(self, paddleocr_vl_data: Dict) -> tuple: """从 PaddleOCR_VL 数据中获取原始图像尺寸""" return BBoxExtractor._get_original_image_size(paddleocr_vl_data) def _transform_vl_block_bbox(self, item: Dict, angle: float, orig_image_size: tuple) -> Dict: """ 转换 PaddleOCR_VL 的 block_bbox 坐标 Args: item: PaddleOCR_VL 的 block 数据 angle: 旋转角度 orig_image_size: 原始图像尺寸 Returns: 转换后的 block 数据 """ transformed_item = item.copy() if 'block_bbox' not in item: return transformed_item block_bbox = item['block_bbox'] if len(block_bbox) < 4: return transformed_item transformed_bbox = BBoxExtractor.inverse_rotate_box_coordinates(block_bbox, angle, orig_image_size) transformed_item['block_bbox'] = transformed_bbox return transformed_item def _convert_paddleocr_vl_to_mineru(self, paddleocr_vl_item: Dict) -> Dict: """ 🎯 将 PaddleOCR_VL 格式转换为 MinerU 格式 基于 PP-DocLayout_plus-L 的 20 种类别 """ block_label = paddleocr_vl_item.get('block_label', '') # 🎯 PP-DocLayout_plus-L 类别映射(共 20 种) label_map = { # 标题类(3种) 'paragraph_title': 'title', 'doc_title': 'title', 'figure_table_chart_title': 'title', # 文本类(9种) 'text': 'text', 'number': 'text', 'content': 'text', 'abstract': 'text', 'footnote': 'text', 'aside_text': 'text', 'algorithm': 'text', 'reference': 'text', 'reference_content': 'text', # 页眉页脚(2种) 'header': 'header', 'footer': 'footer', # 表格(1种) 'table': 'table', # 图片/图表(3种) 'image': 'image', 'chart': 'image', 'seal': 'image', # 公式(2种) 'formula': 'equation', 'formula_number': 'equation' } mineru_type = label_map.get(block_label, 'text') mineru_item = { 'type': mineru_type, 'bbox': paddleocr_vl_item.get('block_bbox', []), 'page_idx': 0 } content = paddleocr_vl_item.get('block_content', '') if mineru_type == 'table': mineru_item['table_body'] = content else: mineru_item['text'] = content # 标题级别 if block_label == 'doc_title': mineru_item['text_level'] = 1 elif block_label == 'paragraph_title': mineru_item['text_level'] = 2 elif block_label == 'figure_table_chart_title': mineru_item['text_level'] = 3 return mineru_item def _process_table(self, item: Dict, paddle_text_boxes: List[Dict], start_pointer: int) -> Tuple[Dict, int]: """ 处理表格类型(MinerU 格式) 策略: - 解析 HTML 表格 - 为每个单元格匹配 PaddleOCR 的 bbox - 返回处理后的表格和新指针位置 """ table_body = item.get('table_body', '') if not table_body: print(f"⚠️ 表格内容为空,跳过") return item, start_pointer try: # 🔑 传入 table_bbox 用于筛选 table_bbox = item.get('bbox') # MinerU 提供的表格边界 # 🎯 委托给 TableCellMatcher enhanced_html, cells, new_pointer = \ self.table_cell_matcher.enhance_table_html_with_bbox( table_body, paddle_text_boxes, start_pointer, table_bbox ) # 更新 item item['table_body'] = enhanced_html item['table_cells'] = cells # 统计信息 matched_count = len(cells) total_cells = len(BeautifulSoup(table_body, 'html.parser').find_all(['td', 'th'])) print(f" 表格单元格: {matched_count}/{total_cells} 匹配") return item, new_pointer except Exception as e: print(f"⚠️ 表格处理失败: {e}") import traceback traceback.print_exc() return item, start_pointer def _process_text(self, item: Dict, paddle_text_boxes: List[Dict], paddle_pointer: int, last_matched_index: int) -> Tuple[Dict, int, int]: """处理文本""" merged_item = item.copy() text = item.get('text', '') matched_bbox, paddle_pointer, last_matched_index = \ self.text_matcher.find_matching_bbox( text, paddle_text_boxes, paddle_pointer, last_matched_index, self.look_ahead_window ) if matched_bbox: matched_bbox['used'] = True return merged_item, paddle_pointer, last_matched_index def _process_list(self, item: Dict, paddle_text_boxes: List[Dict], paddle_pointer: int, last_matched_index: int) -> Tuple[Dict, int, int]: """处理列表""" merged_item = item.copy() list_items = item.get('list_items', []) for list_item in list_items: matched_bbox, paddle_pointer, last_matched_index = \ self.text_matcher.find_matching_bbox( list_item, paddle_text_boxes, paddle_pointer, last_matched_index, self.look_ahead_window ) if matched_bbox: matched_bbox['used'] = True return merged_item, paddle_pointer, last_matched_index