""" 数据处理模块 负责处理 MinerU/PaddleOCR_VL/DotsOCR 数据,添加 bbox 信息 """ from typing import List, Dict, Tuple from bs4 import BeautifulSoup try: from .text_matcher import TextMatcher from .bbox_extractor import BBoxExtractor except ImportError: from text_matcher import TextMatcher from bbox_extractor import BBoxExtractor class DataProcessor: """数据处理器""" def __init__(self, text_matcher: TextMatcher, look_ahead_window: int = 10): """ Args: text_matcher: 文本匹配器 look_ahead_window: 向前查找窗口 """ self.text_matcher = text_matcher self.look_ahead_window = look_ahead_window def process_mineru_data(self, mineru_data: List[Dict], paddle_text_boxes: List[Dict]) -> List[Dict]: """ 处理 MinerU 数据,添加 bbox 信息 Args: mineru_data: MinerU 数据 paddle_text_boxes: PaddleOCR 文字框列表 Returns: 合并后的数据, table cell使用paddle的bbox,其他类型只是移动指针,bbox还是沿用minerU的bbox """ merged_data = [] paddle_pointer = 0 last_matched_index = 0 # 按 bbox 排序 mineru_data.sort( key=lambda x: (x['bbox'][1], x['bbox'][0]) if 'bbox' in x else (float('inf'), float('inf')) ) for item in mineru_data: item_type = item.get('type', '') if item_type == 'table': merged_item, paddle_pointer = self._process_table( item, paddle_text_boxes, paddle_pointer ) merged_data.append(merged_item) elif item_type in ['text', 'title']: merged_item, paddle_pointer, last_matched_index = self._process_text( item, paddle_text_boxes, paddle_pointer, last_matched_index ) merged_data.append(merged_item) elif item_type == 'list': merged_item, paddle_pointer, last_matched_index = self._process_list( item, paddle_text_boxes, paddle_pointer, last_matched_index ) merged_data.append(merged_item) else: merged_data.append(item.copy()) return merged_data def process_dotsocr_data(self, dotsocr_data: List[Dict], paddle_text_boxes: List[Dict]) -> List[Dict]: """ 🎯 处理 DotsOCR 数据,转换为 MinerU 格式并添加 bbox 信息 Args: dotsocr_data: DotsOCR 数据 paddle_text_boxes: PaddleOCR 文字框列表 Returns: MinerU 格式的合并数据 """ merged_data = [] paddle_pointer = 0 last_matched_index = 0 # 按 bbox 排序 dotsocr_data.sort( key=lambda x: (x['bbox'][1], x['bbox'][0]) if 'bbox' in x else (float('inf'), float('inf')) ) for item in dotsocr_data: # 🎯 转换为 MinerU 格式 mineru_item = self._convert_dotsocr_to_mineru(item) category = mineru_item.get('type', '') # 🎯 根据类型处理 if category.lower() == 'table': merged_item, paddle_pointer = self._process_dotsocr_table( mineru_item, paddle_text_boxes, paddle_pointer ) merged_data.append(merged_item) elif category.lower() in ['text', 'title', 'header', 'footer']: merged_item, paddle_pointer, last_matched_index = self._process_text( mineru_item, paddle_text_boxes, paddle_pointer, last_matched_index ) merged_data.append(merged_item) elif category.lower() == 'list': merged_item, paddle_pointer, last_matched_index = self._process_list( mineru_item, paddle_text_boxes, paddle_pointer, last_matched_index ) merged_data.append(merged_item) else: # Page-header, Page-footer, Picture 等 merged_data.append(mineru_item) return merged_data def _convert_dotsocr_to_mineru(self, dotsocr_item: Dict) -> Dict: """ 🎯 将 DotsOCR 格式转换为 MinerU 格式 DotsOCR: { "category": "Table", "bbox": [x1, y1, x2, y2], "text": "..." } MinerU: { "type": "table", "bbox": [x1, y1, x2, y2], "table_body": "...", "page_idx": 0 } """ category = dotsocr_item.get('category', '') # 🎯 Category 映射 category_map = { 'Page-header': 'header', 'Page-footer': 'footer', 'Picture': 'image', 'Figure': 'image', 'Section-header': 'title', 'Table': 'table', 'Text': 'text', 'Title': 'title', 'List': 'list', 'Caption': 'title' } mineru_type = category_map.get(category, 'text') # 🎯 基础转换 mineru_item = { 'type': mineru_type, 'bbox': dotsocr_item.get('bbox', []), 'page_idx': 0 # DotsOCR 默认单页 } # 🎯 处理文本内容 text = dotsocr_item.get('text', '') if mineru_type == 'table': # 表格:text -> table_body mineru_item['table_body'] = text else: # 其他类型:保持 text mineru_item['text'] = text # 标题级别 if category == 'Section-header': mineru_item['text_level'] = 1 return mineru_item def _process_dotsocr_table(self, item: Dict, paddle_text_boxes: List[Dict], start_pointer: int) -> Tuple[Dict, int]: """ 🎯 处理 DotsOCR 表格(已转换为 MinerU 格式) DotsOCR 的表格 HTML 已经在 text 字段中,需要转移到 table_body """ merged_item = item.copy() table_html = item.get('table_body', '') if not table_html: return merged_item, start_pointer # 🎯 复用表格处理逻辑 enhanced_html, cells, new_pointer = self._enhance_table_html_with_bbox( table_html, paddle_text_boxes, start_pointer ) merged_item['table_body'] = enhanced_html merged_item['table_body_with_bbox'] = enhanced_html merged_item['bbox_mapping'] = 'merged_from_paddle_ocr' merged_item['table_cells'] = cells if cells else [] return merged_item, new_pointer def process_paddleocr_vl_data(self, paddleocr_vl_data: Dict, paddle_text_boxes: List[Dict]) -> List[Dict]: """ 处理 PaddleOCR_VL 数据,添加 bbox 信息 Args: paddleocr_vl_data: PaddleOCR_VL 数据 (JSON 对象) paddle_text_boxes: PaddleOCR 文字框列表 Returns: 🎯 MinerU 格式的合并数据(统一输出格式) """ merged_data = [] paddle_pointer = 0 last_matched_index = 0 # 🎯 获取旋转角度和原始图像尺寸 rotation_angle = self._get_rotation_angle_from_vl(paddleocr_vl_data) orig_image_size = None if rotation_angle != 0: orig_image_size = self._get_original_image_size_from_vl(paddleocr_vl_data) print(f"🔄 PaddleOCR_VL 检测到旋转角度: {rotation_angle}°") print(f"📐 原始图像尺寸: {orig_image_size[0]} x {orig_image_size[1]}") # 提取 parsing_res_list parsing_res_list = paddleocr_vl_data.get('parsing_res_list', []) # 按 bbox 排序 parsing_res_list.sort( key=lambda x: (x['block_bbox'][1], x['block_bbox'][0]) if 'block_bbox' in x else (float('inf'), float('inf')) ) for item in parsing_res_list: # 🎯 先转换 bbox 坐标(如果需要) if rotation_angle != 0 and orig_image_size: item = self._transform_vl_block_bbox(item, rotation_angle, orig_image_size) # 🎯 统一转换为 MinerU 格式 mineru_item = self._convert_paddleocr_vl_to_mineru(item) item_type = mineru_item.get('type', '') # 🎯 根据类型处理(复用 MinerU 的通用方法) if item_type == 'table': merged_item, paddle_pointer = self._process_table( mineru_item, paddle_text_boxes, paddle_pointer ) merged_data.append(merged_item) elif item_type in ['text', 'title', 'header', 'footer', 'equation']: merged_item, paddle_pointer, last_matched_index = self._process_text( mineru_item, paddle_text_boxes, paddle_pointer, last_matched_index ) merged_data.append(merged_item) elif item_type == 'list': merged_item, paddle_pointer, last_matched_index = self._process_list( mineru_item, paddle_text_boxes, paddle_pointer, last_matched_index ) merged_data.append(merged_item) else: # 其他类型(image 等)直接添加 merged_data.append(mineru_item) return merged_data def _get_rotation_angle_from_vl(self, paddleocr_vl_data: Dict) -> float: """从 PaddleOCR_VL 数据中获取旋转角度""" return BBoxExtractor._get_rotation_angle(paddleocr_vl_data) def _get_original_image_size_from_vl(self, paddleocr_vl_data: Dict) -> tuple: """从 PaddleOCR_VL 数据中获取原始图像尺寸""" return BBoxExtractor._get_original_image_size(paddleocr_vl_data) def _transform_vl_block_bbox(self, item: Dict, angle: float, orig_image_size: tuple) -> Dict: """ 转换 PaddleOCR_VL 的 block_bbox 坐标 Args: item: PaddleOCR_VL 的 block 数据 angle: 旋转角度 orig_image_size: 原始图像尺寸 Returns: 转换后的 block 数据 """ transformed_item = item.copy() if 'block_bbox' not in item: return transformed_item block_bbox = item['block_bbox'] if len(block_bbox) < 4: return transformed_item # block_bbox 格式: [x1, y1, x2, y2] # 转换为 poly 格式进行旋转 poly = [ [block_bbox[0], block_bbox[1]], # 左上 [block_bbox[2], block_bbox[1]], # 右上 [block_bbox[2], block_bbox[3]], # 右下 [block_bbox[0], block_bbox[3]] # 左下 ] # 🎯 使用 BBoxExtractor 的坐标转换方法 transformed_poly = BBoxExtractor._inverse_rotate_coordinates( poly, angle, orig_image_size ) # 转换回 bbox 格式 xs = [p[0] for p in transformed_poly] ys = [p[1] for p in transformed_poly] transformed_bbox = [min(xs), min(ys), max(xs), max(ys)] transformed_item['block_bbox'] = transformed_bbox return transformed_item def _convert_paddleocr_vl_to_mineru(self, paddleocr_vl_item: Dict) -> Dict: """ 🎯 将 PaddleOCR_VL 格式转换为 MinerU 格式 基于 PP-DocLayout_plus-L 的 20 种类别 """ block_label = paddleocr_vl_item.get('block_label', '') # 🎯 PP-DocLayout_plus-L 类别映射(共 20 种) label_map = { # 标题类(3种) 'paragraph_title': 'title', 'doc_title': 'title', 'figure_table_chart_title': 'title', # 文本类(9种) 'text': 'text', 'number': 'text', 'content': 'text', 'abstract': 'text', 'footnote': 'text', 'aside_text': 'text', 'algorithm': 'text', 'reference': 'text', 'reference_content': 'text', # 页眉页脚(2种) 'header': 'header', 'footer': 'footer', # 表格(1种) 'table': 'table', # 图片/图表(3种) 'image': 'image', 'chart': 'image', 'seal': 'image', # 公式(2种) 'formula': 'equation', 'formula_number': 'equation' } mineru_type = label_map.get(block_label, 'text') mineru_item = { 'type': mineru_type, 'bbox': paddleocr_vl_item.get('block_bbox', []), 'page_idx': 0 } content = paddleocr_vl_item.get('block_content', '') if mineru_type == 'table': mineru_item['table_body'] = content else: mineru_item['text'] = content # 标题级别 if block_label == 'doc_title': mineru_item['text_level'] = 1 elif block_label == 'paragraph_title': mineru_item['text_level'] = 2 elif block_label == 'figure_table_chart_title': mineru_item['text_level'] = 3 return mineru_item def _process_table(self, item: Dict, paddle_text_boxes: List[Dict], start_pointer: int) -> Tuple[Dict, int]: """处理 MinerU 表格""" merged_item = item.copy() table_html = item.get('table_body', '') enhanced_html, cells, new_pointer = self._enhance_table_html_with_bbox( table_html, paddle_text_boxes, start_pointer ) merged_item['table_body'] = enhanced_html merged_item['table_body_with_bbox'] = enhanced_html merged_item['bbox_mapping'] = 'merged_from_paddle_ocr' merged_item['table_cells'] = cells if cells else [] return merged_item, new_pointer def _process_text(self, item: Dict, paddle_text_boxes: List[Dict], paddle_pointer: int, last_matched_index: int) -> Tuple[Dict, int, int]: """处理文本""" merged_item = item.copy() text = item.get('text', '') matched_bbox, paddle_pointer, last_matched_index = \ self.text_matcher.find_matching_bbox( text, paddle_text_boxes, paddle_pointer, last_matched_index, self.look_ahead_window ) if matched_bbox: matched_bbox['used'] = True return merged_item, paddle_pointer, last_matched_index def _process_list(self, item: Dict, paddle_text_boxes: List[Dict], paddle_pointer: int, last_matched_index: int) -> Tuple[Dict, int, int]: """处理列表""" merged_item = item.copy() list_items = item.get('list_items', []) for list_item in list_items: matched_bbox, paddle_pointer, last_matched_index = \ self.text_matcher.find_matching_bbox( list_item, paddle_text_boxes, paddle_pointer, last_matched_index, self.look_ahead_window ) if matched_bbox: matched_bbox['used'] = True return merged_item, paddle_pointer, last_matched_index def _enhance_table_html_with_bbox(self, html: str, paddle_text_boxes: List[Dict], start_pointer: int) -> Tuple[str, List[Dict], int]: """为 HTML 表格添加 bbox 信息""" soup = BeautifulSoup(html, 'html.parser') current_pointer = start_pointer last_matched_index = start_pointer cells = [] for row_idx, row in enumerate(soup.find_all('tr')): for col_idx, cell in enumerate(row.find_all(['td', 'th'])): cell_text = cell.get_text(strip=True) if not cell_text: continue matched_bbox, current_pointer, last_matched_index = \ self.text_matcher.find_matching_bbox( cell_text, paddle_text_boxes, current_pointer, last_matched_index, self.look_ahead_window ) if matched_bbox: bbox = matched_bbox['bbox'] cell['data-bbox'] = f"[{bbox[0]},{bbox[1]},{bbox[2]},{bbox[3]}]" cell['data-score'] = f"{matched_bbox['score']:.4f}" cell['data-paddle-index'] = str(matched_bbox['paddle_bbox_index']) # ✅ 完整记录单元格信息 cells.append({ 'type': 'table_cell', 'text': cell_text, 'bbox': bbox, 'row': row_idx + 1, 'col': col_idx + 1, 'score': matched_bbox['score'], 'paddle_bbox_index': matched_bbox['paddle_bbox_index'] }) matched_bbox['used'] = True # ✅ 如果匹配失败,不应该添加到 cells 中 return str(soup), cells, current_pointer