""" 数据处理模块 负责处理 MinerU 数据,添加 bbox 信息 """ from typing import List, Dict, Tuple from bs4 import BeautifulSoup try: from .text_matcher import TextMatcher except ImportError: from text_matcher import TextMatcher class DataProcessor: """数据处理器""" def __init__(self, text_matcher: TextMatcher, look_ahead_window: int = 10): """ Args: text_matcher: 文本匹配器 look_ahead_window: 向前查找窗口 """ self.text_matcher = text_matcher self.look_ahead_window = look_ahead_window def process_mineru_data(self, mineru_data: List[Dict], paddle_text_boxes: List[Dict]) -> List[Dict]: """ 处理 MinerU 数据,添加 bbox 信息 Args: mineru_data: MinerU 数据 paddle_text_boxes: PaddleOCR 文字框列表 Returns: 合并后的数据 """ merged_data = [] paddle_pointer = 0 last_matched_index = 0 # 按 bbox 排序 mineru_data.sort( key=lambda x: (x['bbox'][1], x['bbox'][0]) if 'bbox' in x else (float('inf'), float('inf')) ) for item in mineru_data: item_type = item.get('type', '') if item_type == 'table': merged_item, paddle_pointer = self._process_table( item, paddle_text_boxes, paddle_pointer ) merged_data.append(merged_item) elif item_type in ['text', 'title']: merged_item, paddle_pointer, last_matched_index = self._process_text( item, paddle_text_boxes, paddle_pointer, last_matched_index ) merged_data.append(merged_item) elif item_type == 'list': merged_item, paddle_pointer, last_matched_index = self._process_list( item, paddle_text_boxes, paddle_pointer, last_matched_index ) merged_data.append(merged_item) else: merged_data.append(item.copy()) return merged_data def _process_table(self, item: Dict, paddle_text_boxes: List[Dict], start_pointer: int) -> Tuple[Dict, int]: """处理表格""" merged_item = item.copy() table_html = item.get('table_body', '') enhanced_html, cells, new_pointer = self._enhance_table_html_with_bbox( table_html, paddle_text_boxes, start_pointer ) merged_item['table_body'] = enhanced_html merged_item['table_body_with_bbox'] = enhanced_html merged_item['bbox_mapping'] = 'merged_from_paddle_ocr' merged_item['table_cells'] = cells if cells else [] return merged_item, new_pointer def _process_text(self, item: Dict, paddle_text_boxes: List[Dict], paddle_pointer: int, last_matched_index: int) -> Tuple[Dict, int, int]: """处理文本""" merged_item = item.copy() text = item.get('text', '') matched_bbox, paddle_pointer, last_matched_index = \ self.text_matcher.find_matching_bbox( text, paddle_text_boxes, paddle_pointer, last_matched_index, self.look_ahead_window ) if matched_bbox: matched_bbox['used'] = True return merged_item, paddle_pointer, last_matched_index def _process_list(self, item: Dict, paddle_text_boxes: List[Dict], paddle_pointer: int, last_matched_index: int) -> Tuple[Dict, int, int]: """处理列表""" merged_item = item.copy() list_items = item.get('list_items', []) for list_item in list_items: matched_bbox, paddle_pointer, last_matched_index = \ self.text_matcher.find_matching_bbox( list_item, paddle_text_boxes, paddle_pointer, last_matched_index, self.look_ahead_window ) if matched_bbox: matched_bbox['used'] = True return merged_item, paddle_pointer, last_matched_index def _enhance_table_html_with_bbox(self, html: str, paddle_text_boxes: List[Dict], start_pointer: int) -> Tuple[str, List[Dict], int]: """为 HTML 表格添加 bbox 信息""" soup = BeautifulSoup(html, 'html.parser') current_pointer = start_pointer last_matched_index = start_pointer cells = [] for row_idx, row in enumerate(soup.find_all('tr')): for col_idx, cell in enumerate(row.find_all(['td', 'th'])): cell_text = cell.get_text(strip=True) if not cell_text: continue matched_bbox, current_pointer, last_matched_index = \ self.text_matcher.find_matching_bbox( cell_text, paddle_text_boxes, current_pointer, last_matched_index, self.look_ahead_window ) if matched_bbox: bbox = matched_bbox['bbox'] cell['data-bbox'] = f"[{bbox[0]},{bbox[1]},{bbox[2]},{bbox[3]}]" cell['data-score'] = f"{matched_bbox['score']:.4f}" cell['data-paddle-index'] = str(matched_bbox['paddle_bbox_index']) # ✅ 完整记录单元格信息 cells.append({ 'type': 'table_cell', 'text': cell_text, 'bbox': bbox, 'row': row_idx + 1, 'col': col_idx + 1, 'score': matched_bbox['score'], 'paddle_bbox_index': matched_bbox['paddle_bbox_index'] }) matched_bbox['used'] = True # ✅ 如果匹配失败,不应该添加到 cells 中 return str(soup), cells, current_pointer