| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495 |
- """
- 数据处理模块
- 负责处理 MinerU/PaddleOCR_VL/DotsOCR 数据,添加 bbox 信息
- """
- from typing import List, Dict, Tuple
- from bs4 import BeautifulSoup
- try:
- from .text_matcher import TextMatcher
- from .bbox_extractor import BBoxExtractor
- except ImportError:
- from text_matcher import TextMatcher
- from bbox_extractor import BBoxExtractor
- class DataProcessor:
- """数据处理器"""
-
- def __init__(self, text_matcher: TextMatcher, look_ahead_window: int = 10):
- """
- Args:
- text_matcher: 文本匹配器
- look_ahead_window: 向前查找窗口
- """
- self.text_matcher = text_matcher
- self.look_ahead_window = look_ahead_window
-
- def process_mineru_data(self, mineru_data: List[Dict],
- paddle_text_boxes: List[Dict]) -> List[Dict]:
- """
- 处理 MinerU 数据,添加 bbox 信息
-
- Args:
- mineru_data: MinerU 数据
- paddle_text_boxes: PaddleOCR 文字框列表
-
- Returns:
- 合并后的数据, table cell使用paddle的bbox,其他类型只是移动指针,bbox还是沿用minerU的bbox
- """
- merged_data = []
- paddle_pointer = 0
- last_matched_index = 0
- # 按 bbox 排序
- mineru_data.sort(
- key=lambda x: (x['bbox'][1], x['bbox'][0])
- if 'bbox' in x else (float('inf'), float('inf'))
- )
- for item in mineru_data:
- item_type = item.get('type', '')
-
- if item_type == 'table':
- merged_item, paddle_pointer = self._process_table(
- item, paddle_text_boxes, paddle_pointer
- )
- merged_data.append(merged_item)
-
- elif item_type in ['text', 'title']:
- merged_item, paddle_pointer, last_matched_index = self._process_text(
- item, paddle_text_boxes, paddle_pointer, last_matched_index
- )
- merged_data.append(merged_item)
-
- elif item_type == 'list':
- merged_item, paddle_pointer, last_matched_index = self._process_list(
- item, paddle_text_boxes, paddle_pointer, last_matched_index
- )
- merged_data.append(merged_item)
-
- else:
- merged_data.append(item.copy())
-
- return merged_data
-
- def process_dotsocr_data(self, dotsocr_data: List[Dict],
- paddle_text_boxes: List[Dict]) -> List[Dict]:
- """
- 🎯 处理 DotsOCR 数据,转换为 MinerU 格式并添加 bbox 信息
-
- Args:
- dotsocr_data: DotsOCR 数据
- paddle_text_boxes: PaddleOCR 文字框列表
-
- Returns:
- MinerU 格式的合并数据
- """
- merged_data = []
- paddle_pointer = 0
- last_matched_index = 0
-
- # 按 bbox 排序
- dotsocr_data.sort(
- key=lambda x: (x['bbox'][1], x['bbox'][0])
- if 'bbox' in x else (float('inf'), float('inf'))
- )
-
- for item in dotsocr_data:
- # 🎯 转换为 MinerU 格式
- mineru_item = self._convert_dotsocr_to_mineru(item)
- category = mineru_item.get('type', '')
-
- # 🎯 根据类型处理
- if category.lower() == 'table':
- merged_item, paddle_pointer = self._process_dotsocr_table(
- mineru_item, paddle_text_boxes, paddle_pointer
- )
- merged_data.append(merged_item)
-
- elif category.lower() in ['text', 'title', 'header', 'footer']:
- merged_item, paddle_pointer, last_matched_index = self._process_text(
- mineru_item, paddle_text_boxes, paddle_pointer, last_matched_index
- )
- merged_data.append(merged_item)
-
- elif category.lower() == 'list':
- merged_item, paddle_pointer, last_matched_index = self._process_list(
- mineru_item, paddle_text_boxes, paddle_pointer, last_matched_index
- )
- merged_data.append(merged_item)
-
- else:
- # Page-header, Page-footer, Picture 等
- merged_data.append(mineru_item)
-
- return merged_data
-
- def _convert_dotsocr_to_mineru(self, dotsocr_item: Dict) -> Dict:
- """
- 🎯 将 DotsOCR 格式转换为 MinerU 格式
-
- DotsOCR:
- {
- "category": "Table",
- "bbox": [x1, y1, x2, y2],
- "text": "..."
- }
-
- MinerU:
- {
- "type": "table",
- "bbox": [x1, y1, x2, y2],
- "table_body": "...",
- "page_idx": 0
- }
- """
- category = dotsocr_item.get('category', '')
-
- # 🎯 Category 映射
- category_map = {
- 'Page-header': 'header',
- 'Page-footer': 'footer',
- 'Picture': 'image',
- 'Figure': 'image',
- 'Section-header': 'title',
- 'Table': 'table',
- 'Text': 'text',
- 'Title': 'title',
- 'List': 'list',
- 'Caption': 'title'
- }
-
- mineru_type = category_map.get(category, 'text')
-
- # 🎯 基础转换
- mineru_item = {
- 'type': mineru_type,
- 'bbox': dotsocr_item.get('bbox', []),
- 'page_idx': 0 # DotsOCR 默认单页
- }
-
- # 🎯 处理文本内容
- text = dotsocr_item.get('text', '')
-
- if mineru_type == 'table':
- # 表格:text -> table_body
- mineru_item['table_body'] = text
- else:
- # 其他类型:保持 text
- mineru_item['text'] = text
-
- # 标题级别
- if category == 'Section-header':
- mineru_item['text_level'] = 1
-
- return mineru_item
-
- def _process_dotsocr_table(self, item: Dict, paddle_text_boxes: List[Dict],
- start_pointer: int) -> Tuple[Dict, int]:
- """
- 🎯 处理 DotsOCR 表格(已转换为 MinerU 格式)
-
- DotsOCR 的表格 HTML 已经在 text 字段中,需要转移到 table_body
- """
- merged_item = item.copy()
- table_html = item.get('table_body', '')
-
- if not table_html:
- return merged_item, start_pointer
-
- # 🎯 复用表格处理逻辑
- enhanced_html, cells, new_pointer = self._enhance_table_html_with_bbox(
- table_html, paddle_text_boxes, start_pointer
- )
-
- merged_item['table_body'] = enhanced_html
- merged_item['table_body_with_bbox'] = enhanced_html
- merged_item['bbox_mapping'] = 'merged_from_paddle_ocr'
- merged_item['table_cells'] = cells if cells else []
-
- return merged_item, new_pointer
-
- def process_paddleocr_vl_data(self, paddleocr_vl_data: Dict,
- paddle_text_boxes: List[Dict]) -> List[Dict]:
- """
- 处理 PaddleOCR_VL 数据,添加 bbox 信息
-
- Args:
- paddleocr_vl_data: PaddleOCR_VL 数据 (JSON 对象)
- paddle_text_boxes: PaddleOCR 文字框列表
-
- Returns:
- 🎯 MinerU 格式的合并数据(统一输出格式)
- """
- merged_data = []
- paddle_pointer = 0
- last_matched_index = 0
-
- # 🎯 获取旋转角度和原始图像尺寸
- rotation_angle = self._get_rotation_angle_from_vl(paddleocr_vl_data)
- orig_image_size = None
-
- if rotation_angle != 0:
- orig_image_size = self._get_original_image_size_from_vl(paddleocr_vl_data)
- print(f"🔄 PaddleOCR_VL 检测到旋转角度: {rotation_angle}°")
- print(f"📐 原始图像尺寸: {orig_image_size[0]} x {orig_image_size[1]}")
-
- # 提取 parsing_res_list
- parsing_res_list = paddleocr_vl_data.get('parsing_res_list', [])
-
- # 按 bbox 排序
- parsing_res_list.sort(
- key=lambda x: (x['block_bbox'][1], x['block_bbox'][0])
- if 'block_bbox' in x else (float('inf'), float('inf'))
- )
-
- for item in parsing_res_list:
- # 🎯 先转换 bbox 坐标(如果需要)
- if rotation_angle != 0 and orig_image_size:
- item = self._transform_vl_block_bbox(item, rotation_angle, orig_image_size)
-
- # 🎯 统一转换为 MinerU 格式
- mineru_item = self._convert_paddleocr_vl_to_mineru(item)
- item_type = mineru_item.get('type', '')
-
- # 🎯 根据类型处理(复用 MinerU 的通用方法)
- if item_type == 'table':
- merged_item, paddle_pointer = self._process_table(
- mineru_item, paddle_text_boxes, paddle_pointer
- )
- merged_data.append(merged_item)
-
- elif item_type in ['text', 'title', 'header', 'footer', 'equation']:
- merged_item, paddle_pointer, last_matched_index = self._process_text(
- mineru_item, paddle_text_boxes, paddle_pointer, last_matched_index
- )
- merged_data.append(merged_item)
-
- elif item_type == 'list':
- merged_item, paddle_pointer, last_matched_index = self._process_list(
- mineru_item, paddle_text_boxes, paddle_pointer, last_matched_index
- )
- merged_data.append(merged_item)
-
- else:
- # 其他类型(image 等)直接添加
- merged_data.append(mineru_item)
-
- return merged_data
-
- def _get_rotation_angle_from_vl(self, paddleocr_vl_data: Dict) -> float:
- """从 PaddleOCR_VL 数据中获取旋转角度"""
- return BBoxExtractor._get_rotation_angle(paddleocr_vl_data)
-
- def _get_original_image_size_from_vl(self, paddleocr_vl_data: Dict) -> tuple:
- """从 PaddleOCR_VL 数据中获取原始图像尺寸"""
- return BBoxExtractor._get_original_image_size(paddleocr_vl_data)
-
- def _transform_vl_block_bbox(self, item: Dict, angle: float,
- orig_image_size: tuple) -> Dict:
- """
- 转换 PaddleOCR_VL 的 block_bbox 坐标
-
- Args:
- item: PaddleOCR_VL 的 block 数据
- angle: 旋转角度
- orig_image_size: 原始图像尺寸
-
- Returns:
- 转换后的 block 数据
- """
- transformed_item = item.copy()
-
- if 'block_bbox' not in item:
- return transformed_item
-
- block_bbox = item['block_bbox']
- if len(block_bbox) < 4:
- return transformed_item
-
- # block_bbox 格式: [x1, y1, x2, y2]
- # 转换为 poly 格式进行旋转
- poly = [
- [block_bbox[0], block_bbox[1]], # 左上
- [block_bbox[2], block_bbox[1]], # 右上
- [block_bbox[2], block_bbox[3]], # 右下
- [block_bbox[0], block_bbox[3]] # 左下
- ]
-
- # 🎯 使用 BBoxExtractor 的坐标转换方法
- transformed_poly = BBoxExtractor._inverse_rotate_coordinates(
- poly, angle, orig_image_size
- )
-
- # 转换回 bbox 格式
- xs = [p[0] for p in transformed_poly]
- ys = [p[1] for p in transformed_poly]
- transformed_bbox = [min(xs), min(ys), max(xs), max(ys)]
-
- transformed_item['block_bbox'] = transformed_bbox
-
- return transformed_item
-
- def _convert_paddleocr_vl_to_mineru(self, paddleocr_vl_item: Dict) -> Dict:
- """
- 🎯 将 PaddleOCR_VL 格式转换为 MinerU 格式
-
- 基于 PP-DocLayout_plus-L 的 20 种类别
- """
- block_label = paddleocr_vl_item.get('block_label', '')
-
- # 🎯 PP-DocLayout_plus-L 类别映射(共 20 种)
- label_map = {
- # 标题类(3种)
- 'paragraph_title': 'title',
- 'doc_title': 'title',
- 'figure_table_chart_title': 'title',
-
- # 文本类(9种)
- 'text': 'text',
- 'number': 'text',
- 'content': 'text',
- 'abstract': 'text',
- 'footnote': 'text',
- 'aside_text': 'text',
- 'algorithm': 'text',
- 'reference': 'text',
- 'reference_content': 'text',
-
- # 页眉页脚(2种)
- 'header': 'header',
- 'footer': 'footer',
-
- # 表格(1种)
- 'table': 'table',
-
- # 图片/图表(3种)
- 'image': 'image',
- 'chart': 'image',
- 'seal': 'image',
-
- # 公式(2种)
- 'formula': 'equation',
- 'formula_number': 'equation'
- }
-
- mineru_type = label_map.get(block_label, 'text')
-
- mineru_item = {
- 'type': mineru_type,
- 'bbox': paddleocr_vl_item.get('block_bbox', []),
- 'page_idx': 0
- }
-
- content = paddleocr_vl_item.get('block_content', '')
-
- if mineru_type == 'table':
- mineru_item['table_body'] = content
- else:
- mineru_item['text'] = content
-
- # 标题级别
- if block_label == 'doc_title':
- mineru_item['text_level'] = 1
- elif block_label == 'paragraph_title':
- mineru_item['text_level'] = 2
- elif block_label == 'figure_table_chart_title':
- mineru_item['text_level'] = 3
-
- return mineru_item
-
- def _process_table(self, item: Dict, paddle_text_boxes: List[Dict],
- start_pointer: int) -> Tuple[Dict, int]:
- """处理 MinerU 表格"""
- merged_item = item.copy()
- table_html = item.get('table_body', '')
-
- enhanced_html, cells, new_pointer = self._enhance_table_html_with_bbox(
- table_html, paddle_text_boxes, start_pointer
- )
-
- merged_item['table_body'] = enhanced_html
- merged_item['table_body_with_bbox'] = enhanced_html
- merged_item['bbox_mapping'] = 'merged_from_paddle_ocr'
- merged_item['table_cells'] = cells if cells else []
-
- return merged_item, new_pointer
-
- def _process_text(self, item: Dict, paddle_text_boxes: List[Dict],
- paddle_pointer: int, last_matched_index: int) -> Tuple[Dict, int, int]:
- """处理文本"""
- merged_item = item.copy()
- text = item.get('text', '')
-
- matched_bbox, paddle_pointer, last_matched_index = \
- self.text_matcher.find_matching_bbox(
- text, paddle_text_boxes, paddle_pointer, last_matched_index,
- self.look_ahead_window
- )
-
- if matched_bbox:
- matched_bbox['used'] = True
-
- return merged_item, paddle_pointer, last_matched_index
-
- def _process_list(self, item: Dict, paddle_text_boxes: List[Dict],
- paddle_pointer: int, last_matched_index: int) -> Tuple[Dict, int, int]:
- """处理列表"""
- merged_item = item.copy()
- list_items = item.get('list_items', [])
-
- for list_item in list_items:
- matched_bbox, paddle_pointer, last_matched_index = \
- self.text_matcher.find_matching_bbox(
- list_item, paddle_text_boxes, paddle_pointer, last_matched_index,
- self.look_ahead_window
- )
-
- if matched_bbox:
- matched_bbox['used'] = True
-
- return merged_item, paddle_pointer, last_matched_index
-
- def _enhance_table_html_with_bbox(self, html: str, paddle_text_boxes: List[Dict],
- start_pointer: int) -> Tuple[str, List[Dict], int]:
- """为 HTML 表格添加 bbox 信息"""
- soup = BeautifulSoup(html, 'html.parser')
- current_pointer = start_pointer
- last_matched_index = start_pointer
- cells = []
- for row_idx, row in enumerate(soup.find_all('tr')):
- for col_idx, cell in enumerate(row.find_all(['td', 'th'])):
- cell_text = cell.get_text(strip=True)
-
- if not cell_text:
- continue
-
- matched_bbox, current_pointer, last_matched_index = \
- self.text_matcher.find_matching_bbox(
- cell_text, paddle_text_boxes, current_pointer,
- last_matched_index, self.look_ahead_window
- )
-
- if matched_bbox:
- bbox = matched_bbox['bbox']
- cell['data-bbox'] = f"[{bbox[0]},{bbox[1]},{bbox[2]},{bbox[3]}]"
- cell['data-score'] = f"{matched_bbox['score']:.4f}"
- cell['data-paddle-index'] = str(matched_bbox['paddle_bbox_index'])
- # ✅ 完整记录单元格信息
- cells.append({
- 'type': 'table_cell',
- 'text': cell_text,
- 'bbox': bbox,
- 'row': row_idx + 1,
- 'col': col_idx + 1,
- 'score': matched_bbox['score'],
- 'paddle_bbox_index': matched_bbox['paddle_bbox_index']
- })
-
- matched_bbox['used'] = True
- # ✅ 如果匹配失败,不应该添加到 cells 中
-
- return str(soup), cells, current_pointer
|