|
|
@@ -17,6 +17,19 @@ except ImportError:
|
|
|
|
|
|
class DataProcessor:
|
|
|
"""数据处理器"""
|
|
|
+ """_summary_
|
|
|
+ 1.负责处理 MinerU/PaddleOCR_VL/DotsOCR 数据,添加 table_cells bbox 信息, 其他类型的bbox信息依然使用vl自带的bbox
|
|
|
+ 2.由于不同OCR工具的输出格式不同,DataProcessor 需要包含多个处理方法,分别处理 MinerU、DotsOCR 和 PaddleOCR_VL 数据, 都先转换成mineru格式再添加table cells bbox信息
|
|
|
+ 3.使用 TextMatcher 进行文本匹配,TableCellMatcher 进行表单元格匹配
|
|
|
+ 4.最终输出统一的 MinerU 格式数据
|
|
|
+
|
|
|
+ 由于VL模型minerU,dotsocr坐标都是使用的原图坐标,不是旋转后的坐标,PaddleVL使用的时旋转转换后的坐标,而ppstructure使用的ocr文本块是旋转后的坐标,
|
|
|
+ 因此在处理VL数据时,
|
|
|
+ 1.首先需要根据ppstructure的旋转角度和原图尺寸,将VL的table坐标转换为旋转后的坐标
|
|
|
+ 2.通过TableCellMatcher 进行表单元格匹配
|
|
|
+ 3.再将匹配到的单元格bbox逆向转换为原图坐标,存储在最终输出的MinerU格式数据中
|
|
|
+ 4.其他类型的bbox信息依然使用vl自带的bbox
|
|
|
+ """
|
|
|
|
|
|
def __init__(self, text_matcher: TextMatcher, look_ahead_window: int = 10, x_tolerance: int = 3, y_tolerance: int = 10):
|
|
|
"""
|
|
|
@@ -39,7 +52,7 @@ class DataProcessor:
|
|
|
)
|
|
|
|
|
|
def process_mineru_data(self, mineru_data: List[Dict],
|
|
|
- paddle_text_boxes: List[Dict]) -> List[Dict]:
|
|
|
+ paddle_text_boxes: List[Dict], rotation_angle: float, orig_image_size: Tuple[int, int]) -> List[Dict]:
|
|
|
"""
|
|
|
处理 MinerU 数据,添加 bbox 信息
|
|
|
|
|
|
@@ -64,12 +77,27 @@ class DataProcessor:
|
|
|
item_type = item.get('type', '')
|
|
|
|
|
|
if item_type == 'table':
|
|
|
+ if rotation_angle != 0:
|
|
|
+ inverse_table_bbox = BBoxExtractor.rotate_box_coordinates(item['bbox'], rotation_angle, orig_image_size)
|
|
|
+ inverse_item = item.copy()
|
|
|
+ inverse_item['bbox'] = inverse_table_bbox
|
|
|
+ else:
|
|
|
+ inverse_item = item
|
|
|
merged_item, paddle_pointer = self._process_table(
|
|
|
- item, paddle_text_boxes, paddle_pointer
|
|
|
+ inverse_item, paddle_text_boxes, paddle_pointer
|
|
|
)
|
|
|
+ # 如果有旋转,需要将匹配到的单元格bbox逆向转换为原图坐标
|
|
|
+ if rotation_angle != 0:
|
|
|
+ for cell in merged_item.get('table_cells', []):
|
|
|
+ cell_bbox = cell.get('bbox', [])
|
|
|
+ if cell_bbox:
|
|
|
+ original_bbox = BBoxExtractor.inverse_rotate_box_coordinates(cell_bbox, rotation_angle, orig_image_size)
|
|
|
+ cell['bbox'] = original_bbox
|
|
|
+ merged_item['bbox'] = item['bbox'] # 保持表格的原始bbox不变
|
|
|
+
|
|
|
merged_data.append(merged_item)
|
|
|
|
|
|
- elif item_type in ['text', 'title']:
|
|
|
+ elif item_type in ['text', 'title', 'header', 'footer']:
|
|
|
merged_item, paddle_pointer, last_matched_index = self._process_text(
|
|
|
item, paddle_text_boxes, paddle_pointer, last_matched_index
|
|
|
)
|
|
|
@@ -87,57 +115,44 @@ class DataProcessor:
|
|
|
return merged_data
|
|
|
|
|
|
def process_dotsocr_data(self, dotsocr_data: List[Dict],
|
|
|
- paddle_text_boxes: List[Dict]) -> List[Dict]:
|
|
|
+ paddle_text_boxes: List[Dict],
|
|
|
+ rotation_angle: float,
|
|
|
+ orig_image_size: Tuple[int, int]) -> List[Dict]:
|
|
|
"""
|
|
|
- 🎯 处理 DotsOCR 数据,转换为 MinerU 格式并添加 bbox 信息
|
|
|
+ 处理 DotsOCR 数据(简化版:转换后复用 MinerU 处理逻辑)
|
|
|
|
|
|
Args:
|
|
|
- dotsocr_data: DotsOCR 数据
|
|
|
- paddle_text_boxes: PaddleOCR 文字框列表
|
|
|
+ dotsocr_data: DotsOCR 输出数据
|
|
|
+ paddle_text_boxes: PaddleOCR 文本框
|
|
|
+ rotation_angle: 旋转角度
|
|
|
+ orig_image_size: 原始图片尺寸
|
|
|
|
|
|
Returns:
|
|
|
- MinerU 格式的合并数据
|
|
|
+ 统一的 MinerU 格式数据(带 table_cells bbox)
|
|
|
"""
|
|
|
- merged_data = []
|
|
|
- paddle_pointer = 0
|
|
|
- last_matched_index = 0
|
|
|
-
|
|
|
- # 按 bbox 排序
|
|
|
- dotsocr_data.sort(
|
|
|
- key=lambda x: (x['bbox'][1], x['bbox'][0])
|
|
|
- if 'bbox' in x else (float('inf'), float('inf'))
|
|
|
- )
|
|
|
+ print(f"📊 处理 DotsOCR 数据: {len(dotsocr_data)} 个块")
|
|
|
|
|
|
+ # 🎯 第一步:转换为 MinerU 格式
|
|
|
+ mineru_format_data = []
|
|
|
for item in dotsocr_data:
|
|
|
- # 🎯 转换为 MinerU 格式
|
|
|
- mineru_item = self._convert_dotsocr_to_mineru(item)
|
|
|
- category = mineru_item.get('type', '')
|
|
|
-
|
|
|
- # 🎯 根据类型处理
|
|
|
- if category.lower() == 'table':
|
|
|
- merged_item, paddle_pointer = self._process_table(
|
|
|
- mineru_item, paddle_text_boxes, paddle_pointer
|
|
|
- )
|
|
|
- merged_data.append(merged_item)
|
|
|
-
|
|
|
- elif category.lower() in ['text', 'title', 'header', 'footer']:
|
|
|
- merged_item, paddle_pointer, last_matched_index = self._process_text(
|
|
|
- mineru_item, paddle_text_boxes, paddle_pointer, last_matched_index
|
|
|
- )
|
|
|
- merged_data.append(merged_item)
|
|
|
-
|
|
|
- elif category.lower() == 'list':
|
|
|
- merged_item, paddle_pointer, last_matched_index = self._process_list(
|
|
|
- mineru_item, paddle_text_boxes, paddle_pointer, last_matched_index
|
|
|
- )
|
|
|
- merged_data.append(merged_item)
|
|
|
-
|
|
|
- else:
|
|
|
- # Page-header, Page-footer, Picture 等
|
|
|
- merged_data.append(mineru_item)
|
|
|
-
|
|
|
- return merged_data
|
|
|
-
|
|
|
+ try:
|
|
|
+ converted_item = self._convert_dotsocr_to_mineru(item)
|
|
|
+ if converted_item:
|
|
|
+ mineru_format_data.append(converted_item)
|
|
|
+ except Exception as e:
|
|
|
+ print(f"⚠️ DotsOCR 转换失败: {e}")
|
|
|
+ continue
|
|
|
+
|
|
|
+ print(f" ✓ 转换完成: {len(mineru_format_data)} 个块")
|
|
|
+
|
|
|
+ # 🎯 第二步:复用 MinerU 处理逻辑
|
|
|
+ return self.process_mineru_data(
|
|
|
+ mineru_data=mineru_format_data,
|
|
|
+ paddle_text_boxes=paddle_text_boxes,
|
|
|
+ rotation_angle=rotation_angle,
|
|
|
+ orig_image_size=orig_image_size
|
|
|
+ )
|
|
|
+
|
|
|
def _convert_dotsocr_to_mineru(self, dotsocr_item: Dict) -> Dict:
|
|
|
"""
|
|
|
🎯 将 DotsOCR 格式转换为 MinerU 格式
|
|
|
@@ -199,7 +214,7 @@ class DataProcessor:
|
|
|
return mineru_item
|
|
|
|
|
|
def process_paddleocr_vl_data(self, paddleocr_vl_data: Dict,
|
|
|
- paddle_text_boxes: List[Dict]) -> List[Dict]:
|
|
|
+ paddle_text_boxes: List[Dict], rotation_angle: float, orig_image_size: Tuple[int, int]) -> List[Dict]:
|
|
|
"""
|
|
|
处理 PaddleOCR_VL 数据,添加 bbox 信息
|
|
|
|
|
|
@@ -216,12 +231,12 @@ class DataProcessor:
|
|
|
|
|
|
# 🎯 获取旋转角度和原始图像尺寸
|
|
|
rotation_angle = self._get_rotation_angle_from_vl(paddleocr_vl_data)
|
|
|
- orig_image_size = None
|
|
|
+ vl_orig_image_size = None
|
|
|
|
|
|
if rotation_angle != 0:
|
|
|
- orig_image_size = self._get_original_image_size_from_vl(paddleocr_vl_data)
|
|
|
+ vl_orig_image_size = self._get_original_image_size_from_vl(paddleocr_vl_data)
|
|
|
print(f"🔄 PaddleOCR_VL 检测到旋转角度: {rotation_angle}°")
|
|
|
- print(f"📐 原始图像尺寸: {orig_image_size[0]} x {orig_image_size[1]}")
|
|
|
+ print(f"📐 原始图像尺寸: {vl_orig_image_size[0]} x {vl_orig_image_size[1]}")
|
|
|
|
|
|
# 提取 parsing_res_list
|
|
|
parsing_res_list = paddleocr_vl_data.get('parsing_res_list', [])
|
|
|
@@ -231,40 +246,26 @@ class DataProcessor:
|
|
|
key=lambda x: (x['block_bbox'][1], x['block_bbox'][0])
|
|
|
if 'block_bbox' in x else (float('inf'), float('inf'))
|
|
|
)
|
|
|
-
|
|
|
+ mineru_format_data = []
|
|
|
+
|
|
|
for item in parsing_res_list:
|
|
|
# 🎯 先转换 bbox 坐标(如果需要)
|
|
|
if rotation_angle != 0 and orig_image_size:
|
|
|
item = self._transform_vl_block_bbox(item, rotation_angle, orig_image_size)
|
|
|
-
|
|
|
- # 🎯 统一转换为 MinerU 格式
|
|
|
- mineru_item = self._convert_paddleocr_vl_to_mineru(item)
|
|
|
- item_type = mineru_item.get('type', '')
|
|
|
-
|
|
|
- # 🎯 根据类型处理(复用 MinerU 的通用方法)
|
|
|
- if item_type == 'table':
|
|
|
- merged_item, paddle_pointer = self._process_table(
|
|
|
- mineru_item, paddle_text_boxes, paddle_pointer
|
|
|
- )
|
|
|
- merged_data.append(merged_item)
|
|
|
-
|
|
|
- elif item_type in ['text', 'title', 'header', 'footer', 'equation']:
|
|
|
- merged_item, paddle_pointer, last_matched_index = self._process_text(
|
|
|
- mineru_item, paddle_text_boxes, paddle_pointer, last_matched_index
|
|
|
- )
|
|
|
- merged_data.append(merged_item)
|
|
|
-
|
|
|
- elif item_type == 'list':
|
|
|
- merged_item, paddle_pointer, last_matched_index = self._process_list(
|
|
|
- mineru_item, paddle_text_boxes, paddle_pointer, last_matched_index
|
|
|
- )
|
|
|
- merged_data.append(merged_item)
|
|
|
-
|
|
|
- else:
|
|
|
- # 其他类型(image 等)直接添加
|
|
|
- merged_data.append(mineru_item)
|
|
|
-
|
|
|
- return merged_data
|
|
|
+ converted_item = self._convert_paddleocr_vl_to_mineru(item)
|
|
|
+ if converted_item:
|
|
|
+ mineru_format_data.append(converted_item)
|
|
|
+
|
|
|
+ print(f" ✓ 转换完成: {len(mineru_format_data)} 个块")
|
|
|
+
|
|
|
+ # 🎯 第三步:复用 MinerU 处理逻辑
|
|
|
+ return self.process_mineru_data(
|
|
|
+ mineru_data=mineru_format_data,
|
|
|
+ paddle_text_boxes=paddle_text_boxes,
|
|
|
+ rotation_angle=rotation_angle,
|
|
|
+ orig_image_size=orig_image_size
|
|
|
+ )
|
|
|
+
|
|
|
|
|
|
def _get_rotation_angle_from_vl(self, paddleocr_vl_data: Dict) -> float:
|
|
|
"""从 PaddleOCR_VL 数据中获取旋转角度"""
|
|
|
@@ -296,24 +297,7 @@ class DataProcessor:
|
|
|
if len(block_bbox) < 4:
|
|
|
return transformed_item
|
|
|
|
|
|
- # block_bbox 格式: [x1, y1, x2, y2]
|
|
|
- # 转换为 poly 格式进行旋转
|
|
|
- poly = [
|
|
|
- [block_bbox[0], block_bbox[1]], # 左上
|
|
|
- [block_bbox[2], block_bbox[1]], # 右上
|
|
|
- [block_bbox[2], block_bbox[3]], # 右下
|
|
|
- [block_bbox[0], block_bbox[3]] # 左下
|
|
|
- ]
|
|
|
-
|
|
|
- # 🎯 使用 BBoxExtractor 的坐标转换方法
|
|
|
- transformed_poly = BBoxExtractor._inverse_rotate_coordinates(
|
|
|
- poly, angle, orig_image_size
|
|
|
- )
|
|
|
-
|
|
|
- # 转换回 bbox 格式
|
|
|
- xs = [p[0] for p in transformed_poly]
|
|
|
- ys = [p[1] for p in transformed_poly]
|
|
|
- transformed_bbox = [min(xs), min(ys), max(xs), max(ys)]
|
|
|
+ transformed_bbox = BBoxExtractor.inverse_rotate_box_coordinates(block_bbox, angle, orig_image_size)
|
|
|
|
|
|
transformed_item['block_bbox'] = transformed_bbox
|
|
|
|