Ver código fonte

feat: 添加对 PaddleOCR_VL 数据的旋转角度和原始图像尺寸处理,优化 bbox 坐标转换

zhch158_admin 1 semana atrás
pai
commit
7930c6cd71
1 arquivos alterados com 97 adições e 52 exclusões
  1. 97 52
      merger/data_processor.py

+ 97 - 52
merger/data_processor.py

@@ -7,8 +7,10 @@ from bs4 import BeautifulSoup
 
 try:
     from .text_matcher import TextMatcher
+    from .bbox_extractor import BBoxExtractor
 except ImportError:
     from text_matcher import TextMatcher
+    from bbox_extractor import BBoxExtractor
 
 
 class DataProcessor:
@@ -212,28 +214,41 @@ class DataProcessor:
                                   paddle_text_boxes: List[Dict]) -> List[Dict]:
         """
         处理 PaddleOCR_VL 数据,添加 bbox 信息
-    
+        
         Args:
             paddleocr_vl_data: PaddleOCR_VL 数据 (JSON 对象)
             paddle_text_boxes: PaddleOCR 文字框列表
-    
+        
         Returns:
-            MinerU 格式的合并数据(统一输出格式)
+            🎯 MinerU 格式的合并数据(统一输出格式)
         """
         merged_data = []
         paddle_pointer = 0
         last_matched_index = 0
-    
+        
+        # 🎯 获取旋转角度和原始图像尺寸
+        rotation_angle = self._get_rotation_angle_from_vl(paddleocr_vl_data)
+        orig_image_size = None
+        
+        if rotation_angle != 0:
+            orig_image_size = self._get_original_image_size_from_vl(paddleocr_vl_data)
+            print(f"🔄 PaddleOCR_VL 检测到旋转角度: {rotation_angle}°")
+            print(f"📐 原始图像尺寸: {orig_image_size[0]} x {orig_image_size[1]}")
+        
         # 提取 parsing_res_list
         parsing_res_list = paddleocr_vl_data.get('parsing_res_list', [])
-    
+        
         # 按 bbox 排序
         parsing_res_list.sort(
             key=lambda x: (x['block_bbox'][1], x['block_bbox'][0])
             if 'block_bbox' in x else (float('inf'), float('inf'))
         )
-    
+        
         for item in parsing_res_list:
+            # 🎯 先转换 bbox 坐标(如果需要)
+            if rotation_angle != 0 and orig_image_size:
+                item = self._transform_vl_block_bbox(item, rotation_angle, orig_image_size)
+            
             # 🎯 统一转换为 MinerU 格式
             mineru_item = self._convert_paddleocr_vl_to_mineru(item)
             item_type = mineru_item.get('type', '')
@@ -244,56 +259,94 @@ class DataProcessor:
                     mineru_item, paddle_text_boxes, paddle_pointer
                 )
                 merged_data.append(merged_item)
-        
+            
             elif item_type in ['text', 'title', 'header', 'footer', 'equation']:
                 merged_item, paddle_pointer, last_matched_index = self._process_text(
                     mineru_item, paddle_text_boxes, paddle_pointer, last_matched_index
                 )
                 merged_data.append(merged_item)
-        
+            
             elif item_type == 'list':
                 merged_item, paddle_pointer, last_matched_index = self._process_list(
                     mineru_item, paddle_text_boxes, paddle_pointer, last_matched_index
                 )
                 merged_data.append(merged_item)
-        
+            
             else:
-                # 其他类型(image, equation 等)直接添加
+                # 其他类型(image 等)直接添加
                 merged_data.append(mineru_item)
-    
+        
         return merged_data
     
+    def _get_rotation_angle_from_vl(self, paddleocr_vl_data: Dict) -> float:
+        """从 PaddleOCR_VL 数据中获取旋转角度"""
+        return BBoxExtractor._get_rotation_angle(paddleocr_vl_data)
+    
+    def _get_original_image_size_from_vl(self, paddleocr_vl_data: Dict) -> tuple:
+        """从 PaddleOCR_VL 数据中获取原始图像尺寸"""
+        return BBoxExtractor._get_original_image_size(paddleocr_vl_data)
+    
+    def _transform_vl_block_bbox(self, item: Dict, angle: float, 
+                                 orig_image_size: tuple) -> Dict:
+        """
+        转换 PaddleOCR_VL 的 block_bbox 坐标
+        
+        Args:
+            item: PaddleOCR_VL 的 block 数据
+            angle: 旋转角度
+            orig_image_size: 原始图像尺寸
+        
+        Returns:
+            转换后的 block 数据
+        """
+        transformed_item = item.copy()
+        
+        if 'block_bbox' not in item:
+            return transformed_item
+        
+        block_bbox = item['block_bbox']
+        if len(block_bbox) < 4:
+            return transformed_item
+        
+        # block_bbox 格式: [x1, y1, x2, y2]
+        # 转换为 poly 格式进行旋转
+        poly = [
+            [block_bbox[0], block_bbox[1]],  # 左上
+            [block_bbox[2], block_bbox[1]],  # 右上
+            [block_bbox[2], block_bbox[3]],  # 右下
+            [block_bbox[0], block_bbox[3]]   # 左下
+        ]
+        
+        # 🎯 使用 BBoxExtractor 的坐标转换方法
+        transformed_poly = BBoxExtractor._inverse_rotate_coordinates(
+            poly, angle, orig_image_size
+        )
+        
+        # 转换回 bbox 格式
+        xs = [p[0] for p in transformed_poly]
+        ys = [p[1] for p in transformed_poly]
+        transformed_bbox = [min(xs), min(ys), max(xs), max(ys)]
+        
+        transformed_item['block_bbox'] = transformed_bbox
+        
+        return transformed_item
+    
     def _convert_paddleocr_vl_to_mineru(self, paddleocr_vl_item: Dict) -> Dict:
         """
         🎯 将 PaddleOCR_VL 格式转换为 MinerU 格式
         
-        PaddleOCR_VL (PP-DocLayout_plus-L):
-        {
-            "block_label": "paragraph_title",  # 或 "doc_title", "text" 等
-            "block_bbox": [172, 151, 547, 184],
-            "block_content": "...",
-            "block_id": 0
-        }
-        
-        MinerU:
-        {
-            "type": "title",
-            "bbox": [172, 151, 547, 184],
-            "text": "...",
-            "text_level": 1,
-            "page_idx": 0
-        }
+        基于 PP-DocLayout_plus-L 的 20 种类别
         """
         block_label = paddleocr_vl_item.get('block_label', '')
         
-        # 🎯 PP-DocLayout_plus-L 类别映射
+        # 🎯 PP-DocLayout_plus-L 类别映射(共 20 种)
         label_map = {
-            # 标题类
-            'paragraph_title': 'title',      # 段落标题 → title (level 2)
-            'doc_title': 'title',            # 文档标题 → title (level 1)
-            'figure_table_chart_title': 'title',  # 图表标题 → title (level 3)
+            # 标题类(3种)
+            'paragraph_title': 'title',
+            'doc_title': 'title',
+            'figure_table_chart_title': 'title',
             
-            # 文本类
+            # 文本类(9种)
             'text': 'text',
             'number': 'text',
             'content': 'text',
@@ -301,57 +354,49 @@ class DataProcessor:
             'footnote': 'text',
             'aside_text': 'text',
             'algorithm': 'text',
-            
-            # 参考文献
             'reference': 'text',
             'reference_content': 'text',
             
-            # 页眉页脚
+            # 页眉页脚(2种)
             'header': 'header',
             'footer': 'footer',
             
-            # 表格
+            # 表格(1种)
             'table': 'table',
             
-            # 图片
+            # 图片/图表(3种)
             'image': 'image',
             'chart': 'image',
+            'seal': 'image',
             
-            # 公式
+            # 公式(2种)
             'formula': 'equation',
-            'formula_number': 'equation',
-            
-            # 印章
-            'seal': 'image'
+            'formula_number': 'equation'
         }
         
         mineru_type = label_map.get(block_label, 'text')
         
-        # 🎯 基础转换
         mineru_item = {
             'type': mineru_type,
             'bbox': paddleocr_vl_item.get('block_bbox', []),
             'page_idx': 0
         }
         
-        # 🎯 处理文本内容
         content = paddleocr_vl_item.get('block_content', '')
         
         if mineru_type == 'table':
-            # 表格:block_content -> table_body
             mineru_item['table_body'] = content
         else:
-            # 其他类型:block_content -> text
             mineru_item['text'] = content
             
-            # 🎯 处理标题级别(基于实际的类别)
+            # 标题级别
             if block_label == 'doc_title':
-                mineru_item['text_level'] = 1  # 文档标题 - 一级
+                mineru_item['text_level'] = 1
             elif block_label == 'paragraph_title':
-                mineru_item['text_level'] = 2  # 段落标题 - 二级
+                mineru_item['text_level'] = 2
             elif block_label == 'figure_table_chart_title':
-                mineru_item['text_level'] = 3  # 图表标题 - 三级
-    
+                mineru_item['text_level'] = 3
+        
         return mineru_item
     
     def _process_table(self, item: Dict, paddle_text_boxes: List[Dict],