浏览代码

feat(element_processors): 添加印章元素处理功能,支持 VLM 识别

zhch158_admin 1 周之前
父节点
当前提交
e126aaed5a

+ 51 - 0
ocr_tools/universal_doc_parser/core/element_processors.py

@@ -655,6 +655,57 @@ class ElementProcessors:
             'content': content
         }
     
+    def process_seal_element(
+        self,
+        image: np.ndarray,
+        layout_item: Dict[str, Any]
+    ) -> Dict[str, Any]:
+        """
+        处理印章(seal)元素 - 使用 VLM 识别
+        
+        Args:
+            image: 页面图像
+            layout_item: 布局检测项
+            
+        Returns:
+            处理后的元素字典
+        """
+        bbox = layout_item.get('bbox', [0, 0, 0, 0])
+        category = layout_item.get('category', 'seal')
+        cropped_region = CoordinateUtils.crop_region(image, bbox)
+        
+        content = {'text': '', 'confidence': 0.0}
+        
+        try:
+            # 懒加载 VL 识别器
+            vl_recognizer = self._ensure_vl_recognizer()
+            if vl_recognizer is None:
+                logger.error("❌ VL recognizer not available for seal recognition")
+                return {
+                    'type': category,
+                    'bbox': bbox,
+                    'content': content
+                }
+            
+            # 使用 recognize_text 方法,传入 element_type='seal'
+            # GLM-OCR 适配器会根据 element_type 使用相应的提示词
+            seal_result = vl_recognizer.recognize_text(cropped_region, element_type='seal')
+            content = {
+                'text': seal_result.get('text', ''),
+                'confidence': seal_result.get('confidence', 0.0)
+            }
+            
+            logger.info(f"🔖 Seal recognized: {content['text'][:50]}..." if len(content['text']) > 50 else f"🔖 Seal recognized: {content['text']}")
+        except Exception as e:
+            logger.warning(f"Seal recognition failed: {e}")
+        
+        return {
+            'type': category,
+            'bbox': bbox,
+            'confidence': layout_item.get('confidence', 0.0),
+            'content': content
+        }
+    
     def process_image_element(
         self,
         image: np.ndarray,

+ 8 - 1
ocr_tools/universal_doc_parser/core/model_factory.py

@@ -39,9 +39,13 @@ class ModelFactory:
     def create_layout_detector(cls, config: Dict[str, Any]) -> BaseLayoutDetector:
         # 根据配置创建检测器
         module_name = config.get('module', 'mineru')
-        if module_name == 'paddle':
+        model_name = config.get('model_name', 'default')
+        if module_name == 'paddle' and model_name == 'RT-DETR-H_layout_17cls':
             from models.adapters import PaddleLayoutDetector
             detector = PaddleLayoutDetector(config)
+        elif module_name == 'paddle' and model_name == 'PP-DocLayoutV3':
+            from models.adapters import PPDocLayoutV3Detector
+            detector = PPDocLayoutV3Detector(config)
         elif module_name == 'docling':
             from models.adapters import DoclingLayoutDetector
             detector = DoclingLayoutDetector(config)
@@ -74,6 +78,9 @@ class ModelFactory:
         elif module_name == 'mineru':
             from models.adapters import MinerUVLRecognizer
             recognizer = MinerUVLRecognizer(config)
+        elif module_name == 'glmocr':
+            from models.adapters import GLMOCRVLRecognizer
+            recognizer = GLMOCRVLRecognizer(config)
         else:
             raise ValueError(f"Unknown VL recognizer module: {module_name}")
             

+ 18 - 0
ocr_tools/universal_doc_parser/core/pipeline_manager_v2.py

@@ -88,6 +88,9 @@ class EnhancedDocPipeline:
         'interline_equation_yolo', 'interline_equation_number'
     ]
     
+    # Seal(印章)类元素 - 需要 VLM 识别
+    SEAL_CATEGORIES = ['seal']
+    
     # 丢弃类元素(水印、装饰等)
     DISCARD_CATEGORIES = ['abandon', 'discarded']
     
@@ -750,6 +753,7 @@ class EnhancedDocPipeline:
             'image_body': [],
             'image_text': [],
             'equation': [],
+            'seal': [],  # 🔧 添加 seal 类别
             'code': [],
             'discard': []
         }
@@ -769,6 +773,8 @@ class EnhancedDocPipeline:
                 classified['image_text'].append(item)
             elif category in self.EQUATION_CATEGORIES:
                 classified['equation'].append(item)
+            elif category in self.SEAL_CATEGORIES:
+                classified['seal'].append(item)
             elif category in self.CODE_CATEGORIES:
                 classified['code'].append(item)
             elif category in self.DISCARD_CATEGORIES:
@@ -784,6 +790,7 @@ class EnhancedDocPipeline:
                    f"image={len(classified['image_body'])}, "
                    f"image_text={len(classified['image_text'])}, "
                    f"equation={len(classified['equation'])}, "
+                   f"seal={len(classified['seal'])}, "
                    f"code={len(classified['code'])}, "
                    f"discard={len(classified['discard'])}")
         
@@ -952,6 +959,17 @@ class EnhancedDocPipeline:
                 logger.warning(f"⚠️ Equation processing failed: {e}")
                 processed_elements.append(ElementProcessors.create_error_element(item, str(e)))
         
+        # 🔧 处理 Seal(印章)元素 - 使用 VLM 识别
+        for item in classified_elements['seal']:
+            try:
+                element = self.element_processors.process_seal_element(
+                    detection_image, item
+                )
+                processed_elements.append(element)
+            except Exception as e:
+                logger.warning(f"⚠️ Seal processing failed: {e}")
+                processed_elements.append(ElementProcessors.create_error_element(item, str(e)))
+        
         # 处理图片主体
         for item in classified_elements['image_body']:
             try: