浏览代码

feat(增强文档处理管道): 在EnhancedDocPipeline类中添加印章OCR识别器的初始化与清理逻辑,更新图片相关元素类别以支持图表,优化印章元素处理流程,提升印章识别的准确性与灵活性。

zhch158_admin 1 月之前
父节点
当前提交
797bad05df
共有 1 个文件被更改,包括 17 次插入2 次删除
  1. 17 2
      ocr_tools/universal_doc_parser/core/pipeline_manager_v2.py

+ 17 - 2
ocr_tools/universal_doc_parser/core/pipeline_manager_v2.py

@@ -81,7 +81,7 @@ class EnhancedDocPipeline:
     TABLE_TEXT_CATEGORIES = ['table_caption', 'table_footnote']
     
     # 图片相关元素
-    IMAGE_BODY_CATEGORIES = ['image', 'image_body', 'figure']
+    IMAGE_BODY_CATEGORIES = ['image', 'image_body', 'figure', 'chart']
     IMAGE_TEXT_CATEGORIES = ['image_caption', 'image_footnote']
     
     # 公式类元素
@@ -198,6 +198,18 @@ class EnhancedDocPipeline:
                 self.layout_detector.set_ocr_recognizer(self.ocr_recognizer)
                 logger.info("✅ OCR recognizer set for smart router")
 
+            # 4b. 印章 OCR 识别器(可选,基于 MinerU PytorchPaddleOCR lang=seal)
+            self.seal_ocr_recognizer = None
+            seal_recognition_config = self.config.get('seal_recognition', {})
+            if seal_recognition_config.get('enabled', False):
+                try:
+                    self.seal_ocr_recognizer = ModelFactory.create_seal_ocr_recognizer(
+                        seal_recognition_config
+                    )
+                    logger.info("✅ Seal OCR recognizer initialized")
+                except Exception as e:
+                    logger.warning(f"⚠️ Seal OCR recognizer init failed, will fallback to VLM: {e}")
+
             # 5. 表格分类器(可选)
             self.table_classifier = None
             table_cls_config = self.config.get('table_classification', {})
@@ -249,6 +261,7 @@ class EnhancedDocPipeline:
             wired_table_recognizer=getattr(self, 'wired_table_recognizer', None),
             table_classifier=getattr(self, 'table_classifier', None),
             vl_recognizer_lazy_loader=self._ensure_vl_recognizer,  # 🎯 传入懒加载回调
+            seal_ocr_recognizer=getattr(self, 'seal_ocr_recognizer', None),  # 🆕 印章 OCR 识别器
         )
     
     # ==================== 主处理流程 ====================
@@ -1084,7 +1097,7 @@ class EnhancedDocPipeline:
                 logger.warning(f"⚠️ Equation processing failed: {e}")
                 processed_elements.append(ElementProcessors.create_error_element(item, str(e)))
         
-        # 🔧 处理 Seal(印章)元素 - 使用 VLM 识别
+        # 处理 Seal(印章)元素 - 优先 SealOCRRecognizer,回退 VLM
         for item in classified_elements['seal']:
             try:
                 element = self.element_processors.process_seal_element(
@@ -1145,6 +1158,8 @@ class EnhancedDocPipeline:
                 self.vl_recognizer.cleanup()
             if hasattr(self, 'ocr_recognizer'):
                 self.ocr_recognizer.cleanup()
+            if hasattr(self, 'seal_ocr_recognizer') and self.seal_ocr_recognizer is not None:
+                self.seal_ocr_recognizer.cleanup()
             logger.info("✅ Pipeline cleanup completed")
         except Exception as e:
             logger.warning(f"⚠️ Cleanup failed: {e}")