|
|
@@ -81,7 +81,7 @@ class EnhancedDocPipeline:
|
|
|
TABLE_TEXT_CATEGORIES = ['table_caption', 'table_footnote']
|
|
|
|
|
|
# 图片相关元素
|
|
|
- IMAGE_BODY_CATEGORIES = ['image', 'image_body', 'figure']
|
|
|
+ IMAGE_BODY_CATEGORIES = ['image', 'image_body', 'figure', 'chart']
|
|
|
IMAGE_TEXT_CATEGORIES = ['image_caption', 'image_footnote']
|
|
|
|
|
|
# 公式类元素
|
|
|
@@ -198,6 +198,18 @@ class EnhancedDocPipeline:
|
|
|
self.layout_detector.set_ocr_recognizer(self.ocr_recognizer)
|
|
|
logger.info("✅ OCR recognizer set for smart router")
|
|
|
|
|
|
+ # 4b. 印章 OCR 识别器(可选,基于 MinerU PytorchPaddleOCR lang=seal)
|
|
|
+ self.seal_ocr_recognizer = None
|
|
|
+ seal_recognition_config = self.config.get('seal_recognition', {})
|
|
|
+ if seal_recognition_config.get('enabled', False):
|
|
|
+ try:
|
|
|
+ self.seal_ocr_recognizer = ModelFactory.create_seal_ocr_recognizer(
|
|
|
+ seal_recognition_config
|
|
|
+ )
|
|
|
+ logger.info("✅ Seal OCR recognizer initialized")
|
|
|
+ except Exception as e:
|
|
|
+ logger.warning(f"⚠️ Seal OCR recognizer init failed, will fallback to VLM: {e}")
|
|
|
+
|
|
|
# 5. 表格分类器(可选)
|
|
|
self.table_classifier = None
|
|
|
table_cls_config = self.config.get('table_classification', {})
|
|
|
@@ -249,6 +261,7 @@ class EnhancedDocPipeline:
|
|
|
wired_table_recognizer=getattr(self, 'wired_table_recognizer', None),
|
|
|
table_classifier=getattr(self, 'table_classifier', None),
|
|
|
vl_recognizer_lazy_loader=self._ensure_vl_recognizer, # 🎯 传入懒加载回调
|
|
|
+ seal_ocr_recognizer=getattr(self, 'seal_ocr_recognizer', None), # 🆕 印章 OCR 识别器
|
|
|
)
|
|
|
|
|
|
# ==================== 主处理流程 ====================
|
|
|
@@ -1084,7 +1097,7 @@ class EnhancedDocPipeline:
|
|
|
logger.warning(f"⚠️ Equation processing failed: {e}")
|
|
|
processed_elements.append(ElementProcessors.create_error_element(item, str(e)))
|
|
|
|
|
|
- # 🔧 处理 Seal(印章)元素 - 使用 VLM 识别
|
|
|
+ # 处理 Seal(印章)元素 - 优先 SealOCRRecognizer,回退 VLM
|
|
|
for item in classified_elements['seal']:
|
|
|
try:
|
|
|
element = self.element_processors.process_seal_element(
|
|
|
@@ -1145,6 +1158,8 @@ class EnhancedDocPipeline:
|
|
|
self.vl_recognizer.cleanup()
|
|
|
if hasattr(self, 'ocr_recognizer'):
|
|
|
self.ocr_recognizer.cleanup()
|
|
|
+ if hasattr(self, 'seal_ocr_recognizer') and self.seal_ocr_recognizer is not None:
|
|
|
+ self.seal_ocr_recognizer.cleanup()
|
|
|
logger.info("✅ Pipeline cleanup completed")
|
|
|
except Exception as e:
|
|
|
logger.warning(f"⚠️ Cleanup failed: {e}")
|