|
|
@@ -0,0 +1,175 @@
|
|
|
+"""印章 OCR 识别适配器,封装 MinerU 的 PytorchPaddleOCR(lang="seal")"""
|
|
|
+
|
|
|
+from typing import Dict, Any, List, Union
|
|
|
+import numpy as np
|
|
|
+import cv2
|
|
|
+from PIL import Image
|
|
|
+from loguru import logger
|
|
|
+
|
|
|
+from .base import BaseOCRRecognizer
|
|
|
+
|
|
|
+try:
|
|
|
+ from mineru.backend.pipeline.model_init import AtomModelSingleton
|
|
|
+ from mineru.backend.pipeline.model_list import AtomicModel
|
|
|
+ MINERU_AVAILABLE = True
|
|
|
+except ImportError as e:
|
|
|
+ logger.warning(f"MinerU components not available for seal OCR: {e}")
|
|
|
+ MINERU_AVAILABLE = False
|
|
|
+
|
|
|
+
|
|
|
+class SealOCRRecognizer(BaseOCRRecognizer):
|
|
|
+ """印章 OCR 识别适配器,复用 MinerU 的印章专用 OCR 模型
|
|
|
+
|
|
|
+ 使用 PytorchPaddleOCR(lang="seal"),该模型针对印章文本做了专项优化:
|
|
|
+ - 检测模型: seal_PP-OCRv4_det_server_infer.pth
|
|
|
+ - 识别模型: ch_PP-OCRv4_rec_server_infer.pth
|
|
|
+ - 使用 polygon 边界框,低阈值 (db_thresh=0.2, box_thresh=0.6)
|
|
|
+ - 不合并检测框 (enable_merge_det_boxes=False)
|
|
|
+ - drop_score=0 以保留低置信度结果
|
|
|
+ """
|
|
|
+
|
|
|
+ def __init__(self, config: Dict[str, Any]):
|
|
|
+ super().__init__(config)
|
|
|
+ if not MINERU_AVAILABLE:
|
|
|
+ raise ImportError("MinerU components not available")
|
|
|
+ self.atom_model_manager = AtomModelSingleton()
|
|
|
+ self.seal_model = None
|
|
|
+
|
|
|
+ def initialize(self):
|
|
|
+ """初始化印章 OCR 模型"""
|
|
|
+ try:
|
|
|
+ self.seal_model = self.atom_model_manager.get_atom_model(
|
|
|
+ atom_model_name=AtomicModel.OCR,
|
|
|
+ lang="seal",
|
|
|
+ )
|
|
|
+ logger.info("SealOCRRecognizer initialized with lang=seal")
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f"Failed to initialize SealOCRRecognizer: {e}")
|
|
|
+ raise
|
|
|
+
|
|
|
+ def cleanup(self):
|
|
|
+ """清理资源"""
|
|
|
+ self.seal_model = None
|
|
|
+
|
|
|
+ def recognize(self, image: Union[np.ndarray, Image.Image]) -> Dict[str, Any]:
|
|
|
+ """识别印章图片中的文字
|
|
|
+
|
|
|
+ 与 MinerU batch_analyze.py 中的印章 OCR 逻辑保持一致:
|
|
|
+ 1. 将 RGB 图像转为 BGR
|
|
|
+ 2. 调用 seal_ocr_model.ocr(bgr_img, det=True, rec=True)
|
|
|
+ 3. 提取识别出的文本列表
|
|
|
+
|
|
|
+ Args:
|
|
|
+ image: 印章裁剪图像 (RGB/OpenCV numpy array 或 PIL Image)
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ {
|
|
|
+ 'text': str, # 合并后的文本(用空格连接)
|
|
|
+ 'texts': List[str], # 各文本框识别出的文本列表
|
|
|
+ 'confidence': float, # 平均置信度
|
|
|
+ 'details': List[Dict] # 详细结果 (bbox, text, confidence)
|
|
|
+ }
|
|
|
+ """
|
|
|
+ if self.seal_model is None:
|
|
|
+ raise RuntimeError("Seal OCR model not initialized")
|
|
|
+
|
|
|
+ # 转换为 BGR 格式
|
|
|
+ if isinstance(image, Image.Image):
|
|
|
+ img_rgb = np.array(image)
|
|
|
+ else:
|
|
|
+ img_rgb = image
|
|
|
+
|
|
|
+ if img_rgb.size == 0:
|
|
|
+ return {'text': '', 'texts': [], 'confidence': 0.0, 'details': []}
|
|
|
+
|
|
|
+ img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR)
|
|
|
+
|
|
|
+ try:
|
|
|
+ seal_ocr_res = self.seal_model.ocr(img_bgr, det=True, rec=True)
|
|
|
+ if not seal_ocr_res or not seal_ocr_res[0]:
|
|
|
+ return {'text': '', 'texts': [], 'confidence': 0.0, 'details': []}
|
|
|
+
|
|
|
+ seal_texts: List[str] = []
|
|
|
+ details: List[Dict[str, Any]] = []
|
|
|
+ confidences: List[float] = []
|
|
|
+
|
|
|
+ for seal_item in seal_ocr_res[0]:
|
|
|
+ if not seal_item or len(seal_item) != 2:
|
|
|
+ continue
|
|
|
+ poly = seal_item[0] # 多边形坐标
|
|
|
+ rec_result = seal_item[1]
|
|
|
+ if not rec_result or len(rec_result) < 1:
|
|
|
+ continue
|
|
|
+ rec_text = rec_result[0]
|
|
|
+ rec_conf = rec_result[1] if len(rec_result) >= 2 else 0.0
|
|
|
+ if rec_text:
|
|
|
+ seal_texts.append(rec_text)
|
|
|
+ confidences.append(rec_conf)
|
|
|
+ details.append({
|
|
|
+ 'poly': poly,
|
|
|
+ 'text': rec_text,
|
|
|
+ 'confidence': rec_conf,
|
|
|
+ })
|
|
|
+
|
|
|
+ avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
|
|
|
+ combined_text = " ".join(seal_texts)
|
|
|
+
|
|
|
+ logger.debug(
|
|
|
+ f"Seal OCR: '{combined_text[:50]}...' (avg conf: {avg_confidence:.3f})"
|
|
|
+ )
|
|
|
+
|
|
|
+ return {
|
|
|
+ 'text': combined_text,
|
|
|
+ 'texts': seal_texts,
|
|
|
+ 'confidence': avg_confidence,
|
|
|
+ 'details': details,
|
|
|
+ }
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ logger.warning(f"Seal OCR recognition failed: {e}")
|
|
|
+ return {'text': '', 'texts': [], 'confidence': 0.0, 'details': []}
|
|
|
+
|
|
|
+ def recognize_text(self, image: Union[np.ndarray, Image.Image]) -> List[Dict[str, Any]]:
|
|
|
+ """实现 BaseOCRRecognizer 接口,将 recognize() 结果转为标准 OCR 列表格式"""
|
|
|
+ result = self.recognize(image)
|
|
|
+ formatted: List[Dict[str, Any]] = []
|
|
|
+ for detail in result.get('details', []):
|
|
|
+ poly = detail.get('poly')
|
|
|
+ if not poly:
|
|
|
+ continue
|
|
|
+ formatted.append({
|
|
|
+ 'poly': poly,
|
|
|
+ 'text': detail.get('text', ''),
|
|
|
+ 'confidence': detail.get('confidence', 0.0),
|
|
|
+ })
|
|
|
+ return formatted
|
|
|
+
|
|
|
+ def detect_text_boxes(self, image: Union[np.ndarray, Image.Image]) -> List[Dict[str, Any]]:
|
|
|
+ """只检测印章文本框(不识别文字)"""
|
|
|
+ if self.seal_model is None:
|
|
|
+ raise RuntimeError("Seal OCR model not initialized")
|
|
|
+
|
|
|
+ if isinstance(image, Image.Image):
|
|
|
+ img_rgb = np.array(image)
|
|
|
+ else:
|
|
|
+ img_rgb = image
|
|
|
+
|
|
|
+ if img_rgb.size == 0:
|
|
|
+ return []
|
|
|
+
|
|
|
+ img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR)
|
|
|
+
|
|
|
+ try:
|
|
|
+ ocr_results = self.seal_model.ocr(img_bgr, det=True, rec=False)
|
|
|
+ formatted: List[Dict[str, Any]] = []
|
|
|
+ if ocr_results and ocr_results[0]:
|
|
|
+ for poly in ocr_results[0]:
|
|
|
+ if poly and len(poly) >= 4:
|
|
|
+ formatted.append({
|
|
|
+ 'poly': poly,
|
|
|
+ 'confidence': 1.0,
|
|
|
+ })
|
|
|
+ return formatted
|
|
|
+ except Exception as e:
|
|
|
+ logger.warning(f"Seal text box detection failed: {e}")
|
|
|
+ return []
|