1 lună în urmă · cb83d24f8c
--- a/ocr_tools/universal_doc_parser/models/adapters/paddle_layout_detector.py
+++ b/ocr_tools/universal_doc_parser/models/adapters/paddle_layout_detector.py
@@ -38,7 +38,7 @@ class PaddleLayoutDetector(BaseLayoutDetector):
 
				         13: 'header',            # header -> header (TEXT_CATEGORIES)
			
 
				         14: 'algorithm',         # algorithm -> algorithm (CODE_CATEGORIES)
			
 
				         15: 'footer',            # footer -> footer (TEXT_CATEGORIES)
			
 
				-        16: 'abandon'            # seal -> abandon (DISCARD_CATEGORIES)
			
 
				+        16: 'seal'               # seal -> seal (SEAL_CATEGORIES)
			
 
				     }
			
 
				     
			
 
				     ORIGINAL_CATEGORY_NAMES = {
			
--- a/ocr_tools/universal_doc_parser/models/adapters/pp_doclayout_v3_layout_adapter.py
+++ b/ocr_tools/universal_doc_parser/models/adapters/pp_doclayout_v3_layout_adapter.py
@@ -61,23 +61,23 @@ class PPDocLayoutV3Detector(BaseLayoutDetector):
 
				 
			
 
				     CATEGORY_MAP = {
			
 
				         "abstract": "text",
			
 
				-        "algorithm": "text",
			
 
				-        "aside_text": "text",
			
 
				-        "chart": "image_body",
			
 
				+        "algorithm": "code",
			
 
				+        "aside_text": "aside_text",
			
 
				+        "chart": "chart",
			
 
				         "content": "text",
			
 
				         "formula": "interline_equation",
			
 
				         "doc_title": "title",
			
 
				         "figure_title": "image_caption",
			
 
				         "footer": "footer",
			
 
				         "footnote": "page_footnote",
			
 
				-        "formula_number": "interline_equation",
			
 
				+        "formula_number": "interline_equation_number",
			
 
				         "header": "header",
			
 
				         "image": "image_body",
			
 
				-        "number": "text",
			
 
				+        "number": "page_number",
			
 
				         "paragraph_title": "title",
			
 
				-        "reference": "text",
			
 
				-        "reference_content": "text",
			
 
				-        "seal": "seal",  # 🔧 修改：保留 seal 作为独立类别，用于 VLM 识别
			
 
				+        "reference": "ref_text",
			
 
				+        "reference_content": "ref_text",
			
 
				+        "seal": "seal",
			
 
				         "table": "table_body",
			
 
				         "text": "text",
			
 
				         "vision_footnote": "page_footnote",
			
@@ -187,6 +187,15 @@ class PPDocLayoutV3Detector(BaseLayoutDetector):
 
				         self.image_processor = None
			
 
				         self._model_path = None
			
 
				 
			
 
				+    def _numpy_to_pil_rgb(self, image: np.ndarray) -> Image.Image:
			
 
				+        """将 numpy 图像转为 PIL RGB。
			
 
				+
			
 
				+        Pipeline / PyMuPDF 渲染结果为 RGB，勿误用 cv2.COLOR_BGR2RGB（会导致红章等漏检）。
			
 
				+        """
			
 
				+        if len(image.shape) == 3 and image.shape[2] == 3:
			
 
				+            return Image.fromarray(image).convert("RGB")
			
 
				+        return Image.fromarray(image).convert("RGB")
			
 
				+
			
 
				     def _detect_raw(
			
 
				         self,
			
 
				         image: Union[np.ndarray, Image.Image],
			
@@ -200,11 +209,7 @@ class PPDocLayoutV3Detector(BaseLayoutDetector):
 
				         assert self.image_processor is not None
			
 
				 
			
 
				         if isinstance(image, np.ndarray):
			
 
				-            if len(image.shape) == 3 and image.shape[2] == 3:
			
 
				-                image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
			
 
				-            else:
			
 
				-                image_rgb = image
			
 
				-            pil_image = Image.fromarray(image_rgb).convert("RGB")
			
 
				+            pil_image = self._numpy_to_pil_rgb(image)
			
 
				             orig_h, orig_w = image.shape[:2]
			
 
				         else:
			
 
				             pil_image = image.convert("RGB")
			
@@ -279,11 +284,7 @@ class PPDocLayoutV3Detector(BaseLayoutDetector):
 
				         orig_sizes = []
			
 
				         for image in images:
			
 
				             if isinstance(image, np.ndarray):
			
 
				-                if len(image.shape) == 3 and image.shape[2] == 3:
			
 
				-                    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
			
 
				-                else:
			
 
				-                    image_rgb = image
			
 
				-                pil_images.append(Image.fromarray(image_rgb).convert("RGB"))
			
 
				+                pil_images.append(self._numpy_to_pil_rgb(image))
			
 
				                 orig_sizes.append((image.shape[1], image.shape[0]))
			
 
				             else:
			
 
				                 pil_images.append(image.convert("RGB"))
			
--- a/ocr_tools/universal_doc_parser/models/adapters/seal_ocr_adapter.py
+++ b/ocr_tools/universal_doc_parser/models/adapters/seal_ocr_adapter.py
@@ -0,0 +1,175 @@
 
				+"""印章 OCR 识别适配器，封装 MinerU 的 PytorchPaddleOCR(lang="seal")"""
			
 
				+
			
 
				+from typing import Dict, Any, List, Union
			
 
				+import numpy as np
			
 
				+import cv2
			
 
				+from PIL import Image
			
 
				+from loguru import logger
			
 
				+
			
 
				+from .base import BaseOCRRecognizer
			
 
				+
			
 
				+try:
			
 
				+    from mineru.backend.pipeline.model_init import AtomModelSingleton
			
 
				+    from mineru.backend.pipeline.model_list import AtomicModel
			
 
				+    MINERU_AVAILABLE = True
			
 
				+except ImportError as e:
			
 
				+    logger.warning(f"MinerU components not available for seal OCR: {e}")
			
 
				+    MINERU_AVAILABLE = False
			
 
				+
			
 
				+
			
 
				+class SealOCRRecognizer(BaseOCRRecognizer):
			
 
				+    """印章 OCR 识别适配器，复用 MinerU 的印章专用 OCR 模型
			
 
				+
			
 
				+    使用 PytorchPaddleOCR(lang="seal")，该模型针对印章文本做了专项优化：
			
 
				+    - 检测模型: seal_PP-OCRv4_det_server_infer.pth
			
 
				+    - 识别模型: ch_PP-OCRv4_rec_server_infer.pth
			
 
				+    - 使用 polygon 边界框，低阈值 (db_thresh=0.2, box_thresh=0.6)
			
 
				+    - 不合并检测框 (enable_merge_det_boxes=False)
			
 
				+    - drop_score=0 以保留低置信度结果
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, config: Dict[str, Any]):
			
 
				+        super().__init__(config)
			
 
				+        if not MINERU_AVAILABLE:
			
 
				+            raise ImportError("MinerU components not available")
			
 
				+        self.atom_model_manager = AtomModelSingleton()
			
 
				+        self.seal_model = None
			
 
				+
			
 
				+    def initialize(self):
			
 
				+        """初始化印章 OCR 模型"""
			
 
				+        try:
			
 
				+            self.seal_model = self.atom_model_manager.get_atom_model(
			
 
				+                atom_model_name=AtomicModel.OCR,
			
 
				+                lang="seal",
			
 
				+            )
			
 
				+            logger.info("SealOCRRecognizer initialized with lang=seal")
			
 
				+        except Exception as e:
			
 
				+            logger.error(f"Failed to initialize SealOCRRecognizer: {e}")
			
 
				+            raise
			
 
				+
			
 
				+    def cleanup(self):
			
 
				+        """清理资源"""
			
 
				+        self.seal_model = None
			
 
				+
			
 
				+    def recognize(self, image: Union[np.ndarray, Image.Image]) -> Dict[str, Any]:
			
 
				+        """识别印章图片中的文字
			
 
				+
			
 
				+        与 MinerU batch_analyze.py 中的印章 OCR 逻辑保持一致：
			
 
				+        1. 将 RGB 图像转为 BGR
			
 
				+        2. 调用 seal_ocr_model.ocr(bgr_img, det=True, rec=True)
			
 
				+        3. 提取识别出的文本列表
			
 
				+
			
 
				+        Args:
			
 
				+            image: 印章裁剪图像 (RGB/OpenCV numpy array 或 PIL Image)
			
 
				+
			
 
				+        Returns:
			
 
				+            {
			
 
				+                'text': str,              # 合并后的文本（用空格连接）
			
 
				+                'texts': List[str],       # 各文本框识别出的文本列表
			
 
				+                'confidence': float,      # 平均置信度
			
 
				+                'details': List[Dict]     # 详细结果 (bbox, text, confidence)
			
 
				+            }
			
 
				+        """
			
 
				+        if self.seal_model is None:
			
 
				+            raise RuntimeError("Seal OCR model not initialized")
			
 
				+
			
 
				+        # 转换为 BGR 格式
			
 
				+        if isinstance(image, Image.Image):
			
 
				+            img_rgb = np.array(image)
			
 
				+        else:
			
 
				+            img_rgb = image
			
 
				+
			
 
				+        if img_rgb.size == 0:
			
 
				+            return {'text': '', 'texts': [], 'confidence': 0.0, 'details': []}
			
 
				+
			
 
				+        img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR)
			
 
				+
			
 
				+        try:
			
 
				+            seal_ocr_res = self.seal_model.ocr(img_bgr, det=True, rec=True)
			
 
				+            if not seal_ocr_res or not seal_ocr_res[0]:
			
 
				+                return {'text': '', 'texts': [], 'confidence': 0.0, 'details': []}
			
 
				+
			
 
				+            seal_texts: List[str] = []
			
 
				+            details: List[Dict[str, Any]] = []
			
 
				+            confidences: List[float] = []
			
 
				+
			
 
				+            for seal_item in seal_ocr_res[0]:
			
 
				+                if not seal_item or len(seal_item) != 2:
			
 
				+                    continue
			
 
				+                poly = seal_item[0]  # 多边形坐标
			
 
				+                rec_result = seal_item[1]
			
 
				+                if not rec_result or len(rec_result) < 1:
			
 
				+                    continue
			
 
				+                rec_text = rec_result[0]
			
 
				+                rec_conf = rec_result[1] if len(rec_result) >= 2 else 0.0
			
 
				+                if rec_text:
			
 
				+                    seal_texts.append(rec_text)
			
 
				+                    confidences.append(rec_conf)
			
 
				+                    details.append({
			
 
				+                        'poly': poly,
			
 
				+                        'text': rec_text,
			
 
				+                        'confidence': rec_conf,
			
 
				+                    })
			
 
				+
			
 
				+            avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
			
 
				+            combined_text = " ".join(seal_texts)
			
 
				+
			
 
				+            logger.debug(
			
 
				+                f"Seal OCR: '{combined_text[:50]}...' (avg conf: {avg_confidence:.3f})"
			
 
				+            )
			
 
				+
			
 
				+            return {
			
 
				+                'text': combined_text,
			
 
				+                'texts': seal_texts,
			
 
				+                'confidence': avg_confidence,
			
 
				+                'details': details,
			
 
				+            }
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            logger.warning(f"Seal OCR recognition failed: {e}")
			
 
				+            return {'text': '', 'texts': [], 'confidence': 0.0, 'details': []}
			
 
				+
			
 
				+    def recognize_text(self, image: Union[np.ndarray, Image.Image]) -> List[Dict[str, Any]]:
			
 
				+        """实现 BaseOCRRecognizer 接口，将 recognize() 结果转为标准 OCR 列表格式"""
			
 
				+        result = self.recognize(image)
			
 
				+        formatted: List[Dict[str, Any]] = []
			
 
				+        for detail in result.get('details', []):
			
 
				+            poly = detail.get('poly')
			
 
				+            if not poly:
			
 
				+                continue
			
 
				+            formatted.append({
			
 
				+                'poly': poly,
			
 
				+                'text': detail.get('text', ''),
			
 
				+                'confidence': detail.get('confidence', 0.0),
			
 
				+            })
			
 
				+        return formatted
			
 
				+
			
 
				+    def detect_text_boxes(self, image: Union[np.ndarray, Image.Image]) -> List[Dict[str, Any]]:
			
 
				+        """只检测印章文本框（不识别文字）"""
			
 
				+        if self.seal_model is None:
			
 
				+            raise RuntimeError("Seal OCR model not initialized")
			
 
				+
			
 
				+        if isinstance(image, Image.Image):
			
 
				+            img_rgb = np.array(image)
			
 
				+        else:
			
 
				+            img_rgb = image
			
 
				+
			
 
				+        if img_rgb.size == 0:
			
 
				+            return []
			
 
				+
			
 
				+        img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR)
			
 
				+
			
 
				+        try:
			
 
				+            ocr_results = self.seal_model.ocr(img_bgr, det=True, rec=False)
			
 
				+            formatted: List[Dict[str, Any]] = []
			
 
				+            if ocr_results and ocr_results[0]:
			
 
				+                for poly in ocr_results[0]:
			
 
				+                    if poly and len(poly) >= 4:
			
 
				+                        formatted.append({
			
 
				+                            'poly': poly,
			
 
				+                            'confidence': 1.0,
			
 
				+                        })
			
 
				+            return formatted
			
 
				+        except Exception as e:
			
 
				+            logger.warning(f"Seal text box detection failed: {e}")
			
 
				+            return []