Преглед изворни кода

feat(更新布局检测器与新增印章OCR适配器): 修改PaddleLayoutDetector和PPDocLayoutV3Detector类的类别映射,优化印章和图表的处理逻辑,同时新增SealOCRRecognizer适配器以支持印章OCR识别,提升文档解析与印章识别能力。

zhch158_admin пре 1 месец
родитељ
комит
cb83d24f8c

+ 1 - 1
ocr_tools/universal_doc_parser/models/adapters/paddle_layout_detector.py

@@ -38,7 +38,7 @@ class PaddleLayoutDetector(BaseLayoutDetector):
         13: 'header',            # header -> header (TEXT_CATEGORIES)
         14: 'algorithm',         # algorithm -> algorithm (CODE_CATEGORIES)
         15: 'footer',            # footer -> footer (TEXT_CATEGORIES)
-        16: 'abandon'            # seal -> abandon (DISCARD_CATEGORIES)
+        16: 'seal'               # seal -> seal (SEAL_CATEGORIES)
     }
     
     ORIGINAL_CATEGORY_NAMES = {

+ 19 - 18
ocr_tools/universal_doc_parser/models/adapters/pp_doclayout_v3_layout_adapter.py

@@ -61,23 +61,23 @@ class PPDocLayoutV3Detector(BaseLayoutDetector):
 
     CATEGORY_MAP = {
         "abstract": "text",
-        "algorithm": "text",
-        "aside_text": "text",
-        "chart": "image_body",
+        "algorithm": "code",
+        "aside_text": "aside_text",
+        "chart": "chart",
         "content": "text",
         "formula": "interline_equation",
         "doc_title": "title",
         "figure_title": "image_caption",
         "footer": "footer",
         "footnote": "page_footnote",
-        "formula_number": "interline_equation",
+        "formula_number": "interline_equation_number",
         "header": "header",
         "image": "image_body",
-        "number": "text",
+        "number": "page_number",
         "paragraph_title": "title",
-        "reference": "text",
-        "reference_content": "text",
-        "seal": "seal",  # 🔧 修改:保留 seal 作为独立类别,用于 VLM 识别
+        "reference": "ref_text",
+        "reference_content": "ref_text",
+        "seal": "seal",
         "table": "table_body",
         "text": "text",
         "vision_footnote": "page_footnote",
@@ -187,6 +187,15 @@ class PPDocLayoutV3Detector(BaseLayoutDetector):
         self.image_processor = None
         self._model_path = None
 
+    def _numpy_to_pil_rgb(self, image: np.ndarray) -> Image.Image:
+        """将 numpy 图像转为 PIL RGB。
+
+        Pipeline / PyMuPDF 渲染结果为 RGB,勿误用 cv2.COLOR_BGR2RGB(会导致红章等漏检)。
+        """
+        if len(image.shape) == 3 and image.shape[2] == 3:
+            return Image.fromarray(image).convert("RGB")
+        return Image.fromarray(image).convert("RGB")
+
     def _detect_raw(
         self,
         image: Union[np.ndarray, Image.Image],
@@ -200,11 +209,7 @@ class PPDocLayoutV3Detector(BaseLayoutDetector):
         assert self.image_processor is not None
 
         if isinstance(image, np.ndarray):
-            if len(image.shape) == 3 and image.shape[2] == 3:
-                image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-            else:
-                image_rgb = image
-            pil_image = Image.fromarray(image_rgb).convert("RGB")
+            pil_image = self._numpy_to_pil_rgb(image)
             orig_h, orig_w = image.shape[:2]
         else:
             pil_image = image.convert("RGB")
@@ -279,11 +284,7 @@ class PPDocLayoutV3Detector(BaseLayoutDetector):
         orig_sizes = []
         for image in images:
             if isinstance(image, np.ndarray):
-                if len(image.shape) == 3 and image.shape[2] == 3:
-                    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-                else:
-                    image_rgb = image
-                pil_images.append(Image.fromarray(image_rgb).convert("RGB"))
+                pil_images.append(self._numpy_to_pil_rgb(image))
                 orig_sizes.append((image.shape[1], image.shape[0]))
             else:
                 pil_images.append(image.convert("RGB"))

+ 175 - 0
ocr_tools/universal_doc_parser/models/adapters/seal_ocr_adapter.py

@@ -0,0 +1,175 @@
+"""印章 OCR 识别适配器,封装 MinerU 的 PytorchPaddleOCR(lang="seal")"""
+
+from typing import Dict, Any, List, Union
+import numpy as np
+import cv2
+from PIL import Image
+from loguru import logger
+
+from .base import BaseOCRRecognizer
+
+try:
+    from mineru.backend.pipeline.model_init import AtomModelSingleton
+    from mineru.backend.pipeline.model_list import AtomicModel
+    MINERU_AVAILABLE = True
+except ImportError as e:
+    logger.warning(f"MinerU components not available for seal OCR: {e}")
+    MINERU_AVAILABLE = False
+
+
+class SealOCRRecognizer(BaseOCRRecognizer):
+    """印章 OCR 识别适配器,复用 MinerU 的印章专用 OCR 模型
+
+    使用 PytorchPaddleOCR(lang="seal"),该模型针对印章文本做了专项优化:
+    - 检测模型: seal_PP-OCRv4_det_server_infer.pth
+    - 识别模型: ch_PP-OCRv4_rec_server_infer.pth
+    - 使用 polygon 边界框,低阈值 (db_thresh=0.2, box_thresh=0.6)
+    - 不合并检测框 (enable_merge_det_boxes=False)
+    - drop_score=0 以保留低置信度结果
+    """
+
+    def __init__(self, config: Dict[str, Any]):
+        super().__init__(config)
+        if not MINERU_AVAILABLE:
+            raise ImportError("MinerU components not available")
+        self.atom_model_manager = AtomModelSingleton()
+        self.seal_model = None
+
+    def initialize(self):
+        """初始化印章 OCR 模型"""
+        try:
+            self.seal_model = self.atom_model_manager.get_atom_model(
+                atom_model_name=AtomicModel.OCR,
+                lang="seal",
+            )
+            logger.info("SealOCRRecognizer initialized with lang=seal")
+        except Exception as e:
+            logger.error(f"Failed to initialize SealOCRRecognizer: {e}")
+            raise
+
+    def cleanup(self):
+        """清理资源"""
+        self.seal_model = None
+
+    def recognize(self, image: Union[np.ndarray, Image.Image]) -> Dict[str, Any]:
+        """识别印章图片中的文字
+
+        与 MinerU batch_analyze.py 中的印章 OCR 逻辑保持一致:
+        1. 将 RGB 图像转为 BGR
+        2. 调用 seal_ocr_model.ocr(bgr_img, det=True, rec=True)
+        3. 提取识别出的文本列表
+
+        Args:
+            image: 印章裁剪图像 (RGB/OpenCV numpy array 或 PIL Image)
+
+        Returns:
+            {
+                'text': str,              # 合并后的文本(用空格连接)
+                'texts': List[str],       # 各文本框识别出的文本列表
+                'confidence': float,      # 平均置信度
+                'details': List[Dict]     # 详细结果 (bbox, text, confidence)
+            }
+        """
+        if self.seal_model is None:
+            raise RuntimeError("Seal OCR model not initialized")
+
+        # 转换为 BGR 格式
+        if isinstance(image, Image.Image):
+            img_rgb = np.array(image)
+        else:
+            img_rgb = image
+
+        if img_rgb.size == 0:
+            return {'text': '', 'texts': [], 'confidence': 0.0, 'details': []}
+
+        img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR)
+
+        try:
+            seal_ocr_res = self.seal_model.ocr(img_bgr, det=True, rec=True)
+            if not seal_ocr_res or not seal_ocr_res[0]:
+                return {'text': '', 'texts': [], 'confidence': 0.0, 'details': []}
+
+            seal_texts: List[str] = []
+            details: List[Dict[str, Any]] = []
+            confidences: List[float] = []
+
+            for seal_item in seal_ocr_res[0]:
+                if not seal_item or len(seal_item) != 2:
+                    continue
+                poly = seal_item[0]  # 多边形坐标
+                rec_result = seal_item[1]
+                if not rec_result or len(rec_result) < 1:
+                    continue
+                rec_text = rec_result[0]
+                rec_conf = rec_result[1] if len(rec_result) >= 2 else 0.0
+                if rec_text:
+                    seal_texts.append(rec_text)
+                    confidences.append(rec_conf)
+                    details.append({
+                        'poly': poly,
+                        'text': rec_text,
+                        'confidence': rec_conf,
+                    })
+
+            avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
+            combined_text = " ".join(seal_texts)
+
+            logger.debug(
+                f"Seal OCR: '{combined_text[:50]}...' (avg conf: {avg_confidence:.3f})"
+            )
+
+            return {
+                'text': combined_text,
+                'texts': seal_texts,
+                'confidence': avg_confidence,
+                'details': details,
+            }
+
+        except Exception as e:
+            logger.warning(f"Seal OCR recognition failed: {e}")
+            return {'text': '', 'texts': [], 'confidence': 0.0, 'details': []}
+
+    def recognize_text(self, image: Union[np.ndarray, Image.Image]) -> List[Dict[str, Any]]:
+        """实现 BaseOCRRecognizer 接口,将 recognize() 结果转为标准 OCR 列表格式"""
+        result = self.recognize(image)
+        formatted: List[Dict[str, Any]] = []
+        for detail in result.get('details', []):
+            poly = detail.get('poly')
+            if not poly:
+                continue
+            formatted.append({
+                'poly': poly,
+                'text': detail.get('text', ''),
+                'confidence': detail.get('confidence', 0.0),
+            })
+        return formatted
+
+    def detect_text_boxes(self, image: Union[np.ndarray, Image.Image]) -> List[Dict[str, Any]]:
+        """只检测印章文本框(不识别文字)"""
+        if self.seal_model is None:
+            raise RuntimeError("Seal OCR model not initialized")
+
+        if isinstance(image, Image.Image):
+            img_rgb = np.array(image)
+        else:
+            img_rgb = image
+
+        if img_rgb.size == 0:
+            return []
+
+        img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR)
+
+        try:
+            ocr_results = self.seal_model.ocr(img_bgr, det=True, rec=False)
+            formatted: List[Dict[str, Any]] = []
+            if ocr_results and ocr_results[0]:
+                for poly in ocr_results[0]:
+                    if poly and len(poly) >= 4:
+                        formatted.append({
+                            'poly': poly,
+                            'confidence': 1.0,
+                        })
+            return formatted
+        except Exception as e:
+            logger.warning(f"Seal text box detection failed: {e}")
+            return []