Răsfoiți Sursa

feat(增强印章OCR处理): 在ocr_validator_utils.py中新增对印章类别的支持,优化文本解析逻辑,添加印章相关信息的提取与处理,提升印章识别能力与数据解析的准确性。

zhch158_admin 1 lună în urmă
părinte
comite
8bf66bc119
1 a modificat fișierele cu 39 adăugiri și 7 ștergeri
  1. 39 7
      ocr_validator/ocr_validator_utils.py

+ 39 - 7
ocr_validator/ocr_validator_utils.py

@@ -249,6 +249,31 @@ def parse_mineru_data(data: List, config: Dict, tool_name="mineru") -> List[Dict
                         'confidence': confidence,
                         'source_tool': tool_name
                     })
+        elif category == 'seal':
+            content = item.get('content', {})
+            if isinstance(content, dict):
+                if not text:
+                    text = content.get('text', '')
+                if not confidence or confidence == config['ocr']['default_confidence']:
+                    confidence = content.get('confidence', confidence)
+                recognition_method = content.get('recognition_method', '')
+                seal_texts = content.get('texts', [])
+            else:
+                recognition_method = item.get('recognition_method', '')
+                seal_texts = item.get('texts', [])
+            if text and bbox and len(bbox) >= 4:
+                seal_entry = {
+                    'text': str(text).strip(),
+                    'bbox': bbox[:4],
+                    'category': 'seal',
+                    'confidence': confidence,
+                    'source_tool': tool_name,
+                }
+                if recognition_method:
+                    seal_entry['recognition_method'] = recognition_method
+                if seal_texts:
+                    seal_entry['seal_texts'] = seal_texts
+                parsed_data.append(seal_entry)
         else:
             # 其他类型,按文本处理,  header, table_cell, ...
             if text and bbox and len(bbox) >= 4:
@@ -272,14 +297,16 @@ def detect_mineru_structure(data: Union[List, Dict]) -> bool:
     if not isinstance(first_item, dict):
         return False
     
-    # MinerU特征:包含type字段,且值为text/table/image之一
+    # MinerU / pipeline page json:type + bbox(text 可为空,如部分 text 块)
     has_type = 'type' in first_item
     has_bbox = 'bbox' in first_item
-    has_text = 'text' in first_item
     
-    if has_type and has_bbox and has_text:
+    if has_type and has_bbox:
         item_type = first_item.get('type', '')
-        return item_type in ['text', 'table', 'image']
+        return item_type in [
+            'text', 'table', 'table_body', 'image', 'title',
+            'seal', 'list', 'header', 'footer',
+        ]
     
     return False
 
@@ -541,15 +568,20 @@ def process_ocr_data(ocr_data: List, config: Dict) -> Dict[str, List]:
             if isinstance(bbox, list) and len(bbox) == 4:
                 if text not in text_bbox_mapping:
                     text_bbox_mapping[text] = []
-                text_bbox_mapping[text].append({
+                mapping_entry = {
                     'matched_text': item.get('matched_text', ''),
                     'bbox': bbox,
                     'category': item.get('category', 'Text'),
                     'index': i,
                     'confidence': item.get('confidence', config['ocr']['default_confidence']),
                     'source_tool': item.get('source_tool', 'unknown'),
-                    'rotation_angle': item.get('rotation_angle', 0.0)  # 添加旋转角度信息
-                })
+                    'rotation_angle': item.get('rotation_angle', 0.0),
+                }
+                if item.get('recognition_method'):
+                    mapping_entry['recognition_method'] = item['recognition_method']
+                if item.get('seal_texts'):
+                    mapping_entry['seal_texts'] = item['seal_texts']
+                text_bbox_mapping[text].append(mapping_entry)
     
     return text_bbox_mapping