пре 1 месец · 8bf66bc119
--- a/ocr_validator/ocr_validator_utils.py
+++ b/ocr_validator/ocr_validator_utils.py
@@ -249,6 +249,31 @@ def parse_mineru_data(data: List, config: Dict, tool_name="mineru") -> List[Dict
 
															                         'confidence': confidence,
														
 
															                         'source_tool': tool_name
														
 
															                     })
														
 
															+        elif category == 'seal':
														
 
															+            content = item.get('content', {})
														
 
															+            if isinstance(content, dict):
														
 
															+                if not text:
														
 
															+                    text = content.get('text', '')
														
 
															+                if not confidence or confidence == config['ocr']['default_confidence']:
														
 
															+                    confidence = content.get('confidence', confidence)
														
 
															+                recognition_method = content.get('recognition_method', '')
														
 
															+                seal_texts = content.get('texts', [])
														
 
															+            else:
														
 
															+                recognition_method = item.get('recognition_method', '')
														
 
															+                seal_texts = item.get('texts', [])
														
 
															+            if text and bbox and len(bbox) >= 4:
														
 
															+                seal_entry = {
														
 
															+                    'text': str(text).strip(),
														
 
															+                    'bbox': bbox[:4],
														
 
															+                    'category': 'seal',
														
 
															+                    'confidence': confidence,
														
 
															+                    'source_tool': tool_name,
														
 
															+                }
														
 
															+                if recognition_method:
														
 
															+                    seal_entry['recognition_method'] = recognition_method
														
 
															+                if seal_texts:
														
 
															+                    seal_entry['seal_texts'] = seal_texts
														
 
															+                parsed_data.append(seal_entry)
														
 
															         else:
														
 
															             # 其他类型，按文本处理,  header, table_cell, ...
														
 
															             if text and bbox and len(bbox) >= 4:
														
@@ -272,14 +297,16 @@ def detect_mineru_structure(data: Union[List, Dict]) -> bool:
 
															     if not isinstance(first_item, dict):
														
 
															         return False
														
 
															-    # MinerU特征：包含type字段，且值为text/table/image之一
														
 
															+    # MinerU / pipeline page json：type + bbox（text 可为空，如部分 text 块）
														
 
															     has_type = 'type' in first_item
														
 
															     has_bbox = 'bbox' in first_item
														
 
															-    has_text = 'text' in first_item
														
 
															-    if has_type and has_bbox and has_text:
														
 
															+    if has_type and has_bbox:
														
 
															         item_type = first_item.get('type', '')
														
 
															-        return item_type in ['text', 'table', 'image']
														
 
															+        return item_type in [
														
 
															+            'text', 'table', 'table_body', 'image', 'title',
														
 
															+            'seal', 'list', 'header', 'footer',
														
 
															+        ]
														
 
															     return False
														
@@ -541,15 +568,20 @@ def process_ocr_data(ocr_data: List, config: Dict) -> Dict[str, List]:
 
															             if isinstance(bbox, list) and len(bbox) == 4:
														
 
															                 if text not in text_bbox_mapping:
														
 
															                     text_bbox_mapping[text] = []
														
 
															-                text_bbox_mapping[text].append({
														
 
															+                mapping_entry = {
														
 
															                     'matched_text': item.get('matched_text', ''),
														
 
															                     'bbox': bbox,
														
 
															                     'category': item.get('category', 'Text'),
														
 
															                     'index': i,
														
 
															                     'confidence': item.get('confidence', config['ocr']['default_confidence']),
														
 
															                     'source_tool': item.get('source_tool', 'unknown'),
														
 
															-                    'rotation_angle': item.get('rotation_angle', 0.0)  # 添加旋转角度信息
														
 
															-                })
														
 
															+                    'rotation_angle': item.get('rotation_angle', 0.0),
														
 
															+                }
														
 
															+                if item.get('recognition_method'):
														
 
															+                    mapping_entry['recognition_method'] = item['recognition_method']
														
 
															+                if item.get('seal_texts'):
														
 
															+                    mapping_entry['seal_texts'] = item['seal_texts']
														
 
															+                text_bbox_mapping[text].append(mapping_entry)
														
 
															     return text_bbox_mapping