1 lună în urmă · 8bf66bc119
--- a/ocr_validator/ocr_validator_utils.py
+++ b/ocr_validator/ocr_validator_utils.py
@@ -249,6 +249,31 @@ def parse_mineru_data(data: List, config: Dict, tool_name="mineru") -> List[Dict
 
				                         'confidence': confidence,
			
 
				                         'source_tool': tool_name
			
 
				                     })
			
 
				+        elif category == 'seal':
			
 
				+            content = item.get('content', {})
			
 
				+            if isinstance(content, dict):
			
 
				+                if not text:
			
 
				+                    text = content.get('text', '')
			
 
				+                if not confidence or confidence == config['ocr']['default_confidence']:
			
 
				+                    confidence = content.get('confidence', confidence)
			
 
				+                recognition_method = content.get('recognition_method', '')
			
 
				+                seal_texts = content.get('texts', [])
			
 
				+            else:
			
 
				+                recognition_method = item.get('recognition_method', '')
			
 
				+                seal_texts = item.get('texts', [])
			
 
				+            if text and bbox and len(bbox) >= 4:
			
 
				+                seal_entry = {
			
 
				+                    'text': str(text).strip(),
			
 
				+                    'bbox': bbox[:4],
			
 
				+                    'category': 'seal',
			
 
				+                    'confidence': confidence,
			
 
				+                    'source_tool': tool_name,
			
 
				+                }
			
 
				+                if recognition_method:
			
 
				+                    seal_entry['recognition_method'] = recognition_method
			
 
				+                if seal_texts:
			
 
				+                    seal_entry['seal_texts'] = seal_texts
			
 
				+                parsed_data.append(seal_entry)
			
 
				         else:
			
 
				             # 其他类型，按文本处理,  header, table_cell, ...
			
 
				             if text and bbox and len(bbox) >= 4:
			
@@ -272,14 +297,16 @@ def detect_mineru_structure(data: Union[List, Dict]) -> bool:
 
				     if not isinstance(first_item, dict):
			
 
				         return False
			
 
				     
			
 
				-    # MinerU特征：包含type字段，且值为text/table/image之一
			
 
				+    # MinerU / pipeline page json：type + bbox（text 可为空，如部分 text 块）
			
 
				     has_type = 'type' in first_item
			
 
				     has_bbox = 'bbox' in first_item
			
 
				-    has_text = 'text' in first_item
			
 
				     
			
 
				-    if has_type and has_bbox and has_text:
			
 
				+    if has_type and has_bbox:
			
 
				         item_type = first_item.get('type', '')
			
 
				-        return item_type in ['text', 'table', 'image']
			
 
				+        return item_type in [
			
 
				+            'text', 'table', 'table_body', 'image', 'title',
			
 
				+            'seal', 'list', 'header', 'footer',
			
 
				+        ]
			
 
				     
			
 
				     return False
			
 
				 
			
@@ -541,15 +568,20 @@ def process_ocr_data(ocr_data: List, config: Dict) -> Dict[str, List]:
 
				             if isinstance(bbox, list) and len(bbox) == 4:
			
 
				                 if text not in text_bbox_mapping:
			
 
				                     text_bbox_mapping[text] = []
			
 
				-                text_bbox_mapping[text].append({
			
 
				+                mapping_entry = {
			
 
				                     'matched_text': item.get('matched_text', ''),
			
 
				                     'bbox': bbox,
			
 
				                     'category': item.get('category', 'Text'),
			
 
				                     'index': i,
			
 
				                     'confidence': item.get('confidence', config['ocr']['default_confidence']),
			
 
				                     'source_tool': item.get('source_tool', 'unknown'),
			
 
				-                    'rotation_angle': item.get('rotation_angle', 0.0)  # 添加旋转角度信息
			
 
				-                })
			
 
				+                    'rotation_angle': item.get('rotation_angle', 0.0),
			
 
				+                }
			
 
				+                if item.get('recognition_method'):
			
 
				+                    mapping_entry['recognition_method'] = item['recognition_method']
			
 
				+                if item.get('seal_texts'):
			
 
				+                    mapping_entry['seal_texts'] = item['seal_texts']
			
 
				+                text_bbox_mapping[text].append(mapping_entry)
			
 
				     
			
 
				     return text_bbox_mapping