|
@@ -249,6 +249,31 @@ def parse_mineru_data(data: List, config: Dict, tool_name="mineru") -> List[Dict
|
|
|
'confidence': confidence,
|
|
'confidence': confidence,
|
|
|
'source_tool': tool_name
|
|
'source_tool': tool_name
|
|
|
})
|
|
})
|
|
|
|
|
+ elif category == 'seal':
|
|
|
|
|
+ content = item.get('content', {})
|
|
|
|
|
+ if isinstance(content, dict):
|
|
|
|
|
+ if not text:
|
|
|
|
|
+ text = content.get('text', '')
|
|
|
|
|
+ if not confidence or confidence == config['ocr']['default_confidence']:
|
|
|
|
|
+ confidence = content.get('confidence', confidence)
|
|
|
|
|
+ recognition_method = content.get('recognition_method', '')
|
|
|
|
|
+ seal_texts = content.get('texts', [])
|
|
|
|
|
+ else:
|
|
|
|
|
+ recognition_method = item.get('recognition_method', '')
|
|
|
|
|
+ seal_texts = item.get('texts', [])
|
|
|
|
|
+ if text and bbox and len(bbox) >= 4:
|
|
|
|
|
+ seal_entry = {
|
|
|
|
|
+ 'text': str(text).strip(),
|
|
|
|
|
+ 'bbox': bbox[:4],
|
|
|
|
|
+ 'category': 'seal',
|
|
|
|
|
+ 'confidence': confidence,
|
|
|
|
|
+ 'source_tool': tool_name,
|
|
|
|
|
+ }
|
|
|
|
|
+ if recognition_method:
|
|
|
|
|
+ seal_entry['recognition_method'] = recognition_method
|
|
|
|
|
+ if seal_texts:
|
|
|
|
|
+ seal_entry['seal_texts'] = seal_texts
|
|
|
|
|
+ parsed_data.append(seal_entry)
|
|
|
else:
|
|
else:
|
|
|
# 其他类型,按文本处理, header, table_cell, ...
|
|
# 其他类型,按文本处理, header, table_cell, ...
|
|
|
if text and bbox and len(bbox) >= 4:
|
|
if text and bbox and len(bbox) >= 4:
|
|
@@ -272,14 +297,16 @@ def detect_mineru_structure(data: Union[List, Dict]) -> bool:
|
|
|
if not isinstance(first_item, dict):
|
|
if not isinstance(first_item, dict):
|
|
|
return False
|
|
return False
|
|
|
|
|
|
|
|
- # MinerU特征:包含type字段,且值为text/table/image之一
|
|
|
|
|
|
|
+ # MinerU / pipeline page json:type + bbox(text 可为空,如部分 text 块)
|
|
|
has_type = 'type' in first_item
|
|
has_type = 'type' in first_item
|
|
|
has_bbox = 'bbox' in first_item
|
|
has_bbox = 'bbox' in first_item
|
|
|
- has_text = 'text' in first_item
|
|
|
|
|
|
|
|
|
|
- if has_type and has_bbox and has_text:
|
|
|
|
|
|
|
+ if has_type and has_bbox:
|
|
|
item_type = first_item.get('type', '')
|
|
item_type = first_item.get('type', '')
|
|
|
- return item_type in ['text', 'table', 'image']
|
|
|
|
|
|
|
+ return item_type in [
|
|
|
|
|
+ 'text', 'table', 'table_body', 'image', 'title',
|
|
|
|
|
+ 'seal', 'list', 'header', 'footer',
|
|
|
|
|
+ ]
|
|
|
|
|
|
|
|
return False
|
|
return False
|
|
|
|
|
|
|
@@ -541,15 +568,20 @@ def process_ocr_data(ocr_data: List, config: Dict) -> Dict[str, List]:
|
|
|
if isinstance(bbox, list) and len(bbox) == 4:
|
|
if isinstance(bbox, list) and len(bbox) == 4:
|
|
|
if text not in text_bbox_mapping:
|
|
if text not in text_bbox_mapping:
|
|
|
text_bbox_mapping[text] = []
|
|
text_bbox_mapping[text] = []
|
|
|
- text_bbox_mapping[text].append({
|
|
|
|
|
|
|
+ mapping_entry = {
|
|
|
'matched_text': item.get('matched_text', ''),
|
|
'matched_text': item.get('matched_text', ''),
|
|
|
'bbox': bbox,
|
|
'bbox': bbox,
|
|
|
'category': item.get('category', 'Text'),
|
|
'category': item.get('category', 'Text'),
|
|
|
'index': i,
|
|
'index': i,
|
|
|
'confidence': item.get('confidence', config['ocr']['default_confidence']),
|
|
'confidence': item.get('confidence', config['ocr']['default_confidence']),
|
|
|
'source_tool': item.get('source_tool', 'unknown'),
|
|
'source_tool': item.get('source_tool', 'unknown'),
|
|
|
- 'rotation_angle': item.get('rotation_angle', 0.0) # 添加旋转角度信息
|
|
|
|
|
- })
|
|
|
|
|
|
|
+ 'rotation_angle': item.get('rotation_angle', 0.0),
|
|
|
|
|
+ }
|
|
|
|
|
+ if item.get('recognition_method'):
|
|
|
|
|
+ mapping_entry['recognition_method'] = item['recognition_method']
|
|
|
|
|
+ if item.get('seal_texts'):
|
|
|
|
|
+ mapping_entry['seal_texts'] = item['seal_texts']
|
|
|
|
|
+ text_bbox_mapping[text].append(mapping_entry)
|
|
|
|
|
|
|
|
return text_bbox_mapping
|
|
return text_bbox_mapping
|
|
|
|
|
|