|
|
@@ -363,6 +363,21 @@ def parse_mineru_data(data: List, config: Dict, tool_name="mineru") -> List[Dict
|
|
|
'source_tool': tool_name,
|
|
|
'img_path': img_path
|
|
|
})
|
|
|
+ elif category in ['list']:
|
|
|
+ # 处理列表和标题类型
|
|
|
+ list_items = item.get('list_items', [])
|
|
|
+ sub_type = item.get('sub_type', 'unordered') # 有序或无序
|
|
|
+
|
|
|
+ for list_item in list_items:
|
|
|
+ if list_item and bbox and len(bbox) >= 4:
|
|
|
+ parsed_data.append({
|
|
|
+ 'text': str(list_item).strip(),
|
|
|
+ 'bbox': bbox[:4],
|
|
|
+ 'category': category,
|
|
|
+ 'sub_type': sub_type,
|
|
|
+ 'confidence': confidence,
|
|
|
+ 'source_tool': tool_name
|
|
|
+ })
|
|
|
else:
|
|
|
# 其他类型,按文本处理, header, table_cell, ...
|
|
|
if text and bbox and len(bbox) >= 4:
|