|
@@ -337,6 +337,21 @@ def parse_mineru_data(data: List, config: Dict, tool_name="mineru") -> List[Dict
|
|
|
'img_path': img_path,
|
|
'img_path': img_path,
|
|
|
'table_body': table_html
|
|
'table_body': table_html
|
|
|
})
|
|
})
|
|
|
|
|
+ table_cells = item.get(tool_config.get('table_cells_field', 'table_cells'), [])
|
|
|
|
|
+ for cell in table_cells:
|
|
|
|
|
+ cell_text = cell.get('text', '')
|
|
|
|
|
+ cell_bbox = cell.get('bbox', [])
|
|
|
|
|
+ if cell_text and cell_bbox and len(cell_bbox) >= 4:
|
|
|
|
|
+ parsed_data.append({
|
|
|
|
|
+ 'text': str(cell_text).strip(),
|
|
|
|
|
+ 'bbox': cell_bbox[:4],
|
|
|
|
|
+ 'row': cell.get('row', -1),
|
|
|
|
|
+ 'col': cell.get('col', -1),
|
|
|
|
|
+ 'category': 'table_cell',
|
|
|
|
|
+ 'confidence': cell.get('score', 0.0),
|
|
|
|
|
+ 'source_tool': tool_name,
|
|
|
|
|
+ })
|
|
|
|
|
+ # 处理图片类型
|
|
|
elif category == 'image':
|
|
elif category == 'image':
|
|
|
img_path = item.get(tool_config.get('img_path_field', 'img_path'), '')
|
|
img_path = item.get(tool_config.get('img_path_field', 'img_path'), '')
|
|
|
if bbox and len(bbox) >= 4:
|
|
if bbox and len(bbox) >= 4:
|
|
@@ -348,17 +363,6 @@ def parse_mineru_data(data: List, config: Dict, tool_name="mineru") -> List[Dict
|
|
|
'source_tool': tool_name,
|
|
'source_tool': tool_name,
|
|
|
'img_path': img_path
|
|
'img_path': img_path
|
|
|
})
|
|
})
|
|
|
- elif category == 'table_cell':
|
|
|
|
|
- if bbox and len(bbox) >= 4:
|
|
|
|
|
- parsed_data.append({
|
|
|
|
|
- 'text': str(text).strip(),
|
|
|
|
|
- 'bbox': bbox[:4],
|
|
|
|
|
- 'row': item.get('row', -1),
|
|
|
|
|
- 'col': item.get('col', -1),
|
|
|
|
|
- 'category': 'table_cell',
|
|
|
|
|
- 'confidence': confidence,
|
|
|
|
|
- 'source_tool': tool_name,
|
|
|
|
|
- })
|
|
|
|
|
else:
|
|
else:
|
|
|
# 其他类型,按文本处理, header, table_cell, ...
|
|
# 其他类型,按文本处理, header, table_cell, ...
|
|
|
if text and bbox and len(bbox) >= 4:
|
|
if text and bbox and len(bbox) >= 4:
|