1 month ago · 470a4d677e
--- a/ocr_validator_utils.py
+++ b/ocr_validator_utils.py
@@ -337,6 +337,21 @@ def parse_mineru_data(data: List, config: Dict, tool_name="mineru") -> List[Dict
 
															                     'img_path': img_path,
														
 
															                     'table_body': table_html
														
 
															                 })
														
 
															+            table_cells = item.get(tool_config.get('table_cells_field', 'table_cells'), [])
														
 
															+            for cell in table_cells:
														
 
															+                cell_text = cell.get('text', '')
														
 
															+                cell_bbox = cell.get('bbox', [])
														
 
															+                if cell_text and cell_bbox and len(cell_bbox) >= 4:
														
 
															+                    parsed_data.append({
														
 
															+                        'text': str(cell_text).strip(),
														
 
															+                        'bbox': cell_bbox[:4],
														
 
															+                        'row': cell.get('row', -1),
														
 
															+                        'col': cell.get('col', -1),
														
 
															+                        'category': 'table_cell',
														
 
															+                        'confidence': cell.get('score', 0.0),
														
 
															+                        'source_tool': tool_name,
														
 
															+                    })
														
 
															+        # 处理图片类型
														
 
															         elif category == 'image':
														
 
															             img_path = item.get(tool_config.get('img_path_field', 'img_path'), '')
														
 
															             if bbox and len(bbox) >= 4:
														
@@ -348,17 +363,6 @@ def parse_mineru_data(data: List, config: Dict, tool_name="mineru") -> List[Dict
 
															                     'source_tool': tool_name,
														
 
															                     'img_path': img_path
														
 
															                 })
														
 
															-        elif category == 'table_cell':
														
 
															-            if bbox and len(bbox) >= 4:
														
 
															-                parsed_data.append({
														
 
															-                    'text': str(text).strip(),
														
 
															-                    'bbox': bbox[:4],
														
 
															-                    'row': item.get('row', -1),
														
 
															-                    'col': item.get('col', -1),
														
 
															-                    'category': 'table_cell',
														
 
															-                    'confidence': confidence,
														
 
															-                    'source_tool': tool_name,
														
 
															-                })
														
 
															         else:
														
 
															             # 其他类型，按文本处理,  header, table_cell, ...
														
 
															             if text and bbox and len(bbox) >= 4: