vor 2 Tagen · 26b22366fa
--- a/table_line_generator/editor/data_processor.py
+++ b/table_line_generator/editor/data_processor.py
@@ -14,70 +14,41 @@ except ImportError:
 
				 	from ..table_line_generator import TableLineGenerator
			
 
				 
			
 
				 
			
 
				-def parse_ocr_data(raw_data: Dict, tool: str = "ppstructv3") -> Tuple[List[int], List[Dict]]:
			
 
				-    """
			
 
				-    解析 OCR 数据（根据工具类型自动选择解析方法）
			
 
				-    
			
 
				-    Args:
			
 
				-        raw_data: 原始 OCR 结果
			
 
				-        tool: 工具类型 ("ppstructv3" 或 "mineru")
			
 
				-    
			
 
				-    Returns:
			
 
				-        (table_bbox, ocr_data): 表格边界框和文本框列表
			
 
				-    """
			
 
				-    if tool.lower() == "mineru":
			
 
				-        # 使用 MinerU 专用解析方法
			
 
				-        table_bbox, structure = TableLineGenerator.parse_mineru_table_result(raw_data)
			
 
				-        
			
 
				-        # 🔑 将结构转换为 ocr_data 格式（兼容现有逻辑）
			
 
				-        ocr_data = []
			
 
				-        for row in structure['rows']:
			
 
				-            for bbox in row['bboxes']:
			
 
				-                ocr_data.append({
			
 
				-                    'bbox': bbox,
			
 
				-                    'text': ''  # MinerU 可能没有文本，或需要从 table_cells 提取
			
 
				-                })
			
 
				-        
			
 
				-        return table_bbox, ocr_data
			
 
				-    
			
 
				-    elif tool.lower() == "ppstructv3":
			
 
				-        # 🔑 PPStructure V3 格式
			
 
				-        return TableLineGenerator.parse_ppstructure_result(raw_data)
			
 
				-    
			
 
				-    else:
			
 
				-        raise ValueError(f"不支持的工具类型: {tool}，支持的类型: ppstructv3, mineru")
			
 
				-
			
 
				-
			
 
				 def get_structure_from_ocr(
			
 
				     raw_data: Dict, 
			
 
				     tool: str = "ppstructv3"
			
 
				 ) -> Tuple[List[int], Dict]:
			
 
				     """
			
 
				-    从 OCR 数据直接生成表格结构
			
 
				+    从 OCR 数据生成表格结构（统一处理流程）
			
 
				     
			
 
				     Args:
			
 
				         raw_data: 原始 OCR 结果
			
 
				-        tool: 工具类型
			
 
				+        tool: 工具类型 ("ppstructv3" / "mineru")
			
 
				     
			
 
				     Returns:
			
 
				         (table_bbox, structure): 表格边界框和结构信息
			
 
				     """
			
 
				-    if tool.lower() == "mineru":
			
 
				-        # 🔑 MinerU：直接传入完整 JSON，方法内部会提取 table
			
 
				-        return TableLineGenerator.parse_mineru_table_result(raw_data)
			
 
				+    from PIL import Image
			
 
				     
			
 
				-    elif tool.lower() == "ppstructv3" or tool.lower() == "ppstructure":
			
 
				-        # 🔑 PPStructure V3：需要先解析再分析
			
 
				-        table_bbox, ocr_data = TableLineGenerator.parse_ppstructure_result(raw_data)
			
 
				-        
			
 
				-        # 使用临时生成器分析结构
			
 
				-        from PIL import Image
			
 
				-        dummy_image = Image.new('RGB', (2000, 3000), 'white')
			
 
				-        generator = TableLineGenerator(dummy_image, ocr_data)
			
 
				-        structure = generator.analyze_table_structure()
			
 
				-        
			
 
				-        return table_bbox, structure
			
 
				+    # 🎯 第一步：解析数据（统一接口）
			
 
				+    table_bbox, ocr_data = TableLineGenerator.parse_ocr_data(raw_data, tool)
			
 
				     
			
 
				+    # 🎯 第二步：创建生成器
			
 
				+    dummy_image = Image.new('RGB', (2000, 3000), 'white')
			
 
				+    generator = TableLineGenerator(dummy_image, ocr_data)
			
 
				+    
			
 
				+    # 🎯 第三步：分析结构（根据工具选择算法）
			
 
				+    if tool.lower() == "mineru":
			
 
				+        # MinerU 使用基于索引的算法
			
 
				+        structure = generator.analyze_table_structure(method="mineru")
			
 
				     else:
			
 
				-        raise ValueError(f"不支持的工具类型: {tool}")
			
 
				+        # PPStructure 使用聚类算法
			
 
				+        structure = generator.analyze_table_structure(
			
 
				+            y_tolerance=5,
			
 
				+            x_tolerance=10,
			
 
				+            min_row_height=20,
			
 
				+            method="cluster"
			
 
				+        )
			
 
				+    
			
 
				+    return table_bbox, structure