zhengchun
/
ocr_verify


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354
							"""
OCR 数据处理
"""
import sys
from pathlib import Path
from typing import List, Dict, Tuple

# 添加父目录到路径
sys.path.insert(0, str(Path(__file__).parent.parent))

try:
	from table_line_generator import TableLineGenerator
except ImportError:
	from ..table_line_generator import TableLineGenerator


def get_structure_from_ocr(
    raw_data: Dict, 
    tool: str = "ppstructv3"
) -> Tuple[List[int], Dict]:
    """
    从 OCR 数据生成表格结构（统一处理流程）
    
    Args:
        raw_data: 原始 OCR 结果
        tool: 工具类型 ("ppstructv3" / "mineru")
    
    Returns:
        (table_bbox, structure): 表格边界框和结构信息
    """
    from PIL import Image
    
    # 🎯 第一步：解析数据（统一接口）
    table_bbox, ocr_data = TableLineGenerator.parse_ocr_data(raw_data, tool)
    
    # 🎯 第二步：创建生成器
    dummy_image = Image.new('RGB', (2000, 3000), 'white')
    generator = TableLineGenerator(dummy_image, ocr_data)
    
    # 🎯 第三步：分析结构（根据工具选择算法）
    if tool.lower() == "mineru":
        # MinerU 使用基于索引的算法
        structure = generator.analyze_table_structure(method="mineru")
    else:
        # PPStructure 使用聚类算法
        structure = generator.analyze_table_structure(
            y_tolerance=5,
            x_tolerance=10,
            min_row_height=20,
            method="cluster"
        )
    
    return table_bbox, structure