zhengchun
/
ocr_platform


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
							import argparse
import sys
from typing import Dict
from pathlib import Path

# 添加 ocr_platform 根目录到 Python 路径
# 使用 resolve() 确保路径是绝对路径，避免相对路径导致的 IndexError
_file_path = Path(__file__).resolve()
ocr_platform_root = _file_path.parents[1]  # compare_ocr_results.py -> ocr_comparator -> ocr_platform
if str(ocr_platform_root) not in sys.path:
    sys.path.insert(0, str(ocr_platform_root))

# ✅ 兼容相对导入和绝对导入
try:
    from .ocr_comparator import OCRResultComparator
    from .report_generator import ReportGenerator
except ImportError:
    from ocr_comparator import OCRResultComparator
    from report_generator import ReportGenerator

def compare_ocr_results(file1_path: str, file2_path: str, output_file: str = "comparison_report",
                       output_format: str = "markdown", ignore_images: bool = True,
                       table_mode: str = 'standard', similarity_algorithm: str = 'ratio') -> Dict:
    """
    比较两个OCR结果文件
    
    Args:
        file1_path: 第一个OCR结果文件路径
        file2_path: 第二个OCR结果文件路径
        output_file: 输出文件名（不含扩展名）
        output_format: 输出格式 ('json', 'markdown', 'both')
        ignore_images: 是否忽略图片内容
        table_mode: 表格比较模式 ('standard', 'flow_list')
        similarity_algorithm: 相似度算法 ('ratio', 'partial_ratio', 'token_sort_ratio', 'token_set_ratio')
    """
    comparator = OCRResultComparator()
    comparator.table_comparison_mode = table_mode
    
    print("🔍 开始对比OCR结果...")
    print(f"📄 文件1: {file1_path}")
    print(f"📄 文件2: {file2_path}")
    print(f"📊 表格模式: {table_mode}")
    print(f"🔧 相似度算法: {similarity_algorithm}")
    
    try:
        # 执行比较
        result = comparator.compare_files(file1_path, file2_path)
        
        # 生成报告
        print(f"\n📝 生成报告...")
        ReportGenerator.generate_report(result, output_file, output_format)
        
        print(f"\n✅ 对比完成！")
        return result
        
    except Exception as e:
        print(f"\n❌ 对比过程中出错: {str(e)}")
        import traceback
        traceback.print_exc()
        raise


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='OCR结果对比工具')
    parser.add_argument('file1', nargs='?', help='第一个OCR结果文件路径')
    parser.add_argument('file2', nargs='?', help='第二个OCR结果文件路径')
    parser.add_argument('-o', '--output', default='comparison_report', help='输出文件名')
    parser.add_argument('-f', '--format', choices=['json', 'markdown', 'both'], 
                       default='markdown', help='输出格式')
    parser.add_argument('--ignore-images', action='store_true', help='忽略图片内容')
    parser.add_argument('--table-mode', choices=['standard', 'flow_list'], 
                       default='standard', help='表格比较模式')
    parser.add_argument('--similarity-algorithm', 
                       choices=['ratio', 'partial_ratio', 'token_sort_ratio', 'token_set_ratio'],
                       default='ratio', help='相似度算法')
    
    args = parser.parse_args()

    if args.file1 and args.file2:
        compare_ocr_results(
            args.file1, 
            args.file2, 
            args.output, 
            args.format,
            args.ignore_images,
            args.table_mode,
            args.similarity_algorithm
        )
    else:
        # 测试流水表格对比
        import time
        result = compare_ocr_results(
            file1_path='/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/dotsocr_vllm_results_cell_bbox/B用户_扫描流水_page_008.md',
            file2_path='/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/mineru_vllm_results/B用户_扫描流水_page_008.md',
            output_file=f'/Users/zhch158/workspace/repository.git/ocr_verify/output/flow_list_comparison_{time.strftime("%Y%m%d_%H%M%S")}',
            output_format='both',
            ignore_images=True,
            table_mode='flow_list',  # 使用流水表格模式
            similarity_algorithm='ratio'
        )