import argparse from typing import Dict # ✅ 兼容相对导入和绝对导入 try: from .ocr_comparator import OCRResultComparator from .report_generator import ReportGenerator except ImportError: from ocr_comparator import OCRResultComparator from report_generator import ReportGenerator def compare_ocr_results(file1_path: str, file2_path: str, output_file: str = "comparison_report", output_format: str = "markdown", ignore_images: bool = True, table_mode: str = 'standard', similarity_algorithm: str = 'ratio') -> Dict: """ 比较两个OCR结果文件 Args: file1_path: 第一个OCR结果文件路径 file2_path: 第二个OCR结果文件路径 output_file: 输出文件名(不含扩展名) output_format: 输出格式 ('json', 'markdown', 'both') ignore_images: 是否忽略图片内容 table_mode: 表格比较模式 ('standard', 'flow_list') similarity_algorithm: 相似度算法 ('ratio', 'partial_ratio', 'token_sort_ratio', 'token_set_ratio') """ comparator = OCRResultComparator() comparator.table_comparison_mode = table_mode print("🔍 开始对比OCR结果...") print(f"📄 文件1: {file1_path}") print(f"📄 文件2: {file2_path}") print(f"📊 表格模式: {table_mode}") print(f"🔧 相似度算法: {similarity_algorithm}") try: # 执行比较 result = comparator.compare_files(file1_path, file2_path) # 生成报告 print(f"\n📝 生成报告...") ReportGenerator.generate_report(result, output_file, output_format) print(f"\n✅ 对比完成!") return result except Exception as e: print(f"\n❌ 对比过程中出错: {str(e)}") import traceback traceback.print_exc() raise if __name__ == "__main__": parser = argparse.ArgumentParser(description='OCR结果对比工具') parser.add_argument('file1', nargs='?', help='第一个OCR结果文件路径') parser.add_argument('file2', nargs='?', help='第二个OCR结果文件路径') parser.add_argument('-o', '--output', default='comparison_report', help='输出文件名') parser.add_argument('-f', '--format', choices=['json', 'markdown', 'both'], default='markdown', help='输出格式') parser.add_argument('--ignore-images', action='store_true', help='忽略图片内容') parser.add_argument('--table-mode', choices=['standard', 'flow_list'], default='standard', help='表格比较模式') parser.add_argument('--similarity-algorithm', choices=['ratio', 'partial_ratio', 'token_sort_ratio', 'token_set_ratio'], default='ratio', help='相似度算法') args = parser.parse_args() if args.file1 and args.file2: compare_ocr_results( args.file1, args.file2, args.output, args.format, args.ignore_images, args.table_mode, args.similarity_algorithm ) else: # 测试流水表格对比 import time result = compare_ocr_results( file1_path='/Users/zhch158/workspace/data/流水分析/2023年度报告母公司/paddleocr_vl_results_cell_bbox/2023年度报告母公司_page_003.md', file2_path='/Users/zhch158/workspace/data/流水分析/2023年度报告母公司/mineru_vllm_results_cell_bbox/2023年度报告母公司_page_003.md', output_file=f'/Users/zhch158/workspace/repository.git/ocr_verify/output/flow_list_comparison_{time.strftime("%Y%m%d_%H%M%S")}', output_format='both', ignore_images=True, table_mode='flow_list', # 使用流水表格模式 similarity_algorithm='ratio' )