| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990 |
- import argparse
- from typing import Dict
- # ✅ 兼容相对导入和绝对导入
- try:
- from .ocr_comparator import OCRResultComparator
- from .report_generator import ReportGenerator
- except ImportError:
- from ocr_comparator import OCRResultComparator
- from report_generator import ReportGenerator
- def compare_ocr_results(file1_path: str, file2_path: str, output_file: str = "comparison_report",
- output_format: str = "markdown", ignore_images: bool = True,
- table_mode: str = 'standard', similarity_algorithm: str = 'ratio') -> Dict:
- """
- 比较两个OCR结果文件
-
- Args:
- file1_path: 第一个OCR结果文件路径
- file2_path: 第二个OCR结果文件路径
- output_file: 输出文件名(不含扩展名)
- output_format: 输出格式 ('json', 'markdown', 'both')
- ignore_images: 是否忽略图片内容
- table_mode: 表格比较模式 ('standard', 'flow_list')
- similarity_algorithm: 相似度算法 ('ratio', 'partial_ratio', 'token_sort_ratio', 'token_set_ratio')
- """
- comparator = OCRResultComparator()
- comparator.table_comparison_mode = table_mode
-
- print("🔍 开始对比OCR结果...")
- print(f"📄 文件1: {file1_path}")
- print(f"📄 文件2: {file2_path}")
- print(f"📊 表格模式: {table_mode}")
- print(f"🔧 相似度算法: {similarity_algorithm}")
-
- try:
- # 执行比较
- result = comparator.compare_files(file1_path, file2_path)
-
- # 生成报告
- print(f"\n📝 生成报告...")
- ReportGenerator.generate_report(result, output_file, output_format)
-
- print(f"\n✅ 对比完成!")
- return result
-
- except Exception as e:
- print(f"\n❌ 对比过程中出错: {str(e)}")
- import traceback
- traceback.print_exc()
- raise
- if __name__ == "__main__":
- parser = argparse.ArgumentParser(description='OCR结果对比工具')
- parser.add_argument('file1', nargs='?', help='第一个OCR结果文件路径')
- parser.add_argument('file2', nargs='?', help='第二个OCR结果文件路径')
- parser.add_argument('-o', '--output', default='comparison_report', help='输出文件名')
- parser.add_argument('-f', '--format', choices=['json', 'markdown', 'both'],
- default='markdown', help='输出格式')
- parser.add_argument('--ignore-images', action='store_true', help='忽略图片内容')
- parser.add_argument('--table-mode', choices=['standard', 'flow_list'],
- default='standard', help='表格比较模式')
- parser.add_argument('--similarity-algorithm',
- choices=['ratio', 'partial_ratio', 'token_sort_ratio', 'token_set_ratio'],
- default='ratio', help='相似度算法')
-
- args = parser.parse_args()
- if args.file1 and args.file2:
- compare_ocr_results(
- args.file1,
- args.file2,
- args.output,
- args.format,
- args.ignore_images,
- args.table_mode,
- args.similarity_algorithm
- )
- else:
- # 测试流水表格对比
- import time
- result = compare_ocr_results(
- file1_path='/Users/zhch158/workspace/data/流水分析/2023年度报告母公司/paddleocr_vl_results_cell_bbox/2023年度报告母公司_page_003.md',
- file2_path='/Users/zhch158/workspace/data/流水分析/2023年度报告母公司/mineru_vllm_results_cell_bbox/2023年度报告母公司_page_003.md',
- output_file=f'/Users/zhch158/workspace/repository.git/ocr_verify/output/flow_list_comparison_{time.strftime("%Y%m%d_%H%M%S")}',
- output_format='both',
- ignore_images=True,
- table_mode='flow_list', # 使用流水表格模式
- similarity_algorithm='ratio'
- )
|