compare_ocr_results.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
  1. import argparse
  2. import sys
  3. from typing import Dict
  4. from pathlib import Path
  5. # 添加 ocr_platform 根目录到 Python 路径
  6. # 使用 resolve() 确保路径是绝对路径,避免相对路径导致的 IndexError
  7. _file_path = Path(__file__).resolve()
  8. ocr_platform_root = _file_path.parents[1] # compare_ocr_results.py -> ocr_comparator -> ocr_platform
  9. if str(ocr_platform_root) not in sys.path:
  10. sys.path.insert(0, str(ocr_platform_root))
  11. # ✅ 兼容相对导入和绝对导入
  12. try:
  13. from .ocr_comparator import OCRResultComparator
  14. from .report_generator import ReportGenerator
  15. except ImportError:
  16. from ocr_comparator import OCRResultComparator
  17. from report_generator import ReportGenerator
  18. def compare_ocr_results(file1_path: str, file2_path: str, output_file: str = "comparison_report",
  19. output_format: str = "markdown", ignore_images: bool = True,
  20. table_mode: str = 'standard', similarity_algorithm: str = 'ratio') -> Dict:
  21. """
  22. 比较两个OCR结果文件
  23. Args:
  24. file1_path: 第一个OCR结果文件路径
  25. file2_path: 第二个OCR结果文件路径
  26. output_file: 输出文件名(不含扩展名)
  27. output_format: 输出格式 ('json', 'markdown', 'both')
  28. ignore_images: 是否忽略图片内容
  29. table_mode: 表格比较模式 ('standard', 'flow_list')
  30. similarity_algorithm: 相似度算法 ('ratio', 'partial_ratio', 'token_sort_ratio', 'token_set_ratio')
  31. """
  32. comparator = OCRResultComparator()
  33. comparator.table_comparison_mode = table_mode
  34. print("🔍 开始对比OCR结果...")
  35. print(f"📄 文件1: {file1_path}")
  36. print(f"📄 文件2: {file2_path}")
  37. print(f"📊 表格模式: {table_mode}")
  38. print(f"🔧 相似度算法: {similarity_algorithm}")
  39. try:
  40. # 执行比较
  41. result = comparator.compare_files(file1_path, file2_path)
  42. # 生成报告
  43. print(f"\n📝 生成报告...")
  44. ReportGenerator.generate_report(result, output_file, output_format)
  45. print(f"\n✅ 对比完成!")
  46. return result
  47. except Exception as e:
  48. print(f"\n❌ 对比过程中出错: {str(e)}")
  49. import traceback
  50. traceback.print_exc()
  51. raise
  52. if __name__ == "__main__":
  53. parser = argparse.ArgumentParser(description='OCR结果对比工具')
  54. parser.add_argument('file1', nargs='?', help='第一个OCR结果文件路径')
  55. parser.add_argument('file2', nargs='?', help='第二个OCR结果文件路径')
  56. parser.add_argument('-o', '--output', default='comparison_report', help='输出文件名')
  57. parser.add_argument('-f', '--format', choices=['json', 'markdown', 'both'],
  58. default='markdown', help='输出格式')
  59. parser.add_argument('--ignore-images', action='store_true', help='忽略图片内容')
  60. parser.add_argument('--table-mode', choices=['standard', 'flow_list'],
  61. default='standard', help='表格比较模式')
  62. parser.add_argument('--similarity-algorithm',
  63. choices=['ratio', 'partial_ratio', 'token_sort_ratio', 'token_set_ratio'],
  64. default='ratio', help='相似度算法')
  65. args = parser.parse_args()
  66. if args.file1 and args.file2:
  67. compare_ocr_results(
  68. args.file1,
  69. args.file2,
  70. args.output,
  71. args.format,
  72. args.ignore_images,
  73. args.table_mode,
  74. args.similarity_algorithm
  75. )
  76. else:
  77. # 测试流水表格对比
  78. import time
  79. result = compare_ocr_results(
  80. file1_path='/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/dotsocr_vllm_results_cell_bbox/B用户_扫描流水_page_008.md',
  81. file2_path='/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/mineru_vllm_results/B用户_扫描流水_page_008.md',
  82. output_file=f'/Users/zhch158/workspace/repository.git/ocr_verify/output/flow_list_comparison_{time.strftime("%Y%m%d_%H%M%S")}',
  83. output_format='both',
  84. ignore_images=True,
  85. table_mode='flow_list', # 使用流水表格模式
  86. similarity_algorithm='ratio'
  87. )