compare_ocr_results.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. import argparse
  2. from typing import Dict
  3. # ✅ 兼容相对导入和绝对导入
  4. try:
  5. from .ocr_comparator import OCRResultComparator
  6. from .report_generator import ReportGenerator
  7. except ImportError:
  8. from ocr_comparator import OCRResultComparator
  9. from report_generator import ReportGenerator
  10. def compare_ocr_results(file1_path: str, file2_path: str, output_file: str = "comparison_report",
  11. output_format: str = "markdown", ignore_images: bool = True,
  12. table_mode: str = 'standard', similarity_algorithm: str = 'ratio') -> Dict:
  13. """
  14. 比较两个OCR结果文件
  15. Args:
  16. file1_path: 第一个OCR结果文件路径
  17. file2_path: 第二个OCR结果文件路径
  18. output_file: 输出文件名(不含扩展名)
  19. output_format: 输出格式 ('json', 'markdown', 'both')
  20. ignore_images: 是否忽略图片内容
  21. table_mode: 表格比较模式 ('standard', 'flow_list')
  22. similarity_algorithm: 相似度算法 ('ratio', 'partial_ratio', 'token_sort_ratio', 'token_set_ratio')
  23. """
  24. comparator = OCRResultComparator()
  25. comparator.table_comparison_mode = table_mode
  26. print("🔍 开始对比OCR结果...")
  27. print(f"📄 文件1: {file1_path}")
  28. print(f"📄 文件2: {file2_path}")
  29. print(f"📊 表格模式: {table_mode}")
  30. print(f"🔧 相似度算法: {similarity_algorithm}")
  31. try:
  32. # 执行比较
  33. result = comparator.compare_files(file1_path, file2_path)
  34. # 生成报告
  35. print(f"\n📝 生成报告...")
  36. ReportGenerator.generate_report(result, output_file, output_format)
  37. print(f"\n✅ 对比完成!")
  38. return result
  39. except Exception as e:
  40. print(f"\n❌ 对比过程中出错: {str(e)}")
  41. import traceback
  42. traceback.print_exc()
  43. raise
  44. if __name__ == "__main__":
  45. parser = argparse.ArgumentParser(description='OCR结果对比工具')
  46. parser.add_argument('file1', nargs='?', help='第一个OCR结果文件路径')
  47. parser.add_argument('file2', nargs='?', help='第二个OCR结果文件路径')
  48. parser.add_argument('-o', '--output', default='comparison_report', help='输出文件名')
  49. parser.add_argument('-f', '--format', choices=['json', 'markdown', 'both'],
  50. default='markdown', help='输出格式')
  51. parser.add_argument('--ignore-images', action='store_true', help='忽略图片内容')
  52. parser.add_argument('--table-mode', choices=['standard', 'flow_list'],
  53. default='standard', help='表格比较模式')
  54. parser.add_argument('--similarity-algorithm',
  55. choices=['ratio', 'partial_ratio', 'token_sort_ratio', 'token_set_ratio'],
  56. default='ratio', help='相似度算法')
  57. args = parser.parse_args()
  58. if args.file1 and args.file2:
  59. compare_ocr_results(
  60. args.file1,
  61. args.file2,
  62. args.output,
  63. args.format,
  64. args.ignore_images,
  65. args.table_mode,
  66. args.similarity_algorithm
  67. )
  68. else:
  69. # 测试流水表格对比
  70. import time
  71. result = compare_ocr_results(
  72. file1_path='/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/dotsocr_vllm_results_cell_bbox/B用户_扫描流水_page_008.md',
  73. file2_path='/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/mineru_vllm_results/B用户_扫描流水_page_008.md',
  74. output_file=f'/Users/zhch158/workspace/repository.git/ocr_verify/output/flow_list_comparison_{time.strftime("%Y%m%d_%H%M%S")}',
  75. output_format='both',
  76. ignore_images=True,
  77. table_mode='flow_list', # 使用流水表格模式
  78. similarity_algorithm='ratio'
  79. )