ocr_comparator.py 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202
  1. import os
  2. from typing import Dict
  3. from datetime import datetime
  4. try:
  5. from .content_extractor import ContentExtractor
  6. from .table_comparator import TableComparator
  7. from .paragraph_comparator import ParagraphComparator
  8. except ImportError:
  9. from content_extractor import ContentExtractor
  10. from table_comparator import TableComparator
  11. from paragraph_comparator import ParagraphComparator
  12. class OCRResultComparator:
  13. """OCR结果比较器主类"""
  14. def __init__(self):
  15. self.content_extractor = ContentExtractor()
  16. self.table_comparator = TableComparator()
  17. self.paragraph_comparator = ParagraphComparator()
  18. self.differences = []
  19. self.paragraph_match_threshold = 80
  20. self.content_similarity_threshold = 95
  21. self.max_paragraph_window = 6
  22. self.table_comparison_mode = 'standard'
  23. self.header_similarity_threshold = 90
  24. def compare_files(self, file1_path: str, file2_path: str) -> Dict:
  25. """比较两个OCR结果文件"""
  26. print(f"\n📖 读取文件...")
  27. # 读取文件内容
  28. with open(file1_path, 'r', encoding='utf-8') as f:
  29. content1 = f.read()
  30. with open(file2_path, 'r', encoding='utf-8') as f:
  31. content2 = f.read()
  32. print(f"✅ 文件读取完成")
  33. print(f" 文件1大小: {len(content1)} 字符")
  34. print(f" 文件2大小: {len(content2)} 字符")
  35. # 提取表格
  36. print(f"\n📊 提取表格...")
  37. tables1 = self.content_extractor.extract_table_data(content1)
  38. tables2 = self.content_extractor.extract_table_data(content2)
  39. print(f" 文件1表格数: {len(tables1)}")
  40. print(f" 文件2表格数: {len(tables2)}")
  41. # 提取段落
  42. print(f"\n📝 提取段落...")
  43. paragraphs1 = self.content_extractor.extract_paragraphs(content1)
  44. paragraphs2 = self.content_extractor.extract_paragraphs(content2)
  45. print(f" 文件1段落数: {len(paragraphs1)}")
  46. print(f" 文件2段落数: {len(paragraphs2)}")
  47. # 比较段落
  48. print(f"\n🔍 开始段落对比...")
  49. paragraph_differences = self.paragraph_comparator.compare_paragraphs(
  50. paragraphs1, paragraphs2
  51. )
  52. print(f"✅ 段落对比完成,发现 {len(paragraph_differences)} 个差异")
  53. # 初始化所有差异列表
  54. all_differences = []
  55. all_differences.extend(paragraph_differences)
  56. # ✅ 智能表格匹配与比较
  57. print(f"\n🔍 开始表格智能匹配...")
  58. if tables1 and tables2:
  59. # 找到匹配的表格对
  60. table_matches = self.table_comparator.find_matching_tables(tables1, tables2)
  61. if not table_matches:
  62. print(f" ⚠️ 未找到匹配的表格")
  63. all_differences.append({
  64. 'type': 'table_structure',
  65. 'position': '表格匹配',
  66. 'file1_value': f'{len(tables1)}个表格',
  67. 'file2_value': f'{len(tables2)}个表格',
  68. 'description': '未找到可匹配的表格',
  69. 'severity': 'high'
  70. })
  71. else:
  72. # 比较每对匹配的表格
  73. for idx1, idx2, similarity in table_matches:
  74. print(f"\n 📋 对比匹配的表格: 表格{idx1+1} vs 表格{idx2+1}")
  75. if self.table_comparison_mode == 'flow_list':
  76. table_diffs = self.table_comparator.compare_table_flow_list(
  77. tables1[idx1], tables2[idx2]
  78. )
  79. else:
  80. table_diffs = self.table_comparator.compare_tables(
  81. tables1[idx1], tables2[idx2]
  82. )
  83. # 为每个差异添加表格标识
  84. for diff in table_diffs:
  85. diff['table_pair'] = f'表格{idx1+1}↔表格{idx2+1}'
  86. diff['table_similarity'] = similarity
  87. all_differences.extend(table_diffs)
  88. print(f" 发现 {len(table_diffs)} 个差异")
  89. # 检查未匹配的表格
  90. matched_tables1 = {m[0] for m in table_matches}
  91. matched_tables2 = {m[1] for m in table_matches}
  92. for i in range(len(tables1)):
  93. if i not in matched_tables1:
  94. all_differences.append({
  95. 'type': 'table_unmatched',
  96. 'position': f'文件1表格{i+1}',
  97. 'file1_value': f'表格{i+1} (无匹配)',
  98. 'file2_value': '',
  99. 'description': f'文件1的表格{i+1}在文件2中无匹配表格',
  100. 'severity': 'medium'
  101. })
  102. for j in range(len(tables2)):
  103. if j not in matched_tables2:
  104. all_differences.append({
  105. 'type': 'table_unmatched',
  106. 'position': f'文件2表格{j+1}',
  107. 'file1_value': '',
  108. 'file2_value': f'表格{j+1} (无匹配)',
  109. 'description': f'文件2的表格{j+1}在文件1中无匹配表格',
  110. 'severity': 'medium'
  111. })
  112. elif tables1 and not tables2:
  113. all_differences.append({
  114. 'type': 'table_structure',
  115. 'position': '表格结构',
  116. 'file1_value': f'包含{len(tables1)}个表格',
  117. 'file2_value': '无表格',
  118. 'description': '文件1包含表格但文件2无表格',
  119. 'severity': 'high'
  120. })
  121. elif not tables1 and tables2:
  122. all_differences.append({
  123. 'type': 'table_structure',
  124. 'position': '表格结构',
  125. 'file1_value': '无表格',
  126. 'file2_value': f'包含{len(tables2)}个表格',
  127. 'description': '文件2包含表格但文件1无表格',
  128. 'severity': 'high'
  129. })
  130. print(f"\n✅ 对比完成")
  131. # ✅ 统计差异 - 细化分类(与原版本保持一致)
  132. stats = {
  133. 'total_differences': len(all_differences),
  134. 'table_differences': len([d for d in all_differences if d['type'].startswith('table')]),
  135. 'paragraph_differences': len([d for d in all_differences if d['type'] == 'paragraph']),
  136. 'amount_differences': len([d for d in all_differences if d['type'] == 'table_amount']),
  137. 'datetime_differences': len([d for d in all_differences if d['type'] == 'table_datetime']),
  138. 'text_differences': len([d for d in all_differences if d['type'] == 'table_text']),
  139. 'table_pre_header': len([d for d in all_differences if d['type'] == 'table_pre_header']),
  140. 'table_header_mismatch': len([d for d in all_differences if d['type'] == 'table_header_mismatch']),
  141. 'table_header_critical': len([d for d in all_differences if d['type'] == 'table_header_critical']),
  142. 'table_header_position': len([d for d in all_differences if d['type'] == 'table_header_position']),
  143. 'table_row_missing': len([d for d in all_differences if d['type'] == 'table_row_missing']),
  144. 'high_severity': len([d for d in all_differences if d.get('severity') in ['critical', 'high']]),
  145. 'medium_severity': len([d for d in all_differences if d.get('severity') == 'medium']),
  146. 'low_severity': len([d for d in all_differences if d.get('severity') == 'low'])
  147. }
  148. # ✅ 构建返回结果 - 与原版本结构保持完全一致
  149. result = {
  150. 'differences': all_differences, # ✅ 原版本使用 differences 而非 paragraph_differences
  151. 'statistics': stats,
  152. 'file1_tables': len(tables1),
  153. 'file2_tables': len(tables2),
  154. 'file1_paragraphs': len(paragraphs1),
  155. 'file2_paragraphs': len(paragraphs2),
  156. 'file1_path': file1_path,
  157. 'file2_path': file2_path,
  158. 'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S') # ✅ 添加时间戳
  159. }
  160. print(f"\n" + "="*60)
  161. print(f"📊 对比结果汇总")
  162. print(f"="*60)
  163. print(f"总差异数: {result['statistics']['total_differences']}")
  164. print(f" - 段落差异: {result['statistics']['paragraph_differences']}")
  165. print(f" - 表格差异: {result['statistics']['table_differences']}")
  166. print(f" - 金额: {result['statistics']['amount_differences']}")
  167. print(f" - 日期: {result['statistics']['datetime_differences']}")
  168. print(f" - 文本: {result['statistics']['text_differences']}")
  169. print(f"\n严重级别分布:")
  170. print(f" 🔴 高: {result['statistics']['high_severity']}")
  171. print(f" 🟡 中: {result['statistics']['medium_severity']}")
  172. print(f" 🟢 低: {result['statistics']['low_severity']}")
  173. print(f"="*60)
  174. return result