Browse Source

fix: 统一比较结果中的错误类型,细化统计信息,改进输出格式

zhch158_admin 1 month ago
parent
commit
b410809491
1 changed files with 48 additions and 59 deletions
  1. 48 59
      compare_ocr_results.py

+ 48 - 59
compare_ocr_results.py

@@ -455,7 +455,7 @@ class OCRResultComparator:
     
     def compare_cell_value(self, value1: str, value2: str, column_type: str, 
                           column_name: str = '') -> Dict:
-        """比较单元格值"""
+        """比较单元格值 - 统一错误类型"""
         result = {
             'match': True,
             'difference': None
@@ -477,16 +477,16 @@ class OCRResultComparator:
                 if abs(num1 - num2) > 0.01:  # 允许0.01的误差
                     result['match'] = False
                     result['difference'] = {
-                        'type': 'table_amount',
+                        'type': 'table_amount',  # ✅ 统一类型
                         'value1': value1,
                         'value2': value2,
                         'diff_amount': abs(num1 - num2),
-                        'description': f'数字不一致: {value1} vs {value2}'
+                        'description': f'金额不一致: {value1} vs {value2}'
                     }
             else:
                 result['match'] = False
                 result['difference'] = {
-                    'type': 'table_amount_format_error',
+                    'type': 'table_amount',  # ✅ 格式错误也算金额差异
                     'value1': value1,
                     'value2': value2,
                     'description': f'数字格式错误: {value1} vs {value2}'
@@ -494,14 +494,13 @@ class OCRResultComparator:
         
         elif column_type == 'datetime':
             # 日期时间比较
-            # 提取日期时间部分进行比较
             datetime1 = self.extract_datetime(v1)
             datetime2 = self.extract_datetime(v2)
             
             if datetime1 != datetime2:
                 result['match'] = False
                 result['difference'] = {
-                    'type': 'table_datetime_mismatch',
+                    'type': 'table_datetime',  # ✅ 日期时间类型
                     'value1': value1,
                     'value2': value2,
                     'description': f'日期时间不一致: {value1} vs {value2}'
@@ -513,7 +512,7 @@ class OCRResultComparator:
             if similarity < self.similarity_threshold:
                 result['match'] = False
                 result['difference'] = {
-                    'type': 'table_text_mismatch',
+                    'type': 'table_text',  # ✅ 文本差异
                     'value1': value1,
                     'value2': value2,
                     'similarity': similarity,
@@ -626,7 +625,7 @@ class OCRResultComparator:
         
         if header_row_idx1 != header_row_idx2:
             differences.append({
-                'type': 'table_header_position',  # ✅ 已经是 table_ 开头
+                'type': 'table_header_position',
                 'position': '表头位置',
                 'file1_value': f'第{header_row_idx1 + 1}行',
                 'file2_value': f'第{header_row_idx2 + 1}行',
@@ -646,9 +645,9 @@ class OCRResultComparator:
                 # 复用compare_tables方法进行比对
                 pre_header_diffs = self.compare_tables(pre_header_table1, pre_header_table2)
                 
-                # 修改:统一类型为 table_pre_header
+                # 修改:统一类型为 table_pre_header
                 for diff in pre_header_diffs:
-                    diff['type'] = 'table_pre_header'  # 改为 table_ 开头
+                    diff['type'] = 'table_pre_header'
                     diff['position'] = f"表头前{diff['position']}"
                     diff['severity'] = 'medium'
                     print(f"   ⚠️  {diff['position']}: {diff['description']}")
@@ -670,7 +669,7 @@ class OCRResultComparator:
             for diff in header_result['differences']:
                 print(f"   - {diff['description']}")
                 differences.append({
-                    'type': 'table_header_critical',  # ✅ 已经是 table_ 开头
+                    'type': 'table_header_critical',
                     'position': '表头',
                     'file1_value': ', '.join(headers1),
                     'file2_value': ', '.join(headers2),
@@ -706,35 +705,35 @@ class OCRResultComparator:
             row2 = data_rows2[row_idx] if row_idx < len(data_rows2) else []
             
             # 实际行号(加上表头行索引)
-            actual_row_num1 = header_row_idx1 + row_idx + 2
-            actual_row_num2 = header_row_idx2 + row_idx + 2
+            actual_row_num = header_row_idx1 + row_idx + 2
             
             if not row1:
                 differences.append({
-                    'type': 'table_row_missing',  # ✅ 修改:改为 table_row_missing
-                    'position': f'第{actual_row_num1}行',
+                    'type': 'table_row_missing',
+                    'position': f'第{actual_row_num}行',
                     'file1_value': '',
                     'file2_value': ', '.join(row2),
-                    'description': f'文件1缺少第{actual_row_num1}行',
-                    'severity': 'high'
+                    'description': f'文件1缺少第{actual_row_num}行',
+                    'severity': 'high',
+                    'row_index': actual_row_num
                 })
                 continue
             
             if not row2:
+                # ✅ 修改:整行缺失按单元格输出
                 differences.append({
-                    'type': 'table_row_missing',  # ✅ 修改:改为 table_row_missing
-                    'position': f'第{actual_row_num2}行',
+                    'type': 'table_row_missing',
+                    'position': f'第{actual_row_num}行',
                     'file1_value': ', '.join(row1),
                     'file2_value': '',
-                    'description': f'文件2缺少第{actual_row_num2}行',
-                    'severity': 'high'
+                    'description': f'文件2缺少第{actual_row_num}行',
+                    'severity': 'high',
+                    'row_index': actual_row_num
                 })
                 continue
             
-            # 逐列比较
+            # ✅ 修改:逐列比较,每个单元格差异独立输出
             max_cols = max(len(row1), len(row2))
-            row_has_diff = False
-            row_diffs = []
             
             for col_idx in range(max_cols):
                 cell1 = row1[col_idx] if col_idx < len(row1) else ''
@@ -750,33 +749,25 @@ class OCRResultComparator:
                 compare_result = self.compare_cell_value(cell1, cell2, column_type, column_name)
                 
                 if not compare_result['match']:
-                    row_has_diff = True
+                    # ✅ 直接将单元格差异添加到differences列表
                     diff_info = compare_result['difference']
-                    row_diffs.append({
-                        'column_index': col_idx,
+                    
+                    differences.append({
+                        'type': diff_info['type'],  # 使用原始类型(table_amount, table_text等)
+                        'position': f'第{actual_row_num}行第{col_idx + 1}列',
+                        'file1_value': diff_info['value1'],
+                        'file2_value': diff_info['value2'],
+                        'description': diff_info['description'],
+                        'severity': 'medium',
+                        'row_index': actual_row_num,
+                        'col_index': col_idx,
                         'column_name': column_name,
                         'column_type': column_type,
-                        **diff_info
+                        # 保留额外信息
+                        **{k: v for k, v in diff_info.items() if k not in ['type', 'value1', 'value2', 'description']}
                     })
-            
-            if row_has_diff:
-                # 汇总该行的所有差异
-                diff_columns = [f"{d['column_name']}(列{d['column_index'] + 1})" for d in row_diffs]
-                differences.append({
-                    'type': 'table_row_data',  # ✅ 修改:改为 table_row_data
-                    'position': f'第{actual_row_num1}行',
-                    'row_index': row_idx + 1,
-                    'affected_columns': diff_columns,
-                    'column_differences': row_diffs,
-                    'file1_value': ', '.join(row1),
-                    'file2_value': ', '.join(row2),
-                    'description': f'表格第{actual_row_num1}行在以下列有差异: {", ".join(diff_columns)}',
-                    'severity': 'medium'
-                })
-                
-                print(f"   ⚠️  第{actual_row_num1}行有差异:")
-                for diff in row_diffs:
-                    print(f"      - {diff['column_name']}: {diff['description']}")
+                    
+                    print(f"   ⚠️  第{actual_row_num}行第{col_idx + 1}列({column_name}): {diff_info['description']}")
         
         print(f"\n✅ 流水表格对比完成,发现 {len(differences)} 个差异")
         
@@ -839,24 +830,23 @@ class OCRResultComparator:
         para_diffs = self.compare_paragraphs_with_flexible_matching(paras1, paras2)
         all_differences.extend(para_diffs)
         
-        # # 生成unified diff报告
-        # unified_diff_data = self.generate_unified_diff_report(
-        #     paras1, paras2, file1_path, file2_path, 
-        #     "./output/pre_validation/unified_diff_comparison"
-        # )
-
-        # 统计信息
+        # ✅ 改进统计信息 - 细化分类
         stats = {
             'total_differences': len(all_differences),
             'table_differences': len([d for d in all_differences if d['type'].startswith('table')]),
             'paragraph_differences': len([d for d in all_differences if d['type'] == 'paragraph']),
             'amount_differences': len([d for d in all_differences if d['type'] == 'table_amount']),
-            'high_severity': len([d for d in all_differences if d.get('severity') == 'high']),
+            'datetime_differences': len([d for d in all_differences if d['type'] == 'table_datetime']),
+            'text_differences': len([d for d in all_differences if d['type'] == 'table_text']),
+            'table_pre_header': len([d for d in all_differences if d['type'] == 'table_pre_header']),
+            'table_header_critical': len([d for d in all_differences if d['type'] == 'table_header_critical']),
+            'table_header_position': len([d for d in all_differences if d['type'] == 'table_header_position']),
+            'table_row_missing': len([d for d in all_differences if d['type'] == 'table_row_missing']),
+            'high_severity': len([d for d in all_differences if d.get('severity') == 'critical' or d.get('severity') == 'high']),
             'medium_severity': len([d for d in all_differences if d.get('severity') == 'medium']),
             'low_severity': len([d for d in all_differences if d.get('severity') == 'low'])
         }
         
-        # 在返回结果中添加unified diff数据
         result = {
             'differences': all_differences,
             'statistics': stats,
@@ -866,7 +856,6 @@ class OCRResultComparator:
             'file2_paragraphs': len(paras2),
             'file1_path': file1_path,
             'file2_path': file2_path,
-            # 'unified_diff': unified_diff_data  # 添加unified diff数据
         }
         
         return result
@@ -908,7 +897,7 @@ class OCRResultComparator:
             f.write("## 统计信息\n\n")
             f.write(f"- 总差异数量: **{stats['total_differences']}**\n")
             f.write(f"- 表格差异: **{stats['table_differences']}**\n")
-            f.write(f"- 金额差异: **{stats['amount_differences']}**\n")
+            f.write(f"- 其中表格金额差异: **{stats['amount_differences']}**\n")
             f.write(f"- 段落差异: **{stats['paragraph_differences']}**\n")
             f.write(f"- 高严重度: **{stats['high_severity']}**\n")  # ✅ 新增
             f.write(f"- 中严重度: **{stats['medium_severity']}**\n")  # ✅ 新增
@@ -1029,7 +1018,7 @@ def compare_ocr_results(file1_path: str, file2_path: str, output_file: str = "co
         print(f"\n📊 对比完成!")
         print(f"   总差异数: {result['statistics']['total_differences']}")
         print(f"   表格差异: {result['statistics']['table_differences']}")
-        print(f"   金额差异: {result['statistics']['amount_differences']}")
+        print(f"   其中表格金额差异: {result['statistics']['amount_differences']}")
         print(f"   段落差异: {result['statistics']['paragraph_differences']}")
         
         # 打印前几个重要差异