Bläddra i källkod

feat: 优化表头比较逻辑,新增列数和列类型一致性检查,调整差异输出信息

zhch158_admin 1 månad sedan
förälder
incheckning
b35a5f4c94
1 ändrade filer med 65 tillägg och 18 borttagningar
  1. 65 18
      compare_ocr_results.py

+ 65 - 18
compare_ocr_results.py

@@ -418,7 +418,7 @@ class OCRResultComparator:
         if len(headers1) != len(headers2):
             result['match'] = False
             result['differences'].append({
-                'type': 'header_count',
+                'type': 'table_header_critical',
                 'description': f'表头列数不一致: {len(headers1)} vs {len(headers2)}',
                 'severity': 'critical'
             })
@@ -440,13 +440,13 @@ class OCRResultComparator:
             if similarity < self.header_similarity_threshold:
                 result['match'] = False
                 result['differences'].append({
-                    'type': 'header_mismatch',
+                    'type': 'table_header_mismatch',
                     'column_index': i,
                     'header1': h1,
                     'header2': h2,
                     'similarity': similarity,
                     'description': f'第{i+1}列表头不匹配: "{h1}" vs "{h2}" (相似度: {similarity:.1f}%)',
-                    'severity': 'critical'
+                    'severity': 'medium' if similarity < 50 else 'high'
                 })
             else:
                 result['column_mapping'][i] = i  # 建立列映射
@@ -664,24 +664,30 @@ class OCRResultComparator:
         
         header_result = self.compare_table_headers(headers1, headers2)
         
+        # ✅ 新增:检查列数是否一致
+        column_count_match = len(headers1) == len(headers2)
         if not header_result['match']:
-            print(f"\n❌ 表头不匹配,严重错误!")
+            print(f"\n⚠️  表头文字存在差异")
             for diff in header_result['differences']:
                 print(f"   - {diff['description']}")
                 differences.append({
-                    'type': 'table_header_critical',
+                    'type': diff.get('type', 'table_header_mismatch'),  # ✅ 改为 mismatch 而非 critical
                     'position': '表头',
-                    'file1_value': ', '.join(headers1),
-                    'file2_value': ', '.join(headers2),
+                    'file1_value': diff.get('header1', ''),
+                    'file2_value': diff.get('header2', ''),
                     'description': diff['description'],
-                    'severity': 'critical'
+                    'severity': diff.get('severity', 'high'),
                 })
-            return differences
-        
-        print(f"✅ 表头匹配成功")
+                if diff.get('severity', 'high') == 'critical':
+                    return differences
+        else:
+            print(f"✅ 表头匹配成功")
         
         # 第四步:检测列类型
-        column_types = []
+        column_types1 = []
+        column_types2 = []
+        
+        # 检测文件1的列类型
         for col_idx in range(len(headers1)):
             col_values1 = [
                 row[col_idx] 
@@ -689,8 +695,40 @@ class OCRResultComparator:
                 if col_idx < len(row)
             ]
             col_type = self.detect_column_type(col_values1)
-            column_types.append(col_type)
-            print(f"   列 {col_idx + 1} ({headers1[col_idx]}): {col_type}")
+            column_types1.append(col_type)
+            print(f"   文件1列 {col_idx + 1} ({headers1[col_idx]}): {col_type}")
+        
+        # 检测文件2的列类型
+        for col_idx in range(len(headers2)):
+            col_values2 = [
+                row[col_idx] 
+                for row in table2[header_row_idx2 + 1:] 
+                if col_idx < len(row)
+            ]
+            col_type = self.detect_column_type(col_values2)
+            column_types2.append(col_type)
+            print(f"   文件2列 {col_idx + 1} ({headers2[col_idx]}): {col_type}")
+        
+        # ✅ 新增:检查列类型是否一致
+        column_types_match = column_types1 == column_types2
+        
+        if not column_types_match:
+            print(f"\n⚠️  列类型存在差异,不再比较单元格内容...")
+            for col_idx in range(min(len(column_types1), len(column_types2))):
+                if column_types1[col_idx] != column_types2[col_idx]:
+                    differences.append({
+                        'type': 'table_header_critical',
+                        'position': f'第{col_idx + 1}列',
+                        'file1_value': f'{headers1[col_idx]} ({column_types1[col_idx]})',
+                        'file2_value': f'{headers2[col_idx]} ({column_types2[col_idx]})',
+                        'description': f'列类型不一致: {column_types1[col_idx]} vs {column_types2[col_idx]}',
+                        'severity': 'critical',
+                        'column_index': col_idx
+                    })
+            return differences
+        
+        # ✅ 使用两个文件中更准确的列类型(优先使用数据更多的文件)
+        column_types = column_types1  # 默认使用文件1的列类型
         
         # 第五步:逐行比较数据
         data_rows1 = table1[header_row_idx1 + 1:]
@@ -732,7 +770,7 @@ class OCRResultComparator:
                 })
                 continue
             
-            # ✅ 修改:逐列比较,每个单元格差异独立输出
+            # 逐列比较,每个单元格差异独立输出
             max_cols = max(len(row1), len(row2))
             
             for col_idx in range(max_cols):
@@ -743,8 +781,16 @@ class OCRResultComparator:
                 if "[图片内容-忽略]" in cell1 or "[图片内容-忽略]" in cell2:
                     continue
                 
+                # ✅ 使用对应的列类型
                 column_type = column_types[col_idx] if col_idx < len(column_types) else 'text'
-                column_name = headers1[col_idx] if col_idx < len(headers1) else f'列{col_idx + 1}'
+                
+                # ✅ 获取列名(如果表头不匹配,显示两个表头)
+                if header_result['match']:
+                    column_name = headers1[col_idx] if col_idx < len(headers1) else f'列{col_idx + 1}'
+                else:
+                    col_name1 = headers1[col_idx] if col_idx < len(headers1) else f'列{col_idx + 1}'
+                    col_name2 = headers2[col_idx] if col_idx < len(headers2) else f'列{col_idx + 1}'
+                    column_name = f"{col_name1}/{col_name2}"
                 
                 compare_result = self.compare_cell_value(cell1, cell2, column_type, column_name)
                 
@@ -839,7 +885,8 @@ class OCRResultComparator:
             'datetime_differences': len([d for d in all_differences if d['type'] == 'table_datetime']),
             'text_differences': len([d for d in all_differences if d['type'] == 'table_text']),
             'table_pre_header': len([d for d in all_differences if d['type'] == 'table_pre_header']),
-            'table_header_critical': len([d for d in all_differences if d['type'] == 'table_header_critical']),
+            'table_header_mismatch': len([d for d in all_differences if d['type'] == 'table_header_mismatch']),  # ✅ 新增
+            'table_header_critical': len([d for d in all_differences if d['type'] == 'table_header_critical']),  # ✅ 新增
             'table_header_position': len([d for d in all_differences if d['type'] == 'table_header_position']),
             'table_row_missing': len([d for d in all_differences if d['type'] == 'table_row_missing']),
             'high_severity': len([d for d in all_differences if d.get('severity') == 'critical' or d.get('severity') == 'high']),
@@ -965,7 +1012,7 @@ class OCRResultComparator:
 
 def compare_ocr_results(file1_path: str, file2_path: str, output_file: str = "comparison_report",
                        output_format: str = "markdown", ignore_images: bool = True,
-                       table_mode: str = 'standard', similarity_algorithm: str = 'ratio'):
+                       table_mode: str = 'standard', similarity_algorithm: str = 'ratio') -> Dict:
     """
     比较两个OCR结果文件