1 månad sedan · b35a5f4c94
--- a/compare_ocr_results.py
+++ b/compare_ocr_results.py
@@ -418,7 +418,7 @@ class OCRResultComparator:
 
				         if len(headers1) != len(headers2):
			
 
				             result['match'] = False
			
 
				             result['differences'].append({
			
 
				-                'type': 'header_count',
			
 
				+                'type': 'table_header_critical',
			
 
				                 'description': f'表头列数不一致: {len(headers1)} vs {len(headers2)}',
			
 
				                 'severity': 'critical'
			
 
				             })
			
@@ -440,13 +440,13 @@ class OCRResultComparator:
 
				             if similarity < self.header_similarity_threshold:
			
 
				                 result['match'] = False
			
 
				                 result['differences'].append({
			
 
				-                    'type': 'header_mismatch',
			
 
				+                    'type': 'table_header_mismatch',
			
 
				                     'column_index': i,
			
 
				                     'header1': h1,
			
 
				                     'header2': h2,
			
 
				                     'similarity': similarity,
			
 
				                     'description': f'第{i+1}列表头不匹配: "{h1}" vs "{h2}" (相似度: {similarity:.1f}%)',
			
 
				-                    'severity': 'critical'
			
 
				+                    'severity': 'medium' if similarity < 50 else 'high'
			
 
				                 })
			
 
				             else:
			
 
				                 result['column_mapping'][i] = i  # 建立列映射
			
@@ -664,24 +664,30 @@ class OCRResultComparator:
 
				         
			
 
				         header_result = self.compare_table_headers(headers1, headers2)
			
 
				         
			
 
				+        # ✅ 新增：检查列数是否一致
			
 
				+        column_count_match = len(headers1) == len(headers2)
			
 
				         if not header_result['match']:
			
 
				-            print(f"\n❌ 表头不匹配，严重错误！")
			
 
				+            print(f"\n⚠️  表头文字存在差异")
			
 
				             for diff in header_result['differences']:
			
 
				                 print(f"   - {diff['description']}")
			
 
				                 differences.append({
			
 
				-                    'type': 'table_header_critical',
			
 
				+                    'type': diff.get('type', 'table_header_mismatch'),  # ✅ 改为 mismatch 而非 critical
			
 
				                     'position': '表头',
			
 
				-                    'file1_value': ', '.join(headers1),
			
 
				-                    'file2_value': ', '.join(headers2),
			
 
				+                    'file1_value': diff.get('header1', ''),
			
 
				+                    'file2_value': diff.get('header2', ''),
			
 
				                     'description': diff['description'],
			
 
				-                    'severity': 'critical'
			
 
				+                    'severity': diff.get('severity', 'high'),
			
 
				                 })
			
 
				-            return differences
			
 
				-        
			
 
				-        print(f"✅ 表头匹配成功")
			
 
				+                if diff.get('severity', 'high') == 'critical':
			
 
				+                    return differences
			
 
				+        else:
			
 
				+            print(f"✅ 表头匹配成功")
			
 
				         
			
 
				         # 第四步：检测列类型
			
 
				-        column_types = []
			
 
				+        column_types1 = []
			
 
				+        column_types2 = []
			
 
				+        
			
 
				+        # 检测文件1的列类型
			
 
				         for col_idx in range(len(headers1)):
			
 
				             col_values1 = [
			
 
				                 row[col_idx] 
			
@@ -689,8 +695,40 @@ class OCRResultComparator:
 
				                 if col_idx < len(row)
			
 
				             ]
			
 
				             col_type = self.detect_column_type(col_values1)
			
 
				-            column_types.append(col_type)
			
 
				-            print(f"   列 {col_idx + 1} ({headers1[col_idx]}): {col_type}")
			
 
				+            column_types1.append(col_type)
			
 
				+            print(f"   文件1列 {col_idx + 1} ({headers1[col_idx]}): {col_type}")
			
 
				+        
			
 
				+        # 检测文件2的列类型
			
 
				+        for col_idx in range(len(headers2)):
			
 
				+            col_values2 = [
			
 
				+                row[col_idx] 
			
 
				+                for row in table2[header_row_idx2 + 1:] 
			
 
				+                if col_idx < len(row)
			
 
				+            ]
			
 
				+            col_type = self.detect_column_type(col_values2)
			
 
				+            column_types2.append(col_type)
			
 
				+            print(f"   文件2列 {col_idx + 1} ({headers2[col_idx]}): {col_type}")
			
 
				+        
			
 
				+        # ✅ 新增：检查列类型是否一致
			
 
				+        column_types_match = column_types1 == column_types2
			
 
				+        
			
 
				+        if not column_types_match:
			
 
				+            print(f"\n⚠️  列类型存在差异，不再比较单元格内容...")
			
 
				+            for col_idx in range(min(len(column_types1), len(column_types2))):
			
 
				+                if column_types1[col_idx] != column_types2[col_idx]:
			
 
				+                    differences.append({
			
 
				+                        'type': 'table_header_critical',
			
 
				+                        'position': f'第{col_idx + 1}列',
			
 
				+                        'file1_value': f'{headers1[col_idx]} ({column_types1[col_idx]})',
			
 
				+                        'file2_value': f'{headers2[col_idx]} ({column_types2[col_idx]})',
			
 
				+                        'description': f'列类型不一致: {column_types1[col_idx]} vs {column_types2[col_idx]}',
			
 
				+                        'severity': 'critical',
			
 
				+                        'column_index': col_idx
			
 
				+                    })
			
 
				+            return differences
			
 
				+        
			
 
				+        # ✅ 使用两个文件中更准确的列类型（优先使用数据更多的文件）
			
 
				+        column_types = column_types1  # 默认使用文件1的列类型
			
 
				         
			
 
				         # 第五步：逐行比较数据
			
 
				         data_rows1 = table1[header_row_idx1 + 1:]
			
@@ -732,7 +770,7 @@ class OCRResultComparator:
 
				                 })
			
 
				                 continue
			
 
				             
			
 
				-            # ✅ 修改：逐列比较，每个单元格差异独立输出
			
 
				+            # 逐列比较，每个单元格差异独立输出
			
 
				             max_cols = max(len(row1), len(row2))
			
 
				             
			
 
				             for col_idx in range(max_cols):
			
@@ -743,8 +781,16 @@ class OCRResultComparator:
 
				                 if "[图片内容-忽略]" in cell1 or "[图片内容-忽略]" in cell2:
			
 
				                     continue
			
 
				                 
			
 
				+                # ✅ 使用对应的列类型
			
 
				                 column_type = column_types[col_idx] if col_idx < len(column_types) else 'text'
			
 
				-                column_name = headers1[col_idx] if col_idx < len(headers1) else f'列{col_idx + 1}'
			
 
				+                
			
 
				+                # ✅ 获取列名（如果表头不匹配，显示两个表头）
			
 
				+                if header_result['match']:
			
 
				+                    column_name = headers1[col_idx] if col_idx < len(headers1) else f'列{col_idx + 1}'
			
 
				+                else:
			
 
				+                    col_name1 = headers1[col_idx] if col_idx < len(headers1) else f'列{col_idx + 1}'
			
 
				+                    col_name2 = headers2[col_idx] if col_idx < len(headers2) else f'列{col_idx + 1}'
			
 
				+                    column_name = f"{col_name1}/{col_name2}"
			
 
				                 
			
 
				                 compare_result = self.compare_cell_value(cell1, cell2, column_type, column_name)
			
 
				                 
			
@@ -839,7 +885,8 @@ class OCRResultComparator:
 
				             'datetime_differences': len([d for d in all_differences if d['type'] == 'table_datetime']),
			
 
				             'text_differences': len([d for d in all_differences if d['type'] == 'table_text']),
			
 
				             'table_pre_header': len([d for d in all_differences if d['type'] == 'table_pre_header']),
			
 
				-            'table_header_critical': len([d for d in all_differences if d['type'] == 'table_header_critical']),
			
 
				+            'table_header_mismatch': len([d for d in all_differences if d['type'] == 'table_header_mismatch']),  # ✅ 新增
			
 
				+            'table_header_critical': len([d for d in all_differences if d['type'] == 'table_header_critical']),  # ✅ 新增
			
 
				             'table_header_position': len([d for d in all_differences if d['type'] == 'table_header_position']),
			
 
				             'table_row_missing': len([d for d in all_differences if d['type'] == 'table_row_missing']),
			
 
				             'high_severity': len([d for d in all_differences if d.get('severity') == 'critical' or d.get('severity') == 'high']),
			
@@ -965,7 +1012,7 @@ class OCRResultComparator:
 
				 
			
 
				 def compare_ocr_results(file1_path: str, file2_path: str, output_file: str = "comparison_report",
			
 
				                        output_format: str = "markdown", ignore_images: bool = True,
			
 
				-                       table_mode: str = 'standard', similarity_algorithm: str = 'ratio'):
			
 
				+                       table_mode: str = 'standard', similarity_algorithm: str = 'ratio') -> Dict:
			
 
				     """
			
 
				     比较两个OCR结果文件