1 month ago · b410809491
--- a/compare_ocr_results.py
+++ b/compare_ocr_results.py
@@ -455,7 +455,7 @@ class OCRResultComparator:
 
				     
			
 
				     def compare_cell_value(self, value1: str, value2: str, column_type: str, 
			
 
				                           column_name: str = '') -> Dict:
			
 
				-        """比较单元格值"""
			
 
				+        """比较单元格值 - 统一错误类型"""
			
 
				         result = {
			
 
				             'match': True,
			
 
				             'difference': None
			
@@ -477,16 +477,16 @@ class OCRResultComparator:
 
				                 if abs(num1 - num2) > 0.01:  # 允许0.01的误差
			
 
				                     result['match'] = False
			
 
				                     result['difference'] = {
			
 
				-                        'type': 'table_amount',
			
 
				+                        'type': 'table_amount',  # ✅ 统一类型
			
 
				                         'value1': value1,
			
 
				                         'value2': value2,
			
 
				                         'diff_amount': abs(num1 - num2),
			
 
				-                        'description': f'数字不一致: {value1} vs {value2}'
			
 
				+                        'description': f'金额不一致: {value1} vs {value2}'
			
 
				                     }
			
 
				             else:
			
 
				                 result['match'] = False
			
 
				                 result['difference'] = {
			
 
				-                    'type': 'table_amount_format_error',
			
 
				+                    'type': 'table_amount',  # ✅ 格式错误也算金额差异
			
 
				                     'value1': value1,
			
 
				                     'value2': value2,
			
 
				                     'description': f'数字格式错误: {value1} vs {value2}'
			
@@ -494,14 +494,13 @@ class OCRResultComparator:
 
				         
			
 
				         elif column_type == 'datetime':
			
 
				             # 日期时间比较
			
 
				-            # 提取日期时间部分进行比较
			
 
				             datetime1 = self.extract_datetime(v1)
			
 
				             datetime2 = self.extract_datetime(v2)
			
 
				             
			
 
				             if datetime1 != datetime2:
			
 
				                 result['match'] = False
			
 
				                 result['difference'] = {
			
 
				-                    'type': 'table_datetime_mismatch',
			
 
				+                    'type': 'table_datetime',  # ✅ 日期时间类型
			
 
				                     'value1': value1,
			
 
				                     'value2': value2,
			
 
				                     'description': f'日期时间不一致: {value1} vs {value2}'
			
@@ -513,7 +512,7 @@ class OCRResultComparator:
 
				             if similarity < self.similarity_threshold:
			
 
				                 result['match'] = False
			
 
				                 result['difference'] = {
			
 
				-                    'type': 'table_text_mismatch',
			
 
				+                    'type': 'table_text',  # ✅ 文本差异
			
 
				                     'value1': value1,
			
 
				                     'value2': value2,
			
 
				                     'similarity': similarity,
			
@@ -626,7 +625,7 @@ class OCRResultComparator:
 
				         
			
 
				         if header_row_idx1 != header_row_idx2:
			
 
				             differences.append({
			
 
				-                'type': 'table_header_position',  # ✅ 已经是 table_ 开头
			
 
				+                'type': 'table_header_position',
			
 
				                 'position': '表头位置',
			
 
				                 'file1_value': f'第{header_row_idx1 + 1}行',
			
 
				                 'file2_value': f'第{header_row_idx2 + 1}行',
			
@@ -646,9 +645,9 @@ class OCRResultComparator:
 
				                 # 复用compare_tables方法进行比对
			
 
				                 pre_header_diffs = self.compare_tables(pre_header_table1, pre_header_table2)
			
 
				                 
			
 
				-                # ✅ 修改：统一类型为 table_pre_header
			
 
				+                # 修改：统一类型为 table_pre_header
			
 
				                 for diff in pre_header_diffs:
			
 
				-                    diff['type'] = 'table_pre_header'  # 改为 table_ 开头
			
 
				+                    diff['type'] = 'table_pre_header'
			
 
				                     diff['position'] = f"表头前{diff['position']}"
			
 
				                     diff['severity'] = 'medium'
			
 
				                     print(f"   ⚠️  {diff['position']}: {diff['description']}")
			
@@ -670,7 +669,7 @@ class OCRResultComparator:
 
				             for diff in header_result['differences']:
			
 
				                 print(f"   - {diff['description']}")
			
 
				                 differences.append({
			
 
				-                    'type': 'table_header_critical',  # ✅ 已经是 table_ 开头
			
 
				+                    'type': 'table_header_critical',
			
 
				                     'position': '表头',
			
 
				                     'file1_value': ', '.join(headers1),
			
 
				                     'file2_value': ', '.join(headers2),
			
@@ -706,35 +705,35 @@ class OCRResultComparator:
 
				             row2 = data_rows2[row_idx] if row_idx < len(data_rows2) else []
			
 
				             
			
 
				             # 实际行号（加上表头行索引）
			
 
				-            actual_row_num1 = header_row_idx1 + row_idx + 2
			
 
				-            actual_row_num2 = header_row_idx2 + row_idx + 2
			
 
				+            actual_row_num = header_row_idx1 + row_idx + 2
			
 
				             
			
 
				             if not row1:
			
 
				                 differences.append({
			
 
				-                    'type': 'table_row_missing',  # ✅ 修改：改为 table_row_missing
			
 
				-                    'position': f'第{actual_row_num1}行',
			
 
				+                    'type': 'table_row_missing',
			
 
				+                    'position': f'第{actual_row_num}行',
			
 
				                     'file1_value': '',
			
 
				                     'file2_value': ', '.join(row2),
			
 
				-                    'description': f'文件1缺少第{actual_row_num1}行',
			
 
				-                    'severity': 'high'
			
 
				+                    'description': f'文件1缺少第{actual_row_num}行',
			
 
				+                    'severity': 'high',
			
 
				+                    'row_index': actual_row_num
			
 
				                 })
			
 
				                 continue
			
 
				             
			
 
				             if not row2:
			
 
				+                # ✅ 修改：整行缺失按单元格输出
			
 
				                 differences.append({
			
 
				-                    'type': 'table_row_missing',  # ✅ 修改：改为 table_row_missing
			
 
				-                    'position': f'第{actual_row_num2}行',
			
 
				+                    'type': 'table_row_missing',
			
 
				+                    'position': f'第{actual_row_num}行',
			
 
				                     'file1_value': ', '.join(row1),
			
 
				                     'file2_value': '',
			
 
				-                    'description': f'文件2缺少第{actual_row_num2}行',
			
 
				-                    'severity': 'high'
			
 
				+                    'description': f'文件2缺少第{actual_row_num}行',
			
 
				+                    'severity': 'high',
			
 
				+                    'row_index': actual_row_num
			
 
				                 })
			
 
				                 continue
			
 
				             
			
 
				-            # 逐列比较
			
 
				+            # ✅ 修改：逐列比较，每个单元格差异独立输出
			
 
				             max_cols = max(len(row1), len(row2))
			
 
				-            row_has_diff = False
			
 
				-            row_diffs = []
			
 
				             
			
 
				             for col_idx in range(max_cols):
			
 
				                 cell1 = row1[col_idx] if col_idx < len(row1) else ''
			
@@ -750,33 +749,25 @@ class OCRResultComparator:
 
				                 compare_result = self.compare_cell_value(cell1, cell2, column_type, column_name)
			
 
				                 
			
 
				                 if not compare_result['match']:
			
 
				-                    row_has_diff = True
			
 
				+                    # ✅ 直接将单元格差异添加到differences列表
			
 
				                     diff_info = compare_result['difference']
			
 
				-                    row_diffs.append({
			
 
				-                        'column_index': col_idx,
			
 
				+                    
			
 
				+                    differences.append({
			
 
				+                        'type': diff_info['type'],  # 使用原始类型（table_amount, table_text等）
			
 
				+                        'position': f'第{actual_row_num}行第{col_idx + 1}列',
			
 
				+                        'file1_value': diff_info['value1'],
			
 
				+                        'file2_value': diff_info['value2'],
			
 
				+                        'description': diff_info['description'],
			
 
				+                        'severity': 'medium',
			
 
				+                        'row_index': actual_row_num,
			
 
				+                        'col_index': col_idx,
			
 
				                         'column_name': column_name,
			
 
				                         'column_type': column_type,
			
 
				-                        **diff_info
			
 
				+                        # 保留额外信息
			
 
				+                        **{k: v for k, v in diff_info.items() if k not in ['type', 'value1', 'value2', 'description']}
			
 
				                     })
			
 
				-            
			
 
				-            if row_has_diff:
			
 
				-                # 汇总该行的所有差异
			
 
				-                diff_columns = [f"{d['column_name']}(列{d['column_index'] + 1})" for d in row_diffs]
			
 
				-                differences.append({
			
 
				-                    'type': 'table_row_data',  # ✅ 修改：改为 table_row_data
			
 
				-                    'position': f'第{actual_row_num1}行',
			
 
				-                    'row_index': row_idx + 1,
			
 
				-                    'affected_columns': diff_columns,
			
 
				-                    'column_differences': row_diffs,
			
 
				-                    'file1_value': ', '.join(row1),
			
 
				-                    'file2_value': ', '.join(row2),
			
 
				-                    'description': f'表格第{actual_row_num1}行在以下列有差异: {", ".join(diff_columns)}',
			
 
				-                    'severity': 'medium'
			
 
				-                })
			
 
				-                
			
 
				-                print(f"   ⚠️  第{actual_row_num1}行有差异:")
			
 
				-                for diff in row_diffs:
			
 
				-                    print(f"      - {diff['column_name']}: {diff['description']}")
			
 
				+                    
			
 
				+                    print(f"   ⚠️  第{actual_row_num}行第{col_idx + 1}列({column_name}): {diff_info['description']}")
			
 
				         
			
 
				         print(f"\n✅ 流水表格对比完成，发现 {len(differences)} 个差异")
			
 
				         
			
@@ -839,24 +830,23 @@ class OCRResultComparator:
 
				         para_diffs = self.compare_paragraphs_with_flexible_matching(paras1, paras2)
			
 
				         all_differences.extend(para_diffs)
			
 
				         
			
 
				-        # # 生成unified diff报告
			
 
				-        # unified_diff_data = self.generate_unified_diff_report(
			
 
				-        #     paras1, paras2, file1_path, file2_path, 
			
 
				-        #     "./output/pre_validation/unified_diff_comparison"
			
 
				-        # )
			
 
				-
			
 
				-        # 统计信息
			
 
				+        # ✅ 改进统计信息 - 细化分类
			
 
				         stats = {
			
 
				             'total_differences': len(all_differences),
			
 
				             'table_differences': len([d for d in all_differences if d['type'].startswith('table')]),
			
 
				             'paragraph_differences': len([d for d in all_differences if d['type'] == 'paragraph']),
			
 
				             'amount_differences': len([d for d in all_differences if d['type'] == 'table_amount']),
			
 
				-            'high_severity': len([d for d in all_differences if d.get('severity') == 'high']),
			
 
				+            'datetime_differences': len([d for d in all_differences if d['type'] == 'table_datetime']),
			
 
				+            'text_differences': len([d for d in all_differences if d['type'] == 'table_text']),
			
 
				+            'table_pre_header': len([d for d in all_differences if d['type'] == 'table_pre_header']),
			
 
				+            'table_header_critical': len([d for d in all_differences if d['type'] == 'table_header_critical']),
			
 
				+            'table_header_position': len([d for d in all_differences if d['type'] == 'table_header_position']),
			
 
				+            'table_row_missing': len([d for d in all_differences if d['type'] == 'table_row_missing']),
			
 
				+            'high_severity': len([d for d in all_differences if d.get('severity') == 'critical' or d.get('severity') == 'high']),
			
 
				             'medium_severity': len([d for d in all_differences if d.get('severity') == 'medium']),
			
 
				             'low_severity': len([d for d in all_differences if d.get('severity') == 'low'])
			
 
				         }
			
 
				         
			
 
				-        # 在返回结果中添加unified diff数据
			
 
				         result = {
			
 
				             'differences': all_differences,
			
 
				             'statistics': stats,
			
@@ -866,7 +856,6 @@ class OCRResultComparator:
 
				             'file2_paragraphs': len(paras2),
			
 
				             'file1_path': file1_path,
			
 
				             'file2_path': file2_path,
			
 
				-            # 'unified_diff': unified_diff_data  # 添加unified diff数据
			
 
				         }
			
 
				         
			
 
				         return result
			
@@ -908,7 +897,7 @@ class OCRResultComparator:
 
				             f.write("## 统计信息\n\n")
			
 
				             f.write(f"- 总差异数量: **{stats['total_differences']}**\n")
			
 
				             f.write(f"- 表格差异: **{stats['table_differences']}**\n")
			
 
				-            f.write(f"- 金额差异: **{stats['amount_differences']}**\n")
			
 
				+            f.write(f"- 其中表格金额差异: **{stats['amount_differences']}**\n")
			
 
				             f.write(f"- 段落差异: **{stats['paragraph_differences']}**\n")
			
 
				             f.write(f"- 高严重度: **{stats['high_severity']}**\n")  # ✅ 新增
			
 
				             f.write(f"- 中严重度: **{stats['medium_severity']}**\n")  # ✅ 新增
			
@@ -1029,7 +1018,7 @@ def compare_ocr_results(file1_path: str, file2_path: str, output_file: str = "co
 
				         print(f"\n📊 对比完成！")
			
 
				         print(f"   总差异数: {result['statistics']['total_differences']}")
			
 
				         print(f"   表格差异: {result['statistics']['table_differences']}")
			
 
				-        print(f"   金额差异: {result['statistics']['amount_differences']}")
			
 
				+        print(f"   其中表格金额差异: {result['statistics']['amount_differences']}")
			
 
				         print(f"   段落差异: {result['statistics']['paragraph_differences']}")
			
 
				         
			
 
				         # 打印前几个重要差异