Răsfoiți Sursa

feat: 增强表格比较功能,支持智能匹配与未匹配表格的详细报告

zhch158_admin 1 lună în urmă
părinte
comite
e9b1502957
1 a modificat fișierele cu 65 adăugiri și 13 ștergeri
  1. 65 13
      comparator/ocr_comparator.py

+ 65 - 13
comparator/ocr_comparator.py

@@ -1,7 +1,7 @@
 import os
 from typing import Dict
 from datetime import datetime
-# ✅ 兼容相对导入和绝对导入
+
 try:
     from .content_extractor import ContentExtractor
     from .table_comparator import TableComparator
@@ -11,6 +11,7 @@ except ImportError:
     from table_comparator import TableComparator
     from paragraph_comparator import ParagraphComparator
 
+
 class OCRResultComparator:
     """OCR结果比较器主类"""
     
@@ -62,24 +63,75 @@ class OCRResultComparator:
         )
         print(f"✅ 段落对比完成,发现 {len(paragraph_differences)} 个差异")
         
-        # 初始化所有差异列表 - 用于兼容原版本返回结构
+        # 初始化所有差异列表
         all_differences = []
         all_differences.extend(paragraph_differences)
         
-        # 比较表格
-        print(f"\n🔍 开始表格对比...")
+        # ✅ 智能表格匹配与比较
+        print(f"\n🔍 开始表格智能匹配...")
         
-        # ✅ 处理表格比较 - 支持多表格
         if tables1 and tables2:
-            # 根据模式选择比较方法
-            if self.table_comparison_mode == 'flow_list':
-                table_diffs = self.table_comparator.compare_table_flow_list(tables1[0], tables2[0])
-            else:
-                table_diffs = self.table_comparator.compare_tables(tables1[0], tables2[0])
-            
-            all_differences.extend(table_diffs)
-            print(f"✅ 表格对比完成,发现 {len(table_diffs)} 个差异")
+            # 找到匹配的表格对
+            table_matches = self.table_comparator.find_matching_tables(tables1, tables2)
             
+            if not table_matches:
+                print(f"   ⚠️  未找到匹配的表格")
+                all_differences.append({
+                    'type': 'table_structure',
+                    'position': '表格匹配',
+                    'file1_value': f'{len(tables1)}个表格',
+                    'file2_value': f'{len(tables2)}个表格',
+                    'description': '未找到可匹配的表格',
+                    'severity': 'high'
+                })
+            else:
+                # 比较每对匹配的表格
+                for idx1, idx2, similarity in table_matches:
+                    print(f"\n   📋 对比匹配的表格: 表格{idx1+1} vs 表格{idx2+1}")
+                    
+                    if self.table_comparison_mode == 'flow_list':
+                        table_diffs = self.table_comparator.compare_table_flow_list(
+                            tables1[idx1], tables2[idx2]
+                        )
+                    else:
+                        table_diffs = self.table_comparator.compare_tables(
+                            tables1[idx1], tables2[idx2]
+                        )
+                    
+                    # 为每个差异添加表格标识
+                    for diff in table_diffs:
+                        diff['table_pair'] = f'表格{idx1+1}↔表格{idx2+1}'
+                        diff['table_similarity'] = similarity
+                    
+                    all_differences.extend(table_diffs)
+                    print(f"      发现 {len(table_diffs)} 个差异")
+                
+                # 检查未匹配的表格
+                matched_tables1 = {m[0] for m in table_matches}
+                matched_tables2 = {m[1] for m in table_matches}
+                
+                for i in range(len(tables1)):
+                    if i not in matched_tables1:
+                        all_differences.append({
+                            'type': 'table_unmatched',
+                            'position': f'文件1表格{i+1}',
+                            'file1_value': f'表格{i+1} (无匹配)',
+                            'file2_value': '',
+                            'description': f'文件1的表格{i+1}在文件2中无匹配表格',
+                            'severity': 'medium'
+                        })
+                
+                for j in range(len(tables2)):
+                    if j not in matched_tables2:
+                        all_differences.append({
+                            'type': 'table_unmatched',
+                            'position': f'文件2表格{j+1}',
+                            'file1_value': '',
+                            'file2_value': f'表格{j+1} (无匹配)',
+                            'description': f'文件2的表格{j+1}在文件1中无匹配表格',
+                            'severity': 'medium'
+                        })
+        
         elif tables1 and not tables2:
             all_differences.append({
                 'type': 'table_structure',