|
|
@@ -1,7 +1,7 @@
|
|
|
import os
|
|
|
from typing import Dict
|
|
|
from datetime import datetime
|
|
|
-# ✅ 兼容相对导入和绝对导入
|
|
|
+
|
|
|
try:
|
|
|
from .content_extractor import ContentExtractor
|
|
|
from .table_comparator import TableComparator
|
|
|
@@ -11,6 +11,7 @@ except ImportError:
|
|
|
from table_comparator import TableComparator
|
|
|
from paragraph_comparator import ParagraphComparator
|
|
|
|
|
|
+
|
|
|
class OCRResultComparator:
|
|
|
"""OCR结果比较器主类"""
|
|
|
|
|
|
@@ -62,24 +63,75 @@ class OCRResultComparator:
|
|
|
)
|
|
|
print(f"✅ 段落对比完成,发现 {len(paragraph_differences)} 个差异")
|
|
|
|
|
|
- # ✅ 初始化所有差异列表 - 用于兼容原版本返回结构
|
|
|
+ # 初始化所有差异列表
|
|
|
all_differences = []
|
|
|
all_differences.extend(paragraph_differences)
|
|
|
|
|
|
- # 比较表格
|
|
|
- print(f"\n🔍 开始表格对比...")
|
|
|
+ # ✅ 智能表格匹配与比较
|
|
|
+ print(f"\n🔍 开始表格智能匹配...")
|
|
|
|
|
|
- # ✅ 处理表格比较 - 支持多表格
|
|
|
if tables1 and tables2:
|
|
|
- # 根据模式选择比较方法
|
|
|
- if self.table_comparison_mode == 'flow_list':
|
|
|
- table_diffs = self.table_comparator.compare_table_flow_list(tables1[0], tables2[0])
|
|
|
- else:
|
|
|
- table_diffs = self.table_comparator.compare_tables(tables1[0], tables2[0])
|
|
|
-
|
|
|
- all_differences.extend(table_diffs)
|
|
|
- print(f"✅ 表格对比完成,发现 {len(table_diffs)} 个差异")
|
|
|
+ # 找到匹配的表格对
|
|
|
+ table_matches = self.table_comparator.find_matching_tables(tables1, tables2)
|
|
|
|
|
|
+ if not table_matches:
|
|
|
+ print(f" ⚠️ 未找到匹配的表格")
|
|
|
+ all_differences.append({
|
|
|
+ 'type': 'table_structure',
|
|
|
+ 'position': '表格匹配',
|
|
|
+ 'file1_value': f'{len(tables1)}个表格',
|
|
|
+ 'file2_value': f'{len(tables2)}个表格',
|
|
|
+ 'description': '未找到可匹配的表格',
|
|
|
+ 'severity': 'high'
|
|
|
+ })
|
|
|
+ else:
|
|
|
+ # 比较每对匹配的表格
|
|
|
+ for idx1, idx2, similarity in table_matches:
|
|
|
+ print(f"\n 📋 对比匹配的表格: 表格{idx1+1} vs 表格{idx2+1}")
|
|
|
+
|
|
|
+ if self.table_comparison_mode == 'flow_list':
|
|
|
+ table_diffs = self.table_comparator.compare_table_flow_list(
|
|
|
+ tables1[idx1], tables2[idx2]
|
|
|
+ )
|
|
|
+ else:
|
|
|
+ table_diffs = self.table_comparator.compare_tables(
|
|
|
+ tables1[idx1], tables2[idx2]
|
|
|
+ )
|
|
|
+
|
|
|
+ # 为每个差异添加表格标识
|
|
|
+ for diff in table_diffs:
|
|
|
+ diff['table_pair'] = f'表格{idx1+1}↔表格{idx2+1}'
|
|
|
+ diff['table_similarity'] = similarity
|
|
|
+
|
|
|
+ all_differences.extend(table_diffs)
|
|
|
+ print(f" 发现 {len(table_diffs)} 个差异")
|
|
|
+
|
|
|
+ # 检查未匹配的表格
|
|
|
+ matched_tables1 = {m[0] for m in table_matches}
|
|
|
+ matched_tables2 = {m[1] for m in table_matches}
|
|
|
+
|
|
|
+ for i in range(len(tables1)):
|
|
|
+ if i not in matched_tables1:
|
|
|
+ all_differences.append({
|
|
|
+ 'type': 'table_unmatched',
|
|
|
+ 'position': f'文件1表格{i+1}',
|
|
|
+ 'file1_value': f'表格{i+1} (无匹配)',
|
|
|
+ 'file2_value': '',
|
|
|
+ 'description': f'文件1的表格{i+1}在文件2中无匹配表格',
|
|
|
+ 'severity': 'medium'
|
|
|
+ })
|
|
|
+
|
|
|
+ for j in range(len(tables2)):
|
|
|
+ if j not in matched_tables2:
|
|
|
+ all_differences.append({
|
|
|
+ 'type': 'table_unmatched',
|
|
|
+ 'position': f'文件2表格{j+1}',
|
|
|
+ 'file1_value': '',
|
|
|
+ 'file2_value': f'表格{j+1} (无匹配)',
|
|
|
+ 'description': f'文件2的表格{j+1}在文件1中无匹配表格',
|
|
|
+ 'severity': 'medium'
|
|
|
+ })
|
|
|
+
|
|
|
elif tables1 and not tables2:
|
|
|
all_differences.append({
|
|
|
'type': 'table_structure',
|