|
|
@@ -455,7 +455,7 @@ class OCRResultComparator:
|
|
|
|
|
|
def compare_cell_value(self, value1: str, value2: str, column_type: str,
|
|
|
column_name: str = '') -> Dict:
|
|
|
- """比较单元格值"""
|
|
|
+ """比较单元格值 - 统一错误类型"""
|
|
|
result = {
|
|
|
'match': True,
|
|
|
'difference': None
|
|
|
@@ -477,16 +477,16 @@ class OCRResultComparator:
|
|
|
if abs(num1 - num2) > 0.01: # 允许0.01的误差
|
|
|
result['match'] = False
|
|
|
result['difference'] = {
|
|
|
- 'type': 'table_amount',
|
|
|
+ 'type': 'table_amount', # ✅ 统一类型
|
|
|
'value1': value1,
|
|
|
'value2': value2,
|
|
|
'diff_amount': abs(num1 - num2),
|
|
|
- 'description': f'数字不一致: {value1} vs {value2}'
|
|
|
+ 'description': f'金额不一致: {value1} vs {value2}'
|
|
|
}
|
|
|
else:
|
|
|
result['match'] = False
|
|
|
result['difference'] = {
|
|
|
- 'type': 'table_amount_format_error',
|
|
|
+ 'type': 'table_amount', # ✅ 格式错误也算金额差异
|
|
|
'value1': value1,
|
|
|
'value2': value2,
|
|
|
'description': f'数字格式错误: {value1} vs {value2}'
|
|
|
@@ -494,14 +494,13 @@ class OCRResultComparator:
|
|
|
|
|
|
elif column_type == 'datetime':
|
|
|
# 日期时间比较
|
|
|
- # 提取日期时间部分进行比较
|
|
|
datetime1 = self.extract_datetime(v1)
|
|
|
datetime2 = self.extract_datetime(v2)
|
|
|
|
|
|
if datetime1 != datetime2:
|
|
|
result['match'] = False
|
|
|
result['difference'] = {
|
|
|
- 'type': 'table_datetime_mismatch',
|
|
|
+ 'type': 'table_datetime', # ✅ 日期时间类型
|
|
|
'value1': value1,
|
|
|
'value2': value2,
|
|
|
'description': f'日期时间不一致: {value1} vs {value2}'
|
|
|
@@ -513,7 +512,7 @@ class OCRResultComparator:
|
|
|
if similarity < self.similarity_threshold:
|
|
|
result['match'] = False
|
|
|
result['difference'] = {
|
|
|
- 'type': 'table_text_mismatch',
|
|
|
+ 'type': 'table_text', # ✅ 文本差异
|
|
|
'value1': value1,
|
|
|
'value2': value2,
|
|
|
'similarity': similarity,
|
|
|
@@ -626,7 +625,7 @@ class OCRResultComparator:
|
|
|
|
|
|
if header_row_idx1 != header_row_idx2:
|
|
|
differences.append({
|
|
|
- 'type': 'table_header_position', # ✅ 已经是 table_ 开头
|
|
|
+ 'type': 'table_header_position',
|
|
|
'position': '表头位置',
|
|
|
'file1_value': f'第{header_row_idx1 + 1}行',
|
|
|
'file2_value': f'第{header_row_idx2 + 1}行',
|
|
|
@@ -646,9 +645,9 @@ class OCRResultComparator:
|
|
|
# 复用compare_tables方法进行比对
|
|
|
pre_header_diffs = self.compare_tables(pre_header_table1, pre_header_table2)
|
|
|
|
|
|
- # ✅ 修改:统一类型为 table_pre_header
|
|
|
+ # 修改:统一类型为 table_pre_header
|
|
|
for diff in pre_header_diffs:
|
|
|
- diff['type'] = 'table_pre_header' # 改为 table_ 开头
|
|
|
+ diff['type'] = 'table_pre_header'
|
|
|
diff['position'] = f"表头前{diff['position']}"
|
|
|
diff['severity'] = 'medium'
|
|
|
print(f" ⚠️ {diff['position']}: {diff['description']}")
|
|
|
@@ -670,7 +669,7 @@ class OCRResultComparator:
|
|
|
for diff in header_result['differences']:
|
|
|
print(f" - {diff['description']}")
|
|
|
differences.append({
|
|
|
- 'type': 'table_header_critical', # ✅ 已经是 table_ 开头
|
|
|
+ 'type': 'table_header_critical',
|
|
|
'position': '表头',
|
|
|
'file1_value': ', '.join(headers1),
|
|
|
'file2_value': ', '.join(headers2),
|
|
|
@@ -706,35 +705,35 @@ class OCRResultComparator:
|
|
|
row2 = data_rows2[row_idx] if row_idx < len(data_rows2) else []
|
|
|
|
|
|
# 实际行号(加上表头行索引)
|
|
|
- actual_row_num1 = header_row_idx1 + row_idx + 2
|
|
|
- actual_row_num2 = header_row_idx2 + row_idx + 2
|
|
|
+ actual_row_num = header_row_idx1 + row_idx + 2
|
|
|
|
|
|
if not row1:
|
|
|
differences.append({
|
|
|
- 'type': 'table_row_missing', # ✅ 修改:改为 table_row_missing
|
|
|
- 'position': f'第{actual_row_num1}行',
|
|
|
+ 'type': 'table_row_missing',
|
|
|
+ 'position': f'第{actual_row_num}行',
|
|
|
'file1_value': '',
|
|
|
'file2_value': ', '.join(row2),
|
|
|
- 'description': f'文件1缺少第{actual_row_num1}行',
|
|
|
- 'severity': 'high'
|
|
|
+ 'description': f'文件1缺少第{actual_row_num}行',
|
|
|
+ 'severity': 'high',
|
|
|
+ 'row_index': actual_row_num
|
|
|
})
|
|
|
continue
|
|
|
|
|
|
if not row2:
|
|
|
+ # ✅ 修改:整行缺失按单元格输出
|
|
|
differences.append({
|
|
|
- 'type': 'table_row_missing', # ✅ 修改:改为 table_row_missing
|
|
|
- 'position': f'第{actual_row_num2}行',
|
|
|
+ 'type': 'table_row_missing',
|
|
|
+ 'position': f'第{actual_row_num}行',
|
|
|
'file1_value': ', '.join(row1),
|
|
|
'file2_value': '',
|
|
|
- 'description': f'文件2缺少第{actual_row_num2}行',
|
|
|
- 'severity': 'high'
|
|
|
+ 'description': f'文件2缺少第{actual_row_num}行',
|
|
|
+ 'severity': 'high',
|
|
|
+ 'row_index': actual_row_num
|
|
|
})
|
|
|
continue
|
|
|
|
|
|
- # 逐列比较
|
|
|
+ # ✅ 修改:逐列比较,每个单元格差异独立输出
|
|
|
max_cols = max(len(row1), len(row2))
|
|
|
- row_has_diff = False
|
|
|
- row_diffs = []
|
|
|
|
|
|
for col_idx in range(max_cols):
|
|
|
cell1 = row1[col_idx] if col_idx < len(row1) else ''
|
|
|
@@ -750,33 +749,25 @@ class OCRResultComparator:
|
|
|
compare_result = self.compare_cell_value(cell1, cell2, column_type, column_name)
|
|
|
|
|
|
if not compare_result['match']:
|
|
|
- row_has_diff = True
|
|
|
+ # ✅ 直接将单元格差异添加到differences列表
|
|
|
diff_info = compare_result['difference']
|
|
|
- row_diffs.append({
|
|
|
- 'column_index': col_idx,
|
|
|
+
|
|
|
+ differences.append({
|
|
|
+ 'type': diff_info['type'], # 使用原始类型(table_amount, table_text等)
|
|
|
+ 'position': f'第{actual_row_num}行第{col_idx + 1}列',
|
|
|
+ 'file1_value': diff_info['value1'],
|
|
|
+ 'file2_value': diff_info['value2'],
|
|
|
+ 'description': diff_info['description'],
|
|
|
+ 'severity': 'medium',
|
|
|
+ 'row_index': actual_row_num,
|
|
|
+ 'col_index': col_idx,
|
|
|
'column_name': column_name,
|
|
|
'column_type': column_type,
|
|
|
- **diff_info
|
|
|
+ # 保留额外信息
|
|
|
+ **{k: v for k, v in diff_info.items() if k not in ['type', 'value1', 'value2', 'description']}
|
|
|
})
|
|
|
-
|
|
|
- if row_has_diff:
|
|
|
- # 汇总该行的所有差异
|
|
|
- diff_columns = [f"{d['column_name']}(列{d['column_index'] + 1})" for d in row_diffs]
|
|
|
- differences.append({
|
|
|
- 'type': 'table_row_data', # ✅ 修改:改为 table_row_data
|
|
|
- 'position': f'第{actual_row_num1}行',
|
|
|
- 'row_index': row_idx + 1,
|
|
|
- 'affected_columns': diff_columns,
|
|
|
- 'column_differences': row_diffs,
|
|
|
- 'file1_value': ', '.join(row1),
|
|
|
- 'file2_value': ', '.join(row2),
|
|
|
- 'description': f'表格第{actual_row_num1}行在以下列有差异: {", ".join(diff_columns)}',
|
|
|
- 'severity': 'medium'
|
|
|
- })
|
|
|
-
|
|
|
- print(f" ⚠️ 第{actual_row_num1}行有差异:")
|
|
|
- for diff in row_diffs:
|
|
|
- print(f" - {diff['column_name']}: {diff['description']}")
|
|
|
+
|
|
|
+ print(f" ⚠️ 第{actual_row_num}行第{col_idx + 1}列({column_name}): {diff_info['description']}")
|
|
|
|
|
|
print(f"\n✅ 流水表格对比完成,发现 {len(differences)} 个差异")
|
|
|
|
|
|
@@ -839,24 +830,23 @@ class OCRResultComparator:
|
|
|
para_diffs = self.compare_paragraphs_with_flexible_matching(paras1, paras2)
|
|
|
all_differences.extend(para_diffs)
|
|
|
|
|
|
- # # 生成unified diff报告
|
|
|
- # unified_diff_data = self.generate_unified_diff_report(
|
|
|
- # paras1, paras2, file1_path, file2_path,
|
|
|
- # "./output/pre_validation/unified_diff_comparison"
|
|
|
- # )
|
|
|
-
|
|
|
- # 统计信息
|
|
|
+ # ✅ 改进统计信息 - 细化分类
|
|
|
stats = {
|
|
|
'total_differences': len(all_differences),
|
|
|
'table_differences': len([d for d in all_differences if d['type'].startswith('table')]),
|
|
|
'paragraph_differences': len([d for d in all_differences if d['type'] == 'paragraph']),
|
|
|
'amount_differences': len([d for d in all_differences if d['type'] == 'table_amount']),
|
|
|
- 'high_severity': len([d for d in all_differences if d.get('severity') == 'high']),
|
|
|
+ 'datetime_differences': len([d for d in all_differences if d['type'] == 'table_datetime']),
|
|
|
+ 'text_differences': len([d for d in all_differences if d['type'] == 'table_text']),
|
|
|
+ 'table_pre_header': len([d for d in all_differences if d['type'] == 'table_pre_header']),
|
|
|
+ 'table_header_critical': len([d for d in all_differences if d['type'] == 'table_header_critical']),
|
|
|
+ 'table_header_position': len([d for d in all_differences if d['type'] == 'table_header_position']),
|
|
|
+ 'table_row_missing': len([d for d in all_differences if d['type'] == 'table_row_missing']),
|
|
|
+ 'high_severity': len([d for d in all_differences if d.get('severity') == 'critical' or d.get('severity') == 'high']),
|
|
|
'medium_severity': len([d for d in all_differences if d.get('severity') == 'medium']),
|
|
|
'low_severity': len([d for d in all_differences if d.get('severity') == 'low'])
|
|
|
}
|
|
|
|
|
|
- # 在返回结果中添加unified diff数据
|
|
|
result = {
|
|
|
'differences': all_differences,
|
|
|
'statistics': stats,
|
|
|
@@ -866,7 +856,6 @@ class OCRResultComparator:
|
|
|
'file2_paragraphs': len(paras2),
|
|
|
'file1_path': file1_path,
|
|
|
'file2_path': file2_path,
|
|
|
- # 'unified_diff': unified_diff_data # 添加unified diff数据
|
|
|
}
|
|
|
|
|
|
return result
|
|
|
@@ -908,7 +897,7 @@ class OCRResultComparator:
|
|
|
f.write("## 统计信息\n\n")
|
|
|
f.write(f"- 总差异数量: **{stats['total_differences']}**\n")
|
|
|
f.write(f"- 表格差异: **{stats['table_differences']}**\n")
|
|
|
- f.write(f"- 金额差异: **{stats['amount_differences']}**\n")
|
|
|
+ f.write(f"- 其中表格金额差异: **{stats['amount_differences']}**\n")
|
|
|
f.write(f"- 段落差异: **{stats['paragraph_differences']}**\n")
|
|
|
f.write(f"- 高严重度: **{stats['high_severity']}**\n") # ✅ 新增
|
|
|
f.write(f"- 中严重度: **{stats['medium_severity']}**\n") # ✅ 新增
|
|
|
@@ -1029,7 +1018,7 @@ def compare_ocr_results(file1_path: str, file2_path: str, output_file: str = "co
|
|
|
print(f"\n📊 对比完成!")
|
|
|
print(f" 总差异数: {result['statistics']['total_differences']}")
|
|
|
print(f" 表格差异: {result['statistics']['table_differences']}")
|
|
|
- print(f" 金额差异: {result['statistics']['amount_differences']}")
|
|
|
+ print(f" 其中表格金额差异: {result['statistics']['amount_differences']}")
|
|
|
print(f" 段落差异: {result['statistics']['paragraph_differences']}")
|
|
|
|
|
|
# 打印前几个重要差异
|