|
|
@@ -418,7 +418,7 @@ class OCRResultComparator:
|
|
|
if len(headers1) != len(headers2):
|
|
|
result['match'] = False
|
|
|
result['differences'].append({
|
|
|
- 'type': 'header_count',
|
|
|
+ 'type': 'table_header_critical',
|
|
|
'description': f'表头列数不一致: {len(headers1)} vs {len(headers2)}',
|
|
|
'severity': 'critical'
|
|
|
})
|
|
|
@@ -440,13 +440,13 @@ class OCRResultComparator:
|
|
|
if similarity < self.header_similarity_threshold:
|
|
|
result['match'] = False
|
|
|
result['differences'].append({
|
|
|
- 'type': 'header_mismatch',
|
|
|
+ 'type': 'table_header_mismatch',
|
|
|
'column_index': i,
|
|
|
'header1': h1,
|
|
|
'header2': h2,
|
|
|
'similarity': similarity,
|
|
|
'description': f'第{i+1}列表头不匹配: "{h1}" vs "{h2}" (相似度: {similarity:.1f}%)',
|
|
|
- 'severity': 'critical'
|
|
|
+ 'severity': 'medium' if similarity < 50 else 'high'
|
|
|
})
|
|
|
else:
|
|
|
result['column_mapping'][i] = i # 建立列映射
|
|
|
@@ -664,24 +664,30 @@ class OCRResultComparator:
|
|
|
|
|
|
header_result = self.compare_table_headers(headers1, headers2)
|
|
|
|
|
|
+ # ✅ 新增:检查列数是否一致
|
|
|
+ column_count_match = len(headers1) == len(headers2)
|
|
|
if not header_result['match']:
|
|
|
- print(f"\n❌ 表头不匹配,严重错误!")
|
|
|
+ print(f"\n⚠️ 表头文字存在差异")
|
|
|
for diff in header_result['differences']:
|
|
|
print(f" - {diff['description']}")
|
|
|
differences.append({
|
|
|
- 'type': 'table_header_critical',
|
|
|
+ 'type': diff.get('type', 'table_header_mismatch'), # ✅ 改为 mismatch 而非 critical
|
|
|
'position': '表头',
|
|
|
- 'file1_value': ', '.join(headers1),
|
|
|
- 'file2_value': ', '.join(headers2),
|
|
|
+ 'file1_value': diff.get('header1', ''),
|
|
|
+ 'file2_value': diff.get('header2', ''),
|
|
|
'description': diff['description'],
|
|
|
- 'severity': 'critical'
|
|
|
+ 'severity': diff.get('severity', 'high'),
|
|
|
})
|
|
|
- return differences
|
|
|
-
|
|
|
- print(f"✅ 表头匹配成功")
|
|
|
+ if diff.get('severity', 'high') == 'critical':
|
|
|
+ return differences
|
|
|
+ else:
|
|
|
+ print(f"✅ 表头匹配成功")
|
|
|
|
|
|
# 第四步:检测列类型
|
|
|
- column_types = []
|
|
|
+ column_types1 = []
|
|
|
+ column_types2 = []
|
|
|
+
|
|
|
+ # 检测文件1的列类型
|
|
|
for col_idx in range(len(headers1)):
|
|
|
col_values1 = [
|
|
|
row[col_idx]
|
|
|
@@ -689,8 +695,40 @@ class OCRResultComparator:
|
|
|
if col_idx < len(row)
|
|
|
]
|
|
|
col_type = self.detect_column_type(col_values1)
|
|
|
- column_types.append(col_type)
|
|
|
- print(f" 列 {col_idx + 1} ({headers1[col_idx]}): {col_type}")
|
|
|
+ column_types1.append(col_type)
|
|
|
+ print(f" 文件1列 {col_idx + 1} ({headers1[col_idx]}): {col_type}")
|
|
|
+
|
|
|
+ # 检测文件2的列类型
|
|
|
+ for col_idx in range(len(headers2)):
|
|
|
+ col_values2 = [
|
|
|
+ row[col_idx]
|
|
|
+ for row in table2[header_row_idx2 + 1:]
|
|
|
+ if col_idx < len(row)
|
|
|
+ ]
|
|
|
+ col_type = self.detect_column_type(col_values2)
|
|
|
+ column_types2.append(col_type)
|
|
|
+ print(f" 文件2列 {col_idx + 1} ({headers2[col_idx]}): {col_type}")
|
|
|
+
|
|
|
+ # ✅ 新增:检查列类型是否一致
|
|
|
+ column_types_match = column_types1 == column_types2
|
|
|
+
|
|
|
+ if not column_types_match:
|
|
|
+ print(f"\n⚠️ 列类型存在差异,不再比较单元格内容...")
|
|
|
+ for col_idx in range(min(len(column_types1), len(column_types2))):
|
|
|
+ if column_types1[col_idx] != column_types2[col_idx]:
|
|
|
+ differences.append({
|
|
|
+ 'type': 'table_header_critical',
|
|
|
+ 'position': f'第{col_idx + 1}列',
|
|
|
+ 'file1_value': f'{headers1[col_idx]} ({column_types1[col_idx]})',
|
|
|
+ 'file2_value': f'{headers2[col_idx]} ({column_types2[col_idx]})',
|
|
|
+ 'description': f'列类型不一致: {column_types1[col_idx]} vs {column_types2[col_idx]}',
|
|
|
+ 'severity': 'critical',
|
|
|
+ 'column_index': col_idx
|
|
|
+ })
|
|
|
+ return differences
|
|
|
+
|
|
|
+ # ✅ 使用两个文件中更准确的列类型(优先使用数据更多的文件)
|
|
|
+ column_types = column_types1 # 默认使用文件1的列类型
|
|
|
|
|
|
# 第五步:逐行比较数据
|
|
|
data_rows1 = table1[header_row_idx1 + 1:]
|
|
|
@@ -732,7 +770,7 @@ class OCRResultComparator:
|
|
|
})
|
|
|
continue
|
|
|
|
|
|
- # ✅ 修改:逐列比较,每个单元格差异独立输出
|
|
|
+ # 逐列比较,每个单元格差异独立输出
|
|
|
max_cols = max(len(row1), len(row2))
|
|
|
|
|
|
for col_idx in range(max_cols):
|
|
|
@@ -743,8 +781,16 @@ class OCRResultComparator:
|
|
|
if "[图片内容-忽略]" in cell1 or "[图片内容-忽略]" in cell2:
|
|
|
continue
|
|
|
|
|
|
+ # ✅ 使用对应的列类型
|
|
|
column_type = column_types[col_idx] if col_idx < len(column_types) else 'text'
|
|
|
- column_name = headers1[col_idx] if col_idx < len(headers1) else f'列{col_idx + 1}'
|
|
|
+
|
|
|
+ # ✅ 获取列名(如果表头不匹配,显示两个表头)
|
|
|
+ if header_result['match']:
|
|
|
+ column_name = headers1[col_idx] if col_idx < len(headers1) else f'列{col_idx + 1}'
|
|
|
+ else:
|
|
|
+ col_name1 = headers1[col_idx] if col_idx < len(headers1) else f'列{col_idx + 1}'
|
|
|
+ col_name2 = headers2[col_idx] if col_idx < len(headers2) else f'列{col_idx + 1}'
|
|
|
+ column_name = f"{col_name1}/{col_name2}"
|
|
|
|
|
|
compare_result = self.compare_cell_value(cell1, cell2, column_type, column_name)
|
|
|
|
|
|
@@ -839,7 +885,8 @@ class OCRResultComparator:
|
|
|
'datetime_differences': len([d for d in all_differences if d['type'] == 'table_datetime']),
|
|
|
'text_differences': len([d for d in all_differences if d['type'] == 'table_text']),
|
|
|
'table_pre_header': len([d for d in all_differences if d['type'] == 'table_pre_header']),
|
|
|
- 'table_header_critical': len([d for d in all_differences if d['type'] == 'table_header_critical']),
|
|
|
+ 'table_header_mismatch': len([d for d in all_differences if d['type'] == 'table_header_mismatch']), # ✅ 新增
|
|
|
+ 'table_header_critical': len([d for d in all_differences if d['type'] == 'table_header_critical']), # ✅ 新增
|
|
|
'table_header_position': len([d for d in all_differences if d['type'] == 'table_header_position']),
|
|
|
'table_row_missing': len([d for d in all_differences if d['type'] == 'table_row_missing']),
|
|
|
'high_severity': len([d for d in all_differences if d.get('severity') == 'critical' or d.get('severity') == 'high']),
|
|
|
@@ -965,7 +1012,7 @@ class OCRResultComparator:
|
|
|
|
|
|
def compare_ocr_results(file1_path: str, file2_path: str, output_file: str = "comparison_report",
|
|
|
output_format: str = "markdown", ignore_images: bool = True,
|
|
|
- table_mode: str = 'standard', similarity_algorithm: str = 'ratio'):
|
|
|
+ table_mode: str = 'standard', similarity_algorithm: str = 'ratio') -> Dict:
|
|
|
"""
|
|
|
比较两个OCR结果文件
|
|
|
|