import re import difflib import json import argparse from typing import Dict, List, Tuple import markdown from bs4 import BeautifulSoup class OCRResultComparator: def __init__(self): self.differences = [] def normalize_text(self, text: str) -> str: """标准化文本:去除多余空格、回车等无效字符""" if not text: return "" # 去除多余的空白字符 text = re.sub(r'\s+', ' ', text.strip()) # 去除标点符号周围的空格 text = re.sub(r'\s*([,。:;!?、])\s*', r'\1', text) return text def is_image_reference(self, text: str) -> bool: """判断是否为图片引用或描述""" image_keywords = [ '图', '图片', '图像', 'image', 'figure', 'fig', '照片', '截图', '示意图', '流程图', '结构图' ] # 检查是否包含图片相关关键词 for keyword in image_keywords: if keyword in text.lower(): return True # 检查是否为Markdown图片语法 if re.search(r'!\[.*?\]\(.*?\)', text): return True # 检查是否为HTML图片标签 if re.search(r']*>', text, re.IGNORECASE): return True return False def extract_table_data(self, md_content: str) -> List[List[List[str]]]: """从Markdown中提取表格数据""" tables = [] # 使用BeautifulSoup解析HTML表格 soup = BeautifulSoup(md_content, 'html.parser') html_tables = soup.find_all('table') for table in html_tables: table_data = [] rows = table.find_all('tr') for row in rows: cells = row.find_all(['td', 'th']) row_data = [] for cell in cells: cell_text = self.normalize_text(cell.get_text()) # 跳过图片内容 if not self.is_image_reference(cell_text): row_data.append(cell_text) else: row_data.append("[图片内容-忽略]") if row_data: # 只添加非空行 table_data.append(row_data) if table_data: tables.append(table_data) return tables def extract_paragraphs(self, md_content: str) -> List[str]: """提取段落文本""" # 移除表格 content = re.sub(r'.*?
', '', md_content, flags=re.DOTALL) # 移除HTML标签 content = re.sub(r'<[^>]+>', '', content) # 移除Markdown注释 content = re.sub(r'', '', content, flags=re.DOTALL) # 分割段落 paragraphs = [] lines = content.split('\n') for line in lines: normalized = self.normalize_text(line) if normalized and not normalized.startswith('#'): # 跳过图片内容 if not self.is_image_reference(normalized): paragraphs.append(normalized) return paragraphs def compare_tables(self, table1: List[List[str]], table2: List[List[str]]) -> List[Dict]: """比较表格数据""" differences = [] # 确定最大行数 max_rows = max(len(table1), len(table2)) for i in range(max_rows): row1 = table1[i] if i < len(table1) else [] row2 = table2[i] if i < len(table2) else [] # 确定最大列数 max_cols = max(len(row1), len(row2)) for j in range(max_cols): cell1 = row1[j] if j < len(row1) else "" cell2 = row2[j] if j < len(row2) else "" # 跳过图片内容比较 if "[图片内容-忽略]" in cell1 or "[图片内容-忽略]" in cell2: continue if cell1 != cell2: # 特别处理数字金额 if self.is_numeric(cell1) and self.is_numeric(cell2): num1 = self.parse_number(cell1) num2 = self.parse_number(cell2) if abs(num1 - num2) > 0.001: # 允许小数精度误差 differences.append({ 'type': 'table_amount', 'position': f'行{i+1}列{j+1}', 'file1_value': cell1, 'file2_value': cell2, 'description': f'金额不一致: {cell1} vs {cell2}', 'row_index': i, 'col_index': j }) else: differences.append({ 'type': 'table_text', 'position': f'行{i+1}列{j+1}', 'file1_value': cell1, 'file2_value': cell2, 'description': f'文本不一致: {cell1} vs {cell2}', 'row_index': i, 'col_index': j }) return differences def is_numeric(self, text: str) -> bool: """判断文本是否为数字""" if not text: return False # 移除千分位分隔符和负号 clean_text = re.sub(r'[,,-]', '', text) try: float(clean_text) return True except ValueError: return False def parse_number(self, text: str) -> float: """解析数字""" if not text: return 0.0 clean_text = re.sub(r'[,,]', '', text) try: return float(clean_text) except ValueError: return 0.0 def compare_paragraphs(self, paras1: List[str], paras2: List[str]) -> List[Dict]: """比较段落文本""" differences = [] # 使用difflib进行文本比较 matcher = difflib.SequenceMatcher(None, paras1, paras2) for tag, i1, i2, j1, j2 in matcher.get_opcodes(): if tag == 'replace': for k in range(max(i2-i1, j2-j1)): para1 = paras1[i1+k] if i1+k < i2 else "" para2 = paras2[j1+k] if j1+k < j2 else "" if para1 != para2: differences.append({ 'type': 'paragraph', 'position': f'段落{i1+k+1}', 'file1_value': para1, 'file2_value': para2, 'description': f'段落文本不一致', 'paragraph_index': i1+k }) elif tag == 'delete': for k in range(i1, i2): differences.append({ 'type': 'paragraph', 'position': f'段落{k+1}', 'file1_value': paras1[k], 'file2_value': "", 'description': f'文件1中存在但文件2中缺失的段落', 'paragraph_index': k }) elif tag == 'insert': for k in range(j1, j2): differences.append({ 'type': 'paragraph', 'position': f'段落{k+1}', 'file1_value': "", 'file2_value': paras2[k], 'description': f'文件2中存在但文件1中缺失的段落', 'paragraph_index': k }) return differences def compare_files(self, file1_path: str, file2_path: str) -> Dict: """比较两个文件""" # 读取文件 with open(file1_path, 'r', encoding='utf-8') as f: content1 = f.read() with open(file2_path, 'r', encoding='utf-8') as f: content2 = f.read() # 提取表格和段落 tables1 = self.extract_table_data(content1) tables2 = self.extract_table_data(content2) paras1 = self.extract_paragraphs(content1) paras2 = self.extract_paragraphs(content2) # 比较结果 all_differences = [] # 比较表格 if tables1 and tables2: table_diffs = self.compare_tables(tables1[0], tables2[0]) all_differences.extend(table_diffs) elif tables1 and not tables2: all_differences.append({ 'type': 'table_structure', 'position': '表格结构', 'file1_value': f'包含{len(tables1)}个表格', 'file2_value': '无表格', 'description': '文件1包含表格但文件2无表格' }) elif not tables1 and tables2: all_differences.append({ 'type': 'table_structure', 'position': '表格结构', 'file1_value': '无表格', 'file2_value': f'包含{len(tables2)}个表格', 'description': '文件2包含表格但文件1无表格' }) # 比较段落 para_diffs = self.compare_paragraphs(paras1, paras2) all_differences.extend(para_diffs) # 统计信息 stats = { 'total_differences': len(all_differences), 'table_differences': len([d for d in all_differences if d['type'].startswith('table')]), 'paragraph_differences': len([d for d in all_differences if d['type'] == 'paragraph']), 'amount_differences': len([d for d in all_differences if d['type'] == 'table_amount']) } return { 'differences': all_differences, 'statistics': stats, 'file1_tables': len(tables1), 'file2_tables': len(tables2), 'file1_paragraphs': len(paras1), 'file2_paragraphs': len(paras2), 'file1_path': file1_path, 'file2_path': file2_path } def generate_json_report(self, comparison_result: Dict, output_file: str): """生成JSON格式的比较报告""" report_data = { 'comparison_summary': { 'timestamp': re.sub(r'[^\w\-_\.]', '_', str(comparison_result.get('timestamp', ''))), 'file1': comparison_result['file1_path'], 'file2': comparison_result['file2_path'], 'statistics': comparison_result['statistics'], 'file_info': { 'file1_tables': comparison_result['file1_tables'], 'file2_tables': comparison_result['file2_tables'], 'file1_paragraphs': comparison_result['file1_paragraphs'], 'file2_paragraphs': comparison_result['file2_paragraphs'] } }, 'differences': comparison_result['differences'] } with open(output_file, 'w', encoding='utf-8') as f: json.dump(report_data, f, ensure_ascii=False, indent=2) def generate_markdown_report(self, comparison_result: Dict, output_file: str): """生成Markdown格式的比较报告""" with open(output_file, 'w', encoding='utf-8') as f: f.write("# OCR结果对比报告\n\n") # 基本信息 f.write("## 基本信息\n\n") f.write(f"- **文件1**: `{comparison_result['file1_path']}`\n") f.write(f"- **文件2**: `{comparison_result['file2_path']}`\n") f.write(f"- **比较时间**: {comparison_result.get('timestamp', 'N/A')}\n\n") # 统计信息 stats = comparison_result['statistics'] f.write("## 统计信息\n\n") f.write(f"- 总差异数量: **{stats['total_differences']}**\n") f.write(f"- 表格差异: **{stats['table_differences']}**\n") f.write(f"- 金额差异: **{stats['amount_differences']}**\n") f.write(f"- 段落差异: **{stats['paragraph_differences']}**\n") f.write(f"- 文件1表格数: {comparison_result['file1_tables']}\n") f.write(f"- 文件2表格数: {comparison_result['file2_tables']}\n") f.write(f"- 文件1段落数: {comparison_result['file1_paragraphs']}\n") f.write(f"- 文件2段落数: {comparison_result['file2_paragraphs']}\n\n") # 差异摘要 if stats['total_differences'] == 0: f.write("## 结论\n\n") f.write("🎉 **完美匹配!没有发现任何差异。**\n\n") else: f.write("## 差异摘要\n\n") # 按类型分组显示差异 diff_by_type = {} for diff in comparison_result['differences']: diff_type = diff['type'] if diff_type not in diff_by_type: diff_by_type[diff_type] = [] diff_by_type[diff_type].append(diff) for diff_type, diffs in diff_by_type.items(): type_name = { 'table_amount': '💰 表格金额差异', 'table_text': '📝 表格文本差异', 'paragraph': '📄 段落差异', 'table_structure': '🏗️ 表格结构差异' }.get(diff_type, f'❓ {diff_type}') f.write(f"### {type_name} ({len(diffs)}个)\n\n") for i, diff in enumerate(diffs, 1): f.write(f"**{i}. {diff['position']}**\n") f.write(f"- 文件1: `{diff['file1_value']}`\n") f.write(f"- 文件2: `{diff['file2_value']}`\n") f.write(f"- 说明: {diff['description']}\n\n") # 详细差异列表 if comparison_result['differences']: f.write("## 详细差异列表\n\n") f.write("| 序号 | 类型 | 位置 | 文件1内容 | 文件2内容 | 描述 |\n") f.write("| --- | --- | --- | --- | --- | --- |\n") for i, diff in enumerate(comparison_result['differences'], 1): f.write(f"| {i} | {diff['type']} | {diff['position']} | ") f.write(f"`{diff['file1_value'][:50]}{'...' if len(diff['file1_value']) > 50 else ''}` | ") f.write(f"`{diff['file2_value'][:50]}{'...' if len(diff['file2_value']) > 50 else ''}` | ") f.write(f"{diff['description']} |\n") def main(): """主函数""" parser = argparse.ArgumentParser(description='OCR结果对比工具') parser.add_argument('file1', help='第一个OCR结果文件路径') parser.add_argument('file2', help='第二个OCR结果文件路径') parser.add_argument('-o', '--output', default='comparison_report', help='输出文件名(不含扩展名)') parser.add_argument('-f', '--format', choices=['json', 'markdown', 'both'], default='markdown', help='输出格式: json, markdown, 或 both') parser.add_argument('--ignore-images', action='store_true', help='忽略图片内容(默认已启用)') args = parser.parse_args() comparator = OCRResultComparator() print("🔍 开始对比OCR结果...") print(f"📄 文件1: {args.file1}") print(f"📄 文件2: {args.file2}") print(f"📁 输出格式: {args.format}") print(f"🖼️ 图片处理: {'忽略' if args.ignore_images else '对比'}") try: # 执行比较 result = comparator.compare_files(args.file1, args.file2) # 添加时间戳 import datetime result['timestamp'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') # 生成报告 if args.format in ['json', 'both']: json_file = f"{args.output}.json" comparator.generate_json_report(result, json_file) print(f"📄 JSON报告已保存至: {json_file}") if args.format in ['markdown', 'both']: md_file = f"{args.output}.md" comparator.generate_markdown_report(result, md_file) print(f"📝 Markdown报告已保存至: {md_file}") # 打印简要结果 print(f"\n📊 对比完成!") print(f" 总差异数: {result['statistics']['total_differences']}") print(f" 表格差异: {result['statistics']['table_differences']}") print(f" 金额差异: {result['statistics']['amount_differences']}") print(f" 段落差异: {result['statistics']['paragraph_differences']}") # 打印前几个重要差异 if result['differences']: print(f"\n🔍 前3个重要差异:") for i, diff in enumerate(result['differences'][:3], 1): print(f" {i}. {diff['position']}: {diff['description']}") print(f" 文件1: '{diff['file1_value'][:50]}{'...' if len(diff['file1_value']) > 50 else ''}'") print(f" 文件2: '{diff['file2_value'][:50]}{'...' if len(diff['file2_value']) > 50 else ''}'") else: print(f"\n🎉 恭喜!两个文件内容完全一致!") except Exception as e: print(f"❌ 对比过程中出现错误: {e}") return 1 return 0 if __name__ == "__main__": # 如果sys.argv没有被传入参数,则提供默认参数用于测试 import sys import time if len(sys.argv) == 1: sys.argv.extend([ # './output/至远彩色印刷工业有限公司-2022年母公司_2.md', './sample_data/demo_54fa7ad0_page_1_nohf.md', './output/至远彩色印刷工业有限公司-2022年母公司_2.md', './output/至远彩色印刷工业有限公司-2022年母公司_2-GLM4.5V.md', '-o', f'./output/comparison_result_{time.strftime("%Y%m%d_%H%M%S")}', '-f', 'both', '--ignore-images']) exit(main())