compare_ocr_results.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502
  1. import sys
  2. import time
  3. import re
  4. import difflib
  5. import json
  6. import argparse
  7. from typing import Dict, List, Tuple
  8. import markdown
  9. from bs4 import BeautifulSoup
  10. class OCRResultComparator:
  11. def __init__(self):
  12. self.differences = []
  13. def normalize_text(self, text: str) -> str:
  14. """标准化文本:去除多余空格、回车等无效字符"""
  15. if not text:
  16. return ""
  17. # 去除多余的空白字符
  18. text = re.sub(r'\s+', ' ', text.strip())
  19. # 去除标点符号周围的空格
  20. text = re.sub(r'\s*([,。:;!?、])\s*', r'\1', text)
  21. return text
  22. def is_image_reference(self, text: str) -> bool:
  23. """判断是否为图片引用或描述"""
  24. image_keywords = [
  25. '图', '图片', '图像', 'image', 'figure', 'fig',
  26. '照片', '截图', '示意图', '流程图', '结构图'
  27. ]
  28. # 检查是否包含图片相关关键词
  29. for keyword in image_keywords:
  30. if keyword in text.lower():
  31. return True
  32. # 检查是否为Markdown图片语法
  33. if re.search(r'!\[.*?\]\(.*?\)', text):
  34. return True
  35. # 检查是否为HTML图片标签
  36. if re.search(r'<img[^>]*>', text, re.IGNORECASE):
  37. return True
  38. return False
  39. def extract_table_data(self, md_content: str) -> List[List[List[str]]]:
  40. """从Markdown中提取表格数据"""
  41. tables = []
  42. # 使用BeautifulSoup解析HTML表格
  43. soup = BeautifulSoup(md_content, 'html.parser')
  44. html_tables = soup.find_all('table')
  45. for table in html_tables:
  46. table_data = []
  47. rows = table.find_all('tr')
  48. for row in rows:
  49. cells = row.find_all(['td', 'th'])
  50. row_data = []
  51. for cell in cells:
  52. cell_text = self.normalize_text(cell.get_text())
  53. # 跳过图片内容
  54. if not self.is_image_reference(cell_text):
  55. row_data.append(cell_text)
  56. else:
  57. row_data.append("[图片内容-忽略]")
  58. if row_data: # 只添加非空行
  59. table_data.append(row_data)
  60. if table_data:
  61. tables.append(table_data)
  62. return tables
  63. def extract_paragraphs(self, md_content: str) -> List[str]:
  64. """提取段落文本"""
  65. # 移除表格
  66. content = re.sub(r'<table>.*?</table>', '', md_content, flags=re.DOTALL)
  67. # 移除HTML标签
  68. content = re.sub(r'<[^>]+>', '', content)
  69. # 移除Markdown注释
  70. content = re.sub(r'<!--.*?-->', '', content, flags=re.DOTALL)
  71. # 分割段落
  72. paragraphs = []
  73. lines = content.split('\n')
  74. for line in lines:
  75. normalized = self.normalize_text(line)
  76. if normalized and not normalized.startswith('#'):
  77. # 跳过图片内容
  78. if not self.is_image_reference(normalized):
  79. paragraphs.append(normalized)
  80. return paragraphs
  81. def compare_tables(self, table1: List[List[str]], table2: List[List[str]]) -> List[Dict]:
  82. """比较表格数据"""
  83. differences = []
  84. # 确定最大行数
  85. max_rows = max(len(table1), len(table2))
  86. for i in range(max_rows):
  87. row1 = table1[i] if i < len(table1) else []
  88. row2 = table2[i] if i < len(table2) else []
  89. # 确定最大列数
  90. max_cols = max(len(row1), len(row2))
  91. for j in range(max_cols):
  92. cell1 = row1[j] if j < len(row1) else ""
  93. cell2 = row2[j] if j < len(row2) else ""
  94. # 跳过图片内容比较
  95. if "[图片内容-忽略]" in cell1 or "[图片内容-忽略]" in cell2:
  96. continue
  97. if cell1 != cell2:
  98. # 特别处理数字金额
  99. if self.is_numeric(cell1) and self.is_numeric(cell2):
  100. num1 = self.parse_number(cell1)
  101. num2 = self.parse_number(cell2)
  102. if abs(num1 - num2) > 0.001: # 允许小数精度误差
  103. differences.append({
  104. 'type': 'table_amount',
  105. 'position': f'行{i+1}列{j+1}',
  106. 'file1_value': cell1,
  107. 'file2_value': cell2,
  108. 'description': f'金额不一致: {cell1} vs {cell2}',
  109. 'row_index': i,
  110. 'col_index': j
  111. })
  112. else:
  113. differences.append({
  114. 'type': 'table_text',
  115. 'position': f'行{i+1}列{j+1}',
  116. 'file1_value': cell1,
  117. 'file2_value': cell2,
  118. 'description': f'文本不一致: {cell1} vs {cell2}',
  119. 'row_index': i,
  120. 'col_index': j
  121. })
  122. return differences
  123. def is_numeric(self, text: str) -> bool:
  124. """判断文本是否为数字"""
  125. if not text:
  126. return False
  127. # 移除千分位分隔符和负号
  128. clean_text = re.sub(r'[,,-]', '', text)
  129. try:
  130. float(clean_text)
  131. return True
  132. except ValueError:
  133. return False
  134. def parse_number(self, text: str) -> float:
  135. """解析数字"""
  136. if not text:
  137. return 0.0
  138. clean_text = re.sub(r'[,,]', '', text)
  139. try:
  140. return float(clean_text)
  141. except ValueError:
  142. return 0.0
  143. def compare_paragraphs(self, paras1: List[str], paras2: List[str]) -> List[Dict]:
  144. """比较段落文本"""
  145. differences = []
  146. # 使用difflib进行文本比较
  147. matcher = difflib.SequenceMatcher(None, paras1, paras2)
  148. for tag, i1, i2, j1, j2 in matcher.get_opcodes():
  149. if tag == 'replace':
  150. for k in range(max(i2-i1, j2-j1)):
  151. para1 = paras1[i1+k] if i1+k < i2 else ""
  152. para2 = paras2[j1+k] if j1+k < j2 else ""
  153. if para1 != para2:
  154. differences.append({
  155. 'type': 'paragraph',
  156. 'position': f'段落{i1+k+1}',
  157. 'file1_value': para1,
  158. 'file2_value': para2,
  159. 'description': f'段落文本不一致',
  160. 'paragraph_index': i1+k
  161. })
  162. elif tag == 'delete':
  163. for k in range(i1, i2):
  164. differences.append({
  165. 'type': 'paragraph',
  166. 'position': f'段落{k+1}',
  167. 'file1_value': paras1[k],
  168. 'file2_value': "",
  169. 'description': f'文件1中存在但文件2中缺失的段落',
  170. 'paragraph_index': k
  171. })
  172. elif tag == 'insert':
  173. for k in range(j1, j2):
  174. differences.append({
  175. 'type': 'paragraph',
  176. 'position': f'段落{k+1}',
  177. 'file1_value': "",
  178. 'file2_value': paras2[k],
  179. 'description': f'文件2中存在但文件1中缺失的段落',
  180. 'paragraph_index': k
  181. })
  182. return differences
  183. def compare_files(self, file1_path: str, file2_path: str) -> Dict:
  184. """比较两个文件"""
  185. # 读取文件
  186. with open(file1_path, 'r', encoding='utf-8') as f:
  187. content1 = f.read()
  188. with open(file2_path, 'r', encoding='utf-8') as f:
  189. content2 = f.read()
  190. # 提取表格和段落
  191. tables1 = self.extract_table_data(content1)
  192. tables2 = self.extract_table_data(content2)
  193. paras1 = self.extract_paragraphs(content1)
  194. paras2 = self.extract_paragraphs(content2)
  195. # 比较结果
  196. all_differences = []
  197. # 比较表格
  198. if tables1 and tables2:
  199. table_diffs = self.compare_tables(tables1[0], tables2[0])
  200. all_differences.extend(table_diffs)
  201. elif tables1 and not tables2:
  202. all_differences.append({
  203. 'type': 'table_structure',
  204. 'position': '表格结构',
  205. 'file1_value': f'包含{len(tables1)}个表格',
  206. 'file2_value': '无表格',
  207. 'description': '文件1包含表格但文件2无表格'
  208. })
  209. elif not tables1 and tables2:
  210. all_differences.append({
  211. 'type': 'table_structure',
  212. 'position': '表格结构',
  213. 'file1_value': '无表格',
  214. 'file2_value': f'包含{len(tables2)}个表格',
  215. 'description': '文件2包含表格但文件1无表格'
  216. })
  217. # 比较段落
  218. para_diffs = self.compare_paragraphs(paras1, paras2)
  219. all_differences.extend(para_diffs)
  220. # 统计信息
  221. stats = {
  222. 'total_differences': len(all_differences),
  223. 'table_differences': len([d for d in all_differences if d['type'].startswith('table')]),
  224. 'paragraph_differences': len([d for d in all_differences if d['type'] == 'paragraph']),
  225. 'amount_differences': len([d for d in all_differences if d['type'] == 'table_amount'])
  226. }
  227. return {
  228. 'differences': all_differences,
  229. 'statistics': stats,
  230. 'file1_tables': len(tables1),
  231. 'file2_tables': len(tables2),
  232. 'file1_paragraphs': len(paras1),
  233. 'file2_paragraphs': len(paras2),
  234. 'file1_path': file1_path,
  235. 'file2_path': file2_path
  236. }
  237. def generate_json_report(self, comparison_result: Dict, output_file: str):
  238. """生成JSON格式的比较报告"""
  239. # report_data = {
  240. # 'comparison_summary': {
  241. # 'timestamp': re.sub(r'[^\w\-_\.]', '_', str(comparison_result.get('timestamp', ''))),
  242. # 'file1': comparison_result['file1_path'],
  243. # 'file2': comparison_result['file2_path'],
  244. # 'statistics': comparison_result['statistics'],
  245. # 'file_info': {
  246. # 'file1_tables': comparison_result['file1_tables'],
  247. # 'file2_tables': comparison_result['file2_tables'],
  248. # 'file1_paragraphs': comparison_result['file1_paragraphs'],
  249. # 'file2_paragraphs': comparison_result['file2_paragraphs']
  250. # }
  251. # },
  252. # 'differences': comparison_result['differences']
  253. # }
  254. with open(output_file, 'w', encoding='utf-8') as f:
  255. json.dump(comparison_result, f, ensure_ascii=False, indent=2)
  256. def generate_markdown_report(self, comparison_result: Dict, output_file: str):
  257. """生成Markdown格式的比较报告"""
  258. with open(output_file, 'w', encoding='utf-8') as f:
  259. f.write("# OCR结果对比报告\n\n")
  260. # 基本信息
  261. f.write("## 基本信息\n\n")
  262. f.write(f"- **文件1**: `{comparison_result['file1_path']}`\n")
  263. f.write(f"- **文件2**: `{comparison_result['file2_path']}`\n")
  264. f.write(f"- **比较时间**: {comparison_result.get('timestamp', 'N/A')}\n\n")
  265. # 统计信息
  266. stats = comparison_result['statistics']
  267. f.write("## 统计信息\n\n")
  268. f.write(f"- 总差异数量: **{stats['total_differences']}**\n")
  269. f.write(f"- 表格差异: **{stats['table_differences']}**\n")
  270. f.write(f"- 金额差异: **{stats['amount_differences']}**\n")
  271. f.write(f"- 段落差异: **{stats['paragraph_differences']}**\n")
  272. f.write(f"- 文件1表格数: {comparison_result['file1_tables']}\n")
  273. f.write(f"- 文件2表格数: {comparison_result['file2_tables']}\n")
  274. f.write(f"- 文件1段落数: {comparison_result['file1_paragraphs']}\n")
  275. f.write(f"- 文件2段落数: {comparison_result['file2_paragraphs']}\n\n")
  276. # 差异摘要
  277. if stats['total_differences'] == 0:
  278. f.write("## 结论\n\n")
  279. f.write("🎉 **完美匹配!没有发现任何差异。**\n\n")
  280. else:
  281. f.write("## 差异摘要\n\n")
  282. # 按类型分组显示差异
  283. diff_by_type = {}
  284. for diff in comparison_result['differences']:
  285. diff_type = diff['type']
  286. if diff_type not in diff_by_type:
  287. diff_by_type[diff_type] = []
  288. diff_by_type[diff_type].append(diff)
  289. for diff_type, diffs in diff_by_type.items():
  290. type_name = {
  291. 'table_amount': '💰 表格金额差异',
  292. 'table_text': '📝 表格文本差异',
  293. 'paragraph': '📄 段落差异',
  294. 'table_structure': '🏗️ 表格结构差异'
  295. }.get(diff_type, f'❓ {diff_type}')
  296. f.write(f"### {type_name} ({len(diffs)}个)\n\n")
  297. for i, diff in enumerate(diffs, 1):
  298. f.write(f"**{i}. {diff['position']}**\n")
  299. f.write(f"- 文件1: `{diff['file1_value']}`\n")
  300. f.write(f"- 文件2: `{diff['file2_value']}`\n")
  301. f.write(f"- 说明: {diff['description']}\n\n")
  302. # 详细差异列表
  303. if comparison_result['differences']:
  304. f.write("## 详细差异列表\n\n")
  305. f.write("| 序号 | 类型 | 位置 | 文件1内容 | 文件2内容 | 描述 |\n")
  306. f.write("| --- | --- | --- | --- | --- | --- |\n")
  307. for i, diff in enumerate(comparison_result['differences'], 1):
  308. f.write(f"| {i} | {diff['type']} | {diff['position']} | ")
  309. f.write(f"`{diff['file1_value'][:50]}{'...' if len(diff['file1_value']) > 50 else ''}` | ")
  310. f.write(f"`{diff['file2_value'][:50]}{'...' if len(diff['file2_value']) > 50 else ''}` | ")
  311. f.write(f"{diff['description']} |\n")
  312. def compare_ocr_results(file1_path: str, file2_path: str, output_file: str = "comparison_report",
  313. output_format: str = "markdown", ignore_images: bool = True):
  314. """
  315. 比较两个OCR结果文件
  316. Args:
  317. file1_path: 第一个OCR结果文件路径
  318. file2_path: 第二个OCR结果文件路径
  319. output_file: 输出文件名(不含扩展名),默认为"comparison_report"
  320. output_format: 输出格式,选项: 'json', 'markdown', 'both',默认为'markdown'
  321. ignore_images: 是否忽略图片内容,默认为True
  322. Returns:
  323. Dict: 比较结果字典
  324. """
  325. comparator = OCRResultComparator()
  326. print("🔍 开始对比OCR结果...")
  327. print(f"📄 文件1: {file1_path}")
  328. print(f"📄 文件2: {file2_path}")
  329. print(f"📁 输出格式: {output_format}")
  330. print(f"🖼️ 图片处理: {'忽略' if ignore_images else '对比'}")
  331. try:
  332. # 执行比较
  333. result = comparator.compare_files(file1_path, file2_path)
  334. # 添加时间戳
  335. import datetime
  336. result['timestamp'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
  337. # 生成报告
  338. if output_format in ['json', 'both']:
  339. json_file = f"{output_file}.json"
  340. comparator.generate_json_report(result, json_file)
  341. print(f"📄 JSON报告已保存至: {json_file}")
  342. if output_format in ['markdown', 'both']:
  343. md_file = f"{output_file}.md"
  344. comparator.generate_markdown_report(result, md_file)
  345. print(f"📝 Markdown报告已保存至: {md_file}")
  346. # 打印简要结果
  347. print(f"\n📊 对比完成!")
  348. print(f" 总差异数: {result['statistics']['total_differences']}")
  349. print(f" 表格差异: {result['statistics']['table_differences']}")
  350. print(f" 金额差异: {result['statistics']['amount_differences']}")
  351. print(f" 段落差异: {result['statistics']['paragraph_differences']}")
  352. # 打印前几个重要差异
  353. if result['differences']:
  354. print(f"\n🔍 前3个重要差异:")
  355. for i, diff in enumerate(result['differences'][:3], 1):
  356. print(f" {i}. {diff['position']}: {diff['description']}")
  357. print(f" 文件1: '{diff['file1_value'][:50]}{'...' if len(diff['file1_value']) > 50 else ''}'")
  358. print(f" 文件2: '{diff['file2_value'][:50]}{'...' if len(diff['file2_value']) > 50 else ''}'")
  359. else:
  360. print(f"\n🎉 恭喜!两个文件内容完全一致!")
  361. # 添加处理统计信息(模仿 ocr_by_vlm.py 的风格)
  362. print("\n📊 对比处理统计")
  363. print(f" 文件1路径: {result['file1_path']}")
  364. print(f" 文件2路径: {result['file2_path']}")
  365. print(f" 输出文件: {output_file}")
  366. print(f" 输出格式: {output_format}")
  367. print(f" 忽略图片: {ignore_images}")
  368. print(f" 处理时间: {result['timestamp']}")
  369. print(f" 文件1表格数: {result['file1_tables']}")
  370. print(f" 文件2表格数: {result['file2_tables']}")
  371. print(f" 文件1段落数: {result['file1_paragraphs']}")
  372. print(f" 文件2段落数: {result['file2_paragraphs']}")
  373. return result
  374. except Exception as e:
  375. import traceback
  376. traceback.print_exc()
  377. raise Exception(f"OCR对比任务失败: {e}")
  378. def main():
  379. """主函数 - 保持向后兼容"""
  380. parser = argparse.ArgumentParser(description='OCR结果对比工具')
  381. parser.add_argument('file1', help='第一个OCR结果文件路径')
  382. parser.add_argument('file2', help='第二个OCR结果文件路径')
  383. parser.add_argument('-o', '--output', default='comparison_report',
  384. help='输出文件名(不含扩展名)')
  385. parser.add_argument('-f', '--format', choices=['json', 'markdown', 'both'],
  386. default='markdown', help='输出格式: json, markdown, 或 both')
  387. parser.add_argument('--ignore-images', action='store_true',
  388. help='忽略图片内容(默认已启用)')
  389. args = parser.parse_args()
  390. try:
  391. result = compare_ocr_results(
  392. file1_path=args.file1,
  393. file2_path=args.file2,
  394. output_file=args.output,
  395. output_format=args.format,
  396. ignore_images=args.ignore_images
  397. )
  398. print("\n🎉 OCR对比完成!")
  399. return 0
  400. except Exception as e:
  401. print(f"❌ OCR对比失败: {e}")
  402. return 1
  403. if __name__ == "__main__":
  404. parser = argparse.ArgumentParser(description='OCR结果对比工具')
  405. parser.add_argument('file1', nargs= '?', help='第一个OCR结果文件路径')
  406. parser.add_argument('file2', nargs= '?', help='第二个OCR结果文件路径')
  407. parser.add_argument('-o', '--output', default='comparison_report',
  408. help='输出文件名(不含扩展名)')
  409. parser.add_argument('-f', '--format', choices=['json', 'markdown', 'both'],
  410. default='markdown', help='输出格式: json, markdown, 或 both')
  411. parser.add_argument('--ignore-images', action='store_true',
  412. help='忽略图片内容(默认已启用)')
  413. args = parser.parse_args()
  414. if args.file1 and args.file2:
  415. result = compare_ocr_results(
  416. file1_path=args.file1,
  417. file2_path=args.file2,
  418. output_file=args.output,
  419. output_format=args.format,
  420. ignore_images=args.ignore_images
  421. )
  422. else:
  423. # 如果sys.argv没有被传入参数,则提供默认参数用于测试
  424. result = compare_ocr_results(
  425. file1_path='./output/dots.ocr/至远彩色印刷工业有限公司-2022年母公司_2.md',
  426. file2_path='./output/Qwen2.5-VL-72B-Instruct-AWQ/至远彩色印刷工业有限公司-2022年母公司_2.md',
  427. output_file=f'./output/comparison_result_{time.strftime("%Y%m%d_%H%M%S")}',
  428. output_format='both',
  429. ignore_images=True
  430. )
  431. print("\n🎉 OCR对比完成!")