compare_ocr_results.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436
  1. import re
  2. import difflib
  3. import json
  4. import argparse
  5. from typing import Dict, List, Tuple
  6. import markdown
  7. from bs4 import BeautifulSoup
  8. class OCRResultComparator:
  9. def __init__(self):
  10. self.differences = []
  11. def normalize_text(self, text: str) -> str:
  12. """标准化文本:去除多余空格、回车等无效字符"""
  13. if not text:
  14. return ""
  15. # 去除多余的空白字符
  16. text = re.sub(r'\s+', ' ', text.strip())
  17. # 去除标点符号周围的空格
  18. text = re.sub(r'\s*([,。:;!?、])\s*', r'\1', text)
  19. return text
  20. def is_image_reference(self, text: str) -> bool:
  21. """判断是否为图片引用或描述"""
  22. image_keywords = [
  23. '图', '图片', '图像', 'image', 'figure', 'fig',
  24. '照片', '截图', '示意图', '流程图', '结构图'
  25. ]
  26. # 检查是否包含图片相关关键词
  27. for keyword in image_keywords:
  28. if keyword in text.lower():
  29. return True
  30. # 检查是否为Markdown图片语法
  31. if re.search(r'!\[.*?\]\(.*?\)', text):
  32. return True
  33. # 检查是否为HTML图片标签
  34. if re.search(r'<img[^>]*>', text, re.IGNORECASE):
  35. return True
  36. return False
  37. def extract_table_data(self, md_content: str) -> List[List[List[str]]]:
  38. """从Markdown中提取表格数据"""
  39. tables = []
  40. # 使用BeautifulSoup解析HTML表格
  41. soup = BeautifulSoup(md_content, 'html.parser')
  42. html_tables = soup.find_all('table')
  43. for table in html_tables:
  44. table_data = []
  45. rows = table.find_all('tr')
  46. for row in rows:
  47. cells = row.find_all(['td', 'th'])
  48. row_data = []
  49. for cell in cells:
  50. cell_text = self.normalize_text(cell.get_text())
  51. # 跳过图片内容
  52. if not self.is_image_reference(cell_text):
  53. row_data.append(cell_text)
  54. else:
  55. row_data.append("[图片内容-忽略]")
  56. if row_data: # 只添加非空行
  57. table_data.append(row_data)
  58. if table_data:
  59. tables.append(table_data)
  60. return tables
  61. def extract_paragraphs(self, md_content: str) -> List[str]:
  62. """提取段落文本"""
  63. # 移除表格
  64. content = re.sub(r'<table>.*?</table>', '', md_content, flags=re.DOTALL)
  65. # 移除HTML标签
  66. content = re.sub(r'<[^>]+>', '', content)
  67. # 移除Markdown注释
  68. content = re.sub(r'<!--.*?-->', '', content, flags=re.DOTALL)
  69. # 分割段落
  70. paragraphs = []
  71. lines = content.split('\n')
  72. for line in lines:
  73. normalized = self.normalize_text(line)
  74. if normalized and not normalized.startswith('#'):
  75. # 跳过图片内容
  76. if not self.is_image_reference(normalized):
  77. paragraphs.append(normalized)
  78. return paragraphs
  79. def compare_tables(self, table1: List[List[str]], table2: List[List[str]]) -> List[Dict]:
  80. """比较表格数据"""
  81. differences = []
  82. # 确定最大行数
  83. max_rows = max(len(table1), len(table2))
  84. for i in range(max_rows):
  85. row1 = table1[i] if i < len(table1) else []
  86. row2 = table2[i] if i < len(table2) else []
  87. # 确定最大列数
  88. max_cols = max(len(row1), len(row2))
  89. for j in range(max_cols):
  90. cell1 = row1[j] if j < len(row1) else ""
  91. cell2 = row2[j] if j < len(row2) else ""
  92. # 跳过图片内容比较
  93. if "[图片内容-忽略]" in cell1 or "[图片内容-忽略]" in cell2:
  94. continue
  95. if cell1 != cell2:
  96. # 特别处理数字金额
  97. if self.is_numeric(cell1) and self.is_numeric(cell2):
  98. num1 = self.parse_number(cell1)
  99. num2 = self.parse_number(cell2)
  100. if abs(num1 - num2) > 0.001: # 允许小数精度误差
  101. differences.append({
  102. 'type': 'table_amount',
  103. 'position': f'行{i+1}列{j+1}',
  104. 'file1_value': cell1,
  105. 'file2_value': cell2,
  106. 'description': f'金额不一致: {cell1} vs {cell2}',
  107. 'row_index': i,
  108. 'col_index': j
  109. })
  110. else:
  111. differences.append({
  112. 'type': 'table_text',
  113. 'position': f'行{i+1}列{j+1}',
  114. 'file1_value': cell1,
  115. 'file2_value': cell2,
  116. 'description': f'文本不一致: {cell1} vs {cell2}',
  117. 'row_index': i,
  118. 'col_index': j
  119. })
  120. return differences
  121. def is_numeric(self, text: str) -> bool:
  122. """判断文本是否为数字"""
  123. if not text:
  124. return False
  125. # 移除千分位分隔符和负号
  126. clean_text = re.sub(r'[,,-]', '', text)
  127. try:
  128. float(clean_text)
  129. return True
  130. except ValueError:
  131. return False
  132. def parse_number(self, text: str) -> float:
  133. """解析数字"""
  134. if not text:
  135. return 0.0
  136. clean_text = re.sub(r'[,,]', '', text)
  137. try:
  138. return float(clean_text)
  139. except ValueError:
  140. return 0.0
  141. def compare_paragraphs(self, paras1: List[str], paras2: List[str]) -> List[Dict]:
  142. """比较段落文本"""
  143. differences = []
  144. # 使用difflib进行文本比较
  145. matcher = difflib.SequenceMatcher(None, paras1, paras2)
  146. for tag, i1, i2, j1, j2 in matcher.get_opcodes():
  147. if tag == 'replace':
  148. for k in range(max(i2-i1, j2-j1)):
  149. para1 = paras1[i1+k] if i1+k < i2 else ""
  150. para2 = paras2[j1+k] if j1+k < j2 else ""
  151. if para1 != para2:
  152. differences.append({
  153. 'type': 'paragraph',
  154. 'position': f'段落{i1+k+1}',
  155. 'file1_value': para1,
  156. 'file2_value': para2,
  157. 'description': f'段落文本不一致',
  158. 'paragraph_index': i1+k
  159. })
  160. elif tag == 'delete':
  161. for k in range(i1, i2):
  162. differences.append({
  163. 'type': 'paragraph',
  164. 'position': f'段落{k+1}',
  165. 'file1_value': paras1[k],
  166. 'file2_value': "",
  167. 'description': f'文件1中存在但文件2中缺失的段落',
  168. 'paragraph_index': k
  169. })
  170. elif tag == 'insert':
  171. for k in range(j1, j2):
  172. differences.append({
  173. 'type': 'paragraph',
  174. 'position': f'段落{k+1}',
  175. 'file1_value': "",
  176. 'file2_value': paras2[k],
  177. 'description': f'文件2中存在但文件1中缺失的段落',
  178. 'paragraph_index': k
  179. })
  180. return differences
  181. def compare_files(self, file1_path: str, file2_path: str) -> Dict:
  182. """比较两个文件"""
  183. # 读取文件
  184. with open(file1_path, 'r', encoding='utf-8') as f:
  185. content1 = f.read()
  186. with open(file2_path, 'r', encoding='utf-8') as f:
  187. content2 = f.read()
  188. # 提取表格和段落
  189. tables1 = self.extract_table_data(content1)
  190. tables2 = self.extract_table_data(content2)
  191. paras1 = self.extract_paragraphs(content1)
  192. paras2 = self.extract_paragraphs(content2)
  193. # 比较结果
  194. all_differences = []
  195. # 比较表格
  196. if tables1 and tables2:
  197. table_diffs = self.compare_tables(tables1[0], tables2[0])
  198. all_differences.extend(table_diffs)
  199. elif tables1 and not tables2:
  200. all_differences.append({
  201. 'type': 'table_structure',
  202. 'position': '表格结构',
  203. 'file1_value': f'包含{len(tables1)}个表格',
  204. 'file2_value': '无表格',
  205. 'description': '文件1包含表格但文件2无表格'
  206. })
  207. elif not tables1 and tables2:
  208. all_differences.append({
  209. 'type': 'table_structure',
  210. 'position': '表格结构',
  211. 'file1_value': '无表格',
  212. 'file2_value': f'包含{len(tables2)}个表格',
  213. 'description': '文件2包含表格但文件1无表格'
  214. })
  215. # 比较段落
  216. para_diffs = self.compare_paragraphs(paras1, paras2)
  217. all_differences.extend(para_diffs)
  218. # 统计信息
  219. stats = {
  220. 'total_differences': len(all_differences),
  221. 'table_differences': len([d for d in all_differences if d['type'].startswith('table')]),
  222. 'paragraph_differences': len([d for d in all_differences if d['type'] == 'paragraph']),
  223. 'amount_differences': len([d for d in all_differences if d['type'] == 'table_amount'])
  224. }
  225. return {
  226. 'differences': all_differences,
  227. 'statistics': stats,
  228. 'file1_tables': len(tables1),
  229. 'file2_tables': len(tables2),
  230. 'file1_paragraphs': len(paras1),
  231. 'file2_paragraphs': len(paras2),
  232. 'file1_path': file1_path,
  233. 'file2_path': file2_path
  234. }
  235. def generate_json_report(self, comparison_result: Dict, output_file: str):
  236. """生成JSON格式的比较报告"""
  237. report_data = {
  238. 'comparison_summary': {
  239. 'timestamp': re.sub(r'[^\w\-_\.]', '_', str(comparison_result.get('timestamp', ''))),
  240. 'file1': comparison_result['file1_path'],
  241. 'file2': comparison_result['file2_path'],
  242. 'statistics': comparison_result['statistics'],
  243. 'file_info': {
  244. 'file1_tables': comparison_result['file1_tables'],
  245. 'file2_tables': comparison_result['file2_tables'],
  246. 'file1_paragraphs': comparison_result['file1_paragraphs'],
  247. 'file2_paragraphs': comparison_result['file2_paragraphs']
  248. }
  249. },
  250. 'differences': comparison_result['differences']
  251. }
  252. with open(output_file, 'w', encoding='utf-8') as f:
  253. json.dump(report_data, f, ensure_ascii=False, indent=2)
  254. def generate_markdown_report(self, comparison_result: Dict, output_file: str):
  255. """生成Markdown格式的比较报告"""
  256. with open(output_file, 'w', encoding='utf-8') as f:
  257. f.write("# OCR结果对比报告\n\n")
  258. # 基本信息
  259. f.write("## 基本信息\n\n")
  260. f.write(f"- **文件1**: `{comparison_result['file1_path']}`\n")
  261. f.write(f"- **文件2**: `{comparison_result['file2_path']}`\n")
  262. f.write(f"- **比较时间**: {comparison_result.get('timestamp', 'N/A')}\n\n")
  263. # 统计信息
  264. stats = comparison_result['statistics']
  265. f.write("## 统计信息\n\n")
  266. f.write(f"- 总差异数量: **{stats['total_differences']}**\n")
  267. f.write(f"- 表格差异: **{stats['table_differences']}**\n")
  268. f.write(f"- 金额差异: **{stats['amount_differences']}**\n")
  269. f.write(f"- 段落差异: **{stats['paragraph_differences']}**\n")
  270. f.write(f"- 文件1表格数: {comparison_result['file1_tables']}\n")
  271. f.write(f"- 文件2表格数: {comparison_result['file2_tables']}\n")
  272. f.write(f"- 文件1段落数: {comparison_result['file1_paragraphs']}\n")
  273. f.write(f"- 文件2段落数: {comparison_result['file2_paragraphs']}\n\n")
  274. # 差异摘要
  275. if stats['total_differences'] == 0:
  276. f.write("## 结论\n\n")
  277. f.write("🎉 **完美匹配!没有发现任何差异。**\n\n")
  278. else:
  279. f.write("## 差异摘要\n\n")
  280. # 按类型分组显示差异
  281. diff_by_type = {}
  282. for diff in comparison_result['differences']:
  283. diff_type = diff['type']
  284. if diff_type not in diff_by_type:
  285. diff_by_type[diff_type] = []
  286. diff_by_type[diff_type].append(diff)
  287. for diff_type, diffs in diff_by_type.items():
  288. type_name = {
  289. 'table_amount': '💰 表格金额差异',
  290. 'table_text': '📝 表格文本差异',
  291. 'paragraph': '📄 段落差异',
  292. 'table_structure': '🏗️ 表格结构差异'
  293. }.get(diff_type, f'❓ {diff_type}')
  294. f.write(f"### {type_name} ({len(diffs)}个)\n\n")
  295. for i, diff in enumerate(diffs, 1):
  296. f.write(f"**{i}. {diff['position']}**\n")
  297. f.write(f"- 文件1: `{diff['file1_value']}`\n")
  298. f.write(f"- 文件2: `{diff['file2_value']}`\n")
  299. f.write(f"- 说明: {diff['description']}\n\n")
  300. # 详细差异列表
  301. if comparison_result['differences']:
  302. f.write("## 详细差异列表\n\n")
  303. f.write("| 序号 | 类型 | 位置 | 文件1内容 | 文件2内容 | 描述 |\n")
  304. f.write("| --- | --- | --- | --- | --- | --- |\n")
  305. for i, diff in enumerate(comparison_result['differences'], 1):
  306. f.write(f"| {i} | {diff['type']} | {diff['position']} | ")
  307. f.write(f"`{diff['file1_value'][:50]}{'...' if len(diff['file1_value']) > 50 else ''}` | ")
  308. f.write(f"`{diff['file2_value'][:50]}{'...' if len(diff['file2_value']) > 50 else ''}` | ")
  309. f.write(f"{diff['description']} |\n")
  310. def main():
  311. """主函数"""
  312. parser = argparse.ArgumentParser(description='OCR结果对比工具')
  313. parser.add_argument('file1', help='第一个OCR结果文件路径')
  314. parser.add_argument('file2', help='第二个OCR结果文件路径')
  315. parser.add_argument('-o', '--output', default='comparison_report',
  316. help='输出文件名(不含扩展名)')
  317. parser.add_argument('-f', '--format', choices=['json', 'markdown', 'both'],
  318. default='markdown', help='输出格式: json, markdown, 或 both')
  319. parser.add_argument('--ignore-images', action='store_true',
  320. help='忽略图片内容(默认已启用)')
  321. args = parser.parse_args()
  322. comparator = OCRResultComparator()
  323. print("🔍 开始对比OCR结果...")
  324. print(f"📄 文件1: {args.file1}")
  325. print(f"📄 文件2: {args.file2}")
  326. print(f"📁 输出格式: {args.format}")
  327. print(f"🖼️ 图片处理: {'忽略' if args.ignore_images else '对比'}")
  328. try:
  329. # 执行比较
  330. result = comparator.compare_files(args.file1, args.file2)
  331. # 添加时间戳
  332. import datetime
  333. result['timestamp'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
  334. # 生成报告
  335. if args.format in ['json', 'both']:
  336. json_file = f"{args.output}.json"
  337. comparator.generate_json_report(result, json_file)
  338. print(f"📄 JSON报告已保存至: {json_file}")
  339. if args.format in ['markdown', 'both']:
  340. md_file = f"{args.output}.md"
  341. comparator.generate_markdown_report(result, md_file)
  342. print(f"📝 Markdown报告已保存至: {md_file}")
  343. # 打印简要结果
  344. print(f"\n📊 对比完成!")
  345. print(f" 总差异数: {result['statistics']['total_differences']}")
  346. print(f" 表格差异: {result['statistics']['table_differences']}")
  347. print(f" 金额差异: {result['statistics']['amount_differences']}")
  348. print(f" 段落差异: {result['statistics']['paragraph_differences']}")
  349. # 打印前几个重要差异
  350. if result['differences']:
  351. print(f"\n🔍 前3个重要差异:")
  352. for i, diff in enumerate(result['differences'][:3], 1):
  353. print(f" {i}. {diff['position']}: {diff['description']}")
  354. print(f" 文件1: '{diff['file1_value'][:50]}{'...' if len(diff['file1_value']) > 50 else ''}'")
  355. print(f" 文件2: '{diff['file2_value'][:50]}{'...' if len(diff['file2_value']) > 50 else ''}'")
  356. else:
  357. print(f"\n🎉 恭喜!两个文件内容完全一致!")
  358. except Exception as e:
  359. print(f"❌ 对比过程中出现错误: {e}")
  360. return 1
  361. return 0
  362. if __name__ == "__main__":
  363. # 如果sys.argv没有被传入参数,则提供默认参数用于测试
  364. import sys
  365. if len(sys.argv) == 1:
  366. sys.argv.extend([
  367. './output/至远彩色印刷工业有限公司-2022年母公司_2.md', './sample_data/demo_54fa7ad0_page_1_nohf.md',
  368. '-o', './output/comparison_result',
  369. '-f', 'both',
  370. '--ignore-images'])
  371. exit(main())