compare_ocr_results.py 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855
  1. import sys
  2. import time
  3. import re
  4. import difflib
  5. import json
  6. import argparse
  7. from typing import Dict, List, Tuple
  8. import markdown
  9. from bs4 import BeautifulSoup
  10. from fuzzywuzzy import fuzz
  11. class OCRResultComparator:
  12. def __init__(self):
  13. self.differences = []
  14. self.similarity_threshold = 85 # 相似度阈值,超过85%认为是匹配的
  15. self.max_paragraph_window = 6 # 最大合并段落数
  16. def normalize_text(self, text: str) -> str:
  17. """标准化文本:去除多余空格、回车等无效字符"""
  18. if not text:
  19. return ""
  20. # 去除多余的空白字符
  21. text = re.sub(r'\s+', ' ', text.strip())
  22. # 去除标点符号周围的空格
  23. text = re.sub(r'\s*([,。:;!?、])\s*', r'\1', text)
  24. return text
  25. def is_image_reference(self, text: str) -> bool:
  26. """判断是否为图片引用或描述"""
  27. image_keywords = [
  28. '图', '图片', '图像', 'image', 'figure', 'fig',
  29. '照片', '截图', '示意图', '流程图', '结构图'
  30. ]
  31. # 检查是否包含图片相关关键词
  32. for keyword in image_keywords:
  33. if keyword in text.lower():
  34. return True
  35. # 检查是否为Markdown图片语法
  36. if re.search(r'!\[.*?\]\(.*?\)', text):
  37. return True
  38. # 检查是否为HTML图片标签
  39. if re.search(r'<img[^>]*>', text, re.IGNORECASE):
  40. return True
  41. return False
  42. def extract_table_data(self, md_content: str) -> List[List[List[str]]]:
  43. """从Markdown中提取表格数据"""
  44. tables = []
  45. # 使用BeautifulSoup解析HTML表格
  46. soup = BeautifulSoup(md_content, 'html.parser')
  47. html_tables = soup.find_all('table')
  48. for table in html_tables:
  49. table_data = []
  50. rows = table.find_all('tr')
  51. for row in rows:
  52. cells = row.find_all(['td', 'th'])
  53. row_data = []
  54. for cell in cells:
  55. cell_text = self.normalize_text(cell.get_text())
  56. # 跳过图片内容
  57. if not self.is_image_reference(cell_text):
  58. row_data.append(cell_text)
  59. else:
  60. row_data.append("[图片内容-忽略]")
  61. if row_data: # 只添加非空行
  62. table_data.append(row_data)
  63. if table_data:
  64. tables.append(table_data)
  65. return tables
  66. def merge_split_paragraphs(self, lines: List[str]) -> List[str]:
  67. # 合并连续的非空行作为一个段落,且过滤图片内容
  68. merged_lines = []
  69. current_paragraph = ""
  70. for i, line in enumerate(lines):
  71. # 跳过空行
  72. if not line:
  73. if current_paragraph:
  74. merged_lines.append(current_paragraph)
  75. current_paragraph = ""
  76. continue
  77. # 跳过图片内容
  78. if self.is_image_reference(line):
  79. continue
  80. # 检查是否是标题(以数字、中文数字或特殊标记开头)
  81. is_title = (
  82. line.startswith(('一、', '二、', '三、', '四、', '五、', '六、', '七、', '八、', '九、', '十、')) or
  83. line.startswith(('1.', '2.', '3.', '4.', '5.', '6.', '7.', '8.', '9.')) or
  84. line.startswith('#')
  85. )
  86. # 如果是标题,结束当前段落
  87. if is_title:
  88. if current_paragraph:
  89. merged_lines.append(current_paragraph)
  90. current_paragraph = ""
  91. merged_lines.append(line)
  92. else:
  93. # 检查是否应该与前一行合并 # 如果当前段落不为空,且当前段落最后一个字符非空白字符
  94. if current_paragraph and not current_paragraph.endswith((' ', '\t')):
  95. current_paragraph += line
  96. else:
  97. current_paragraph = line
  98. # 处理最后一个段落
  99. if current_paragraph:
  100. merged_lines.append(current_paragraph)
  101. return merged_lines
  102. def extract_paragraphs(self, md_content: str) -> List[str]:
  103. """提取段落文本"""
  104. # 移除表格
  105. content = re.sub(r'<table>.*?</table>', '', md_content, flags=re.DOTALL)
  106. # 移除HTML标签
  107. content = re.sub(r'<[^>]+>', '', content)
  108. # 移除Markdown注释
  109. content = re.sub(r'<!--.*?-->', '', content, flags=re.DOTALL)
  110. # 分割段落
  111. paragraphs = []
  112. lines = content.split('\n')
  113. merged_lines = self.merge_split_paragraphs(lines)
  114. for line in merged_lines:
  115. normalized = self.normalize_text(line)
  116. if normalized:
  117. paragraphs.append(normalized)
  118. else:
  119. print(f"跳过的内容无效或图片段落: {line[0:30]}...")
  120. return paragraphs
  121. def compare_tables(self, table1: List[List[str]], table2: List[List[str]]) -> List[Dict]:
  122. """比较表格数据"""
  123. differences = []
  124. # 确定最大行数
  125. max_rows = max(len(table1), len(table2))
  126. for i in range(max_rows):
  127. row1 = table1[i] if i < len(table1) else []
  128. row2 = table2[i] if i < len(table2) else []
  129. # 确定最大列数
  130. max_cols = max(len(row1), len(row2))
  131. for j in range(max_cols):
  132. cell1 = row1[j] if j < len(row1) else ""
  133. cell2 = row2[j] if j < len(row2) else ""
  134. # 跳过图片内容比较
  135. if "[图片内容-忽略]" in cell1 or "[图片内容-忽略]" in cell2:
  136. continue
  137. if cell1 != cell2:
  138. # 特别处理数字金额
  139. if self.is_numeric(cell1) and self.is_numeric(cell2):
  140. num1 = self.parse_number(cell1)
  141. num2 = self.parse_number(cell2)
  142. if abs(num1 - num2) > 0.001: # 允许小数精度误差
  143. differences.append({
  144. 'type': 'table_amount',
  145. 'position': f'行{i+1}列{j+1}',
  146. 'file1_value': cell1,
  147. 'file2_value': cell2,
  148. 'description': f'金额不一致: {cell1} vs {cell2}',
  149. 'row_index': i,
  150. 'col_index': j
  151. })
  152. else:
  153. differences.append({
  154. 'type': 'table_text',
  155. 'position': f'行{i+1}列{j+1}',
  156. 'file1_value': cell1,
  157. 'file2_value': cell2,
  158. 'description': f'文本不一致: {cell1} vs {cell2}',
  159. 'row_index': i,
  160. 'col_index': j
  161. })
  162. return differences
  163. def is_numeric(self, text: str) -> bool:
  164. """判断文本是否为数字"""
  165. if not text:
  166. return False
  167. # 移除千分位分隔符和负号
  168. clean_text = re.sub(r'[,,-]', '', text)
  169. try:
  170. float(clean_text)
  171. return True
  172. except ValueError:
  173. return False
  174. def parse_number(self, text: str) -> float:
  175. """解析数字"""
  176. if not text:
  177. return 0.0
  178. clean_text = re.sub(r'[,,]', '', text)
  179. try:
  180. return float(clean_text)
  181. except ValueError:
  182. return 0.0
  183. def normalize_text_for_comparison(self, text: str) -> str:
  184. """增强的文本标准化 - 用于语义比较"""
  185. if not text:
  186. return ""
  187. # 移除Markdown格式标记
  188. text = re.sub(r'#{1,6}\s*', '', text) # 移除标题标记
  189. text = re.sub(r'\*\*(.+?)\*\*', r'\1', text) # 移除粗体标记
  190. text = re.sub(r'\*(.+?)\*', r'\1', text) # 移除斜体标记
  191. text = re.sub(r'`(.+?)`', r'\1', text) # 移除代码标记
  192. text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL) # 移除注释
  193. # 统一标点符号
  194. punctuation_map = {
  195. ',': ',', '。': '.', ':': ':', ';': ';',
  196. '!': '!', '?': '?', '(': '(', ')': ')',
  197. '【': '[', '】': ']', '《': '<', '》': '>',
  198. '"': '"', '"': '"', ''': "'", ''': "'",
  199. '、': ',', '…': '...'
  200. }
  201. for chinese_punct, english_punct in punctuation_map.items():
  202. text = text.replace(chinese_punct, english_punct)
  203. # 移除多余的空白字符
  204. text = re.sub(r'\s+', ' ', text.strip())
  205. # 移除标点符号周围的空格
  206. text = re.sub(r'\s*([,.():;!?])\s*', r'\1', text)
  207. return text
  208. def calculate_text_similarity(self, text1: str, text2: str) -> float:
  209. """改进的相似度计算"""
  210. if not text1 and not text2:
  211. return 100.0
  212. if not text1 or not text2:
  213. return 0.0
  214. # 如果标准化后完全相同,返回100%
  215. if text1 == text2:
  216. return 100.0
  217. # 使用多种相似度算法
  218. similarity_scores = [
  219. fuzz.ratio(text1, text2),
  220. fuzz.partial_ratio(text1, text2),
  221. fuzz.token_sort_ratio(text1, text2),
  222. fuzz.token_set_ratio(text1, text2)
  223. ]
  224. # 对于包含关系,给予更高的权重
  225. if text1 in text2 or text2 in text1:
  226. max_score = max(similarity_scores)
  227. # 提升包含关系的相似度
  228. return min(100.0, max_score + 10)
  229. return max(similarity_scores)
  230. def compare_paragraphs_with_flexible_matching(self, paras1: List[str], paras2: List[str]) -> List[Dict]:
  231. """改进的段落匹配算法 - 更好地处理段落重组"""
  232. differences = []
  233. # 直接调用normalize_text_for_comparison进行预处理
  234. meaningful_paras1 = [self.normalize_text_for_comparison(p) for p in paras1]
  235. meaningful_paras2 = [self.normalize_text_for_comparison(p) for p in paras2]
  236. # 使用预处理后的段落进行匹配
  237. used_paras1 = set()
  238. used_paras2 = set()
  239. best_match = {'similarity': 0.0} # 初始化best_match
  240. # 文件1和文件2同时向下遍历,当有匹配项时,文件2的窗口从匹配项的下一个位置开始
  241. paras2_idx = 0
  242. for window_size1 in range(1, min(self.max_paragraph_window, len(meaningful_paras1) + 1)): # 增加到6个段落
  243. for i in range(len(meaningful_paras1) - window_size1 + 1):
  244. if any(idx in used_paras1 for idx in range(i, i + window_size1)):
  245. continue
  246. # 合并文件1中的段落
  247. combined_para1 = "".join(meaningful_paras1[i:i+window_size1])
  248. # 在文件2中寻找最佳匹配
  249. best_match = self._find_best_match_in_paras2_improved(
  250. combined_para1,
  251. meaningful_paras2[paras2_idx: min(paras2_idx + self.max_paragraph_window, len(meaningful_paras2))],
  252. paras2_idx
  253. )
  254. if best_match and best_match['similarity'] >= self.similarity_threshold:
  255. paras2_idx = best_match['indices'][-1] + 1 # 更新文件2的起始索引
  256. # 记录匹配
  257. for idx in range(i, i + window_size1):
  258. used_paras1.add(idx)
  259. for idx in best_match['indices']:
  260. used_paras2.add(idx)
  261. # 只有当相似度明显不同时才记录差异
  262. if best_match['similarity'] < 95.0: # 提高阈值到95%
  263. severity = 'low' if best_match['similarity'] >= 90 else 'medium'
  264. differences.append({
  265. 'type': 'paragraph',
  266. 'position': f'段落{i+1}' + (f'-{i+window_size1}' if window_size1 > 1 else ''),
  267. 'file1_value': combined_para1,
  268. 'file2_value': best_match['text'],
  269. 'description': f'段落格式差异 (相似度: {best_match["similarity"]:.1f}%)',
  270. 'similarity': best_match['similarity'],
  271. 'severity': severity
  272. })
  273. if paras2_idx >= len(meaningful_paras2):
  274. break # 文件2已全部匹配完,退出
  275. # 处理未匹配的有意义段落
  276. for i, para in enumerate(meaningful_paras1):
  277. if i not in used_paras1:
  278. differences.append({
  279. 'type': 'paragraph',
  280. 'position': f'段落{i+1}',
  281. 'file1_value': para,
  282. 'file2_value': "",
  283. 'description': '文件1中独有的段落',
  284. 'similarity': 0.0,
  285. 'severity': 'medium'
  286. })
  287. for j, para in enumerate(meaningful_paras2):
  288. if j not in used_paras2:
  289. differences.append({
  290. 'type': 'paragraph',
  291. 'position': f'段落{j+1}',
  292. 'file1_value': "",
  293. 'file2_value': para,
  294. 'description': '文件2中独有的段落',
  295. 'similarity': 0.0,
  296. 'severity': 'medium'
  297. })
  298. return differences
  299. def _find_best_match_in_paras2_improved(self, target_text: str, paras2: List[str],
  300. paras2_idx: int) -> Dict:
  301. """改进的段落匹配方法"""
  302. best_match = None
  303. for window_size in range(1, len(paras2) + 1):
  304. for j in range(len(paras2) - window_size + 1):
  305. combined_para2 = "".join(paras2[j:j+window_size])
  306. similarity = self.calculate_text_similarity(target_text, combined_para2)
  307. if best_match and best_match['similarity'] == 100.0:
  308. break # 找到完美匹配,提前退出
  309. if not best_match or similarity > best_match['similarity']:
  310. best_match = {
  311. 'text': combined_para2,
  312. 'similarity': similarity,
  313. 'indices': list(range(j + paras2_idx, j + paras2_idx + window_size))
  314. }
  315. if best_match and best_match['similarity'] == 100.0:
  316. break # 找到完美匹配,提前退出
  317. # Return empty dict if no match found
  318. if best_match is None:
  319. return {
  320. 'text': '',
  321. 'similarity': 0.0,
  322. 'indices': []
  323. }
  324. return best_match
  325. def compare_files(self, file1_path: str, file2_path: str) -> Dict:
  326. """改进的文件比较方法"""
  327. # 读取文件
  328. with open(file1_path, 'r', encoding='utf-8') as f:
  329. content1 = f.read()
  330. with open(file2_path, 'r', encoding='utf-8') as f:
  331. content2 = f.read()
  332. # 提取表格和段落
  333. tables1 = self.extract_table_data(content1)
  334. tables2 = self.extract_table_data(content2)
  335. paras1 = self.extract_paragraphs(content1)
  336. paras2 = self.extract_paragraphs(content2)
  337. # 比较结果
  338. all_differences = []
  339. # 比较表格 (保持原有逻辑)
  340. if tables1 and tables2:
  341. table_diffs = self.compare_tables(tables1[0], tables2[0])
  342. all_differences.extend(table_diffs)
  343. elif tables1 and not tables2:
  344. all_differences.append({
  345. 'type': 'table_structure',
  346. 'position': '表格结构',
  347. 'file1_value': f'包含{len(tables1)}个表格',
  348. 'file2_value': '无表格',
  349. 'description': '文件1包含表格但文件2无表格',
  350. 'severity': 'high'
  351. })
  352. elif not tables1 and tables2:
  353. all_differences.append({
  354. 'type': 'table_structure',
  355. 'position': '表格结构',
  356. 'file1_value': '无表格',
  357. 'file2_value': f'包含{len(tables2)}个表格',
  358. 'description': '文件2包含表格但文件1无表格',
  359. 'severity': 'high'
  360. })
  361. # 使用增强的段落比较
  362. para_diffs = self.compare_paragraphs_with_flexible_matching(paras1, paras2)
  363. all_differences.extend(para_diffs)
  364. # # 生成unified diff报告
  365. # unified_diff_data = self.generate_unified_diff_report(
  366. # paras1, paras2, file1_path, file2_path,
  367. # "./output/pre_validation/unified_diff_comparison"
  368. # )
  369. # 统计信息
  370. stats = {
  371. 'total_differences': len(all_differences),
  372. 'table_differences': len([d for d in all_differences if d['type'].startswith('table')]),
  373. 'paragraph_differences': len([d for d in all_differences if d['type'] == 'paragraph']),
  374. 'amount_differences': len([d for d in all_differences if d['type'] == 'table_amount']),
  375. 'high_severity': len([d for d in all_differences if d.get('severity') == 'high']),
  376. 'medium_severity': len([d for d in all_differences if d.get('severity') == 'medium']),
  377. 'low_severity': len([d for d in all_differences if d.get('severity') == 'low'])
  378. }
  379. # 在返回结果中添加unified diff数据
  380. result = {
  381. 'differences': all_differences,
  382. 'statistics': stats,
  383. 'file1_tables': len(tables1),
  384. 'file2_tables': len(tables2),
  385. 'file1_paragraphs': len(paras1),
  386. 'file2_paragraphs': len(paras2),
  387. 'file1_path': file1_path,
  388. 'file2_path': file2_path,
  389. # 'unified_diff': unified_diff_data # 添加unified diff数据
  390. }
  391. return result
  392. def generate_unified_diff(self, paras1: List[str], paras2: List[str], file1_path: str, file2_path: str) -> Dict:
  393. """
  394. 生成类似git diff的统一差异格式,并返回结构化数据
  395. """
  396. # 直接调用normalize_text_for_comparison进行预处理
  397. file1_lines = [self.normalize_text_for_comparison(p) for p in paras1]
  398. file2_lines = [self.normalize_text_for_comparison(p) for p in paras2]
  399. # 使用unified_diff生成差异
  400. diff = difflib.unified_diff(
  401. file1_lines,
  402. file2_lines,
  403. fromfile=file1_path,
  404. tofile=file2_path,
  405. lineterm='' # 确保每行末尾不添加额外字符
  406. )
  407. # 将差异生成器转换为列表
  408. diff_output = list(diff)
  409. # 解析diff输出并生成结构化数据
  410. structured_diff = self._parse_unified_diff(diff_output, file1_lines, file2_lines, file1_path, file2_path)
  411. return structured_diff
  412. def _parse_unified_diff(self, diff_lines: List[str], file1_lines: List[str], file2_lines: List[str],
  413. file1_path: str, file2_path: str) -> Dict:
  414. """解析unified diff输出并生成结构化数据"""
  415. differences = []
  416. current_hunk = None
  417. file1_line_num = 0
  418. file2_line_num = 0
  419. for line in diff_lines:
  420. if line.startswith('---') or line.startswith('+++'):
  421. continue
  422. elif line.startswith('@@'):
  423. # 解析hunk头部,例如: @@ -1,5 +1,4 @@
  424. import re
  425. match = re.match(r'@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@', line)
  426. if match:
  427. file1_start = int(match.group(1))
  428. file1_count = int(match.group(2)) if match.group(2) else 1
  429. file2_start = int(match.group(3))
  430. file2_count = int(match.group(4)) if match.group(4) else 1
  431. current_hunk = {
  432. 'file1_start': file1_start,
  433. 'file1_count': file1_count,
  434. 'file2_start': file2_start,
  435. 'file2_count': file2_count
  436. }
  437. file1_line_num = file1_start - 1 # 转为0基索引
  438. file2_line_num = file2_start - 1
  439. elif line.startswith(' '):
  440. # 未改变的行
  441. file1_line_num += 1
  442. file2_line_num += 1
  443. elif line.startswith('-'):
  444. # 文件1中删除的行
  445. content = line[1:] # 去掉'-'前缀
  446. differences.append({
  447. 'type': 'paragraph',
  448. 'position': f'段落{file1_line_num + 1}',
  449. 'file1_value': content,
  450. 'file2_value': "",
  451. 'description': '文件1中独有的段落',
  452. 'similarity': 0.0,
  453. 'severity': 'medium',
  454. 'line_number': file1_line_num + 1,
  455. 'change_type': 'deletion'
  456. })
  457. file1_line_num += 1
  458. elif line.startswith('+'):
  459. # 文件2中添加的行
  460. content = line[1:] # 去掉'+'前缀
  461. differences.append({
  462. 'type': 'paragraph',
  463. 'position': f'段落{file2_line_num + 1}',
  464. 'file1_value': "",
  465. 'file2_value': content,
  466. 'description': '文件2中独有的段落',
  467. 'similarity': 0.0,
  468. 'severity': 'medium',
  469. 'line_number': file2_line_num + 1,
  470. 'change_type': 'addition'
  471. })
  472. file2_line_num += 1
  473. # 计算统计信息
  474. stats = {
  475. 'total_differences': len(differences),
  476. 'table_differences': 0, # diff不包含表格差异
  477. 'paragraph_differences': len(differences),
  478. 'amount_differences': 0,
  479. 'high_severity': len([d for d in differences if d.get('severity') == 'high']),
  480. 'medium_severity': len([d for d in differences if d.get('severity') == 'medium']),
  481. 'low_severity': len([d for d in differences if d.get('severity') == 'low']),
  482. 'deletions': len([d for d in differences if d.get('change_type') == 'deletion']),
  483. 'additions': len([d for d in differences if d.get('change_type') == 'addition'])
  484. }
  485. return {
  486. 'differences': differences,
  487. 'statistics': stats,
  488. 'file1_tables': 0,
  489. 'file2_tables': 0,
  490. 'file1_paragraphs': len(file1_lines),
  491. 'file2_paragraphs': len(file2_lines),
  492. 'file1_path': file1_path,
  493. 'file2_path': file2_path,
  494. 'diff_type': 'unified_diff'
  495. }
  496. def generate_unified_diff_report(self, paras1: List[str], paras2: List[str], file1_path: str, file2_path: str, output_file: str):
  497. """生成unified diff的JSON和Markdown报告"""
  498. # 生成结构化diff数据
  499. diff_data = self.generate_unified_diff(paras1, paras2, file1_path, file2_path)
  500. # 添加时间戳
  501. import datetime
  502. diff_data['timestamp'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
  503. # 生成JSON报告
  504. json_file = f"{output_file}_unified_diff.json"
  505. with open(json_file, 'w', encoding='utf-8') as f:
  506. json.dump(diff_data, f, ensure_ascii=False, indent=2)
  507. # 生成Markdown报告
  508. md_file = f"{output_file}_unified_diff.md"
  509. self._generate_unified_diff_markdown(diff_data, md_file)
  510. print(f"📄 Unified Diff JSON报告: {json_file}")
  511. print(f"📝 Unified Diff Markdown报告: {md_file}")
  512. return diff_data
  513. def _generate_unified_diff_markdown(self, diff_data: Dict, output_file: str):
  514. """生成unified diff的Markdown报告"""
  515. with open(output_file, 'w', encoding='utf-8') as f:
  516. f.write("# OCR结果Unified Diff对比报告\n\n")
  517. # 基本信息
  518. f.write("## 基本信息\n\n")
  519. f.write(f"- **文件1**: `{diff_data['file1_path']}`\n")
  520. f.write(f"- **文件2**: `{diff_data['file2_path']}`\n")
  521. f.write(f"- **比较时间**: {diff_data.get('timestamp', 'N/A')}\n")
  522. f.write(f"- **对比方式**: Unified Diff\n\n")
  523. # 统计信息
  524. stats = diff_data['statistics']
  525. f.write("## 统计信息\n\n")
  526. f.write(f"- 总差异数量: **{stats['total_differences']}**\n")
  527. f.write(f"- 删除行数: **{stats['deletions']}**\n")
  528. f.write(f"- 添加行数: **{stats['additions']}**\n")
  529. f.write(f"- 文件1段落数: {diff_data['file1_paragraphs']}\n")
  530. f.write(f"- 文件2段落数: {diff_data['file2_paragraphs']}\n\n")
  531. # 差异详情
  532. if diff_data['differences']:
  533. f.write("## 差异详情\n\n")
  534. # 按变更类型分组
  535. deletions = [d for d in diff_data['differences'] if d['change_type'] == 'deletion']
  536. additions = [d for d in diff_data['differences'] if d['change_type'] == 'addition']
  537. if deletions:
  538. f.write(f"### 🗑️ 删除内容 ({len(deletions)}项)\n\n")
  539. for i, diff in enumerate(deletions, 1):
  540. f.write(f"**{i}. 第{diff['line_number']}行**\n")
  541. f.write(f"```\n{diff['file1_value']}\n```\n\n")
  542. if additions:
  543. f.write(f"### ➕ 新增内容 ({len(additions)}项)\n\n")
  544. for i, diff in enumerate(additions, 1):
  545. f.write(f"**{i}. 第{diff['line_number']}行**\n")
  546. f.write(f"```\n{diff['file2_value']}\n```\n\n")
  547. # 详细差异表格
  548. f.write("## 详细差异列表\n\n")
  549. f.write("| 序号 | 类型 | 行号 | 变更类型 | 内容 | 描述 |\n")
  550. f.write("| --- | --- | --- | --- | --- | --- |\n")
  551. for i, diff in enumerate(diff_data['differences'], 1):
  552. change_icon = "🗑️" if diff['change_type'] == 'deletion' else "➕"
  553. content = diff['file1_value'] if diff['change_type'] == 'deletion' else diff['file2_value']
  554. f.write(f"| {i} | {change_icon} | {diff['line_number']} | {diff['change_type']} | ")
  555. f.write(f"`{content[:50]}{'...' if len(content) > 50 else ''}` | ")
  556. f.write(f"{diff['description']} |\n")
  557. else:
  558. f.write("## 结论\n\n")
  559. f.write("🎉 **完美匹配!没有发现任何差异。**\n\n")
  560. def generate_json_report(self, comparison_result: Dict, output_file: str):
  561. """生成JSON格式的比较报告"""
  562. # report_data = {
  563. # 'comparison_summary': {
  564. # 'timestamp': re.sub(r'[^\w\-_\.]', '_', str(comparison_result.get('timestamp', ''))),
  565. # 'file1': comparison_result['file1_path'],
  566. # 'file2': comparison_result['file2_path'],
  567. # 'statistics': comparison_result['statistics'],
  568. # 'file_info': {
  569. # 'file1_tables': comparison_result['file1_tables'],
  570. # 'file2_tables': comparison_result['file2_tables'],
  571. # 'file1_paragraphs': comparison_result['file1_paragraphs'],
  572. # 'file2_paragraphs': comparison_result['file2_paragraphs']
  573. # }
  574. # },
  575. # 'differences': comparison_result['differences']
  576. # }
  577. with open(output_file, 'w', encoding='utf-8') as f:
  578. json.dump(comparison_result, f, ensure_ascii=False, indent=2)
  579. def generate_markdown_report(self, comparison_result: Dict, output_file: str):
  580. """生成Markdown格式的比较报告"""
  581. with open(output_file, 'w', encoding='utf-8') as f:
  582. f.write("# OCR结果对比报告\n\n")
  583. # 基本信息
  584. f.write("## 基本信息\n\n")
  585. f.write(f"- **文件1**: `{comparison_result['file1_path']}`\n")
  586. f.write(f"- **文件2**: `{comparison_result['file2_path']}`\n")
  587. f.write(f"- **比较时间**: {comparison_result.get('timestamp', 'N/A')}\n\n")
  588. # 统计信息
  589. stats = comparison_result['statistics']
  590. f.write("## 统计信息\n\n")
  591. f.write(f"- 总差异数量: **{stats['total_differences']}**\n")
  592. f.write(f"- 表格差异: **{stats['table_differences']}**\n")
  593. f.write(f"- 金额差异: **{stats['amount_differences']}**\n")
  594. f.write(f"- 段落差异: **{stats['paragraph_differences']}**\n")
  595. f.write(f"- 文件1表格数: {comparison_result['file1_tables']}\n")
  596. f.write(f"- 文件2表格数: {comparison_result['file2_tables']}\n")
  597. f.write(f"- 文件1段落数: {comparison_result['file1_paragraphs']}\n")
  598. f.write(f"- 文件2段落数: {comparison_result['file2_paragraphs']}\n\n")
  599. # 差异摘要
  600. if stats['total_differences'] == 0:
  601. f.write("## 结论\n\n")
  602. f.write("🎉 **完美匹配!没有发现任何差异。**\n\n")
  603. else:
  604. f.write("## 差异摘要\n\n")
  605. # 按类型分组显示差异
  606. diff_by_type = {}
  607. for diff in comparison_result['differences']:
  608. diff_type = diff['type']
  609. if diff_type not in diff_by_type:
  610. diff_by_type[diff_type] = []
  611. diff_by_type[diff_type].append(diff)
  612. for diff_type, diffs in diff_by_type.items():
  613. type_name = {
  614. 'table_amount': '💰 表格金额差异',
  615. 'table_text': '📝 表格文本差异',
  616. 'paragraph': '📄 段落差异',
  617. 'table_structure': '🏗️ 表格结构差异'
  618. }.get(diff_type, f'❓ {diff_type}')
  619. f.write(f"### {type_name} ({len(diffs)}个)\n\n")
  620. for i, diff in enumerate(diffs, 1):
  621. f.write(f"**{i}. {diff['position']}**\n")
  622. f.write(f"- 文件1: `{diff['file1_value']}`\n")
  623. f.write(f"- 文件2: `{diff['file2_value']}`\n")
  624. f.write(f"- 说明: {diff['description']}\n\n")
  625. # 详细差异列表
  626. if comparison_result['differences']:
  627. f.write("## 详细差异列表\n\n")
  628. f.write("| 序号 | 类型 | 位置 | 文件1内容 | 文件2内容 | 描述 |\n")
  629. f.write("| --- | --- | --- | --- | --- | --- |\n")
  630. for i, diff in enumerate(comparison_result['differences'], 1):
  631. f.write(f"| {i} | {diff['type']} | {diff['position']} | ")
  632. f.write(f"`{diff['file1_value'][:50]}{'...' if len(diff['file1_value']) > 50 else ''}` | ")
  633. f.write(f"`{diff['file2_value'][:50]}{'...' if len(diff['file2_value']) > 50 else ''}` | ")
  634. f.write(f"{diff['description']} |\n")
  635. def compare_ocr_results(file1_path: str, file2_path: str, output_file: str = "comparison_report",
  636. output_format: str = "markdown", ignore_images: bool = True):
  637. """
  638. 比较两个OCR结果文件
  639. Args:
  640. file1_path: 第一个OCR结果文件路径
  641. file2_path: 第二个OCR结果文件路径
  642. output_file: 输出文件名(不含扩展名),默认为"comparison_report"
  643. output_format: 输出格式,选项: 'json', 'markdown', 'both',默认为'markdown'
  644. ignore_images: 是否忽略图片内容,默认为True
  645. Returns:
  646. Dict: 比较结果字典
  647. """
  648. comparator = OCRResultComparator()
  649. print("🔍 开始对比OCR结果...")
  650. print(f"📄 文件1: {file1_path}")
  651. print(f"📄 文件2: {file2_path}")
  652. print(f"📁 输出格式: {output_format}")
  653. print(f"🖼️ 图片处理: {'忽略' if ignore_images else '对比'}")
  654. try:
  655. # 执行比较
  656. result = comparator.compare_files(file1_path, file2_path)
  657. # 添加时间戳
  658. import datetime
  659. result['timestamp'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
  660. # 生成报告
  661. if output_format in ['json', 'both']:
  662. json_file = f"{output_file}.json"
  663. comparator.generate_json_report(result, json_file)
  664. print(f"📄 JSON报告已保存至: {json_file}")
  665. if output_format in ['markdown', 'both']:
  666. md_file = f"{output_file}.md"
  667. comparator.generate_markdown_report(result, md_file)
  668. print(f"📝 Markdown报告已保存至: {md_file}")
  669. # 打印简要结果
  670. print(f"\n📊 对比完成!")
  671. print(f" 总差异数: {result['statistics']['total_differences']}")
  672. print(f" 表格差异: {result['statistics']['table_differences']}")
  673. print(f" 金额差异: {result['statistics']['amount_differences']}")
  674. print(f" 段落差异: {result['statistics']['paragraph_differences']}")
  675. # 打印前几个重要差异
  676. if result['differences']:
  677. print(f"\n🔍 前3个重要差异:")
  678. for i, diff in enumerate(result['differences'][:3], 1):
  679. print(f" {i}. {diff['position']}: {diff['description']}")
  680. print(f" 文件1: '{diff['file1_value'][:50]}{'...' if len(diff['file1_value']) > 50 else ''}'")
  681. print(f" 文件2: '{diff['file2_value'][:50]}{'...' if len(diff['file2_value']) > 50 else ''}'")
  682. else:
  683. print(f"\n🎉 恭喜!两个文件内容完全一致!")
  684. # 添加处理统计信息(模仿 ocr_by_vlm.py 的风格)
  685. print("\n📊 对比处理统计")
  686. print(f" 文件1路径: {result['file1_path']}")
  687. print(f" 文件2路径: {result['file2_path']}")
  688. print(f" 输出文件: {output_file}")
  689. print(f" 输出格式: {output_format}")
  690. print(f" 忽略图片: {ignore_images}")
  691. print(f" 处理时间: {result['timestamp']}")
  692. print(f" 文件1表格数: {result['file1_tables']}")
  693. print(f" 文件2表格数: {result['file2_tables']}")
  694. print(f" 文件1段落数: {result['file1_paragraphs']}")
  695. print(f" 文件2段落数: {result['file2_paragraphs']}")
  696. return result
  697. except Exception as e:
  698. import traceback
  699. traceback.print_exc()
  700. raise Exception(f"OCR对比任务失败: {e}")
  701. if __name__ == "__main__":
  702. parser = argparse.ArgumentParser(description='OCR结果对比工具')
  703. parser.add_argument('file1', nargs= '?', help='第一个OCR结果文件路径')
  704. parser.add_argument('file2', nargs= '?', help='第二个OCR结果文件路径')
  705. parser.add_argument('-o', '--output', default='comparison_report',
  706. help='输出文件名(不含扩展名)')
  707. parser.add_argument('-f', '--format', choices=['json', 'markdown', 'both'],
  708. default='markdown', help='输出格式: json, markdown, 或 both')
  709. parser.add_argument('--ignore-images', action='store_true',
  710. help='忽略图片内容(默认已启用)')
  711. args = parser.parse_args()
  712. if args.file1 and args.file2:
  713. result = compare_ocr_results(
  714. file1_path=args.file1,
  715. file2_path=args.file2,
  716. output_file=args.output,
  717. output_format=args.format,
  718. ignore_images=args.ignore_images
  719. )
  720. else:
  721. # 如果sys.argv没有被传入参数,则提供默认参数用于测试
  722. result = compare_ocr_results(
  723. file1_path='/Users/zhch158/workspace/data/至远彩色印刷工业有限公司/data_DotsOCR_Results/2023年度报告母公司_page_001.md',
  724. file2_path='./output/pre_validation/2023年度报告母公司_page_001.md',
  725. # output_file=f'./output/comparison_result_{time.strftime("%Y%m%d_%H%M%S")}',
  726. output_file=f'./output/pre_validation/2023年度报告母公司_page_001_comparison_result',
  727. output_format='both',
  728. ignore_images=True
  729. )
  730. print("\n🎉 OCR对比完成!")