compare_ocr_results.py 47 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090
  1. import sys
  2. import time
  3. import re
  4. import difflib
  5. import json
  6. import argparse
  7. from typing import Dict, List, Tuple
  8. import markdown
  9. from bs4 import BeautifulSoup
  10. from fuzzywuzzy import fuzz
  11. class OCRResultComparator:
  12. def __init__(self):
  13. self.differences = []
  14. self.similarity_threshold = 85
  15. self.max_paragraph_window = 6
  16. self.table_comparison_mode = 'standard' # 新增:表格比较模式
  17. self.header_similarity_threshold = 80 # 表头相似度阈值
  18. def normalize_text(self, text: str) -> str:
  19. """标准化文本:去除多余空格、回车等无效字符"""
  20. if not text:
  21. return ""
  22. # 去除多余的空白字符
  23. text = re.sub(r'\s+', ' ', text.strip())
  24. # 去除标点符号周围的空格
  25. text = re.sub(r'\s*([,。:;!?、])\s*', r'\1', text)
  26. return text
  27. def is_image_reference(self, text: str) -> bool:
  28. """判断是否为图片引用或描述"""
  29. image_keywords = [
  30. '图', '图片', '图像', 'image', 'figure', 'fig',
  31. '照片', '截图', '示意图', '流程图', '结构图'
  32. ]
  33. # 检查是否包含图片相关关键词
  34. for keyword in image_keywords:
  35. if keyword in text.lower():
  36. return True
  37. # 检查是否为Markdown图片语法
  38. if re.search(r'!\[.*?\]\(.*?\)', text):
  39. return True
  40. # 检查是否为HTML图片标签
  41. if re.search(r'<img[^>]*>', text, re.IGNORECASE):
  42. return True
  43. return False
  44. def extract_table_data(self, md_content: str) -> List[List[List[str]]]:
  45. """从Markdown中提取表格数据"""
  46. tables = []
  47. # 使用BeautifulSoup解析HTML表格
  48. soup = BeautifulSoup(md_content, 'html.parser')
  49. html_tables = soup.find_all('table')
  50. for table in html_tables:
  51. table_data = []
  52. rows = table.find_all('tr')
  53. for row in rows:
  54. cells = row.find_all(['td', 'th'])
  55. row_data = []
  56. for cell in cells:
  57. cell_text = self.normalize_text(cell.get_text())
  58. # 跳过图片内容
  59. if not self.is_image_reference(cell_text):
  60. row_data.append(cell_text)
  61. else:
  62. row_data.append("[图片内容-忽略]")
  63. if row_data: # 只添加非空行
  64. table_data.append(row_data)
  65. if table_data:
  66. tables.append(table_data)
  67. return tables
  68. def merge_split_paragraphs(self, lines: List[str]) -> List[str]:
  69. # 合并连续的非空行作为一个段落,且过滤图片内容
  70. merged_lines = []
  71. current_paragraph = ""
  72. for i, line in enumerate(lines):
  73. # 跳过空行
  74. if not line:
  75. if current_paragraph:
  76. merged_lines.append(current_paragraph)
  77. current_paragraph = ""
  78. continue
  79. # 跳过图片内容
  80. if self.is_image_reference(line):
  81. continue
  82. # 检查是否是标题(以数字、中文数字或特殊标记开头)
  83. is_title = (
  84. line.startswith(('一、', '二、', '三、', '四、', '五、', '六、', '七、', '八、', '九、', '十、')) or
  85. line.startswith(('1.', '2.', '3.', '4.', '5.', '6.', '7.', '8.', '9.')) or
  86. line.startswith('#')
  87. )
  88. # 如果是标题,结束当前段落
  89. if is_title:
  90. if current_paragraph:
  91. merged_lines.append(current_paragraph)
  92. current_paragraph = ""
  93. merged_lines.append(line)
  94. else:
  95. # 检查是否应该与前一行合并 # 如果当前段落不为空,且当前段落最后一个字符非空白字符
  96. if current_paragraph and not current_paragraph.endswith((' ', '\t')):
  97. current_paragraph += line
  98. else:
  99. current_paragraph = line
  100. # 处理最后一个段落
  101. if current_paragraph:
  102. merged_lines.append(current_paragraph)
  103. return merged_lines
  104. def extract_paragraphs(self, md_content: str) -> List[str]:
  105. """提取段落文本"""
  106. # 移除表格 - 修复正则表达式
  107. # 使用 IGNORECASE 和 DOTALL 标志
  108. content = re.sub(r'<table[^>]*>.*?</table>', '', md_content, flags=re.DOTALL | re.IGNORECASE)
  109. # 移除其他 HTML 标签
  110. content = re.sub(r'<[^>]+>', '', content)
  111. # 移除 Markdown 注释
  112. content = re.sub(r'<!--.*?-->', '', content, flags=re.DOTALL)
  113. # 分割段落
  114. paragraphs = []
  115. lines = content.split('\n')
  116. merged_lines = self.merge_split_paragraphs(lines)
  117. for line in merged_lines:
  118. normalized = self.normalize_text(line)
  119. if normalized:
  120. paragraphs.append(normalized)
  121. else:
  122. print(f"跳过的内容无效或图片段落: {line[0:30] if line else ''}...")
  123. return paragraphs
  124. def compare_tables(self, table1: List[List[str]], table2: List[List[str]]) -> List[Dict]:
  125. """比较表格数据"""
  126. differences = []
  127. # 确定最大行数
  128. max_rows = max(len(table1), len(table2))
  129. for i in range(max_rows):
  130. row1 = table1[i] if i < len(table1) else []
  131. row2 = table2[i] if i < len(table2) else []
  132. # 确定最大列数
  133. max_cols = max(len(row1), len(row2))
  134. for j in range(max_cols):
  135. cell1 = row1[j] if j < len(row1) else ""
  136. cell2 = row2[j] if j < len(row2) else ""
  137. # 跳过图片内容比较
  138. if "[图片内容-忽略]" in cell1 or "[图片内容-忽略]" in cell2:
  139. continue
  140. if cell1 != cell2:
  141. # 特别处理数字金额
  142. if self.is_numeric(cell1) and self.is_numeric(cell2):
  143. num1 = self.parse_number(cell1)
  144. num2 = self.parse_number(cell2)
  145. if abs(num1 - num2) > 0.001: # 允许小数精度误差
  146. differences.append({
  147. 'type': 'table_amount',
  148. 'position': f'行{i+1}列{j+1}',
  149. 'file1_value': cell1,
  150. 'file2_value': cell2,
  151. 'description': f'金额不一致: {cell1} vs {cell2}',
  152. 'row_index': i,
  153. 'col_index': j
  154. })
  155. else:
  156. differences.append({
  157. 'type': 'table_text',
  158. 'position': f'行{i+1}列{j+1}',
  159. 'file1_value': cell1,
  160. 'file2_value': cell2,
  161. 'description': f'文本不一致: {cell1} vs {cell2}',
  162. 'row_index': i,
  163. 'col_index': j
  164. })
  165. return differences
  166. def is_numeric(self, text: str) -> bool:
  167. """判断文本是否为数字"""
  168. if not text:
  169. return False
  170. # 移除千分位分隔符和负号
  171. clean_text = re.sub(r'[,,-]', '', text)
  172. try:
  173. float(clean_text)
  174. return True
  175. except ValueError:
  176. return False
  177. def parse_number(self, text: str) -> float:
  178. """解析数字"""
  179. if not text:
  180. return 0.0
  181. clean_text = re.sub(r'[,,]', '', text)
  182. try:
  183. return float(clean_text)
  184. except ValueError:
  185. return 0.0
  186. def calculate_text_similarity(self, text1: str, text2: str) -> float:
  187. """改进的相似度计算"""
  188. if not text1 and not text2:
  189. return 100.0
  190. if not text1 or not text2:
  191. return 0.0
  192. # 如果标准化后完全相同,返回100%
  193. if text1 == text2:
  194. return 100.0
  195. # 使用多种相似度算法
  196. similarity_scores = [
  197. fuzz.ratio(text1, text2),
  198. # fuzz.partial_ratio(text1, text2),
  199. # fuzz.token_sort_ratio(text1, text2),
  200. # fuzz.token_set_ratio(text1, text2)
  201. ]
  202. # 对于包含关系,给予更高的权重
  203. # if text1 in text2 or text2 in text1:
  204. # max_score = max(similarity_scores)
  205. # # 提升包含关系的相似度
  206. # return min(100.0, max_score + 10)
  207. return max(similarity_scores)
  208. def compare_paragraphs_with_flexible_matching(self, paras1: List[str], paras2: List[str]) -> List[Dict]:
  209. """改进的段落匹配算法 - 更好地处理段落重组"""
  210. differences = []
  211. # 直接调用进行预处理
  212. meaningful_paras1 = paras1
  213. meaningful_paras2 = paras2
  214. # 使用预处理后的段落进行匹配
  215. used_paras1 = set()
  216. used_paras2 = set()
  217. best_match = {'similarity': 0.0} # 初始化best_match
  218. # 文件1和文件2同时向下遍历,当有匹配项时,文件2的窗口从匹配项的下一个位置开始
  219. paras2_idx = 0
  220. for window_size1 in range(1, min(self.max_paragraph_window, len(meaningful_paras1) + 1)): # 增加到6个段落
  221. for i in range(len(meaningful_paras1) - window_size1 + 1):
  222. if any(idx in used_paras1 for idx in range(i, i + window_size1)):
  223. continue
  224. # 合并文件1中的段落
  225. combined_para1 = "".join(meaningful_paras1[i:i+window_size1])
  226. # 在文件2中寻找最佳匹配
  227. best_match = self._find_best_match_in_paras2_improved(
  228. combined_para1,
  229. meaningful_paras2[paras2_idx: min(paras2_idx + self.max_paragraph_window, len(meaningful_paras2))],
  230. paras2_idx
  231. )
  232. if best_match and best_match['similarity'] >= self.similarity_threshold:
  233. paras2_idx = best_match['indices'][-1] + 1 # 更新文件2的起始索引
  234. # 记录匹配
  235. for idx in range(i, i + window_size1):
  236. used_paras1.add(idx)
  237. for idx in best_match['indices']:
  238. used_paras2.add(idx)
  239. # 只有当相似度明显不同时才记录差异
  240. if best_match['similarity'] < 95.0: # 提高阈值到95%
  241. severity = 'low' if best_match['similarity'] >= 90 else 'medium'
  242. differences.append({
  243. 'type': 'paragraph',
  244. 'position': f'段落{i+1}' + (f'-{i+window_size1}' if window_size1 > 1 else ''),
  245. 'file1_value': combined_para1,
  246. 'file2_value': best_match['text'],
  247. 'description': f'段落格式差异 (相似度: {best_match["similarity"]:.1f}%)',
  248. 'similarity': best_match['similarity'],
  249. 'severity': severity
  250. })
  251. if paras2_idx >= len(meaningful_paras2):
  252. break # 文件2已全部匹配完,退出
  253. # 处理未匹配的有意义段落
  254. for i, para in enumerate(meaningful_paras1):
  255. if i not in used_paras1:
  256. differences.append({
  257. 'type': 'paragraph',
  258. 'position': f'段落{i+1}',
  259. 'file1_value': para,
  260. 'file2_value': "",
  261. 'description': '文件1中独有的段落',
  262. 'similarity': 0.0,
  263. 'severity': 'medium'
  264. })
  265. for j, para in enumerate(meaningful_paras2):
  266. if j not in used_paras2:
  267. differences.append({
  268. 'type': 'paragraph',
  269. 'position': f'段落{j+1}',
  270. 'file1_value': "",
  271. 'file2_value': para,
  272. 'description': '文件2中独有的段落',
  273. 'similarity': 0.0,
  274. 'severity': 'medium'
  275. })
  276. return differences
  277. def _find_best_match_in_paras2_improved(self, target_text: str, paras2: List[str],
  278. paras2_idx: int) -> Dict:
  279. """改进的段落匹配方法"""
  280. best_match = None
  281. for window_size in range(1, len(paras2) + 1):
  282. for j in range(len(paras2) - window_size + 1):
  283. combined_para2 = "".join(paras2[j:j+window_size])
  284. similarity = self.calculate_text_similarity(target_text, combined_para2)
  285. if best_match and best_match['similarity'] == 100.0:
  286. break # 找到完美匹配,提前退出
  287. if not best_match or similarity > best_match['similarity']:
  288. best_match = {
  289. 'text': combined_para2,
  290. 'similarity': similarity,
  291. 'indices': list(range(j + paras2_idx, j + paras2_idx + window_size))
  292. }
  293. if best_match and best_match['similarity'] == 100.0:
  294. break # 找到完美匹配,提前退出
  295. # Return empty dict if no match found
  296. if best_match is None:
  297. return {
  298. 'text': '',
  299. 'similarity': 0.0,
  300. 'indices': []
  301. }
  302. return best_match
  303. def detect_column_type(self, column_values: List[str]) -> str:
  304. """检测列的数据类型"""
  305. if not column_values:
  306. return 'text'
  307. # 过滤空值
  308. non_empty_values = [v for v in column_values if v and v.strip()]
  309. if not non_empty_values:
  310. return 'text'
  311. # 检测是否为日期时间
  312. datetime_patterns = [
  313. r'\d{4}[-/]\d{1,2}[-/]\d{1,2}', # YYYY-MM-DD
  314. r'\d{4}[-/]\d{1,2}[-/]\d{1,2}\s*\d{1,2}:\d{1,2}:\d{1,2}', # YYYY-MM-DD HH:MM:SS
  315. r'\d{4}年\d{1,2}月\d{1,2}日', # 中文日期
  316. ]
  317. datetime_count = 0
  318. for value in non_empty_values[:5]: # 检查前5个值
  319. for pattern in datetime_patterns:
  320. if re.search(pattern, value):
  321. datetime_count += 1
  322. break
  323. if datetime_count >= len(non_empty_values[:5]) * 0.6:
  324. return 'datetime'
  325. # 检测是否为数字/金额
  326. numeric_count = 0
  327. for value in non_empty_values[:5]:
  328. if self.is_numeric(value):
  329. numeric_count += 1
  330. if numeric_count >= len(non_empty_values[:5]) * 0.6:
  331. return 'numeric'
  332. # 默认为文本
  333. return 'text'
  334. def normalize_header_text(self, text: str) -> str:
  335. """标准化表头文本"""
  336. # 移除括号及其内容
  337. text = re.sub(r'[((].*?[))]', '', text)
  338. # 统一空格
  339. text = re.sub(r'\s+', '', text)
  340. # 移除特殊字符
  341. text = re.sub(r'[^\w\u4e00-\u9fff]', '', text)
  342. return text.lower().strip()
  343. def compare_table_headers(self, headers1: List[str], headers2: List[str]) -> Dict:
  344. """比较表格表头"""
  345. result = {
  346. 'match': True,
  347. 'differences': [],
  348. 'column_mapping': {}, # 列映射关系
  349. 'similarity_scores': []
  350. }
  351. if len(headers1) != len(headers2):
  352. result['match'] = False
  353. result['differences'].append({
  354. 'type': 'header_count',
  355. 'description': f'表头列数不一致: {len(headers1)} vs {len(headers2)}',
  356. 'severity': 'critical'
  357. })
  358. return result
  359. # 逐列比较表头
  360. for i, (h1, h2) in enumerate(zip(headers1, headers2)):
  361. norm_h1 = self.normalize_header_text(h1)
  362. norm_h2 = self.normalize_header_text(h2)
  363. similarity = self.calculate_text_similarity(norm_h1, norm_h2)
  364. result['similarity_scores'].append({
  365. 'column_index': i,
  366. 'header1': h1,
  367. 'header2': h2,
  368. 'similarity': similarity
  369. })
  370. if similarity < self.header_similarity_threshold:
  371. result['match'] = False
  372. result['differences'].append({
  373. 'type': 'header_mismatch',
  374. 'column_index': i,
  375. 'header1': h1,
  376. 'header2': h2,
  377. 'similarity': similarity,
  378. 'description': f'第{i+1}列表头不匹配: "{h1}" vs "{h2}" (相似度: {similarity:.1f}%)',
  379. 'severity': 'critical'
  380. })
  381. else:
  382. result['column_mapping'][i] = i # 建立列映射
  383. return result
  384. def compare_cell_value(self, value1: str, value2: str, column_type: str,
  385. column_name: str = '') -> Dict:
  386. """比较单元格值 - 统一错误类型"""
  387. result = {
  388. 'match': True,
  389. 'difference': None
  390. }
  391. # 标准化值
  392. v1 = self.normalize_text(value1)
  393. v2 = self.normalize_text(value2)
  394. if v1 == v2:
  395. return result
  396. # 根据列类型采用不同的比较策略
  397. if column_type == 'numeric':
  398. # 数字/金额比较
  399. if self.is_numeric(v1) and self.is_numeric(v2):
  400. num1 = self.parse_number(v1)
  401. num2 = self.parse_number(v2)
  402. if abs(num1 - num2) > 0.01: # 允许0.01的误差
  403. result['match'] = False
  404. result['difference'] = {
  405. 'type': 'table_amount', # ✅ 统一类型
  406. 'value1': value1,
  407. 'value2': value2,
  408. 'diff_amount': abs(num1 - num2),
  409. 'description': f'金额不一致: {value1} vs {value2}'
  410. }
  411. else:
  412. result['match'] = False
  413. result['difference'] = {
  414. 'type': 'table_amount', # ✅ 格式错误也算金额差异
  415. 'value1': value1,
  416. 'value2': value2,
  417. 'description': f'数字格式错误: {value1} vs {value2}'
  418. }
  419. elif column_type == 'datetime':
  420. # 日期时间比较
  421. datetime1 = self.extract_datetime(v1)
  422. datetime2 = self.extract_datetime(v2)
  423. if datetime1 != datetime2:
  424. result['match'] = False
  425. result['difference'] = {
  426. 'type': 'table_datetime', # ✅ 日期时间类型
  427. 'value1': value1,
  428. 'value2': value2,
  429. 'description': f'日期时间不一致: {value1} vs {value2}'
  430. }
  431. else:
  432. # 文本比较
  433. similarity = self.calculate_text_similarity(v1, v2)
  434. if similarity < self.similarity_threshold:
  435. result['match'] = False
  436. result['difference'] = {
  437. 'type': 'table_text', # ✅ 文本差异
  438. 'value1': value1,
  439. 'value2': value2,
  440. 'similarity': similarity,
  441. 'description': f'文本不一致: {value1} vs {value2} (相似度: {similarity:.1f}%)'
  442. }
  443. return result
  444. def extract_datetime(self, text: str) -> str:
  445. """提取并标准化日期时间"""
  446. # 尝试匹配各种日期时间格式
  447. patterns = [
  448. (r'(\d{4})[-/](\d{1,2})[-/](\d{1,2})\s*(\d{1,2}):(\d{1,2}):(\d{1,2})',
  449. lambda m: f"{m.group(1)}-{m.group(2).zfill(2)}-{m.group(3).zfill(2)} {m.group(4).zfill(2)}:{m.group(5).zfill(2)}:{m.group(6).zfill(2)}"),
  450. (r'(\d{4})[-/](\d{1,2})[-/](\d{1,2})',
  451. lambda m: f"{m.group(1)}-{m.group(2).zfill(2)}-{m.group(3).zfill(2)}"),
  452. (r'(\d{4})年(\d{1,2})月(\d{1,2})日',
  453. lambda m: f"{m.group(1)}-{m.group(2).zfill(2)}-{m.group(3).zfill(2)}"),
  454. ]
  455. for pattern, formatter in patterns:
  456. match = re.search(pattern, text)
  457. if match:
  458. return formatter(match)
  459. return text
  460. def detect_table_header_row(self, table: List[List[str]]) -> int:
  461. """
  462. 智能检测表格的表头行索引
  463. 策略:
  464. 1. 查找包含典型表头关键词的行(如:序号、编号、时间、日期、金额等)
  465. 2. 检查该行后续行是否为数据行(包含数字、日期等)
  466. 3. 返回表头行的索引,如果找不到则返回0
  467. """
  468. # 常见表头关键词
  469. header_keywords = [
  470. # 通用表头
  471. '序号', '编号', '时间', '日期', '名称', '类型', '金额', '数量', '单价',
  472. '备注', '说明', '状态', '类别', '方式', '账号', '单号', '订单',
  473. # 流水表格特定
  474. '交易单号', '交易时间', '交易类型', '收/支', '支出', '收入',
  475. '交易方式', '交易对方', '商户单号', '付款方式', '收款方',
  476. # 英文表头
  477. 'no', 'id', 'time', 'date', 'name', 'type', 'amount', 'status'
  478. ]
  479. for row_idx, row in enumerate(table):
  480. if not row:
  481. continue
  482. # 计算该行包含表头关键词的单元格数量
  483. keyword_count = 0
  484. for cell in row:
  485. cell_lower = cell.lower().strip()
  486. for keyword in header_keywords:
  487. if keyword in cell_lower:
  488. keyword_count += 1
  489. break
  490. # 如果超过一半的单元格包含表头关键词,认为是表头行
  491. if keyword_count >= len(row) * 0.4 and keyword_count >= 2:
  492. # 验证:检查下一行是否像数据行
  493. if row_idx + 1 < len(table):
  494. next_row = table[row_idx + 1]
  495. if self.is_data_row(next_row):
  496. print(f" 📍 检测到表头在第 {row_idx + 1} 行")
  497. return row_idx
  498. # 如果没有找到明确的表头行,返回0(默认第一行)
  499. print(f" ⚠️ 未检测到明确表头,默认使用第1行")
  500. return 0
  501. def is_data_row(self, row: List[str]) -> bool:
  502. """判断是否为数据行(包含数字、日期等)"""
  503. data_pattern_count = 0
  504. for cell in row:
  505. if not cell:
  506. continue
  507. # 检查是否包含数字
  508. if re.search(r'\d', cell):
  509. data_pattern_count += 1
  510. # 检查是否为日期时间格式
  511. if re.search(r'\d{4}[-/年]\d{1,2}[-/月]\d{1,2}', cell):
  512. data_pattern_count += 1
  513. # 如果超过一半的单元格包含数据特征,认为是数据行
  514. return data_pattern_count >= len(row) * 0.5
  515. def compare_table_flow_list(self, table1: List[List[str]], table2: List[List[str]]) -> List[Dict]:
  516. """专门的流水列表表格比较算法 - 支持表头不在第一行"""
  517. differences = []
  518. if not table1 or not table2:
  519. return [{
  520. 'type': 'table_empty',
  521. 'description': '表格为空',
  522. 'severity': 'critical'
  523. }]
  524. print(f"\n📋 开始流水表格对比...")
  525. # 第一步:智能检测表头位置
  526. header_row_idx1 = self.detect_table_header_row(table1)
  527. header_row_idx2 = self.detect_table_header_row(table2)
  528. if header_row_idx1 != header_row_idx2:
  529. differences.append({
  530. 'type': 'table_header_position',
  531. 'position': '表头位置',
  532. 'file1_value': f'第{header_row_idx1 + 1}行',
  533. 'file2_value': f'第{header_row_idx2 + 1}行',
  534. 'description': f'表头位置不一致: 文件1在第{header_row_idx1 + 1}行,文件2在第{header_row_idx2 + 1}行',
  535. 'severity': 'high'
  536. })
  537. # 第二步:比对表头前的内容(按单元格比对)
  538. if header_row_idx1 > 0 or header_row_idx2 > 0:
  539. print(f"\n📝 对比表头前的内容...")
  540. # 提取表头前的内容作为单独的"表格"
  541. pre_header_table1 = table1[:header_row_idx1] if header_row_idx1 > 0 else []
  542. pre_header_table2 = table2[:header_row_idx2] if header_row_idx2 > 0 else []
  543. if pre_header_table1 or pre_header_table2:
  544. # 复用compare_tables方法进行比对
  545. pre_header_diffs = self.compare_tables(pre_header_table1, pre_header_table2)
  546. # 修改:统一类型为 table_pre_header
  547. for diff in pre_header_diffs:
  548. diff['type'] = 'table_pre_header'
  549. diff['position'] = f"表头前{diff['position']}"
  550. diff['severity'] = 'medium'
  551. print(f" ⚠️ {diff['position']}: {diff['description']}")
  552. differences.extend(pre_header_diffs)
  553. # 第三步:比较表头
  554. headers1 = table1[header_row_idx1]
  555. headers2 = table2[header_row_idx2]
  556. print(f"\n📋 对比表头...")
  557. print(f" 文件1表头 (第{header_row_idx1 + 1}行): {headers1}")
  558. print(f" 文件2表头 (第{header_row_idx2 + 1}行): {headers2}")
  559. header_result = self.compare_table_headers(headers1, headers2)
  560. if not header_result['match']:
  561. print(f"\n❌ 表头不匹配,严重错误!")
  562. for diff in header_result['differences']:
  563. print(f" - {diff['description']}")
  564. differences.append({
  565. 'type': 'table_header_critical',
  566. 'position': '表头',
  567. 'file1_value': ', '.join(headers1),
  568. 'file2_value': ', '.join(headers2),
  569. 'description': diff['description'],
  570. 'severity': 'critical'
  571. })
  572. return differences
  573. print(f"✅ 表头匹配成功")
  574. # 第四步:检测列类型
  575. column_types = []
  576. for col_idx in range(len(headers1)):
  577. col_values1 = [
  578. row[col_idx]
  579. for row in table1[header_row_idx1 + 1:]
  580. if col_idx < len(row)
  581. ]
  582. col_type = self.detect_column_type(col_values1)
  583. column_types.append(col_type)
  584. print(f" 列 {col_idx + 1} ({headers1[col_idx]}): {col_type}")
  585. # 第五步:逐行比较数据
  586. data_rows1 = table1[header_row_idx1 + 1:]
  587. data_rows2 = table2[header_row_idx2 + 1:]
  588. max_rows = max(len(data_rows1), len(data_rows2))
  589. print(f"\n📊 开始逐行对比数据 (共{max_rows}行)...")
  590. for row_idx in range(max_rows):
  591. row1 = data_rows1[row_idx] if row_idx < len(data_rows1) else []
  592. row2 = data_rows2[row_idx] if row_idx < len(data_rows2) else []
  593. # 实际行号(加上表头行索引)
  594. actual_row_num = header_row_idx1 + row_idx + 2
  595. if not row1:
  596. differences.append({
  597. 'type': 'table_row_missing',
  598. 'position': f'第{actual_row_num}行',
  599. 'file1_value': '',
  600. 'file2_value': ', '.join(row2),
  601. 'description': f'文件1缺少第{actual_row_num}行',
  602. 'severity': 'high',
  603. 'row_index': actual_row_num
  604. })
  605. continue
  606. if not row2:
  607. # ✅ 修改:整行缺失按单元格输出
  608. differences.append({
  609. 'type': 'table_row_missing',
  610. 'position': f'第{actual_row_num}行',
  611. 'file1_value': ', '.join(row1),
  612. 'file2_value': '',
  613. 'description': f'文件2缺少第{actual_row_num}行',
  614. 'severity': 'high',
  615. 'row_index': actual_row_num
  616. })
  617. continue
  618. # ✅ 修改:逐列比较,每个单元格差异独立输出
  619. max_cols = max(len(row1), len(row2))
  620. for col_idx in range(max_cols):
  621. cell1 = row1[col_idx] if col_idx < len(row1) else ''
  622. cell2 = row2[col_idx] if col_idx < len(row2) else ''
  623. # 跳过图片内容
  624. if "[图片内容-忽略]" in cell1 or "[图片内容-忽略]" in cell2:
  625. continue
  626. column_type = column_types[col_idx] if col_idx < len(column_types) else 'text'
  627. column_name = headers1[col_idx] if col_idx < len(headers1) else f'列{col_idx + 1}'
  628. compare_result = self.compare_cell_value(cell1, cell2, column_type, column_name)
  629. if not compare_result['match']:
  630. # ✅ 直接将单元格差异添加到differences列表
  631. diff_info = compare_result['difference']
  632. differences.append({
  633. 'type': diff_info['type'], # 使用原始类型(table_amount, table_text等)
  634. 'position': f'第{actual_row_num}行第{col_idx + 1}列',
  635. 'file1_value': diff_info['value1'],
  636. 'file2_value': diff_info['value2'],
  637. 'description': diff_info['description'],
  638. 'severity': 'medium',
  639. 'row_index': actual_row_num,
  640. 'col_index': col_idx,
  641. 'column_name': column_name,
  642. 'column_type': column_type,
  643. # 保留额外信息
  644. **{k: v for k, v in diff_info.items() if k not in ['type', 'value1', 'value2', 'description']}
  645. })
  646. print(f" ⚠️ 第{actual_row_num}行第{col_idx + 1}列({column_name}): {diff_info['description']}")
  647. print(f"\n✅ 流水表格对比完成,发现 {len(differences)} 个差异")
  648. return differences
  649. def compare_tables_with_mode(self, table1: List[List[str]], table2: List[List[str]],
  650. mode: str = 'standard') -> List[Dict]:
  651. """根据模式选择表格比较算法"""
  652. if mode == 'flow_list':
  653. return self.compare_table_flow_list(table1, table2)
  654. else:
  655. return self.compare_tables(table1, table2)
  656. def compare_files(self, file1_path: str, file2_path: str) -> Dict:
  657. """改进的文件比较方法 - 支持不同的表格比较模式"""
  658. # 读取文件
  659. with open(file1_path, 'r', encoding='utf-8') as f:
  660. content1 = f.read()
  661. with open(file2_path, 'r', encoding='utf-8') as f:
  662. content2 = f.read()
  663. # 提取表格和段落
  664. tables1 = self.extract_table_data(content1)
  665. tables2 = self.extract_table_data(content2)
  666. paras1 = self.extract_paragraphs(content1)
  667. paras2 = self.extract_paragraphs(content2)
  668. # 比较结果
  669. all_differences = []
  670. # 比较表格 - 使用指定的比较模式
  671. if tables1 and tables2:
  672. table_diffs = self.compare_tables_with_mode(
  673. tables1[0], tables2[0],
  674. mode=self.table_comparison_mode
  675. )
  676. all_differences.extend(table_diffs)
  677. elif tables1 and not tables2:
  678. all_differences.append({
  679. 'type': 'table_structure',
  680. 'position': '表格结构',
  681. 'file1_value': f'包含{len(tables1)}个表格',
  682. 'file2_value': '无表格',
  683. 'description': '文件1包含表格但文件2无表格',
  684. 'severity': 'high'
  685. })
  686. elif not tables1 and tables2:
  687. all_differences.append({
  688. 'type': 'table_structure',
  689. 'position': '表格结构',
  690. 'file1_value': '无表格',
  691. 'file2_value': f'包含{len(tables2)}个表格',
  692. 'description': '文件2包含表格但文件1无表格',
  693. 'severity': 'high'
  694. })
  695. # 使用增强的段落比较
  696. para_diffs = self.compare_paragraphs_with_flexible_matching(paras1, paras2)
  697. all_differences.extend(para_diffs)
  698. # ✅ 改进统计信息 - 细化分类
  699. stats = {
  700. 'total_differences': len(all_differences),
  701. 'table_differences': len([d for d in all_differences if d['type'].startswith('table')]),
  702. 'paragraph_differences': len([d for d in all_differences if d['type'] == 'paragraph']),
  703. 'amount_differences': len([d for d in all_differences if d['type'] == 'table_amount']),
  704. 'datetime_differences': len([d for d in all_differences if d['type'] == 'table_datetime']),
  705. 'text_differences': len([d for d in all_differences if d['type'] == 'table_text']),
  706. 'table_pre_header': len([d for d in all_differences if d['type'] == 'table_pre_header']),
  707. 'table_header_critical': len([d for d in all_differences if d['type'] == 'table_header_critical']),
  708. 'table_header_position': len([d for d in all_differences if d['type'] == 'table_header_position']),
  709. 'table_row_missing': len([d for d in all_differences if d['type'] == 'table_row_missing']),
  710. 'high_severity': len([d for d in all_differences if d.get('severity') == 'critical' or d.get('severity') == 'high']),
  711. 'medium_severity': len([d for d in all_differences if d.get('severity') == 'medium']),
  712. 'low_severity': len([d for d in all_differences if d.get('severity') == 'low'])
  713. }
  714. result = {
  715. 'differences': all_differences,
  716. 'statistics': stats,
  717. 'file1_tables': len(tables1),
  718. 'file2_tables': len(tables2),
  719. 'file1_paragraphs': len(paras1),
  720. 'file2_paragraphs': len(paras2),
  721. 'file1_path': file1_path,
  722. 'file2_path': file2_path,
  723. }
  724. return result
  725. def generate_json_report(self, comparison_result: Dict, output_file: str):
  726. """生成JSON格式的比较报告"""
  727. # report_data = {
  728. # 'comparison_summary': {
  729. # 'timestamp': re.sub(r'[^\w\-_\.]', '_', str(comparison_result.get('timestamp', ''))),
  730. # 'file1': comparison_result['file1_path'],
  731. # 'file2': comparison_result['file2_path'],
  732. # 'statistics': comparison_result['statistics'],
  733. # 'file_info': {
  734. # 'file1_tables': comparison_result['file1_tables'],
  735. # 'file2_tables': comparison_result['file2_tables'],
  736. # 'file1_paragraphs': comparison_result['file1_paragraphs'],
  737. # 'file2_paragraphs': comparison_result['file2_paragraphs']
  738. # }
  739. # },
  740. # 'differences': comparison_result['differences']
  741. # }
  742. with open(output_file, 'w', encoding='utf-8') as f:
  743. json.dump(comparison_result, f, ensure_ascii=False, indent=2)
  744. def generate_markdown_report(self, comparison_result: Dict, output_file: str):
  745. """生成Markdown格式的比较报告 - 修复类型映射"""
  746. with open(output_file, 'w', encoding='utf-8') as f:
  747. f.write("# OCR结果对比报告\n\n")
  748. # 基本信息
  749. f.write("## 基本信息\n\n")
  750. f.write(f"- **文件1**: `{comparison_result['file1_path']}`\n")
  751. f.write(f"- **文件2**: `{comparison_result['file2_path']}`\n")
  752. f.write(f"- **比较时间**: {comparison_result.get('timestamp', 'N/A')}\n\n")
  753. # 统计信息
  754. stats = comparison_result['statistics']
  755. f.write("## 统计信息\n\n")
  756. f.write(f"- 总差异数量: **{stats['total_differences']}**\n")
  757. f.write(f"- 表格差异: **{stats['table_differences']}**\n")
  758. f.write(f"- 其中表格金额差异: **{stats['amount_differences']}**\n")
  759. f.write(f"- 段落差异: **{stats['paragraph_differences']}**\n")
  760. f.write(f"- 高严重度: **{stats['high_severity']}**\n") # ✅ 新增
  761. f.write(f"- 中严重度: **{stats['medium_severity']}**\n") # ✅ 新增
  762. f.write(f"- 低严重度: **{stats['low_severity']}**\n") # ✅ 新增
  763. f.write(f"- 文件1表格数: {comparison_result['file1_tables']}\n")
  764. f.write(f"- 文件2表格数: {comparison_result['file2_tables']}\n")
  765. f.write(f"- 文件1段落数: {comparison_result['file1_paragraphs']}\n")
  766. f.write(f"- 文件2段落数: {comparison_result['file2_paragraphs']}\n\n")
  767. # 差异摘要
  768. if stats['total_differences'] == 0:
  769. f.write("## 结论\n\n")
  770. f.write("🎉 **完美匹配!没有发现任何差异。**\n\n")
  771. else:
  772. f.write("## 差异摘要\n\n")
  773. # ✅ 更新类型映射
  774. type_name_map = {
  775. 'table_amount': '💰 表格金额差异',
  776. 'table_text': '📝 表格文本差异',
  777. 'table_pre_header': '📋 表头前内容差异',
  778. 'table_header_position': '📍 表头位置差异',
  779. 'table_header_critical': '❌ 表头严重错误',
  780. 'table_row_missing': '🚫 表格行缺失',
  781. 'table_row_data': '📊 表格数据差异',
  782. 'table_structure': '🏗️ 表格结构差异',
  783. 'paragraph': '📄 段落差异'
  784. }
  785. # 按类型分组显示差异
  786. diff_by_type = {}
  787. for diff in comparison_result['differences']:
  788. diff_type = diff['type']
  789. if diff_type not in diff_by_type:
  790. diff_by_type[diff_type] = []
  791. diff_by_type[diff_type].append(diff)
  792. for diff_type, diffs in diff_by_type.items():
  793. type_name = type_name_map.get(diff_type, f'❓ {diff_type}')
  794. f.write(f"### {type_name} ({len(diffs)}个)\n\n")
  795. for i, diff in enumerate(diffs, 1):
  796. f.write(f"**{i}. {diff['position']}**\n")
  797. f.write(f"- 文件1: `{diff['file1_value']}`\n")
  798. f.write(f"- 文件2: `{diff['file2_value']}`\n")
  799. f.write(f"- 说明: {diff['description']}\n")
  800. if 'severity' in diff:
  801. severity_icon = {'critical': '🔴', 'high': '🟠', 'medium': '🟡', 'low': '🟢'}
  802. f.write(f"- 严重度: {severity_icon.get(diff['severity'], '⚪')} {diff['severity']}\n")
  803. f.write("\n")
  804. # 详细差异列表
  805. if comparison_result['differences']:
  806. f.write("## 详细差异列表\n\n")
  807. f.write("| 序号 | 类型 | 位置 | 文件1内容 | 文件2内容 | 描述 | 严重度 |\n")
  808. f.write("| --- | --- | --- | --- | --- | --- | --- |\n")
  809. for i, diff in enumerate(comparison_result['differences'], 1):
  810. severity = diff.get('severity', 'N/A')
  811. f.write(f"| {i} | {diff['type']} | {diff['position']} | ")
  812. f.write(f"`{diff['file1_value'][:50]}{'...' if len(diff['file1_value']) > 50 else ''}` | ")
  813. f.write(f"`{diff['file2_value'][:50]}{'...' if len(diff['file2_value']) > 50 else ''}` | ")
  814. f.write(f"{diff['description']} | {severity} |\n")
  815. def compare_ocr_results(file1_path: str, file2_path: str, output_file: str = "comparison_report",
  816. output_format: str = "markdown", ignore_images: bool = True,
  817. table_mode: str = 'standard', similarity_algorithm: str = 'ratio'):
  818. """
  819. 比较两个OCR结果文件
  820. Args:
  821. file1_path: 第一个OCR结果文件路径
  822. file2_path: 第二个OCR结果文件路径
  823. output_file: 输出文件名(不含扩展名)
  824. output_format: 输出格式 ('json', 'markdown', 'both')
  825. ignore_images: 是否忽略图片内容
  826. table_mode: 表格比较模式 ('standard', 'flow_list')
  827. similarity_algorithm: 相似度算法 ('ratio', 'partial_ratio', 'token_sort_ratio', 'token_set_ratio')
  828. """
  829. comparator = OCRResultComparator()
  830. comparator.table_comparison_mode = table_mode
  831. # 根据参数选择相似度算法
  832. if similarity_algorithm == 'partial_ratio':
  833. comparator.calculate_text_similarity = lambda t1, t2: fuzz.partial_ratio(t1, t2)
  834. elif similarity_algorithm == 'token_sort_ratio':
  835. comparator.calculate_text_similarity = lambda t1, t2: fuzz.token_sort_ratio(t1, t2)
  836. elif similarity_algorithm == 'token_set_ratio':
  837. comparator.calculate_text_similarity = lambda t1, t2: fuzz.token_set_ratio(t1, t2)
  838. print("🔍 开始对比OCR结果...")
  839. print(f"📄 文件1: {file1_path}")
  840. print(f"📄 文件2: {file2_path}")
  841. print(f"📊 表格模式: {table_mode}")
  842. print(f"🔧 相似度算法: {similarity_algorithm}")
  843. try:
  844. # 执行比较
  845. result = comparator.compare_files(file1_path, file2_path)
  846. # 添加时间戳
  847. import datetime
  848. result['timestamp'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
  849. # 生成报告
  850. if output_format in ['json', 'both']:
  851. json_file = f"{output_file}.json"
  852. comparator.generate_json_report(result, json_file)
  853. print(f"📄 JSON报告已保存至: {json_file}")
  854. if output_format in ['markdown', 'both']:
  855. md_file = f"{output_file}.md"
  856. comparator.generate_markdown_report(result, md_file)
  857. print(f"📝 Markdown报告已保存至: {md_file}")
  858. # 打印简要结果
  859. print(f"\n📊 对比完成!")
  860. print(f" 总差异数: {result['statistics']['total_differences']}")
  861. print(f" 表格差异: {result['statistics']['table_differences']}")
  862. print(f" 其中表格金额差异: {result['statistics']['amount_differences']}")
  863. print(f" 段落差异: {result['statistics']['paragraph_differences']}")
  864. # 打印前几个重要差异
  865. if result['differences']:
  866. print(f"\n🔍 前3个重要差异:")
  867. for i, diff in enumerate(result['differences'][:3], 1):
  868. print(f" {i}. {diff['position']}: {diff['description']}")
  869. print(f" 文件1: '{diff['file1_value'][:50]}{'...' if len(diff['file1_value']) > 50 else ''}'")
  870. print(f" 文件2: '{diff['file2_value'][:50]}{'...' if len(diff['file2_value']) > 50 else ''}'")
  871. else:
  872. print(f"\n🎉 恭喜!两个文件内容完全一致!")
  873. # 添加处理统计信息(模仿 ocr_by_vlm.py 的风格)
  874. print("\n📊 对比处理统计")
  875. print(f" 文件1路径: {result['file1_path']}")
  876. print(f" 文件2路径: {result['file2_path']}")
  877. print(f" 输出文件: {output_file}")
  878. print(f" 输出格式: {output_format}")
  879. print(f" 忽略图片: {ignore_images}")
  880. print(f" 处理时间: {result['timestamp']}")
  881. print(f" 文件1表格数: {result['file1_tables']}")
  882. print(f" 文件2表格数: {result['file2_tables']}")
  883. print(f" 文件1段落数: {result['file1_paragraphs']}")
  884. print(f" 文件2段落数: {result['file2_paragraphs']}")
  885. return result
  886. except Exception as e:
  887. import traceback
  888. traceback.print_exc()
  889. raise Exception(f"OCR对比任务失败: {e}")
  890. if __name__ == "__main__":
  891. parser = argparse.ArgumentParser(description='OCR结果对比工具')
  892. parser.add_argument('file1', nargs='?', help='第一个OCR结果文件路径')
  893. parser.add_argument('file2', nargs='?', help='第二个OCR结果文件路径')
  894. parser.add_argument('-o', '--output', default='comparison_report', help='输出文件名')
  895. parser.add_argument('-f', '--format', choices=['json', 'markdown', 'both'],
  896. default='markdown', help='输出格式')
  897. parser.add_argument('--ignore-images', action='store_true', help='忽略图片内容')
  898. parser.add_argument('--table-mode', choices=['standard', 'flow_list'],
  899. default='standard', help='表格比较模式')
  900. parser.add_argument('--similarity-algorithm',
  901. choices=['ratio', 'partial_ratio', 'token_sort_ratio', 'token_set_ratio'],
  902. default='ratio', help='相似度算法')
  903. args = parser.parse_args()
  904. if args.file1 and args.file2:
  905. result = compare_ocr_results(
  906. file1_path=args.file1,
  907. file2_path=args.file2,
  908. output_file=args.output,
  909. output_format=args.format,
  910. ignore_images=args.ignore_images,
  911. table_mode=args.table_mode,
  912. similarity_algorithm=args.similarity_algorithm
  913. )
  914. else:
  915. # 测试流水表格对比
  916. result = compare_ocr_results(
  917. file1_path='/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/data_PPStructureV3_Results/A用户_单元格扫描流水_page_001.md',
  918. file2_path='/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/mineru-vlm-2.5.3_Results/A用户_单元格扫描流水_page_001.md',
  919. output_file=f'./output/flow_list_comparison_{time.strftime("%Y%m%d_%H%M%S")}',
  920. output_format='both',
  921. ignore_images=True,
  922. table_mode='flow_list', # 使用流水表格模式
  923. similarity_algorithm='ratio'
  924. )