compare_ocr_results.py 65 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533
  1. import sys
  2. import time
  3. import re
  4. import difflib
  5. import json
  6. import argparse
  7. from typing import Dict, List, Tuple
  8. import markdown
  9. from bs4 import BeautifulSoup
  10. from fuzzywuzzy import fuzz
  11. class OCRResultComparator:
  12. def __init__(self):
  13. self.differences = []
  14. self.paragraph_match_threshold = 80 # 段落相似度阈值, 大于80代表段落匹配,<100,表示存在差异,小于80代表段落不匹配
  15. self.content_similarity_threshold = 95 # 段落匹配,比较内容,大于95认为无差异
  16. self.max_paragraph_window = 6
  17. self.table_comparison_mode = 'standard' # 新增:表格比较模式
  18. self.header_similarity_threshold = 90 # 表头相似度阈值
  19. def normalize_text(self, text: str) -> str:
  20. """标准化文本:去除多余空格、回车等无效字符"""
  21. if not text:
  22. return ""
  23. # 去除多余的空白字符
  24. text = re.sub(r'\s+', ' ', text.strip())
  25. # 去除标点符号周围的空格
  26. text = re.sub(r'\s*([,。:;!?、])\s*', r'\1', text)
  27. return text
  28. def is_image_reference(self, text: str) -> bool:
  29. """判断是否为图片引用或描述"""
  30. image_keywords = [
  31. '图', '图片', '图像', 'image', 'figure', 'fig',
  32. '照片', '截图', '示意图', '流程图', '结构图'
  33. ]
  34. # 检查是否包含图片相关关键词
  35. for keyword in image_keywords:
  36. if keyword in text.lower():
  37. return True
  38. # 检查是否为Markdown图片语法
  39. if re.search(r'!\[.*?\]\(.*?\)', text):
  40. return True
  41. # 检查是否为HTML图片标签
  42. if re.search(r'<img[^>]*>', text, re.IGNORECASE):
  43. return True
  44. return False
  45. def extract_table_data(self, md_content: str) -> List[List[List[str]]]:
  46. """从Markdown中提取表格数据"""
  47. tables = []
  48. # 使用BeautifulSoup解析HTML表格
  49. soup = BeautifulSoup(md_content, 'html.parser')
  50. html_tables = soup.find_all('table')
  51. for table in html_tables:
  52. table_data = []
  53. rows = table.find_all('tr')
  54. for row in rows:
  55. cells = row.find_all(['td', 'th'])
  56. row_data = []
  57. for cell in cells:
  58. cell_text = self.normalize_text(cell.get_text())
  59. # 跳过图片内容
  60. if not self.is_image_reference(cell_text):
  61. row_data.append(cell_text)
  62. else:
  63. row_data.append("[图片内容-忽略]")
  64. if row_data: # 只添加非空行
  65. table_data.append(row_data)
  66. if table_data:
  67. tables.append(table_data)
  68. return tables
  69. def merge_split_paragraphs(self, lines: List[str]) -> List[str]:
  70. # 合并连续的非空行作为一个段落,且过滤图片内容
  71. merged_lines = []
  72. current_paragraph = ""
  73. for i, line in enumerate(lines):
  74. # 跳过空行
  75. if not line:
  76. if current_paragraph:
  77. merged_lines.append(current_paragraph)
  78. current_paragraph = ""
  79. continue
  80. # 跳过图片内容
  81. if self.is_image_reference(line):
  82. continue
  83. # 检查是否是标题(以数字、中文数字或特殊标记开头)
  84. is_title = (
  85. line.startswith(('一、', '二、', '三、', '四、', '五、', '六、', '七、', '八、', '九、', '十、')) or
  86. line.startswith(('1.', '2.', '3.', '4.', '5.', '6.', '7.', '8.', '9.')) or
  87. line.startswith('#')
  88. )
  89. # 如果是标题,结束当前段落
  90. if is_title:
  91. if current_paragraph:
  92. merged_lines.append(current_paragraph)
  93. current_paragraph = ""
  94. merged_lines.append(line)
  95. else:
  96. # 检查是否应该与前一行合并 # 如果当前段落不为空,且当前段落最后一个字符非空白字符
  97. if current_paragraph and not current_paragraph.endswith((' ', '\t')):
  98. current_paragraph += line
  99. else:
  100. current_paragraph = line
  101. # 处理最后一个段落
  102. if current_paragraph:
  103. merged_lines.append(current_paragraph)
  104. return merged_lines
  105. def extract_paragraphs(self, md_content: str) -> List[str]:
  106. """提取段落文本"""
  107. # 移除表格 - 修复正则表达式
  108. # 使用 IGNORECASE 和 DOTALL 标志
  109. content = re.sub(r'<table[^>]*>.*?</table>', '', md_content, flags=re.DOTALL | re.IGNORECASE)
  110. # 移除其他 HTML 标签
  111. content = re.sub(r'<[^>]+>', '', content)
  112. # 移除 Markdown 注释
  113. content = re.sub(r'<!--.*?-->', '', content, flags=re.DOTALL)
  114. # 分割段落
  115. paragraphs = []
  116. lines = content.split('\n')
  117. merged_lines = self.merge_split_paragraphs(lines)
  118. for line in merged_lines:
  119. normalized = self.normalize_text(line)
  120. if normalized:
  121. paragraphs.append(normalized)
  122. else:
  123. print(f"跳过的内容无效或图片段落: {line[0:30] if line else ''}...")
  124. return paragraphs
  125. def compare_tables(self, table1: List[List[str]], table2: List[List[str]]) -> List[Dict]:
  126. """比较表格数据"""
  127. differences = []
  128. # 确定最大行数
  129. max_rows = max(len(table1), len(table2))
  130. for i in range(max_rows):
  131. row1 = table1[i] if i < len(table1) else []
  132. row2 = table2[i] if i < len(table2) else []
  133. # 确定最大列数
  134. max_cols = max(len(row1), len(row2))
  135. for j in range(max_cols):
  136. cell1 = row1[j] if j < len(row1) else ""
  137. cell2 = row2[j] if j < len(row2) else ""
  138. # 跳过图片内容比较
  139. if "[图片内容-忽略]" in cell1 or "[图片内容-忽略]" in cell2:
  140. continue
  141. if cell1 != cell2:
  142. # 特别处理数字金额
  143. if self.is_numeric(cell1) and self.is_numeric(cell2):
  144. num1 = self.parse_number(cell1)
  145. num2 = self.parse_number(cell2)
  146. if abs(num1 - num2) > 0.001: # 允许小数精度误差
  147. differences.append({
  148. 'type': 'table_amount',
  149. 'position': f'行{i+1}列{j+1}',
  150. 'file1_value': cell1,
  151. 'file2_value': cell2,
  152. 'description': f'金额不一致: {cell1} vs {cell2}',
  153. 'row_index': i,
  154. 'col_index': j
  155. })
  156. else:
  157. differences.append({
  158. 'type': 'table_text',
  159. 'position': f'行{i+1}列{j+1}',
  160. 'file1_value': cell1,
  161. 'file2_value': cell2,
  162. 'description': f'文本不一致: {cell1} vs {cell2}',
  163. 'row_index': i,
  164. 'col_index': j
  165. })
  166. return differences
  167. def parse_number(self, text: str) -> float:
  168. """解析数字,处理千分位和货币符号"""
  169. if not text:
  170. return 0.0
  171. # 移除货币符号和千分位分隔符
  172. clean_text = re.sub(r'[¥$€£,,\s]', '', text)
  173. # 处理负号
  174. is_negative = False
  175. if clean_text.startswith('-') or clean_text.startswith('−'):
  176. is_negative = True
  177. clean_text = clean_text[1:]
  178. # 处理括号表示的负数 (123.45) -> -123.45
  179. if clean_text.startswith('(') and clean_text.endswith(')'):
  180. is_negative = True
  181. clean_text = clean_text[1:-1]
  182. try:
  183. number = float(clean_text)
  184. return -number if is_negative else number
  185. except ValueError:
  186. return 0.0
  187. def extract_datetime(self, text: str) -> str:
  188. """提取并标准化日期时间"""
  189. # 尝试匹配各种日期时间格式
  190. patterns = [
  191. (r'(\d{4})[-/](\d{1,2})[-/](\d{1,2})\s*(\d{1,2}):(\d{1,2}):(\d{1,2})',
  192. lambda m: f"{m.group(1)}-{m.group(2).zfill(2)}-{m.group(3).zfill(2)} {m.group(4).zfill(2)}:{m.group(5).zfill(2)}:{m.group(6).zfill(2)}"),
  193. (r'(\d{4})[-/](\d{1,2})[-/](\d{1,2})',
  194. lambda m: f"{m.group(1)}-{m.group(2).zfill(2)}-{m.group(3).zfill(2)}"),
  195. (r'(\d{4})年(\d{1,2})月(\d{1,2})日',
  196. lambda m: f"{m.group(1)}-{m.group(2).zfill(2)}-{m.group(3).zfill(2)}"),
  197. ]
  198. for pattern, formatter in patterns:
  199. match = re.search(pattern, text)
  200. if match:
  201. return formatter(match)
  202. return text
  203. def is_numeric(self, text: str) -> bool:
  204. """判断文本是否为数字 - 改进版:区分数值和长数字字符串"""
  205. """>15位的数字字符串视为文本型数字"""
  206. if not text:
  207. return False
  208. # 移除千分位分隔符、空格和负号
  209. clean_text = re.sub(r'[,,\s-]', '', text)
  210. # ✅ 新增:长数字字符串判断(超过15位,认为是文本型数字)
  211. if len(clean_text) > 15:
  212. return False
  213. try:
  214. float(clean_text)
  215. return True
  216. except ValueError:
  217. return False
  218. def is_text_number(self, text: str) -> bool:
  219. """
  220. 判断是否为文本型数字(如账号、订单号、流水号)
  221. 特征:
  222. 1. 长度超过15位的纯数字
  223. 2. 或者包含空格/连字符的数字序列
  224. """
  225. if not text:
  226. return False
  227. # 移除空格和连字符
  228. clean_text = re.sub(r'[\s-]', '', text)
  229. # 检查是否为纯数字且长度超过15位
  230. if clean_text.isdigit() and len(clean_text) > 15:
  231. return True
  232. # 检查是否为带空格/连字符的数字序列
  233. if re.match(r'^[\d\s-]+$', text) and len(clean_text) > 10:
  234. return True
  235. return False
  236. def detect_column_type(self, column_values: List[str]) -> str:
  237. """检测列的数据类型 - 改进版:区分数值和文本型数字"""
  238. if not column_values:
  239. return 'text'
  240. # 过滤空值, 如果只有1个代表空值的字符,如:"/"、"-",也视为空值
  241. non_empty_values = [v for v in column_values if v and v.strip() and v not in ['/', '-']]
  242. if not non_empty_values:
  243. return 'text'
  244. # ✅ 优先检测文本型数字(账号、订单号等)
  245. text_number_count = 0
  246. for value in non_empty_values[:5]:
  247. if self.is_text_number(value):
  248. text_number_count += 1
  249. if text_number_count >= len(non_empty_values[:5]) * 0.6:
  250. return 'text' # ✅ 新增类型
  251. # 检测是否为日期时间
  252. datetime_patterns = [
  253. r'\d{4}[-/]\d{1,2}[-/]\d{1,2}', # YYYY-MM-DD
  254. r'\d{4}[-/]\d{1,2}[-/]\d{1,2}\s*\d{1,2}:\d{1,2}:\d{1,2}', # YYYY-MM-DD HH:MM:SS
  255. r'\d{4}年\d{1,2}月\d{1,2}日', # 中文日期
  256. ]
  257. datetime_count = 0
  258. for value in non_empty_values[:5]:
  259. for pattern in datetime_patterns:
  260. if re.search(pattern, value):
  261. datetime_count += 1
  262. break
  263. if datetime_count >= len(non_empty_values[:5]) * 0.6:
  264. return 'datetime'
  265. # 检测是否为数字/金额(短数字)
  266. numeric_count = 0
  267. for value in non_empty_values[:5]:
  268. if self.is_numeric(value) and not self.is_text_number(value):
  269. numeric_count += 1
  270. if numeric_count >= len(non_empty_values[:5]) * 0.6:
  271. return 'numeric'
  272. # 默认为文本
  273. return 'text'
  274. def normalize_text_number(self, text: str) -> str:
  275. """
  276. 标准化文本型数字:移除空格和连字符
  277. Args:
  278. text: 原始文本
  279. Returns:
  280. 标准化后的文本
  281. """
  282. if not text:
  283. return ""
  284. # 移除空格、连字符、全角空格
  285. text = re.sub(r'[\s\-\u3000]', '', text)
  286. return text
  287. def compare_cell_value(self, value1: str, value2: str, column_type: str,
  288. column_name: str = '') -> Dict:
  289. """比较单元格值 - 改进版:支持文本型数字"""
  290. result = {
  291. 'match': True,
  292. 'difference': None
  293. }
  294. # 标准化值
  295. v1 = self.normalize_text(value1)
  296. v2 = self.normalize_text(value2)
  297. if v1 == v2:
  298. return result
  299. # ✅ 新增:文本型数字比较
  300. if column_type == 'text_number':
  301. # 标准化后比较(移除空格和连字符)
  302. norm_v1 = self.normalize_text_number(v1)
  303. norm_v2 = self.normalize_text_number(v2)
  304. if norm_v1 == norm_v2:
  305. # 内容相同,只是格式不同(空格差异)
  306. result['match'] = False
  307. result['difference'] = {
  308. 'type': 'table_text',
  309. 'value1': value1,
  310. 'value2': value2,
  311. 'description': f'文本型数字格式差异: "{value1}" vs "{value2}" (内容相同,空格不同)',
  312. 'severity': 'low'
  313. }
  314. else:
  315. # 内容不同
  316. result['match'] = False
  317. result['difference'] = {
  318. 'type': 'table_text',
  319. 'value1': value1,
  320. 'value2': value2,
  321. 'description': f'文本型数字不一致: {value1} vs {value2}',
  322. 'severity': 'high'
  323. }
  324. return result
  325. # 根据列类型采用不同的比较策略
  326. if column_type == 'numeric':
  327. # 数字/金额比较
  328. if self.is_numeric(v1) and self.is_numeric(v2):
  329. num1 = self.parse_number(v1) # ✅ 使用 parse_number
  330. num2 = self.parse_number(v2)
  331. if abs(num1 - num2) > 0.01: # 允许0.01的误差
  332. result['match'] = False
  333. result['difference'] = {
  334. 'type': 'table_amount',
  335. 'value1': value1,
  336. 'value2': value2,
  337. 'diff_amount': abs(num1 - num2),
  338. 'description': f'金额不一致: {value1} vs {value2}'
  339. }
  340. else:
  341. # 虽然检测为 numeric,但实际是长数字,按文本比较
  342. result['match'] = False
  343. result['difference'] = {
  344. 'type': 'table_text',
  345. 'value1': value1,
  346. 'value2': value2,
  347. 'description': f'长数字字符串不一致: {value1} vs {value2}'
  348. }
  349. elif column_type == 'datetime':
  350. # 日期时间比较
  351. datetime1 = self.extract_datetime(v1) # ✅ 使用 extract_datetime
  352. datetime2 = self.extract_datetime(v2)
  353. if datetime1 != datetime2:
  354. result['match'] = False
  355. result['difference'] = {
  356. 'type': 'table_datetime',
  357. 'value1': value1,
  358. 'value2': value2,
  359. 'description': f'日期时间不一致: {value1} vs {value2}'
  360. }
  361. else:
  362. # 文本比较
  363. similarity = self.calculate_text_similarity(v1, v2)
  364. if similarity < self.content_similarity_threshold:
  365. result['match'] = False
  366. result['difference'] = {
  367. 'type': 'table_text',
  368. 'value1': value1,
  369. 'value2': value2,
  370. 'similarity': similarity,
  371. 'description': f'文本不一致: {value1} vs {value2} (相似度: {similarity:.1f}%)'
  372. }
  373. return result
  374. def calculate_text_similarity(self, text1: str, text2: str) -> float:
  375. """改进的相似度计算"""
  376. if not text1 and not text2:
  377. return 100.0
  378. if not text1 or not text2:
  379. return 0.0
  380. # 如果标准化后完全相同,返回100%
  381. if text1 == text2:
  382. return 100.0
  383. # 使用多种相似度算法
  384. similarity_scores = [
  385. fuzz.ratio(text1, text2),
  386. # fuzz.partial_ratio(text1, text2),
  387. # fuzz.token_sort_ratio(text1, text2),
  388. # fuzz.token_set_ratio(text1, text2)
  389. ]
  390. # 对于包含关系,给予更高的权重
  391. # if text1 in text2 or text2 in text1:
  392. # max_score = max(similarity_scores)
  393. # # 提升包含关系的相似度
  394. # return min(100.0, max_score + 10)
  395. return max(similarity_scores)
  396. def strip_markdown_formatting(self, text: str) -> str:
  397. """移除Markdown格式标记,只保留纯文本内容"""
  398. if not text:
  399. return ""
  400. # 移除标题标记 (# ## ### 等)
  401. text = re.sub(r'^#+\s*', '', text)
  402. # 移除粗体标记 (**text** 或 __text__)
  403. text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)
  404. text = re.sub(r'__(.+?)__', r'\1', text)
  405. # 移除斜体标记 (*text* 或 _text_)
  406. text = re.sub(r'\*(.+?)\*', r'\1', text)
  407. text = re.sub(r'_(.+?)_', r'\1', text)
  408. # 移除链接 [text](url)
  409. text = re.sub(r'\[(.+?)\]\(.+?\)', r'\1', text)
  410. # 移除图片引用 ![alt](url)
  411. text = re.sub(r'!\[.*?\]\(.+?\)', '', text)
  412. # 移除代码标记 `code`
  413. text = re.sub(r'`(.+?)`', r'\1', text)
  414. # 移除HTML标签
  415. text = re.sub(r'<[^>]+>', '', text)
  416. # 移除列表标记 (- * + 1. 2. 等)
  417. text = re.sub(r'^\s*[-*+]\s+', '', text)
  418. text = re.sub(r'^\s*\d+\.\s+', '', text)
  419. # 移除引用标记 (>)
  420. text = re.sub(r'^\s*>\s+', '', text)
  421. # 标准化空白字符
  422. text = re.sub(r'\s+', ' ', text.strip())
  423. return text
  424. def normalize_text_for_comparison(self, text: str) -> str:
  425. """
  426. 用于比较的文本标准化:移除格式 + 标准化空白 + 统一标点
  427. Args:
  428. text: 原始文本
  429. Returns:
  430. 标准化后的纯文本
  431. """
  432. # 第一步:移除Markdown格式
  433. text = self.strip_markdown_formatting(text)
  434. # 第二步:统一标点符号(中英文转换)
  435. text = self.normalize_punctuation(text)
  436. # 第三步:标准化空白字符
  437. text = self.normalize_text(text)
  438. return text
  439. def normalize_punctuation(self, text: str) -> str:
  440. """
  441. 统一标点符号 - 将中文标点转换为英文标点
  442. Args:
  443. text: 原始文本
  444. Returns:
  445. 标点统一后的文本
  446. """
  447. if not text:
  448. return ""
  449. # 中文标点到英文标点的映射
  450. punctuation_map = {
  451. ':': ':', # 冒号
  452. ';': ';', # 分号
  453. ',': ',', # 逗号
  454. '。': '.', # 句号
  455. '!': '!', # 感叹号
  456. '?': '?', # 问号
  457. '(': '(', # 左括号
  458. ')': ')', # 右括号
  459. '【': '[', # 左方括号
  460. '】': ']', # 右方括号
  461. '《': '<', # 左书名号
  462. '》': '>', # 右书名号
  463. '"': '"', # 左双引号
  464. '"': '"', # 右双引号
  465. ''': "'", # 左单引号
  466. ''': "'", # 右单引号
  467. '、': ',', # 顿号
  468. '—': '-', # 破折号
  469. '…': '...', # 省略号
  470. '~': '~', # 波浪号
  471. }
  472. for cn_punct, en_punct in punctuation_map.items():
  473. text = text.replace(cn_punct, en_punct)
  474. return text
  475. def check_punctuation_differences(self, text1: str, text2: str) -> List[Dict]:
  476. """
  477. 检查两段文本的标点符号差异
  478. Args:
  479. text1: 文本1
  480. text2: 文本2
  481. Returns:
  482. 标点差异列表
  483. """
  484. differences = []
  485. # 如果标准化后相同,说明只有标点差异
  486. normalized1 = self.normalize_punctuation(text1)
  487. normalized2 = self.normalize_punctuation(text2)
  488. if normalized1 == normalized2 and text1 != text2:
  489. # 找出具体的标点差异位置
  490. min_len = min(len(text1), len(text2))
  491. for i in range(min_len):
  492. if text1[i] != text2[i]:
  493. # 检查是否是全角半角标点的差异
  494. char1 = text1[i]
  495. char2 = text2[i]
  496. # 使用normalize_punctuation检查是否是对应的全角半角
  497. if self.normalize_punctuation(char1) == self.normalize_punctuation(char2):
  498. # 提取上下文(前后各3个字符)
  499. start = max(0, i - 3)
  500. end = min(len(text1), i + 4)
  501. context1 = text1[start:end]
  502. context2 = text2[start:end]
  503. differences.append({
  504. 'position': i,
  505. 'char1': char1,
  506. 'char2': char2,
  507. 'context1': context1,
  508. 'context2': context2,
  509. 'type': 'full_half_width'
  510. })
  511. return differences
  512. def compare_paragraphs_with_flexible_matching(self, paras1: List[str], paras2: List[str]) -> List[Dict]:
  513. """改进的段落匹配算法 - 更好地处理段落重组"""
  514. """_summary_
  515. paras1: 文件1的段落列表
  516. paras2: 文件2的段落列表
  517. paras1和paras2中的段落顺序有可能不一致,需要对窗口内的段落进行匹配,窗口的段落的顺序可以不一样
  518. para1和para2中的段落可能存在合并或拆分的情况,需要考虑这种情况
  519. """
  520. differences = []
  521. # ✅ 预处理:移除格式并统一标点(用于匹配)
  522. normalized_paras1 = [self.normalize_text_for_comparison(p) for p in paras1]
  523. normalized_paras2 = [self.normalize_text_for_comparison(p) for p in paras2]
  524. # 但保留原始文本(用于差异检测)
  525. original_paras1 = [self.strip_markdown_formatting(p) for p in paras1]
  526. original_paras2 = [self.strip_markdown_formatting(p) for p in paras2]
  527. # 使用预处理后的段落进行匹配
  528. used_paras1 = set()
  529. used_paras2 = set()
  530. # 文件1和文件2同时向下遍历
  531. start_index2 = 0
  532. last_match_index2 = 0
  533. for window_size1 in range(1, min(self.max_paragraph_window, len(normalized_paras1) + 1)):
  534. for i in range(len(normalized_paras1) - window_size1 + 1):
  535. # 跳过已使用的段落
  536. if any(idx in used_paras1 for idx in range(i, i + window_size1)):
  537. continue
  538. # 合并文件1中的段落(用于匹配的标准化版本)
  539. combined_normalized1 = "".join(normalized_paras1[i:i+window_size1])
  540. # 合并文件1中的段落(原始版本,用于差异检测)
  541. combined_original1 = "".join(original_paras1[i:i+window_size1])
  542. # 查找最佳匹配
  543. best_match = self._find_best_match_in_paras2_improved(
  544. combined_normalized1,
  545. normalized_paras2,
  546. start_index2,
  547. last_match_index2,
  548. used_paras2
  549. )
  550. if best_match and best_match['similarity'] >= self.paragraph_match_threshold:
  551. # 更新搜索位置
  552. matched_indices = best_match['indices']
  553. last_match_index2 = matched_indices[-1]
  554. start_index2 = last_match_index2 + 1
  555. # 记录匹配
  556. for idx in range(i, i + window_size1):
  557. used_paras1.add(idx)
  558. for idx in matched_indices:
  559. used_paras2.add(idx)
  560. # ✅ 获取原始文本(未标准化标点的版本)
  561. combined_original2 = "".join([original_paras2[idx] for idx in matched_indices])
  562. # ✅ 检查标点差异
  563. punctuation_diffs = self.check_punctuation_differences(
  564. combined_original1,
  565. combined_original2
  566. )
  567. if punctuation_diffs:
  568. # 有标点差异
  569. diff_description = []
  570. for pdiff in punctuation_diffs:
  571. diff_description.append(
  572. f"位置{pdiff['position']}: '{pdiff['char1']}' vs '{pdiff['char2']}' "
  573. f"(上下文: ...{pdiff['context1']}... vs ...{pdiff['context2']}...)"
  574. )
  575. differences.append({
  576. 'type': 'paragraph_punctuation', # ✅ 新类型
  577. 'position': f'段落{i+1}' + (f'-{i+window_size1}' if window_size1 > 1 else ''),
  578. 'file1_value': combined_original1,
  579. 'file2_value': combined_original2,
  580. 'description': f'段落全角半角标点差异: {"; ".join(diff_description)}',
  581. 'punctuation_differences': punctuation_diffs,
  582. 'similarity': 100.0, # 内容完全相同
  583. 'severity': 'low'
  584. })
  585. elif best_match['similarity'] < self.content_similarity_threshold:
  586. # 内容有差异
  587. severity = 'low' if best_match['similarity'] >= 90 else 'medium'
  588. differences.append({
  589. 'type': 'paragraph',
  590. 'position': f'段落{i+1}' + (f'-{i+window_size1}' if window_size1 > 1 else ''),
  591. 'file1_value': combined_original1,
  592. 'file2_value': combined_original2,
  593. 'description': f'段落内容差异 (相似度: {best_match["similarity"]:.1f}%)',
  594. 'similarity': best_match['similarity'],
  595. 'severity': severity
  596. })
  597. # 如果文件2已全部匹配完,退出
  598. if len(used_paras2) >= len(normalized_paras2):
  599. return differences
  600. # 处理未匹配的段落
  601. for i, para in enumerate(original_paras1):
  602. if i not in used_paras1:
  603. differences.append({
  604. 'type': 'paragraph',
  605. 'position': f'段落{i+1}',
  606. 'file1_value': para,
  607. 'file2_value': "",
  608. 'description': '文件1中独有的段落',
  609. 'similarity': 0.0,
  610. 'severity': 'medium'
  611. })
  612. for j, para in enumerate(original_paras2):
  613. if j not in used_paras2:
  614. differences.append({
  615. 'type': 'paragraph',
  616. 'position': f'段落{j+1}',
  617. 'file1_value': "",
  618. 'file2_value': para,
  619. 'description': '文件2中独有的段落',
  620. 'similarity': 0.0,
  621. 'severity': 'medium'
  622. })
  623. return differences
  624. def _find_best_match_in_paras2_improved(self, target_text: str, paras2: List[str],
  625. start_index: int, last_match_index: int,
  626. used_paras2: set) -> Dict:
  627. """
  628. 改进的段落匹配方法 - 借鉴 _find_matching_bbox 的窗口查找逻辑
  629. Args:
  630. target_text: 目标文本(已标准化)
  631. paras2: 文件2的段落列表(已标准化)
  632. start_index: 起始搜索索引(上次匹配后的下一个位置)
  633. last_match_index: 上次匹配成功的索引
  634. used_paras2: 已使用的段落索引集合
  635. Returns:
  636. 最佳匹配结果
  637. """
  638. # ✅ 向前查找窗口(类似 _find_matching_bbox)
  639. search_start = last_match_index - 1
  640. unused_count = 0
  641. # 向前找到 look_ahead_window 个未使用的段落
  642. while search_start >= 0:
  643. if search_start not in used_paras2:
  644. unused_count += 1
  645. if unused_count >= self.max_paragraph_window:
  646. break
  647. search_start -= 1
  648. if search_start < 0:
  649. search_start = 0
  650. # 跳过开头已使用的段落
  651. while search_start < start_index and search_start in used_paras2:
  652. search_start += 1
  653. # 搜索范围:从 search_start 到 start_index + window
  654. search_end = min(start_index + self.max_paragraph_window, len(paras2))
  655. best_match = None
  656. # ✅ 遍历不同窗口大小
  657. for window_size in range(1, self.max_paragraph_window + 1):
  658. for j in range(search_start, search_end):
  659. # ✅ 跳过已使用的段落
  660. if any(idx in used_paras2 for idx in range(j, min(j + window_size, len(paras2)))):
  661. continue
  662. # 确保不越界
  663. if j + window_size > len(paras2):
  664. break
  665. # 合并段落
  666. combined_para2 = "".join(paras2[j:j+window_size])
  667. # 计算相似度
  668. if target_text == combined_para2:
  669. similarity = 100.0
  670. else:
  671. similarity = self.calculate_text_similarity(target_text, combined_para2)
  672. # 更新最佳匹配
  673. if not best_match or similarity > best_match['similarity']:
  674. best_match = {
  675. 'text': combined_para2,
  676. 'similarity': similarity,
  677. 'indices': list(range(j, j + window_size))
  678. }
  679. # ✅ 如果找到完美匹配,提前返回
  680. if similarity == 100.0:
  681. return best_match
  682. # 如果没有找到匹配,返回空结果
  683. if best_match is None:
  684. return {
  685. 'text': '',
  686. 'similarity': 0.0,
  687. 'indices': []
  688. }
  689. return best_match
  690. def normalize_header_text(self, text: str) -> str:
  691. """标准化表头文本"""
  692. # 移除括号及其内容
  693. text = re.sub(r'[((].*?[))]', '', text)
  694. # 统一空格
  695. text = re.sub(r'\s+', '', text)
  696. # 移除特殊字符
  697. text = re.sub(r'[^\w\u4e00-\u9fff]', '', text)
  698. return text.lower().strip()
  699. def compare_table_headers(self, headers1: List[str], headers2: List[str]) -> Dict:
  700. """比较表格表头"""
  701. result = {
  702. 'match': True,
  703. 'differences': [],
  704. 'column_mapping': {}, # 列映射关系
  705. 'similarity_scores': []
  706. }
  707. if len(headers1) != len(headers2):
  708. result['match'] = False
  709. result['differences'].append({
  710. 'type': 'table_header_critical',
  711. 'description': f'表头列数不一致: {len(headers1)} vs {len(headers2)}',
  712. 'severity': 'critical'
  713. })
  714. return result
  715. # 逐列比较表头
  716. for i, (h1, h2) in enumerate(zip(headers1, headers2)):
  717. norm_h1 = self.normalize_header_text(h1)
  718. norm_h2 = self.normalize_header_text(h2)
  719. similarity = self.calculate_text_similarity(norm_h1, norm_h2)
  720. result['similarity_scores'].append({
  721. 'column_index': i,
  722. 'header1': h1,
  723. 'header2': h2,
  724. 'similarity': similarity
  725. })
  726. if similarity < self.header_similarity_threshold:
  727. result['match'] = False
  728. result['differences'].append({
  729. 'type': 'table_header_mismatch',
  730. 'column_index': i,
  731. 'header1': h1,
  732. 'header2': h2,
  733. 'similarity': similarity,
  734. 'description': f'第{i+1}列表头不匹配: "{h1}" vs "{h2}" (相似度: {similarity:.1f}%)',
  735. 'severity': 'medium' if similarity < 50 else 'high'
  736. })
  737. else:
  738. result['column_mapping'][i] = i # 建立列映射
  739. return result
  740. def detect_table_header_row(self, table: List[List[str]]) -> int:
  741. """
  742. 智能检测表格的表头行索引
  743. 策略:
  744. 1. 查找包含典型表头关键词的行(如:序号、编号、时间、日期、金额等)
  745. 2. 检查该行后续行是否为数据行(包含数字、日期等)
  746. 3. 返回表头行的索引,如果找不到则返回0
  747. """
  748. # 常见表头关键词
  749. header_keywords = [
  750. # 通用表头
  751. '序号', '编号', '时间', '日期', '名称', '类型', '金额', '数量', '单价',
  752. '备注', '说明', '状态', '类别', '方式', '账号', '单号', '订单',
  753. # 流水表格特定
  754. '交易单号', '交易时间', '交易类型', '收/支', '支出', '收入',
  755. '交易方式', '交易对方', '商户单号', '付款方式', '收款方',
  756. # 英文表头
  757. 'no', 'id', 'time', 'date', 'name', 'type', 'amount', 'status'
  758. ]
  759. for row_idx, row in enumerate(table):
  760. if not row:
  761. continue
  762. # 计算该行包含表头关键词的单元格数量
  763. keyword_count = 0
  764. for cell in row:
  765. cell_lower = cell.lower().strip()
  766. for keyword in header_keywords:
  767. if keyword in cell_lower:
  768. keyword_count += 1
  769. break
  770. # 如果超过一半的单元格包含表头关键词,认为是表头行
  771. if keyword_count >= len(row) * 0.4 and keyword_count >= 2:
  772. # 验证:检查下一行是否像数据行
  773. if row_idx + 1 < len(table):
  774. next_row = table[row_idx + 1]
  775. if self.is_data_row(next_row):
  776. print(f" 📍 检测到表头在第 {row_idx + 1} 行")
  777. return row_idx
  778. # 如果没有找到明确的表头行,返回0(默认第一行)
  779. print(f" ⚠️ 未检测到明确表头,默认使用第1行")
  780. return 0
  781. def is_data_row(self, row: List[str]) -> bool:
  782. """判断是否为数据行(包含数字、日期等)"""
  783. data_pattern_count = 0
  784. for cell in row:
  785. if not cell:
  786. continue
  787. # 检查是否包含数字
  788. if re.search(r'\d', cell):
  789. data_pattern_count += 1
  790. # 检查是否为日期时间格式
  791. if re.search(r'\d{4}[-/年]\d{1,2}[-/月]\d{1,2}', cell):
  792. data_pattern_count += 1
  793. # 如果超过一半的单元格包含数据特征,认为是数据行
  794. return data_pattern_count >= len(row) * 0.5
  795. def compare_table_flow_list(self, table1: List[List[str]], table2: List[List[str]]) -> List[Dict]:
  796. """专门的流水列表表格比较算法 - 支持表头不在第一行"""
  797. differences = []
  798. if not table1 or not table2:
  799. return [{
  800. 'type': 'table_empty',
  801. 'description': '表格为空',
  802. 'severity': 'critical'
  803. }]
  804. print(f"\n📋 开始流水表格对比...")
  805. # 第一步:智能检测表头位置
  806. header_row_idx1 = self.detect_table_header_row(table1)
  807. header_row_idx2 = self.detect_table_header_row(table2)
  808. if header_row_idx1 != header_row_idx2:
  809. differences.append({
  810. 'type': 'table_header_position',
  811. 'position': '表头位置',
  812. 'file1_value': f'第{header_row_idx1 + 1}行',
  813. 'file2_value': f'第{header_row_idx2 + 1}行',
  814. 'description': f'表头位置不一致: 文件1在第{header_row_idx1 + 1}行,文件2在第{header_row_idx2 + 1}行',
  815. 'severity': 'high'
  816. })
  817. # 第二步:比对表头前的内容(按单元格比对)
  818. if header_row_idx1 > 0 or header_row_idx2 > 0:
  819. print(f"\n📝 对比表头前的内容...")
  820. # 提取表头前的内容作为单独的"表格"
  821. pre_header_table1 = table1[:header_row_idx1] if header_row_idx1 > 0 else []
  822. pre_header_table2 = table2[:header_row_idx2] if header_row_idx2 > 0 else []
  823. if pre_header_table1 or pre_header_table2:
  824. # 复用compare_tables方法进行比对
  825. pre_header_diffs = self.compare_tables(pre_header_table1, pre_header_table2)
  826. # 修改:统一类型为 table_pre_header
  827. for diff in pre_header_diffs:
  828. diff['type'] = 'table_pre_header'
  829. diff['position'] = f"表头前{diff['position']}"
  830. diff['severity'] = 'medium'
  831. print(f" ⚠️ {diff['position']}: {diff['description']}")
  832. differences.extend(pre_header_diffs)
  833. # 第三步:比较表头
  834. headers1 = table1[header_row_idx1]
  835. headers2 = table2[header_row_idx2]
  836. print(f"\n📋 对比表头...")
  837. print(f" 文件1表头 (第{header_row_idx1 + 1}行): {headers1}")
  838. print(f" 文件2表头 (第{header_row_idx2 + 1}行): {headers2}")
  839. header_result = self.compare_table_headers(headers1, headers2)
  840. # ✅ 新增:检查列数是否一致
  841. column_count_match = len(headers1) == len(headers2)
  842. if not header_result['match']:
  843. print(f"\n⚠️ 表头文字存在差异")
  844. for diff in header_result['differences']:
  845. print(f" - {diff['description']}")
  846. differences.append({
  847. 'type': diff.get('type', 'table_header_mismatch'), # ✅ 改为 mismatch 而非 critical
  848. 'position': '表头',
  849. 'file1_value': diff.get('header1', ''),
  850. 'file2_value': diff.get('header2', ''),
  851. 'description': diff['description'],
  852. 'severity': diff.get('severity', 'high'),
  853. })
  854. if diff.get('severity', 'high') == 'critical':
  855. return differences
  856. else:
  857. print(f"✅ 表头匹配成功")
  858. # 第四步:检测列类型
  859. column_types1 = []
  860. column_types2 = []
  861. # 检测文件1的列类型
  862. for col_idx in range(len(headers1)):
  863. col_values1 = [
  864. row[col_idx]
  865. for row in table1[header_row_idx1 + 1:]
  866. if col_idx < len(row)
  867. ]
  868. col_type = self.detect_column_type(col_values1)
  869. column_types1.append(col_type)
  870. print(f" 文件1列 {col_idx + 1} ({headers1[col_idx]}): {col_type}")
  871. # 检测文件2的列类型
  872. for col_idx in range(len(headers2)):
  873. col_values2 = [
  874. row[col_idx]
  875. for row in table2[header_row_idx2 + 1:]
  876. if col_idx < len(row)
  877. ]
  878. col_type = self.detect_column_type(col_values2)
  879. column_types2.append(col_type)
  880. print(f" 文件2列 {col_idx + 1} ({headers2[col_idx]}): {col_type}")
  881. # ✅ 改进:统计列类型差异,只有超过阈值才停止比较
  882. mismatched_columns = []
  883. for col_idx in range(min(len(column_types1), len(column_types2))):
  884. if column_types1[col_idx] != column_types2[col_idx]:
  885. mismatched_columns.append(col_idx)
  886. differences.append({
  887. 'type': 'table_column_type_mismatch', # ✅ 新类型,区别于 critical
  888. 'position': f'第{col_idx + 1}列',
  889. 'file1_value': f'{headers1[col_idx]} ({column_types1[col_idx]})',
  890. 'file2_value': f'{headers2[col_idx]} ({column_types2[col_idx]})',
  891. 'description': f'列类型不一致: {column_types1[col_idx]} vs {column_types2[col_idx]}',
  892. 'severity': 'high',
  893. 'column_index': col_idx
  894. })
  895. # ✅ 计算列类型差异比例
  896. total_columns = min(len(column_types1), len(column_types2))
  897. mismatch_ratio = len(mismatched_columns) / total_columns if total_columns > 0 else 0
  898. # ✅ 只有当差异比例超过50%时才停止比较
  899. if mismatch_ratio > 0.5:
  900. print(f"\n⚠️ 列类型差异过大 ({len(mismatched_columns)}/{total_columns} = {mismatch_ratio:.1%}),不再比较单元格内容...")
  901. # 添加一个汇总差异
  902. differences.append({
  903. 'type': 'table_header_critical',
  904. 'position': '表格列类型',
  905. 'file1_value': f'{len(mismatched_columns)}列类型不一致',
  906. 'file2_value': f'共{total_columns}列',
  907. 'description': f'列类型差异过大: {len(mismatched_columns)}/{total_columns}列不匹配 ({mismatch_ratio:.1%})',
  908. 'severity': 'critical'
  909. })
  910. return differences
  911. elif mismatched_columns:
  912. print(f"\n⚠️ 检测到 {len(mismatched_columns)} 列类型差异,但仍继续比较单元格...")
  913. print(f" 不匹配的列: {[col_idx + 1 for col_idx in mismatched_columns]}")
  914. # ✅ 为每列选择更合适的类型(优先使用数据更丰富的文件)
  915. column_types = []
  916. for col_idx in range(max(len(column_types1), len(column_types2))):
  917. if col_idx >= len(column_types1):
  918. column_types.append(column_types2[col_idx])
  919. elif col_idx >= len(column_types2):
  920. column_types.append(column_types1[col_idx])
  921. elif col_idx in mismatched_columns:
  922. # ✅ 对于类型不一致的列,选择更通用的类型
  923. type1 = column_types1[col_idx]
  924. type2 = column_types2[col_idx]
  925. # 类型优先级: text > text_number > numeric/datetime
  926. if type1 == 'text' or type2 == 'text':
  927. column_types.append('text')
  928. elif type1 == 'text_number' or type2 == 'text_number':
  929. column_types.append('text_number')
  930. else:
  931. # 默认使用文件1的类型
  932. column_types.append(type1)
  933. print(f" 📝 第{col_idx + 1}列类型冲突,使用通用类型: {column_types[-1]}")
  934. else:
  935. column_types.append(column_types1[col_idx])
  936. # 第五步:逐行比较数据
  937. data_rows1 = table1[header_row_idx1 + 1:]
  938. data_rows2 = table2[header_row_idx2 + 1:]
  939. max_rows = max(len(data_rows1), len(data_rows2))
  940. print(f"\n📊 开始逐行对比数据 (共{max_rows}行)...")
  941. for row_idx in range(max_rows):
  942. row1 = data_rows1[row_idx] if row_idx < len(data_rows1) else []
  943. row2 = data_rows2[row_idx] if row_idx < len(data_rows2) else []
  944. # 实际行号(加上表头行索引)
  945. actual_row_num = header_row_idx1 + row_idx + 2
  946. if not row1:
  947. differences.append({
  948. 'type': 'table_row_missing',
  949. 'position': f'第{actual_row_num}行',
  950. 'file1_value': '',
  951. 'file2_value': ', '.join(row2),
  952. 'description': f'文件1缺少第{actual_row_num}行',
  953. 'severity': 'high',
  954. 'row_index': actual_row_num
  955. })
  956. continue
  957. if not row2:
  958. # ✅ 修改:整行缺失按单元格输出
  959. differences.append({
  960. 'type': 'table_row_missing',
  961. 'position': f'第{actual_row_num}行',
  962. 'file1_value': ', '.join(row1),
  963. 'file2_value': '',
  964. 'description': f'文件2缺少第{actual_row_num}行',
  965. 'severity': 'high',
  966. 'row_index': actual_row_num
  967. })
  968. continue
  969. # 逐列比较,每个单元格差异独立输出
  970. max_cols = max(len(row1), len(row2))
  971. for col_idx in range(max_cols):
  972. cell1 = row1[col_idx] if col_idx < len(row1) else ''
  973. cell2 = row2[col_idx] if col_idx < len(row2) else ''
  974. # 跳过图片内容
  975. if "[图片内容-忽略]" in cell1 or "[图片内容-忽略]" in cell2:
  976. continue
  977. # ✅ 使用合并后的列类型
  978. column_type = column_types[col_idx] if col_idx < len(column_types) else 'text'
  979. # ✅ 获取列名
  980. if header_result['match']:
  981. column_name = headers1[col_idx] if col_idx < len(headers1) else f'列{col_idx + 1}'
  982. else:
  983. col_name1 = headers1[col_idx] if col_idx < len(headers1) else f'列{col_idx + 1}'
  984. col_name2 = headers2[col_idx] if col_idx < len(headers2) else f'列{col_idx + 1}'
  985. column_name = f"{col_name1}/{col_name2}"
  986. # ✅ 如果该列类型不匹配,在描述中标注
  987. type_mismatch_note = ""
  988. if col_idx in mismatched_columns:
  989. type_mismatch_note = f" [列类型冲突: {column_types1[col_idx]} vs {column_types2[col_idx]}]"
  990. compare_result = self.compare_cell_value(cell1, cell2, column_type, column_name)
  991. if not compare_result['match']:
  992. # ✅ 直接将单元格差异添加到differences列表
  993. diff_info = compare_result['difference']
  994. differences.append({
  995. 'type': diff_info['type'], # 使用原始类型(table_amount, table_text等)
  996. 'position': f'第{actual_row_num}行第{col_idx + 1}列',
  997. 'file1_value': diff_info['value1'],
  998. 'file2_value': diff_info['value2'],
  999. 'description': diff_info['description'] + type_mismatch_note, # ✅ 添加类型冲突标注
  1000. 'severity': 'high' if col_idx in mismatched_columns else 'medium', # ✅ 类型冲突的单元格提高严重度
  1001. 'row_index': actual_row_num,
  1002. 'col_index': col_idx,
  1003. 'column_name': column_name,
  1004. 'column_type': column_type,
  1005. 'column_type_mismatch': col_idx in mismatched_columns, # ✅ 新增字段
  1006. **{k: v for k, v in diff_info.items() if k not in ['type', 'value1', 'value2', 'description']}
  1007. })
  1008. print(f" ⚠️ 第{actual_row_num}行第{col_idx + 1}列({column_name}): {diff_info['description']}{type_mismatch_note}")
  1009. print(f"\n✅ 流水表格对比完成,发现 {len(differences)} 个差异")
  1010. return differences
  1011. def compare_tables_with_mode(self, table1: List[List[str]], table2: List[List[str]],
  1012. mode: str = 'standard') -> List[Dict]:
  1013. """根据模式选择表格比较算法"""
  1014. if mode == 'flow_list':
  1015. return self.compare_table_flow_list(table1, table2)
  1016. else:
  1017. return self.compare_tables(table1, table2)
  1018. def compare_files(self, file1_path: str, file2_path: str) -> Dict:
  1019. """改进的文件比较方法 - 支持不同的表格比较模式"""
  1020. # 读取文件
  1021. with open(file1_path, 'r', encoding='utf-8') as f:
  1022. content1 = f.read()
  1023. with open(file2_path, 'r', encoding='utf-8') as f:
  1024. content2 = f.read()
  1025. # 提取表格和段落
  1026. tables1 = self.extract_table_data(content1)
  1027. tables2 = self.extract_table_data(content2)
  1028. paras1 = self.extract_paragraphs(content1)
  1029. paras2 = self.extract_paragraphs(content2)
  1030. # 比较结果
  1031. all_differences = []
  1032. # 比较表格 - 使用指定的比较模式
  1033. if tables1 and tables2:
  1034. table_diffs = self.compare_tables_with_mode(
  1035. tables1[0], tables2[0],
  1036. mode=self.table_comparison_mode
  1037. )
  1038. all_differences.extend(table_diffs)
  1039. elif tables1 and not tables2:
  1040. all_differences.append({
  1041. 'type': 'table_structure',
  1042. 'position': '表格结构',
  1043. 'file1_value': f'包含{len(tables1)}个表格',
  1044. 'file2_value': '无表格',
  1045. 'description': '文件1包含表格但文件2无表格',
  1046. 'severity': 'high'
  1047. })
  1048. elif not tables1 and tables2:
  1049. all_differences.append({
  1050. 'type': 'table_structure',
  1051. 'position': '表格结构',
  1052. 'file1_value': '无表格',
  1053. 'file2_value': f'包含{len(tables2)}个表格',
  1054. 'description': '文件2包含表格但文件1无表格',
  1055. 'severity': 'high'
  1056. })
  1057. # 使用增强的段落比较
  1058. para_diffs = self.compare_paragraphs_with_flexible_matching(paras1, paras2)
  1059. all_differences.extend(para_diffs)
  1060. # ✅ 改进统计信息 - 细化分类
  1061. stats = {
  1062. 'total_differences': len(all_differences),
  1063. 'table_differences': len([d for d in all_differences if d['type'].startswith('table')]),
  1064. 'paragraph_differences': len([d for d in all_differences if d['type'] == 'paragraph']),
  1065. 'amount_differences': len([d for d in all_differences if d['type'] == 'table_amount']),
  1066. 'datetime_differences': len([d for d in all_differences if d['type'] == 'table_datetime']),
  1067. 'text_differences': len([d for d in all_differences if d['type'] == 'table_text']),
  1068. 'table_pre_header': len([d for d in all_differences if d['type'] == 'table_pre_header']),
  1069. 'table_header_mismatch': len([d for d in all_differences if d['type'] == 'table_header_mismatch']), # ✅ 新增
  1070. 'table_header_critical': len([d for d in all_differences if d['type'] == 'table_header_critical']), # ✅ 新增
  1071. 'table_header_position': len([d for d in all_differences if d['type'] == 'table_header_position']),
  1072. 'table_row_missing': len([d for d in all_differences if d['type'] == 'table_row_missing']),
  1073. 'high_severity': len([d for d in all_differences if d.get('severity') == 'critical' or d.get('severity') == 'high']),
  1074. 'medium_severity': len([d for d in all_differences if d.get('severity') == 'medium']),
  1075. 'low_severity': len([d for d in all_differences if d.get('severity') == 'low'])
  1076. }
  1077. result = {
  1078. 'differences': all_differences,
  1079. 'statistics': stats,
  1080. 'file1_tables': len(tables1),
  1081. 'file2_tables': len(tables2),
  1082. 'file1_paragraphs': len(paras1),
  1083. 'file2_paragraphs': len(paras2),
  1084. 'file1_path': file1_path,
  1085. 'file2_path': file2_path,
  1086. }
  1087. return result
  1088. def generate_json_report(self, comparison_result: Dict, output_file: str):
  1089. """生成JSON格式的比较报告"""
  1090. # report_data = {
  1091. # 'comparison_summary': {
  1092. # 'timestamp': re.sub(r'[^\w\-_\.]', '_', str(comparison_result.get('timestamp', ''))),
  1093. # 'file1': comparison_result['file1_path'],
  1094. # 'file2': comparison_result['file2_path'],
  1095. # 'statistics': comparison_result['statistics'],
  1096. # 'file_info': {
  1097. # 'file1_tables': comparison_result['file1_tables'],
  1098. # 'file2_tables': comparison_result['file2_tables'],
  1099. # 'file1_paragraphs': comparison_result['file1_paragraphs'],
  1100. # 'file2_paragraphs': comparison_result['file2_paragraphs']
  1101. # }
  1102. # },
  1103. # 'differences': comparison_result['differences']
  1104. # }
  1105. with open(output_file, 'w', encoding='utf-8') as f:
  1106. json.dump(comparison_result, f, ensure_ascii=False, indent=2)
  1107. def generate_markdown_report(self, comparison_result: Dict, output_file: str):
  1108. """生成Markdown格式的比较报告 - 修复类型映射"""
  1109. with open(output_file, 'w', encoding='utf-8') as f:
  1110. f.write("# OCR结果对比报告\n\n")
  1111. # 基本信息
  1112. f.write("## 基本信息\n\n")
  1113. f.write(f"- **文件1**: `{comparison_result['file1_path']}`\n")
  1114. f.write(f"- **文件2**: `{comparison_result['file2_path']}`\n")
  1115. f.write(f"- **比较时间**: {comparison_result.get('timestamp', 'N/A')}\n\n")
  1116. # 统计信息
  1117. stats = comparison_result['statistics']
  1118. f.write("## 统计信息\n\n")
  1119. f.write(f"- 总差异数量: **{stats['total_differences']}**\n")
  1120. f.write(f"- 表格差异: **{stats['table_differences']}**\n")
  1121. f.write(f"- 其中表格金额差异: **{stats['amount_differences']}**\n")
  1122. f.write(f"- 段落差异: **{stats['paragraph_differences']}**\n")
  1123. f.write(f"- 高严重度: **{stats['high_severity']}**\n") # ✅ 新增
  1124. f.write(f"- 中严重度: **{stats['medium_severity']}**\n") # ✅ 新增
  1125. f.write(f"- 低严重度: **{stats['low_severity']}**\n") # ✅ 新增
  1126. f.write(f"- 文件1表格数: {comparison_result['file1_tables']}\n")
  1127. f.write(f"- 文件2表格数: {comparison_result['file2_tables']}\n")
  1128. f.write(f"- 文件1段落数: {comparison_result['file1_paragraphs']}\n")
  1129. f.write(f"- 文件2段落数: {comparison_result['file2_paragraphs']}\n\n")
  1130. # 差异摘要
  1131. if stats['total_differences'] == 0:
  1132. f.write("## 结论\n\n")
  1133. f.write("🎉 **完美匹配!没有发现任何差异。**\n\n")
  1134. else:
  1135. f.write("## 差异摘要\n\n")
  1136. # ✅ 更新类型映射
  1137. type_name_map = {
  1138. 'table_amount': '💰 表格金额差异',
  1139. 'table_text': '📝 表格文本差异',
  1140. 'table_pre_header': '📋 表头前内容差异',
  1141. 'table_header_position': '📍 表头位置差异',
  1142. 'table_header_critical': '❌ 表头严重错误',
  1143. 'table_row_missing': '🚫 表格行缺失',
  1144. 'table_row_data': '📊 表格数据差异',
  1145. 'table_structure': '🏗️ 表格结构差异',
  1146. 'paragraph': '📄 段落差异'
  1147. }
  1148. # 按类型分组显示差异
  1149. diff_by_type = {}
  1150. for diff in comparison_result['differences']:
  1151. diff_type = diff['type']
  1152. if diff_type not in diff_by_type:
  1153. diff_by_type[diff_type] = []
  1154. diff_by_type[diff_type].append(diff)
  1155. for diff_type, diffs in diff_by_type.items():
  1156. type_name = type_name_map.get(diff_type, f'❓ {diff_type}')
  1157. f.write(f"### {type_name} ({len(diffs)}个)\n\n")
  1158. for i, diff in enumerate(diffs, 1):
  1159. f.write(f"**{i}. {diff['position']}**\n")
  1160. f.write(f"- 文件1: `{diff['file1_value']}`\n")
  1161. f.write(f"- 文件2: `{diff['file2_value']}`\n")
  1162. f.write(f"- 说明: {diff['description']}\n")
  1163. if 'severity' in diff:
  1164. severity_icon = {'critical': '🔴', 'high': '🟠', 'medium': '🟡', 'low': '🟢'}
  1165. f.write(f"- 严重度: {severity_icon.get(diff['severity'], '⚪')} {diff['severity']}\n")
  1166. f.write("\n")
  1167. # 详细差异列表
  1168. if comparison_result['differences']:
  1169. f.write("## 详细差异列表\n\n")
  1170. f.write("| 序号 | 类型 | 位置 | 文件1内容 | 文件2内容 | 描述 | 严重度 |\n")
  1171. f.write("| --- | --- | --- | --- | --- | --- | --- |\n")
  1172. for i, diff in enumerate(comparison_result['differences'], 1):
  1173. severity = diff.get('severity', 'N/A')
  1174. f.write(f"| {i} | {diff['type']} | {diff['position']} | ")
  1175. f.write(f"`{diff['file1_value'][:50]}{'...' if len(diff['file1_value']) > 50 else ''}` | ")
  1176. f.write(f"`{diff['file2_value'][:50]}{'...' if len(diff['file2_value']) > 50 else ''}` | ")
  1177. f.write(f"{diff['description']} | {severity} |\n")
  1178. def compare_ocr_results(file1_path: str, file2_path: str, output_file: str = "comparison_report",
  1179. output_format: str = "markdown", ignore_images: bool = True,
  1180. table_mode: str = 'standard', similarity_algorithm: str = 'ratio') -> Dict:
  1181. """
  1182. 比较两个OCR结果文件
  1183. Args:
  1184. file1_path: 第一个OCR结果文件路径
  1185. file2_path: 第二个OCR结果文件路径
  1186. output_file: 输出文件名(不含扩展名)
  1187. output_format: 输出格式 ('json', 'markdown', 'both')
  1188. ignore_images: 是否忽略图片内容
  1189. table_mode: 表格比较模式 ('standard', 'flow_list')
  1190. similarity_algorithm: 相似度算法 ('ratio', 'partial_ratio', 'token_sort_ratio', 'token_set_ratio')
  1191. """
  1192. comparator = OCRResultComparator()
  1193. comparator.table_comparison_mode = table_mode
  1194. # 根据参数选择相似度算法
  1195. if similarity_algorithm == 'partial_ratio':
  1196. comparator.calculate_text_similarity = lambda t1, t2: fuzz.partial_ratio(t1, t2)
  1197. elif similarity_algorithm == 'token_sort_ratio':
  1198. comparator.calculate_text_similarity = lambda t1, t2: fuzz.token_sort_ratio(t1, t2)
  1199. elif similarity_algorithm == 'token_set_ratio':
  1200. comparator.calculate_text_similarity = lambda t1, t2: fuzz.token_set_ratio(t1, t2)
  1201. print("🔍 开始对比OCR结果...")
  1202. print(f"📄 文件1: {file1_path}")
  1203. print(f"📄 文件2: {file2_path}")
  1204. print(f"📊 表格模式: {table_mode}")
  1205. print(f"🔧 相似度算法: {similarity_algorithm}")
  1206. try:
  1207. # 执行比较
  1208. result = comparator.compare_files(file1_path, file2_path)
  1209. # 添加时间戳
  1210. import datetime
  1211. result['timestamp'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
  1212. # 生成报告
  1213. if output_format in ['json', 'both']:
  1214. json_file = f"{output_file}.json"
  1215. comparator.generate_json_report(result, json_file)
  1216. print(f"📄 JSON报告已保存至: {json_file}")
  1217. if output_format in ['markdown', 'both']:
  1218. md_file = f"{output_file}.md"
  1219. comparator.generate_markdown_report(result, md_file)
  1220. print(f"📝 Markdown报告已保存至: {md_file}")
  1221. # 打印简要结果
  1222. print(f"\n📊 对比完成!")
  1223. print(f" 总差异数: {result['statistics']['total_differences']}")
  1224. print(f" 表格差异: {result['statistics']['table_differences']}")
  1225. print(f" 其中表格金额差异: {result['statistics']['amount_differences']}")
  1226. print(f" 段落差异: {result['statistics']['paragraph_differences']}")
  1227. # 打印前几个重要差异
  1228. if result['differences']:
  1229. print(f"\n🔍 前3个重要差异:")
  1230. for i, diff in enumerate(result['differences'][:3], 1):
  1231. print(f" {i}. {diff['position']}: {diff['description']}")
  1232. print(f" 文件1: '{diff['file1_value'][:50]}{'...' if len(diff['file1_value']) > 50 else ''}'")
  1233. print(f" 文件2: '{diff['file2_value'][:50]}{'...' if len(diff['file2_value']) > 50 else ''}'")
  1234. else:
  1235. print(f"\n🎉 恭喜!两个文件内容完全一致!")
  1236. # 添加处理统计信息(模仿 ocr_by_vlm.py 的风格)
  1237. print("\n📊 对比处理统计")
  1238. print(f" 文件1路径: {result['file1_path']}")
  1239. print(f" 文件2路径: {result['file2_path']}")
  1240. print(f" 输出文件: {output_file}")
  1241. print(f" 输出格式: {output_format}")
  1242. print(f" 忽略图片: {ignore_images}")
  1243. print(f" 处理时间: {result['timestamp']}")
  1244. print(f" 文件1表格数: {result['file1_tables']}")
  1245. print(f" 文件2表格数: {result['file2_tables']}")
  1246. print(f" 文件1段落数: {result['file1_paragraphs']}")
  1247. print(f" 文件2段落数: {result['file2_paragraphs']}")
  1248. return result
  1249. except Exception as e:
  1250. import traceback
  1251. traceback.print_exc()
  1252. raise Exception(f"OCR对比任务失败: {e}")
  1253. if __name__ == "__main__":
  1254. parser = argparse.ArgumentParser(description='OCR结果对比工具')
  1255. parser.add_argument('file1', nargs='?', help='第一个OCR结果文件路径')
  1256. parser.add_argument('file2', nargs='?', help='第二个OCR结果文件路径')
  1257. parser.add_argument('-o', '--output', default='comparison_report', help='输出文件名')
  1258. parser.add_argument('-f', '--format', choices=['json', 'markdown', 'both'],
  1259. default='markdown', help='输出格式')
  1260. parser.add_argument('--ignore-images', action='store_true', help='忽略图片内容')
  1261. parser.add_argument('--table-mode', choices=['standard', 'flow_list'],
  1262. default='standard', help='表格比较模式')
  1263. parser.add_argument('--similarity-algorithm',
  1264. choices=['ratio', 'partial_ratio', 'token_sort_ratio', 'token_set_ratio'],
  1265. default='ratio', help='相似度算法')
  1266. args = parser.parse_args()
  1267. if args.file1 and args.file2:
  1268. result = compare_ocr_results(
  1269. file1_path=args.file1,
  1270. file2_path=args.file2,
  1271. output_file=args.output,
  1272. output_format=args.format,
  1273. ignore_images=args.ignore_images,
  1274. table_mode=args.table_mode,
  1275. similarity_algorithm=args.similarity_algorithm
  1276. )
  1277. else:
  1278. # 测试流水表格对比
  1279. result = compare_ocr_results(
  1280. file1_path='/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/merged_results/A用户_单元格扫描流水_page_005.md',
  1281. file2_path='/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/data_DotsOCR_Results/A用户_单元格扫描流水_page_005.md',
  1282. output_file=f'./output/flow_list_comparison_{time.strftime("%Y%m%d_%H%M%S")}',
  1283. output_format='both',
  1284. ignore_images=True,
  1285. table_mode='flow_list', # 使用流水表格模式
  1286. similarity_algorithm='ratio'
  1287. )