streamlit_validator_result.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390
  1. """
  2. 验证结果展示模块
  3. """
  4. import streamlit as st
  5. import pandas as pd
  6. import plotly.express as px
  7. from io import BytesIO
  8. import json
  9. from pathlib import Path
  10. from ocr_validator_utils import process_all_images_in_content
  11. def display_single_page_cross_validation(validator, config):
  12. """显示单页交叉验证结果
  13. Args:
  14. validator: OCR验证器实例
  15. config: 配置字典
  16. """
  17. current_md_path = Path(validator.file_paths[validator.selected_file_index]).with_suffix('.md')
  18. pre_validation_dir = Path(validator.config['pre_validation'].get('out_dir', './output/pre_validation/')).resolve()
  19. comparison_result_path = pre_validation_dir / f"{current_md_path.stem}_cross_validation.json"
  20. verify_md_path = validator.find_verify_md_path(validator.selected_file_index)
  21. # 检查验证结果是否与当前数据源匹配
  22. result_is_valid = False
  23. comparison_result = None
  24. if comparison_result_path.exists():
  25. try:
  26. with open(comparison_result_path, "r", encoding="utf-8") as f:
  27. comparison_result = json.load(f)
  28. # 检查文件路径是否匹配(验证结果是否为当前页面生成)
  29. if (comparison_result.get('file1_path') == str(current_md_path) and
  30. comparison_result.get('file2_path') == str(verify_md_path)):
  31. result_is_valid = True
  32. except Exception as e:
  33. st.error(f"读取验证结果失败: {e}")
  34. if result_is_valid:
  35. _display_valid_cross_validation_result(
  36. validator, config, current_md_path, verify_md_path, comparison_result
  37. )
  38. else:
  39. _display_no_validation_result_prompt(validator)
  40. def _display_valid_cross_validation_result(validator, config, current_md_path, verify_md_path, comparison_result):
  41. """显示有效的交叉验证结果
  42. Args:
  43. validator: OCR验证器实例
  44. config: 配置字典
  45. current_md_path: 当前OCR文件路径
  46. verify_md_path: 验证文件路径
  47. comparison_result: 对比结果字典
  48. """
  49. col1, col2 = st.columns([1, 1])
  50. # 左侧:原OCR识别结果
  51. with col1:
  52. st.subheader("🤖 原OCR识别结果")
  53. if current_md_path.exists():
  54. with open(current_md_path, "r", encoding="utf-8") as f:
  55. original_md_content = f.read()
  56. original_md_content = process_all_images_in_content(original_md_content, current_md_path)
  57. font_size = config['styles'].get('font_size', 10)
  58. height = config['styles']['layout'].get('default_height', 800)
  59. validator.layout_manager.render_content_by_mode(
  60. original_md_content, "HTML渲染", font_size, height, "compact"
  61. )
  62. else:
  63. st.error("原OCR文件不存在")
  64. # 右侧:验证识别结果
  65. with col2:
  66. st.subheader("🤖 验证识别结果")
  67. if verify_md_path and verify_md_path.exists():
  68. with open(str(verify_md_path), "r", encoding="utf-8") as f:
  69. verify_md_content = f.read()
  70. verify_md_content = process_all_images_in_content(verify_md_content, verify_md_path)
  71. font_size = config['styles'].get('font_size', 10)
  72. height = config['styles']['layout'].get('default_height', 800)
  73. validator.layout_manager.render_content_by_mode(
  74. verify_md_content, "HTML渲染", font_size, height, "compact"
  75. )
  76. else:
  77. st.warning("验证文件不存在")
  78. st.markdown("---")
  79. # 显示详细的对比结果
  80. display_comparison_results(comparison_result, detailed=True)
  81. def _display_no_validation_result_prompt(validator):
  82. """显示无验证结果的提示信息
  83. Args:
  84. validator: OCR验证器实例
  85. """
  86. st.info("💡 暂无当前页面的交叉验证结果,请点击上方「交叉验证」按钮运行验证")
  87. # 显示当前数据源信息
  88. col1, col2 = st.columns(2)
  89. with col1:
  90. st.write("**当前OCR数据源:**")
  91. from ocr_validator_utils import get_data_source_display_name
  92. if validator.current_source_config and validator.file_info:
  93. current_source_name = get_data_source_display_name(validator.current_source_config)
  94. current_page = validator.file_info[validator.selected_file_index]['page']
  95. st.code(f"{current_source_name}\n第 {current_page} 页")
  96. else:
  97. st.warning("未选择OCR数据源")
  98. with col2:
  99. st.write("**当前验证数据源:**")
  100. if validator.verify_source_config:
  101. from ocr_validator_utils import get_data_source_display_name
  102. verify_source_name = get_data_source_display_name(validator.verify_source_config)
  103. st.code(verify_source_name)
  104. else:
  105. st.warning("未选择验证数据源")
  106. # 添加操作提示
  107. st.markdown("---")
  108. st.markdown("""
  109. ### 📝 操作步骤:
  110. 1. **选择数据源**: 在页面顶部选择不同的OCR数据源和验证数据源
  111. 2. **运行验证**: 点击「交叉验证」按钮开始批量验证
  112. 3. **查看结果**: 验证完成后,在此处查看详细对比结果
  113. 💡 **提示**:
  114. - 确保两个数据源包含相同页码的文件
  115. - 建议选择不同OCR工具的结果进行交叉验证
  116. - 验证结果会自动保存,可随时查看
  117. """)
  118. def display_comparison_results(comparison_result: dict, detailed: bool = True):
  119. """显示对比结果
  120. Args:
  121. comparison_result: 对比结果字典
  122. detailed: 是否显示详细信息
  123. """
  124. st.header("📊 交叉验证结果")
  125. stats = comparison_result['statistics']
  126. # 显示主要指标
  127. col1, col2, col3, col4 = st.columns(4)
  128. with col1:
  129. st.metric("总差异数", stats['total_differences'])
  130. with col2:
  131. st.metric("表格差异", stats['table_differences'])
  132. with col3:
  133. st.metric("金额差异", stats.get('amount_differences', 0))
  134. with col4:
  135. st.metric("段落差异", stats['paragraph_differences'])
  136. # 根据差异数量显示不同的提示
  137. if stats['total_differences'] == 0:
  138. st.success("🎉 完美匹配!两个数据源结果完全一致")
  139. else:
  140. st.warning(f"⚠️ 发现 {stats['total_differences']} 个差异,建议人工检查")
  141. if comparison_result['differences'] and detailed:
  142. _display_differences_dataframe(comparison_result)
  143. _display_difference_details(comparison_result)
  144. _display_difference_charts(comparison_result)
  145. _provide_download_options(comparison_result)
  146. def _display_differences_dataframe(comparison_result: dict):
  147. """显示差异DataFrame"""
  148. st.subheader("🔍 差异详情对比")
  149. diff_data = []
  150. for i, diff in enumerate(comparison_result['differences'], 1):
  151. diff_data.append({
  152. '序号': i,
  153. '位置': diff['position'],
  154. '类型': diff['type'],
  155. '原OCR结果': diff['file1_value'][:100] + ('...' if len(diff['file1_value']) > 100 else ''),
  156. '验证结果': diff['file2_value'][:100] + ('...' if len(diff['file2_value']) > 100 else ''),
  157. '描述': diff['description'][:80] + ('...' if len(diff['description']) > 80 else ''),
  158. '严重程度': _get_severity_level(diff)
  159. })
  160. df_differences = pd.DataFrame(diff_data)
  161. def highlight_severity(val):
  162. if val == '高':
  163. return 'background-color: #ffebee; color: #c62828'
  164. elif val == '中':
  165. return 'background-color: #fff3e0; color: #ef6c00'
  166. elif val == '低':
  167. return 'background-color: #e8f5e8; color: #2e7d32'
  168. return ''
  169. styled_df = df_differences.style.applymap(
  170. highlight_severity,
  171. subset=['严重程度']
  172. ).format({'序号': '{:d}'})
  173. st.dataframe(styled_df, width='stretch', height=400, hide_index=True)
  174. def _display_difference_details(comparison_result: dict):
  175. """显示详细差异"""
  176. st.subheader("🔍 详细差异查看")
  177. selected_diff_index = st.selectbox(
  178. "选择要查看的差异:",
  179. options=range(len(comparison_result['differences'])),
  180. format_func=lambda x: f"差异 {x+1}: {comparison_result['differences'][x]['position']} - {comparison_result['differences'][x]['type']}",
  181. key="selected_diff"
  182. )
  183. if selected_diff_index is not None:
  184. diff = comparison_result['differences'][selected_diff_index]
  185. col1, col2 = st.columns(2)
  186. with col1:
  187. st.write("**原OCR结果:**")
  188. st.text_area("原OCR结果详情", value=diff['file1_value'], height=200,
  189. key=f"original_{selected_diff_index}", label_visibility="collapsed")
  190. with col2:
  191. st.write("**验证结果:**")
  192. st.text_area("验证结果详情", value=diff['file2_value'], height=200,
  193. key=f"verify_{selected_diff_index}", label_visibility="collapsed")
  194. st.info(f"**位置:** {diff['position']}")
  195. st.info(f"**类型:** {diff['type']}")
  196. st.info(f"**描述:** {diff['description']}")
  197. st.info(f"**严重程度:** {_get_severity_level(diff)}")
  198. def _display_difference_charts(comparison_result: dict):
  199. """显示差异统计图表"""
  200. st.subheader("📈 差异类型分布")
  201. type_counts = {}
  202. severity_counts = {'高': 0, '中': 0, '低': 0}
  203. for diff in comparison_result['differences']:
  204. diff_type = diff['type']
  205. type_counts[diff_type] = type_counts.get(diff_type, 0) + 1
  206. severity = _get_severity_level(diff)
  207. severity_counts[severity] += 1
  208. col1, col2 = st.columns(2)
  209. with col1:
  210. if type_counts:
  211. fig_type = px.pie(
  212. values=list(type_counts.values()),
  213. names=list(type_counts.keys()),
  214. title="差异类型分布"
  215. )
  216. st.plotly_chart(fig_type, width='stretch')
  217. with col2:
  218. fig_severity = px.bar(
  219. x=list(severity_counts.keys()),
  220. y=list(severity_counts.values()),
  221. title="差异严重程度分布",
  222. color=list(severity_counts.keys()),
  223. color_discrete_map={'高': '#f44336', '中': '#ff9800', '低': '#4caf50'}
  224. )
  225. st.plotly_chart(fig_severity, width='stretch')
  226. def _provide_download_options(comparison_result: dict):
  227. """提供下载选项"""
  228. st.subheader("📥 导出验证结果")
  229. col1, col2, col3 = st.columns(3)
  230. with col1:
  231. if comparison_result['differences']:
  232. diff_data = []
  233. for i, diff in enumerate(comparison_result['differences'], 1):
  234. diff_data.append({
  235. '序号': i,
  236. '位置': diff['position'],
  237. '类型': diff['type'],
  238. '原OCR结果': diff['file1_value'],
  239. '验证结果': diff['file2_value'],
  240. '描述': diff['description'],
  241. '严重程度': _get_severity_level(diff)
  242. })
  243. df_export = pd.DataFrame(diff_data)
  244. excel_buffer = BytesIO()
  245. df_export.to_excel(excel_buffer, index=False, sheet_name='差异详情')
  246. st.download_button(
  247. label="📊 下载差异详情(Excel)",
  248. data=excel_buffer.getvalue(),
  249. file_name=f"comparison_differences_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.xlsx",
  250. mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
  251. key="download_differences_excel"
  252. )
  253. with col2:
  254. stats_data = {
  255. '统计项目': ['总差异数', '表格差异', '金额差异', '段落差异'],
  256. '数量': [
  257. comparison_result['statistics']['total_differences'],
  258. comparison_result['statistics']['table_differences'],
  259. comparison_result['statistics'].get('amount_differences', 0),
  260. comparison_result['statistics']['paragraph_differences']
  261. ]
  262. }
  263. df_stats = pd.DataFrame(stats_data)
  264. csv_stats = df_stats.to_csv(index=False)
  265. st.download_button(
  266. label="📈 下载统计报告(CSV)",
  267. data=csv_stats,
  268. file_name=f"comparison_stats_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.csv",
  269. mime="text/csv",
  270. key="download_stats_csv"
  271. )
  272. with col3:
  273. report_json = json.dumps(comparison_result, ensure_ascii=False, indent=2)
  274. st.download_button(
  275. label="📄 下载完整报告(JSON)",
  276. data=report_json,
  277. file_name=f"comparison_full_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.json",
  278. mime="application/json",
  279. key="download_full_json"
  280. )
  281. def _get_severity_level(diff: dict) -> str:
  282. """判断严重程度
  283. Args:
  284. diff: 差异字典
  285. Returns:
  286. 严重程度: '高', '中', '低'
  287. """
  288. if 'severity' in diff:
  289. severity_map = {'critical': '高', 'high': '高', 'medium': '中', 'low': '低'}
  290. return severity_map.get(diff['severity'], '中')
  291. diff_type = diff['type'].lower()
  292. # 金额和数字类差异为高严重度
  293. if 'amount' in diff_type or 'number' in diff_type:
  294. return '高'
  295. # 表格和结构类差异为中严重度
  296. if 'table' in diff_type or 'structure' in diff_type:
  297. return '中'
  298. # 根据相似度判断
  299. if 'similarity' in diff:
  300. similarity = diff['similarity']
  301. if similarity < 50:
  302. return '高'
  303. elif similarity < 85:
  304. return '中'
  305. else:
  306. return '低'
  307. # 根据长度差异判断
  308. len_diff = abs(len(diff['file1_value']) - len(diff['file2_value']))
  309. if len_diff > 50:
  310. return '高'
  311. elif len_diff > 10:
  312. return '中'
  313. else:
  314. return '低'