|
|
@@ -532,133 +532,456 @@ class StreamlitOCRValidator:
|
|
|
|
|
|
@st.dialog("交叉验证", width="large", dismissible=True, on_dismiss="rerun")
|
|
|
def cross_validation(self):
|
|
|
- """交叉验证功能 - 比对两个数据源的OCR结果"""
|
|
|
+ """交叉验证功能 - 批量比对两个数据源的所有OCR结果"""
|
|
|
|
|
|
- if not self.image_path or not self.md_content:
|
|
|
- st.error("❌ 请先加载OCR数据文件")
|
|
|
- return
|
|
|
if self.current_source_key == self.verify_source_key:
|
|
|
st.error("❌ OCR数据源和验证数据源不能相同")
|
|
|
return
|
|
|
- # 初始化对比结果存储
|
|
|
- if 'cross_validation_result' not in st.session_state:
|
|
|
- st.session_state.cross_validation_result = None
|
|
|
|
|
|
# 初始化对比结果存储
|
|
|
- if 'cross_validation_result' not in st.session_state:
|
|
|
- st.session_state.cross_validation_result = None
|
|
|
-
|
|
|
- # 创建进度条和状态显示
|
|
|
- with st.spinner("正在进行交叉验证...", show_time=True):
|
|
|
- status_text = st.empty()
|
|
|
+ if 'cross_validation_batch_result' not in st.session_state:
|
|
|
+ st.session_state.cross_validation_batch_result = None
|
|
|
+
|
|
|
+ st.header("🔄 批量交叉验证")
|
|
|
+
|
|
|
+ # 显示数据源信息
|
|
|
+ col1, col2 = st.columns(2)
|
|
|
+ with col1:
|
|
|
+ st.info(f"**OCR数据源:** {get_data_source_display_name(self.current_source_config)}")
|
|
|
+ st.write(f"📁 文件数量: {len(self.file_info)}")
|
|
|
+ with col2:
|
|
|
+ st.info(f"**验证数据源:** {get_data_source_display_name(self.verify_source_config)}")
|
|
|
+ st.write(f"📁 文件数量: {len(self.verify_file_info)}")
|
|
|
+
|
|
|
+ # 批量验证选项
|
|
|
+ with st.expander("⚙️ 验证选项", expanded=True):
|
|
|
+ col1, col2 = st.columns(2)
|
|
|
+ with col1:
|
|
|
+ table_mode = st.selectbox(
|
|
|
+ "表格比对模式",
|
|
|
+ options=['standard', 'flow_list'],
|
|
|
+ index=1, # 默认使用flow_list
|
|
|
+ format_func=lambda x: '流水表格模式' if x == 'flow_list' else '标准模式',
|
|
|
+ help="选择表格比对算法"
|
|
|
+ )
|
|
|
+ with col2:
|
|
|
+ similarity_algorithm = st.selectbox(
|
|
|
+ "相似度算法",
|
|
|
+ options=['ratio', 'partial_ratio', 'token_sort_ratio', 'token_set_ratio'],
|
|
|
+ index=0,
|
|
|
+ help="选择文本相似度计算算法"
|
|
|
+ )
|
|
|
+
|
|
|
+ # 开始批量验证按钮
|
|
|
+ if st.button("🚀 开始批量验证", type="primary", use_container_width=True):
|
|
|
+ self._run_batch_cross_validation(table_mode, similarity_algorithm)
|
|
|
+
|
|
|
+ # 显示历史批量验证结果
|
|
|
+ if 'cross_validation_batch_result' in st.session_state and st.session_state.cross_validation_batch_result:
|
|
|
+ st.markdown("---")
|
|
|
+ self._display_batch_validation_results(st.session_state.cross_validation_batch_result)
|
|
|
+
|
|
|
+ def _generate_batch_validation_markdown(self, batch_results: dict, output_path: str):
|
|
|
+ """生成批量验证的Markdown报告"""
|
|
|
+
|
|
|
+ with open(output_path, "w", encoding="utf-8") as f:
|
|
|
+ f.write("# 批量交叉验证报告\n\n")
|
|
|
+
|
|
|
+ # 基本信息
|
|
|
+ f.write("## 📋 基本信息\n\n")
|
|
|
+ f.write(f"- **OCR数据源:** {batch_results['ocr_source']}\n")
|
|
|
+ f.write(f"- **验证数据源:** {batch_results['verify_source']}\n")
|
|
|
+ f.write(f"- **表格模式:** {batch_results['table_mode']}\n")
|
|
|
+ f.write(f"- **相似度算法:** {batch_results['similarity_algorithm']}\n")
|
|
|
+ f.write(f"- **验证时间:** {batch_results['timestamp']}\n\n")
|
|
|
+
|
|
|
+ # 汇总统计
|
|
|
+ summary = batch_results['summary']
|
|
|
+ f.write("## 📊 汇总统计\n\n")
|
|
|
+ f.write(f"- **总页数:** {summary['total_pages']}\n")
|
|
|
+ f.write(f"- **成功页数:** {summary['successful_pages']}\n")
|
|
|
+ f.write(f"- **失败页数:** {summary['failed_pages']}\n")
|
|
|
+ f.write(f"- **总差异数:** {summary['total_differences']}\n")
|
|
|
+ f.write(f"- **表格差异:** {summary['total_table_differences']}\n")
|
|
|
+ f.write(f" - 金额差异: {summary.get('total_amount_differences', 0)}\n")
|
|
|
+ f.write(f" - 日期差异: {summary.get('total_datetime_differences', 0)}\n")
|
|
|
+ f.write(f" - 文本差异: {summary.get('total_text_differences', 0)}\n")
|
|
|
+ f.write(f" - 表头前差异: {summary.get('total_table_pre_header', 0)}\n")
|
|
|
+ f.write(f" - 表头位置差异: {summary.get('total_table_header_position', 0)}\n")
|
|
|
+ f.write(f" - 表头严重错误: {summary.get('total_table_header_critical', 0)}\n")
|
|
|
+ f.write(f" - 行缺失: {summary.get('total_table_row_missing', 0)}\n")
|
|
|
+ f.write(f"- **段落差异:** {summary['total_paragraph_differences']}\n")
|
|
|
+ f.write(f"- **严重程度统计:**\n")
|
|
|
+ f.write(f" - 高严重度: {summary.get('total_high_severity', 0)}\n")
|
|
|
+ f.write(f" - 中严重度: {summary.get('total_medium_severity', 0)}\n")
|
|
|
+ f.write(f" - 低严重度: {summary.get('total_low_severity', 0)}\n\n")
|
|
|
+
|
|
|
+ # 详细结果表格
|
|
|
+ f.write("## 📄 各页差异统计\n\n")
|
|
|
+ f.write("| 页码 | 状态 | 总差异 | 表格差异 | 金额 | 日期 | 文本 | 段落 | 表头前 | 表头位置 | 表头错误 | 行缺失 | 高 | 中 | 低 |\n")
|
|
|
+ f.write("|------|------|--------|----------|------|------|------|------|--------|----------|----------|--------|----|----|----|\n")
|
|
|
+
|
|
|
+ for page in batch_results['pages']:
|
|
|
+ if page['status'] == 'success':
|
|
|
+ status_icon = "✅" if page['total_differences'] == 0 else "⚠️"
|
|
|
+ f.write(f"| {page['page_num']} | {status_icon} | ")
|
|
|
+ f.write(f"{page['total_differences']} | ")
|
|
|
+ f.write(f"{page['table_differences']} | ")
|
|
|
+ f.write(f"{page.get('amount_differences', 0)} | ")
|
|
|
+ f.write(f"{page.get('datetime_differences', 0)} | ")
|
|
|
+ f.write(f"{page.get('text_differences', 0)} | ")
|
|
|
+ f.write(f"{page['paragraph_differences']} | ")
|
|
|
+ f.write(f"{page.get('table_pre_header', 0)} | ")
|
|
|
+ f.write(f"{page.get('table_header_position', 0)} | ")
|
|
|
+ f.write(f"{page.get('table_header_critical', 0)} | ")
|
|
|
+ f.write(f"{page.get('table_row_missing', 0)} | ")
|
|
|
+ f.write(f"{page.get('high_severity', 0)} | ")
|
|
|
+ f.write(f"{page.get('medium_severity', 0)} | ")
|
|
|
+ f.write(f"{page.get('low_severity', 0)} |\n")
|
|
|
+ else:
|
|
|
+ f.write(f"| {page['page_num']} | ❌ | - | - | - | - | - | - | - | - | - | - | - | - | - |\n")
|
|
|
+
|
|
|
+ f.write("\n")
|
|
|
+
|
|
|
+ # 问题汇总
|
|
|
+ f.write("## 🔍 问题汇总\n\n")
|
|
|
+
|
|
|
+ high_diff_pages = [p for p in batch_results['pages']
|
|
|
+ if p['status'] == 'success' and p['total_differences'] > 10]
|
|
|
+ if high_diff_pages:
|
|
|
+ f.write("### ⚠️ 高差异页面(差异>10)\n\n")
|
|
|
+ for page in high_diff_pages:
|
|
|
+ f.write(f"- 第 {page['page_num']} 页:{page['total_differences']} 个差异\n")
|
|
|
+ f.write("\n")
|
|
|
+
|
|
|
+ amount_error_pages = [p for p in batch_results['pages']
|
|
|
+ if p['status'] == 'success' and p.get('amount_differences', 0) > 0]
|
|
|
+ if amount_error_pages:
|
|
|
+ f.write("### 💰 金额差异页面\n\n")
|
|
|
+ for page in amount_error_pages:
|
|
|
+ f.write(f"- 第 {page['page_num']} 页:{page.get('amount_differences', 0)} 个金额差异\n")
|
|
|
+ f.write("\n")
|
|
|
|
|
|
+ header_error_pages = [p for p in batch_results['pages']
|
|
|
+ if p['status'] == 'success' and p.get('table_header_critical', 0) > 0]
|
|
|
+ if header_error_pages:
|
|
|
+ f.write("### ❌ 表头严重错误页面\n\n")
|
|
|
+ for page in header_error_pages:
|
|
|
+ f.write(f"- 第 {page['page_num']} 页:{page['table_header_critical']} 个表头错误\n")
|
|
|
+ f.write("\n")
|
|
|
+
|
|
|
+ failed_pages = [p for p in batch_results['pages'] if p['status'] == 'failed']
|
|
|
+ if failed_pages:
|
|
|
+ f.write("### 💥 验证失败页面\n\n")
|
|
|
+ for page in failed_pages:
|
|
|
+ f.write(f"- 第 {page['page_num']} 页:{page.get('error', '未知错误')}\n")
|
|
|
+ f.write("\n")
|
|
|
+
|
|
|
+ def _run_batch_cross_validation(self, table_mode: str, similarity_algorithm: str):
|
|
|
+ """执行批量交叉验证"""
|
|
|
+
|
|
|
+ # 准备输出目录
|
|
|
+ pre_validation_dir = Path(self.config['pre_validation'].get('out_dir', './output/pre_validation/')).resolve()
|
|
|
+ pre_validation_dir.mkdir(parents=True, exist_ok=True)
|
|
|
+
|
|
|
+ # ✅ 批量结果存储 - 更新统计字段
|
|
|
+ batch_results = {
|
|
|
+ 'ocr_source': get_data_source_display_name(self.current_source_config),
|
|
|
+ 'verify_source': get_data_source_display_name(self.verify_source_config),
|
|
|
+ 'table_mode': table_mode,
|
|
|
+ 'similarity_algorithm': similarity_algorithm,
|
|
|
+ 'timestamp': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
|
|
|
+ 'pages': [],
|
|
|
+ 'summary': {
|
|
|
+ 'total_pages': 0,
|
|
|
+ 'successful_pages': 0,
|
|
|
+ 'failed_pages': 0,
|
|
|
+ 'total_differences': 0,
|
|
|
+ 'total_table_differences': 0,
|
|
|
+ 'total_amount_differences': 0,
|
|
|
+ 'total_datetime_differences': 0,
|
|
|
+ 'total_text_differences': 0,
|
|
|
+ 'total_paragraph_differences': 0,
|
|
|
+ 'total_table_pre_header': 0,
|
|
|
+ 'total_table_header_position': 0,
|
|
|
+ 'total_table_header_critical': 0,
|
|
|
+ 'total_table_row_missing': 0,
|
|
|
+ 'total_high_severity': 0,
|
|
|
+ 'total_medium_severity': 0,
|
|
|
+ 'total_low_severity': 0
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ # 创建进度条
|
|
|
+ progress_bar = st.progress(0)
|
|
|
+ status_text = st.empty()
|
|
|
+
|
|
|
+ # 建立页码映射
|
|
|
+ ocr_page_map = {info['page']: i for i, info in enumerate(self.file_info)}
|
|
|
+ verify_page_map = {info['page']: i for i, info in enumerate(self.verify_file_info)}
|
|
|
+
|
|
|
+ # 找出两个数据源共同的页码
|
|
|
+ common_pages = sorted(set(ocr_page_map.keys()) & set(verify_page_map.keys()))
|
|
|
+
|
|
|
+ if not common_pages:
|
|
|
+ st.error("❌ 两个数据源没有共同的页码,无法进行对比")
|
|
|
+ return
|
|
|
+
|
|
|
+ batch_results['summary']['total_pages'] = len(common_pages)
|
|
|
+
|
|
|
+ # 创建详细日志区域
|
|
|
+ with st.expander("📋 详细对比日志", expanded=True):
|
|
|
+ log_container = st.container()
|
|
|
+
|
|
|
+ # 逐页对比
|
|
|
+ for idx, page_num in enumerate(common_pages):
|
|
|
try:
|
|
|
- # 第一步:获取当前OCR结果文件路径
|
|
|
- current_md_path = Path(self.file_paths[self.selected_file_index]).with_suffix('.md')
|
|
|
- if not current_md_path.exists():
|
|
|
- st.error("❌ 当前OCR结果的Markdown文件不存在")
|
|
|
- return
|
|
|
+ # 更新进度
|
|
|
+ progress = (idx + 1) / len(common_pages)
|
|
|
+ progress_bar.progress(progress)
|
|
|
+ status_text.text(f"正在对比第 {page_num} 页... ({idx + 1}/{len(common_pages)})")
|
|
|
|
|
|
- status_text.text(f"📄 OCR文件: {current_md_path.name}")
|
|
|
+ # 获取文件路径
|
|
|
+ ocr_file_index = ocr_page_map[page_num]
|
|
|
+ verify_file_index = verify_page_map[page_num]
|
|
|
|
|
|
- # 第二步:查找对应的验证文件
|
|
|
- verify_md_path = self.find_verify_md_path(self.selected_file_index)
|
|
|
+ ocr_md_path = Path(self.file_paths[ocr_file_index]).with_suffix('.md')
|
|
|
+ verify_md_path = Path(self.verify_file_paths[verify_file_index]).with_suffix('.md')
|
|
|
|
|
|
- if not verify_md_path or not verify_md_path.exists():
|
|
|
- st.error(f"❌ 未找到验证数据源中第{current_md_path}页的对应文件")
|
|
|
- return
|
|
|
+ if not ocr_md_path.exists() or not verify_md_path.exists():
|
|
|
+ with log_container:
|
|
|
+ st.warning(f"⚠️ 第 {page_num} 页:文件不存在,跳过")
|
|
|
+ batch_results['summary']['failed_pages'] += 1
|
|
|
+ continue
|
|
|
|
|
|
- status_text.text(f"🔍 验证文件: {verify_md_path.name}")
|
|
|
+ # 执行对比
|
|
|
+ comparison_result_path = pre_validation_dir / f"{ocr_md_path.stem}_cross_validation"
|
|
|
|
|
|
- # 第三步:准备输出目录
|
|
|
- pre_validation_dir = Path(self.config['pre_validation'].get('out_dir', './output/pre_validation/')).resolve()
|
|
|
- pre_validation_dir.mkdir(parents=True, exist_ok=True)
|
|
|
+ # 捕获对比输出
|
|
|
+ import io
|
|
|
+ import contextlib
|
|
|
|
|
|
- # 第四步:调用对比功能
|
|
|
- status_text.text("📊 正在对比OCR结果...")
|
|
|
+ output_buffer = io.StringIO()
|
|
|
|
|
|
- comparison_result_path = pre_validation_dir / f"{current_md_path.stem}_cross_validation"
|
|
|
+ with contextlib.redirect_stdout(output_buffer):
|
|
|
+ comparison_result = compare_ocr_results(
|
|
|
+ file1_path=str(ocr_md_path),
|
|
|
+ file2_path=str(verify_md_path),
|
|
|
+ output_file=str(comparison_result_path),
|
|
|
+ output_format='both',
|
|
|
+ ignore_images=True,
|
|
|
+ table_mode=table_mode,
|
|
|
+ similarity_algorithm=similarity_algorithm
|
|
|
+ )
|
|
|
|
|
|
- # 在expander中显示对比过程
|
|
|
- with st.expander("🔍 交叉验证对比过程", expanded=True):
|
|
|
- compare_output = st.empty()
|
|
|
-
|
|
|
- # 捕获对比输出
|
|
|
- import io
|
|
|
- import contextlib
|
|
|
-
|
|
|
- output_buffer = io.StringIO()
|
|
|
-
|
|
|
- with contextlib.redirect_stdout(output_buffer):
|
|
|
- comparison_result = compare_ocr_results(
|
|
|
- file1_path=str(current_md_path),
|
|
|
- file2_path=str(verify_md_path),
|
|
|
- output_file=str(comparison_result_path),
|
|
|
- output_format='both',
|
|
|
- ignore_images=True,
|
|
|
- table_mode='flow_list', # ✅ 使用流水表格模式
|
|
|
- similarity_algorithm='ratio'
|
|
|
- )
|
|
|
-
|
|
|
- # 显示对比过程输出
|
|
|
- compare_output.code(output_buffer.getvalue(), language='text')
|
|
|
+ # ✅ 提取统计信息 - 更新字段
|
|
|
+ stats = comparison_result['statistics']
|
|
|
|
|
|
- status_text.text("✅ 交叉验证完成")
|
|
|
-
|
|
|
- st.session_state.cross_validation_result = {
|
|
|
- "ocr_source": get_data_source_display_name(self.current_source_config),
|
|
|
- "verify_source": get_data_source_display_name(self.verify_source_config),
|
|
|
- "ocr_file": str(current_md_path),
|
|
|
- "verify_file": str(verify_md_path),
|
|
|
- "comparison_result_json": f"{comparison_result_path}.json",
|
|
|
- "comparison_result_md": f"{comparison_result_path}.md",
|
|
|
- "comparison_result": comparison_result
|
|
|
+ page_result = {
|
|
|
+ 'page_num': page_num,
|
|
|
+ 'ocr_file': str(ocr_md_path.name),
|
|
|
+ 'verify_file': str(verify_md_path.name),
|
|
|
+ 'total_differences': stats['total_differences'],
|
|
|
+ 'table_differences': stats['table_differences'],
|
|
|
+ 'amount_differences': stats.get('amount_differences', 0),
|
|
|
+ 'datetime_differences': stats.get('datetime_differences', 0),
|
|
|
+ 'text_differences': stats.get('text_differences', 0),
|
|
|
+ 'paragraph_differences': stats['paragraph_differences'],
|
|
|
+ 'table_pre_header': stats.get('table_pre_header', 0),
|
|
|
+ 'table_header_position': stats.get('table_header_position', 0),
|
|
|
+ 'table_header_critical': stats.get('table_header_critical', 0),
|
|
|
+ 'table_row_missing': stats.get('table_row_missing', 0),
|
|
|
+ 'high_severity': stats.get('high_severity', 0),
|
|
|
+ 'medium_severity': stats.get('medium_severity', 0),
|
|
|
+ 'low_severity': stats.get('low_severity', 0),
|
|
|
+ 'status': 'success',
|
|
|
+ 'comparison_json': f"{comparison_result_path}.json",
|
|
|
+ 'comparison_md': f"{comparison_result_path}.md"
|
|
|
}
|
|
|
|
|
|
- # 第五步:显示对比结果
|
|
|
- self.display_comparison_results(comparison_result, detailed=False)
|
|
|
+ batch_results['pages'].append(page_result)
|
|
|
+ batch_results['summary']['successful_pages'] += 1
|
|
|
+ batch_results['summary']['total_differences'] += stats['total_differences']
|
|
|
+ batch_results['summary']['total_table_differences'] += stats['table_differences']
|
|
|
+ batch_results['summary']['total_amount_differences'] += stats.get('amount_differences', 0)
|
|
|
+ batch_results['summary']['total_datetime_differences'] += stats.get('datetime_differences', 0)
|
|
|
+ batch_results['summary']['total_text_differences'] += stats.get('text_differences', 0)
|
|
|
+ batch_results['summary']['total_paragraph_differences'] += stats['paragraph_differences']
|
|
|
+ batch_results['summary']['total_table_pre_header'] += stats.get('table_pre_header', 0)
|
|
|
+ batch_results['summary']['total_table_header_position'] += stats.get('table_header_position', 0)
|
|
|
+ batch_results['summary']['total_table_header_critical'] += stats.get('table_header_critical', 0)
|
|
|
+ batch_results['summary']['total_table_row_missing'] += stats.get('table_row_missing', 0)
|
|
|
+ batch_results['summary']['total_high_severity'] += stats.get('high_severity', 0)
|
|
|
+ batch_results['summary']['total_medium_severity'] += stats.get('medium_severity', 0)
|
|
|
+ batch_results['summary']['total_low_severity'] += stats.get('low_severity', 0)
|
|
|
+
|
|
|
+ # 显示当前页对比结果
|
|
|
+ with log_container:
|
|
|
+ if stats['total_differences'] == 0:
|
|
|
+ st.success(f"✅ 第 {page_num} 页:完全匹配")
|
|
|
+ else:
|
|
|
+ st.warning(f"⚠️ 第 {page_num} 页:发现 {stats['total_differences']} 个差异")
|
|
|
|
|
|
except Exception as e:
|
|
|
- st.error(f"❌ 交叉验证失败: {e}")
|
|
|
- st.exception(e)
|
|
|
+ with log_container:
|
|
|
+ st.error(f"❌ 第 {page_num} 页:对比失败 - {str(e)}")
|
|
|
+
|
|
|
+ page_result = {
|
|
|
+ 'page_num': page_num,
|
|
|
+ 'status': 'failed',
|
|
|
+ 'error': str(e)
|
|
|
+ }
|
|
|
+ batch_results['pages'].append(page_result)
|
|
|
+ batch_results['summary']['failed_pages'] += 1
|
|
|
+
|
|
|
+ # 保存批量结果
|
|
|
+ batch_result_path = pre_validation_dir / f"{self.current_source_config['name']}_{self.current_source_config['ocr_tool']}_vs_{self.verify_source_config['ocr_tool']}_batch_cross_validation"
|
|
|
+
|
|
|
+ # 保存JSON
|
|
|
+ with open(f"{batch_result_path}.json", "w", encoding="utf-8") as f:
|
|
|
+ json.dump(batch_results, f, ensure_ascii=False, indent=2)
|
|
|
+
|
|
|
+ # 生成Markdown报告
|
|
|
+ self._generate_batch_validation_markdown(batch_results, f"{batch_result_path}.md")
|
|
|
+
|
|
|
+ # 保存到session state
|
|
|
+ st.session_state.cross_validation_batch_result = batch_results
|
|
|
+
|
|
|
+ # 完成提示
|
|
|
+ progress_bar.progress(1.0)
|
|
|
+ status_text.text("✅ 批量验证完成!")
|
|
|
+
|
|
|
+ st.success(f"🎉 批量验证完成!成功: {batch_results['summary']['successful_pages']}, 失败: {batch_results['summary']['failed_pages']}")
|
|
|
|
|
|
- @st.dialog("查看交叉验证结果", width="large", dismissible=True, on_dismiss="rerun")
|
|
|
- def show_cross_validation_results_dialog(self):
|
|
|
- """显示交叉验证结果的对话框"""
|
|
|
- current_md_path = Path(self.file_paths[self.selected_file_index]).with_suffix('.md')
|
|
|
- pre_validation_dir = Path(self.config['pre_validation'].get('out_dir', './output/pre_validation/')).resolve()
|
|
|
- comparison_result_path = pre_validation_dir / f"{current_md_path.stem}_cross_validation.json"
|
|
|
+ def _display_batch_validation_results(self, batch_results: dict):
|
|
|
+ """显示批量验证结果"""
|
|
|
|
|
|
- if 'cross_validation_result' in st.session_state and st.session_state.cross_validation_result:
|
|
|
- result = st.session_state.cross_validation_result
|
|
|
-
|
|
|
- # 显示数据源信息
|
|
|
- col1, col2 = st.columns(2)
|
|
|
+ st.header("📊 批量验证结果")
|
|
|
+
|
|
|
+ # 汇总统计
|
|
|
+ summary = batch_results['summary']
|
|
|
+
|
|
|
+ col1, col2, col3, col4 = st.columns(4)
|
|
|
+ with col1:
|
|
|
+ st.metric("总页数", summary['total_pages'])
|
|
|
+ with col2:
|
|
|
+ st.metric("成功页数", summary['successful_pages'],
|
|
|
+ delta=f"{summary['successful_pages']/summary['total_pages']*100:.1f}%")
|
|
|
+ with col3:
|
|
|
+ st.metric("失败页数", summary['failed_pages'],
|
|
|
+ delta=f"-{summary['failed_pages']}" if summary['failed_pages'] > 0 else "0")
|
|
|
+ with col4:
|
|
|
+ st.metric("总差异数", summary['total_differences'])
|
|
|
+
|
|
|
+ # ✅ 详细差异类型统计 - 更新展示
|
|
|
+ st.subheader("📈 差异类型统计")
|
|
|
+
|
|
|
+ col1, col2, col3 = st.columns(3)
|
|
|
+ with col1:
|
|
|
+ st.metric("表格差异", summary['total_table_differences'])
|
|
|
+ st.caption(f"金额: {summary.get('total_amount_differences', 0)} | 日期: {summary.get('total_datetime_differences', 0)} | 文本: {summary.get('total_text_differences', 0)}")
|
|
|
+ with col2:
|
|
|
+ st.metric("段落差异", summary['total_paragraph_differences'])
|
|
|
+ with col3:
|
|
|
+ st.metric("严重度", f"高:{summary.get('total_high_severity', 0)} 中:{summary.get('total_medium_severity', 0)} 低:{summary.get('total_low_severity', 0)}")
|
|
|
+
|
|
|
+ # 表格结构差异统计
|
|
|
+ with st.expander("📋 表格结构差异详情", expanded=False):
|
|
|
+ col1, col2, col3, col4 = st.columns(4)
|
|
|
with col1:
|
|
|
- st.info(f"**OCR数据源:** {result['ocr_source']}")
|
|
|
+ st.metric("表头前", summary.get('total_table_pre_header', 0))
|
|
|
with col2:
|
|
|
- st.info(f"**验证数据源:** {result['verify_source']}")
|
|
|
+ st.metric("表头位置", summary.get('total_table_header_position', 0))
|
|
|
+ with col3:
|
|
|
+ st.metric("表头错误", summary.get('total_table_header_critical', 0))
|
|
|
+ with col4:
|
|
|
+ st.metric("行缺失", summary.get('total_table_row_missing', 0))
|
|
|
+
|
|
|
+ # ✅ 各页详细结果表格 - 更新列
|
|
|
+ st.subheader("📄 各页详细结果")
|
|
|
+
|
|
|
+ # 准备DataFrame
|
|
|
+ page_data = []
|
|
|
+ for page in batch_results['pages']:
|
|
|
+ if page['status'] == 'success':
|
|
|
+ page_data.append({
|
|
|
+ '页码': page['page_num'],
|
|
|
+ '状态': '✅ 成功' if page['total_differences'] == 0 else '⚠️ 有差异',
|
|
|
+ '总差异': page['total_differences'],
|
|
|
+ '表格差异': page['table_differences'],
|
|
|
+ '金额': page.get('amount_differences', 0),
|
|
|
+ '日期': page.get('datetime_differences', 0),
|
|
|
+ '文本': page.get('text_differences', 0),
|
|
|
+ '段落': page['paragraph_differences'],
|
|
|
+ '表头前': page.get('table_pre_header', 0),
|
|
|
+ '表头位置': page.get('table_header_position', 0),
|
|
|
+ '表头错误': page.get('table_header_critical', 0),
|
|
|
+ '行缺失': page.get('table_row_missing', 0),
|
|
|
+ '高': page.get('high_severity', 0),
|
|
|
+ '中': page.get('medium_severity', 0),
|
|
|
+ '低': page.get('low_severity', 0)
|
|
|
+ })
|
|
|
+ else:
|
|
|
+ page_data.append({
|
|
|
+ '页码': page['page_num'],
|
|
|
+ '状态': '❌ 失败',
|
|
|
+ '总差异': '-', '表格差异': '-', '金额': '-', '日期': '-',
|
|
|
+ '文本': '-', '段落': '-', '表头前': '-', '表头位置': '-',
|
|
|
+ '表头错误': '-', '行缺失': '-', '高': '-', '中': '-', '低': '-'
|
|
|
+ })
|
|
|
+
|
|
|
+ df_pages = pd.DataFrame(page_data)
|
|
|
+
|
|
|
+ # 显示表格
|
|
|
+ st.dataframe(
|
|
|
+ df_pages,
|
|
|
+ use_container_width=True,
|
|
|
+ hide_index=True,
|
|
|
+ column_config={
|
|
|
+ "页码": st.column_config.NumberColumn("页码", width="small"),
|
|
|
+ "状态": st.column_config.TextColumn("状态", width="small"),
|
|
|
+ "总差异": st.column_config.NumberColumn("总差异", width="small"),
|
|
|
+ "表格差异": st.column_config.NumberColumn("表格", width="small"),
|
|
|
+ "金额": st.column_config.NumberColumn("金额", width="small"),
|
|
|
+ "日期": st.column_config.NumberColumn("日期", width="small"),
|
|
|
+ "文本": st.column_config.NumberColumn("文本", width="small"),
|
|
|
+ "段落": st.column_config.NumberColumn("段落", width="small"),
|
|
|
+ }
|
|
|
+ )
|
|
|
+
|
|
|
+ # 下载选项
|
|
|
+ st.subheader("📥 导出报告")
|
|
|
+
|
|
|
+ col1, col2 = st.columns(2)
|
|
|
+
|
|
|
+ with col1:
|
|
|
+ # 导出Excel
|
|
|
+ excel_buffer = BytesIO()
|
|
|
+ df_pages.to_excel(excel_buffer, index=False, sheet_name='验证结果')
|
|
|
|
|
|
- self.display_comparison_results(result['comparison_result'])
|
|
|
+ st.download_button(
|
|
|
+ label="📊 下载Excel报告",
|
|
|
+ data=excel_buffer.getvalue(),
|
|
|
+ file_name=f"batch_validation_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.xlsx",
|
|
|
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
|
|
+ )
|
|
|
+
|
|
|
+ with col2:
|
|
|
+ # 导出JSON
|
|
|
+ json_data = json.dumps(batch_results, ensure_ascii=False, indent=2)
|
|
|
+
|
|
|
+ st.download_button(
|
|
|
+ label="📄 下载JSON报告",
|
|
|
+ data=json_data,
|
|
|
+ file_name=f"batch_validation_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.json",
|
|
|
+ mime="application/json"
|
|
|
+ )
|
|
|
+
|
|
|
+ @st.dialog("查看交叉验证结果", width="large", dismissible=True, on_dismiss="rerun")
|
|
|
+ def show_batch_cross_validation_results_dialog(self):
|
|
|
+ if 'cross_validation_batch_result' in st.session_state and st.session_state.cross_validation_batch_result:
|
|
|
+ self._display_batch_validation_results(st.session_state.cross_validation_batch_result)
|
|
|
|
|
|
- elif comparison_result_path.exists():
|
|
|
- # 如果有历史结果文件,提示加载
|
|
|
- if st.button("📂 加载历史验证结果"):
|
|
|
- with open(comparison_result_path, "r", encoding="utf-8") as f:
|
|
|
- comparison_json_result = json.load(f)
|
|
|
-
|
|
|
- cross_validation_result = {
|
|
|
- "ocr_source": get_data_source_display_name(self.current_source_config),
|
|
|
- "verify_source": get_data_source_display_name(self.verify_source_config),
|
|
|
- "ocr_file": comparison_json_result['file1_path'],
|
|
|
- "verify_file": comparison_json_result['file2_path'],
|
|
|
- "comparison_result_json": str(comparison_result_path),
|
|
|
- "comparison_result_md": str(comparison_result_path.with_suffix('.md')),
|
|
|
- "comparison_result": comparison_json_result
|
|
|
- }
|
|
|
-
|
|
|
- st.session_state.cross_validation_result = cross_validation_result
|
|
|
- self.display_comparison_results(comparison_json_result)
|
|
|
else:
|
|
|
st.info("暂无交叉验证结果,请先运行交叉验证")
|
|
|
|
|
|
@@ -677,7 +1000,7 @@ class StreamlitOCRValidator:
|
|
|
with col2:
|
|
|
st.metric("表格差异", stats['table_differences'])
|
|
|
with col3:
|
|
|
- st.metric("金额差异", stats['amount_differences'])
|
|
|
+ st.metric("其中表格金额差异", stats['amount_differences'])
|
|
|
with col4:
|
|
|
st.metric("段落差异", stats['paragraph_differences'])
|
|
|
|
|
|
@@ -933,7 +1256,7 @@ class StreamlitOCRValidator:
|
|
|
with col2:
|
|
|
# 导出统计报告
|
|
|
stats_data = {
|
|
|
- '统计项目': ['总差异数', '表格差异', '金额差异', '段落差异'],
|
|
|
+ '统计项目': ['总差异数', '表格差异', '其中表格金额差异', '段落差异'],
|
|
|
'数量': [
|
|
|
comparison_result['statistics']['total_differences'],
|
|
|
comparison_result['statistics']['table_differences'],
|
|
|
@@ -1089,7 +1412,7 @@ def main():
|
|
|
|
|
|
# 查看预校验结果按钮
|
|
|
if st.button("查看验证结果", type="secondary", icon=":material/quick_reference_all:"):
|
|
|
- validator.show_cross_validation_results_dialog()
|
|
|
+ validator.show_batch_cross_validation_results_dialog()
|
|
|
|
|
|
# 显示当前数据源统计信息
|
|
|
with st.expander("🔧 OCR工具统计信息", expanded=False):
|
|
|
@@ -1144,7 +1467,7 @@ def main():
|
|
|
layout_type = "compact"
|
|
|
validator.layout_manager.render_content_by_mode(original_md_content, "HTML渲染", font_size, height, layout_type)
|
|
|
with col2:
|
|
|
- st.subheader("🤖 VLM识别结果")
|
|
|
+ st.subheader("🤖 验证识别结果")
|
|
|
with open(str(verify_md_path), "r", encoding="utf-8") as f:
|
|
|
verify_md_content = f.read()
|
|
|
font_size = config['styles'].get('font_size', 10)
|