| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179 |
- #!/usr/bin/env python3
- """
- 表格验证功能快速验证脚本
- 验证升级后的表格详细分析功能
- """
- import json
- import os
- from typing import Dict, Any
- from ocr_verification import verify_ocr_with_vlm, analyze_differences
- def quick_table_verification_demo():
- """快速演示表格验证功能"""
-
- print("🚀 表格详细验证功能演示")
- print("=" * 50)
-
- # 检查必要文件
- ocr_file = "demo_54fa7ad0_page_1.json"
- image_file = "至远彩色印刷工业有限公司-2022年母公司_2.png"
-
- if not os.path.exists(ocr_file):
- print(f"❌ 找不到OCR文件: {ocr_file}")
- return
-
- if not os.path.exists(image_file):
- print(f"❌ 找不到图片文件: {image_file}")
- return
-
- print(f"✅ 文件检查完成")
- print(f"📄 OCR文件: {ocr_file}")
- print(f"🖼️ 图片文件: {image_file}")
- print()
-
- # 演示不同精度级别的验证
- verification_modes = [
- {
- "name": "🎯 高精度模式(推荐用于财务报表)",
- "config": {
- "temperature": 0.05,
- "max_tokens": 8192,
- "timeout": 400
- }
- },
- {
- "name": "⚖️ 平衡模式(一般表格)",
- "config": {
- "temperature": 0.15,
- "max_tokens": 4096,
- "timeout": 240
- }
- },
- {
- "name": "⚡ 快速模式(初步扫描)",
- "config": {
- "temperature": 0.25,
- "max_tokens": 2048,
- "timeout": 120
- }
- }
- ]
-
- for i, mode in enumerate(verification_modes, 1):
- print(f"\n📊 测试 {i}/3: {mode['name']}")
- print("-" * 40)
-
- try:
- output_file = f"verification_result_mode_{i}.json"
-
- print(f"🔍 开始验证...")
- print(f" 温度参数: {mode['config']['temperature']}")
- print(f" 最大Token: {mode['config']['max_tokens']}")
- print(f" 超时时间: {mode['config']['timeout']}秒")
-
- # 执行验证
- result = verify_ocr_with_vlm(
- image_file,
- ocr_file,
- output_file,
- **mode['config']
- )
-
- if result:
- print(f"✅ 验证完成,结果保存到: {output_file}")
-
- # 显示关键指标
- if isinstance(result, dict):
- print(f"📈 关键指标:")
-
- # 显示表格验证信息
- table_info = result.get('table_verification', {})
- if table_info:
- print(f" 检查项目: {table_info.get('total_items_checked', 'N/A')}")
- print(f" 准确率: {table_info.get('accuracy_rate', 'N/A')}")
- print(f" 结构正确: {'✅' if table_info.get('table_structure_correct') else '❌'}")
-
- # 显示错误统计
- errors = result.get('errors', [])
- format_issues = result.get('format_issues', [])
- missing_items = result.get('missing_items', [])
-
- print(f" 识别错误: {len(errors)} 项")
- print(f" 格式问题: {len(format_issues)} 项")
- print(f" 遗漏项目: {len(missing_items)} 项")
-
- # 显示高严重程度错误
- critical_errors = [
- error for error in errors
- if error.get('severity') == '高'
- ]
- if critical_errors:
- print(f" 🔴 高严重程度错误: {len(critical_errors)} 项")
-
- else:
- print(f"❌ 验证失败")
-
- except Exception as e:
- print(f"❌ 验证过程出错: {str(e)}")
- continue
-
- print(f"\n🎯 演示完成!")
- print(f"📁 查看生成的结果文件:")
- for i in range(1, 4):
- result_file = f"verification_result_mode_{i}.json"
- if os.path.exists(result_file):
- print(f" 📄 {result_file}")
-
- print(f"\n💡 使用建议:")
- print(f" 🎯 财务报表 → 使用高精度模式")
- print(f" ⚖️ 一般表格 → 使用平衡模式")
- print(f" ⚡ 快速检查 → 使用快速模式")
-
- # 分析最后一个结果
- if os.path.exists("verification_result_mode_1.json"):
- print(f"\n🔍 分析高精度模式结果:")
- try:
- analyze_differences("verification_result_mode_1.json")
- except Exception as e:
- print(f"❌ 分析出错: {str(e)}")
- def show_table_analysis_capabilities():
- """展示表格分析能力说明"""
-
- print(f"\n📋 新增表格分析能力:")
- print("=" * 50)
-
- capabilities = [
- "🔍 逐项验证: 对表格中每个数据项进行单独验证",
- "📊 格式检查: 检测千分符、小数点、标点符号错误",
- "🏗️ 结构验证: 验证表格行列对应关系",
- "📈 完整性检查: 确保重要项目和数据不遗漏",
- "🎯 错误分级: 高/中/低严重程度分类",
- "📍 精确定位: 提供行X列Y的具体位置信息",
- "💡 修正建议: 给出具体的错误修正方案",
- "📊 统计报告: 准确率、错误分布等指标",
- "🔧 参数优化: 支持多种精度模式配置"
- ]
-
- for capability in capabilities:
- print(f" {capability}")
-
- print(f"\n📄 支持的表格类型:")
- table_types = [
- "💰 财务报表 (利润表、资产负债表、现金流量表)",
- "📊 数据统计表 (业务数据、运营指标)",
- "📋 清单表格 (物料清单、人员名单)",
- "📈 对比分析表 (同比、环比数据)",
- "🗓️ 时间序列表 (月度、季度、年度数据)"
- ]
-
- for table_type in table_types:
- print(f" {table_type}")
- if __name__ == "__main__":
- # 显示功能说明
- show_table_analysis_capabilities()
-
- # 执行演示
- quick_table_verification_demo()
|