| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125 |
- """
- 分析功能控件
- """
- import streamlit as st
- from typing import Dict, Optional
- import json
- def create_analysis_section(generator, tool: str = "ppstructv3") -> Optional[Dict]:
- """
- 创建分析控件
-
- Args:
- generator: TableLineGenerator 实例
- tool: 工具类型
-
- Returns:
- 分析后的表格结构(如果点击了分析按钮)
- """
- st.sidebar.subheader("🔍 表格结构分析")
-
- # 🔑 根据工具类型显示不同的参数
- if tool.lower() == "mineru":
- st.sidebar.info("📋 MinerU 格式:直接使用 table_cells 生成结构")
-
- if st.sidebar.button("🚀 生成表格结构", type="primary"):
- with st.spinner("正在分析表格结构..."):
- try:
- # 🔑 MinerU 格式:从原始 JSON 重新解析
- current_catalog = st.session_state.get('current_catalog', [])
- current_index = st.session_state.get('current_catalog_index', 0)
-
- if not current_catalog or current_index >= len(current_catalog):
- st.error("❌ 未找到当前文件")
- return None
-
- entry = current_catalog[current_index]
-
- # 加载原始 JSON
- with open(entry["json"], "r", encoding="utf-8") as fp:
- raw = json.load(fp)
-
- # 重新解析获取完整结构
- from .data_processor import get_structure_from_ocr
-
- table_bbox, structure = get_structure_from_ocr(raw, tool)
-
- # 保存到 session_state
- st.session_state.structure = structure
- st.session_state.table_bbox = table_bbox
- st.session_state.undo_stack = []
- st.session_state.redo_stack = []
-
- # 清除缓存的图片
- from .drawing import clear_table_image_cache
- clear_table_image_cache()
-
- st.success(
- f"✅ 表格结构生成成功!\n\n"
- f"检测到 {structure['total_rows']} 行,{structure['total_cols']} 列"
- )
- return structure
-
- except Exception as e:
- st.error(f"❌ 分析失败: {e}")
- import traceback
- with st.expander("🔍 详细错误"):
- st.code(traceback.format_exc())
-
- else:
- # 🔑 PPStructure V3 格式:使用参数调整
- y_tolerance = st.sidebar.slider(
- "Y轴聚类容差(行检测)",
- min_value=1,
- max_value=20,
- value=5,
- help="相邻文本框Y坐标差小于此值时合并为同一行"
- )
-
- x_tolerance = st.sidebar.slider(
- "X轴聚类容差(列检测)",
- min_value=5,
- max_value=30,
- value=10,
- help="相邻文本框X坐标差小于此值时合并为同一列"
- )
-
- min_row_height = st.sidebar.slider(
- "最小行高",
- min_value=10,
- max_value=50,
- value=20,
- help="行高小于此值的将被过滤"
- )
-
- if st.sidebar.button("🚀 分析表格结构", type="primary"):
- with st.spinner("正在分析表格结构..."):
- try:
- structure = generator.analyze_table_structure(
- y_tolerance=y_tolerance,
- x_tolerance=x_tolerance,
- min_row_height=min_row_height
- )
-
- st.session_state.structure = structure
- st.session_state.undo_stack = []
- st.session_state.redo_stack = []
-
- # 清除缓存的图片
- from .drawing import clear_table_image_cache
- clear_table_image_cache()
-
- st.success(
- f"✅ 分析完成!\n\n"
- f"检测到 {len(structure['rows'])} 行,{len(structure['columns'])} 列"
- )
- return structure
-
- except Exception as e:
- st.error(f"❌ 分析失败: {e}")
- import traceback
- with st.expander("🔍 详细错误"):
- st.code(traceback.format_exc())
-
- return None
|