|
|
@@ -1,60 +1,125 @@
|
|
|
"""
|
|
|
-表格结构分析控件
|
|
|
+分析功能控件
|
|
|
"""
|
|
|
import streamlit as st
|
|
|
-from .drawing import clear_table_image_cache
|
|
|
+from typing import Dict, Optional
|
|
|
+import json
|
|
|
|
|
|
|
|
|
-def create_analysis_section(y_tolerance: int, x_tolerance: int, min_row_height: int):
|
|
|
+def create_analysis_section(generator, tool: str = "ppstructv3") -> Optional[Dict]:
|
|
|
"""
|
|
|
- 创建分析区域
|
|
|
+ 创建分析控件
|
|
|
|
|
|
Args:
|
|
|
- y_tolerance: Y轴聚类容差
|
|
|
- x_tolerance: X轴聚类容差
|
|
|
- min_row_height: 最小行高
|
|
|
+ generator: TableLineGenerator 实例
|
|
|
+ tool: 工具类型
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 分析后的表格结构(如果点击了分析按钮)
|
|
|
"""
|
|
|
- if st.button("🔍 分析表格结构"):
|
|
|
- with st.spinner("分析中..."):
|
|
|
- try:
|
|
|
- generator = st.session_state.generator
|
|
|
- structure = generator.analyze_table_structure(
|
|
|
- y_tolerance=y_tolerance,
|
|
|
- x_tolerance=x_tolerance,
|
|
|
- min_row_height=min_row_height
|
|
|
- )
|
|
|
-
|
|
|
- if not structure:
|
|
|
- st.warning("⚠️ 未检测到表格结构")
|
|
|
- st.stop()
|
|
|
-
|
|
|
- structure['modified_h_lines'] = set()
|
|
|
- structure['modified_v_lines'] = set()
|
|
|
-
|
|
|
- st.session_state.structure = structure
|
|
|
- st.session_state.undo_stack = []
|
|
|
- st.session_state.redo_stack = []
|
|
|
- clear_table_image_cache()
|
|
|
-
|
|
|
- st.success(
|
|
|
- f"✅ 检测到 {len(structure['rows'])} 行"
|
|
|
- f"({len(structure['horizontal_lines'])} 条横线),"
|
|
|
- f"{len(structure['columns'])} 列"
|
|
|
- f"({len(structure['vertical_lines'])} 条竖线)"
|
|
|
- )
|
|
|
-
|
|
|
- col1, col2, col3, col4 = st.columns(4)
|
|
|
- with col1:
|
|
|
- st.metric("行数", len(structure['rows']))
|
|
|
- with col2:
|
|
|
- st.metric("横线数", len(structure['horizontal_lines']))
|
|
|
- with col3:
|
|
|
- st.metric("列数", len(structure['columns']))
|
|
|
- with col4:
|
|
|
- st.metric("竖线数", len(structure['vertical_lines']))
|
|
|
-
|
|
|
- except Exception as e:
|
|
|
- st.error(f"❌ 分析失败: {e}")
|
|
|
- import traceback
|
|
|
- st.code(traceback.format_exc())
|
|
|
- st.stop()
|
|
|
+ st.sidebar.subheader("🔍 表格结构分析")
|
|
|
+
|
|
|
+ # 🔑 根据工具类型显示不同的参数
|
|
|
+ if tool.lower() == "mineru":
|
|
|
+ st.sidebar.info("📋 MinerU 格式:直接使用 table_cells 生成结构")
|
|
|
+
|
|
|
+ if st.sidebar.button("🚀 生成表格结构", type="primary"):
|
|
|
+ with st.spinner("正在分析表格结构..."):
|
|
|
+ try:
|
|
|
+ # 🔑 MinerU 格式:从原始 JSON 重新解析
|
|
|
+ current_catalog = st.session_state.get('current_catalog', [])
|
|
|
+ current_index = st.session_state.get('current_catalog_index', 0)
|
|
|
+
|
|
|
+ if not current_catalog or current_index >= len(current_catalog):
|
|
|
+ st.error("❌ 未找到当前文件")
|
|
|
+ return None
|
|
|
+
|
|
|
+ entry = current_catalog[current_index]
|
|
|
+
|
|
|
+ # 加载原始 JSON
|
|
|
+ with open(entry["json"], "r", encoding="utf-8") as fp:
|
|
|
+ raw = json.load(fp)
|
|
|
+
|
|
|
+ # 重新解析获取完整结构
|
|
|
+ from .data_processor import get_structure_from_ocr
|
|
|
+
|
|
|
+ table_bbox, structure = get_structure_from_ocr(raw, tool)
|
|
|
+
|
|
|
+ # 保存到 session_state
|
|
|
+ st.session_state.structure = structure
|
|
|
+ st.session_state.table_bbox = table_bbox
|
|
|
+ st.session_state.undo_stack = []
|
|
|
+ st.session_state.redo_stack = []
|
|
|
+
|
|
|
+ # 清除缓存的图片
|
|
|
+ from .drawing import clear_table_image_cache
|
|
|
+ clear_table_image_cache()
|
|
|
+
|
|
|
+ st.success(
|
|
|
+ f"✅ 表格结构生成成功!\n\n"
|
|
|
+ f"检测到 {structure['total_rows']} 行,{structure['total_cols']} 列"
|
|
|
+ )
|
|
|
+ return structure
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ st.error(f"❌ 分析失败: {e}")
|
|
|
+ import traceback
|
|
|
+ with st.expander("🔍 详细错误"):
|
|
|
+ st.code(traceback.format_exc())
|
|
|
+
|
|
|
+ else:
|
|
|
+ # 🔑 PPStructure V3 格式:使用参数调整
|
|
|
+ y_tolerance = st.sidebar.slider(
|
|
|
+ "Y轴聚类容差(行检测)",
|
|
|
+ min_value=1,
|
|
|
+ max_value=20,
|
|
|
+ value=5,
|
|
|
+ help="相邻文本框Y坐标差小于此值时合并为同一行"
|
|
|
+ )
|
|
|
+
|
|
|
+ x_tolerance = st.sidebar.slider(
|
|
|
+ "X轴聚类容差(列检测)",
|
|
|
+ min_value=5,
|
|
|
+ max_value=30,
|
|
|
+ value=10,
|
|
|
+ help="相邻文本框X坐标差小于此值时合并为同一列"
|
|
|
+ )
|
|
|
+
|
|
|
+ min_row_height = st.sidebar.slider(
|
|
|
+ "最小行高",
|
|
|
+ min_value=10,
|
|
|
+ max_value=50,
|
|
|
+ value=20,
|
|
|
+ help="行高小于此值的将被过滤"
|
|
|
+ )
|
|
|
+
|
|
|
+ if st.sidebar.button("🚀 分析表格结构", type="primary"):
|
|
|
+ with st.spinner("正在分析表格结构..."):
|
|
|
+ try:
|
|
|
+ structure = generator.analyze_table_structure(
|
|
|
+ y_tolerance=y_tolerance,
|
|
|
+ x_tolerance=x_tolerance,
|
|
|
+ min_row_height=min_row_height
|
|
|
+ )
|
|
|
+
|
|
|
+ st.session_state.structure = structure
|
|
|
+ st.session_state.undo_stack = []
|
|
|
+ st.session_state.redo_stack = []
|
|
|
+
|
|
|
+ # 清除缓存的图片
|
|
|
+ from .drawing import clear_table_image_cache
|
|
|
+ clear_table_image_cache()
|
|
|
+
|
|
|
+ st.success(
|
|
|
+ f"✅ 分析完成!\n\n"
|
|
|
+ f"检测到 {len(structure['rows'])} 行,{len(structure['columns'])} 列"
|
|
|
+ )
|
|
|
+ return structure
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ st.error(f"❌ 分析失败: {e}")
|
|
|
+ import traceback
|
|
|
+ with st.expander("🔍 详细错误"):
|
|
|
+ st.code(traceback.format_exc())
|
|
|
+
|
|
|
+ return None
|