analysis_controls.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. """
  2. 分析功能控件
  3. """
  4. import streamlit as st
  5. from typing import Dict, Optional
  6. import json
  7. def create_analysis_section(generator, tool: str = "ppstructv3") -> Optional[Dict]:
  8. """
  9. 创建分析控件
  10. Args:
  11. generator: TableLineGenerator 实例
  12. tool: 工具类型
  13. Returns:
  14. 分析后的表格结构(如果点击了分析按钮)
  15. """
  16. st.sidebar.subheader("🔍 表格结构分析")
  17. # 🔑 根据工具类型显示不同的参数
  18. if tool.lower() == "mineru":
  19. st.sidebar.info("📋 MinerU 格式:直接使用 table_cells 生成结构")
  20. if st.sidebar.button("🚀 生成表格结构", type="primary"):
  21. with st.spinner("正在分析表格结构..."):
  22. try:
  23. # 🔑 MinerU 格式:从原始 JSON 重新解析
  24. current_catalog = st.session_state.get('current_catalog', [])
  25. current_index = st.session_state.get('current_catalog_index', 0)
  26. if not current_catalog or current_index >= len(current_catalog):
  27. st.error("❌ 未找到当前文件")
  28. return None
  29. entry = current_catalog[current_index]
  30. # 加载原始 JSON
  31. with open(entry["json"], "r", encoding="utf-8") as fp:
  32. raw = json.load(fp)
  33. # 重新解析获取完整结构
  34. from .data_processor import get_structure_from_ocr
  35. table_bbox, structure = get_structure_from_ocr(raw, tool)
  36. # 保存到 session_state
  37. st.session_state.structure = structure
  38. st.session_state.table_bbox = table_bbox
  39. st.session_state.undo_stack = []
  40. st.session_state.redo_stack = []
  41. # 清除缓存的图片
  42. from .drawing import clear_table_image_cache
  43. clear_table_image_cache()
  44. st.success(
  45. f"✅ 表格结构生成成功!\n\n"
  46. f"检测到 {structure['total_rows']} 行,{structure['total_cols']} 列"
  47. )
  48. return structure
  49. except Exception as e:
  50. st.error(f"❌ 分析失败: {e}")
  51. import traceback
  52. with st.expander("🔍 详细错误"):
  53. st.code(traceback.format_exc())
  54. else:
  55. # 🔑 PPStructure V3 格式:使用参数调整
  56. y_tolerance = st.sidebar.slider(
  57. "Y轴聚类容差(行检测)",
  58. min_value=1,
  59. max_value=20,
  60. value=5,
  61. help="相邻文本框Y坐标差小于此值时合并为同一行"
  62. )
  63. x_tolerance = st.sidebar.slider(
  64. "X轴聚类容差(列检测)",
  65. min_value=5,
  66. max_value=30,
  67. value=10,
  68. help="相邻文本框X坐标差小于此值时合并为同一列"
  69. )
  70. min_row_height = st.sidebar.slider(
  71. "最小行高",
  72. min_value=10,
  73. max_value=50,
  74. value=20,
  75. help="行高小于此值的将被过滤"
  76. )
  77. if st.sidebar.button("🚀 分析表格结构", type="primary"):
  78. with st.spinner("正在分析表格结构..."):
  79. try:
  80. structure = generator.analyze_table_structure(
  81. y_tolerance=y_tolerance,
  82. x_tolerance=x_tolerance,
  83. min_row_height=min_row_height
  84. )
  85. st.session_state.structure = structure
  86. st.session_state.undo_stack = []
  87. st.session_state.redo_stack = []
  88. # 清除缓存的图片
  89. from .drawing import clear_table_image_cache
  90. clear_table_image_cache()
  91. st.success(
  92. f"✅ 分析完成!\n\n"
  93. f"检测到 {len(structure['rows'])} 行,{len(structure['columns'])} 列"
  94. )
  95. return structure
  96. except Exception as e:
  97. st.error(f"❌ 分析失败: {e}")
  98. import traceback
  99. with st.expander("🔍 详细错误"):
  100. st.code(traceback.format_exc())
  101. return None