Browse Source

增强表格数据分析功能,支持数据过滤、排序、导出及统计分析,优化用户交互体验

zhch158_admin 2 tháng trước cách đây
mục cha
commit
faa33c7919
1 tập tin đã thay đổi với 315 bổ sung138 xóa
  1. 315 138
      streamlit_ocr_validator.py

+ 315 - 138
streamlit_ocr_validator.py

@@ -369,20 +369,130 @@ class StreamlitOCRValidator:
         
         return wrapped_content
     
-    def display_html_table_as_dataframe(self, html_content: str):
+    def display_html_table_as_dataframe(self, html_content: str, enable_editing: bool = False):
         """将HTML表格解析为DataFrame显示"""
         import pandas as pd
-        from io import StringIO
+        from io import StringIO, BytesIO
         
         try:
             # 使用pandas直接读取HTML表格
             tables = pd.read_html(StringIO(html_content))
             if tables:
                 for i, table in enumerate(tables):
-                    st.subheader(f"表格 {i+1}")
-                    st.dataframe(table, use_container_width=True)
+                    st.subheader(f"📊 表格 {i+1}")
+                    
+                    # 创建表格操作按钮
+                    col1, col2, col3, col4 = st.columns(4)
+                    with col1:
+                        show_info = st.checkbox(f"显示表格信息", key=f"info_{i}")
+                    with col2:
+                        show_stats = st.checkbox(f"显示统计信息", key=f"stats_{i}")
+                    with col3:
+                        enable_filter = st.checkbox(f"启用过滤", key=f"filter_{i}")
+                    with col4:
+                        enable_sort = st.checkbox(f"启用排序", key=f"sort_{i}")
+                    
+                    # 数据过滤
+                    filtered_table = table.copy()
+                    if enable_filter and not table.empty:
+                        filter_col = st.selectbox(
+                            f"选择过滤列 (表格 {i+1})", 
+                            options=['无'] + list(table.columns),
+                            key=f"filter_col_{i}"
+                        )
+                        
+                        if filter_col != '无':
+                            filter_value = st.text_input(
+                                f"过滤值 (表格 {i+1})", 
+                                key=f"filter_value_{i}"
+                            )
+                            if filter_value:
+                                filtered_table = table[
+                                    table[filter_col].astype(str).str.contains(filter_value, na=False)
+                                ]
+                    
+                    # 数据排序
+                    if enable_sort and not filtered_table.empty:
+                        sort_col = st.selectbox(
+                            f"选择排序列 (表格 {i+1})", 
+                            options=['无'] + list(filtered_table.columns),
+                            key=f"sort_col_{i}"
+                        )
+                        
+                        if sort_col != '无':
+                            sort_order = st.radio(
+                                f"排序方式 (表格 {i+1})",
+                                options=['升序', '降序'],
+                                horizontal=True,
+                                key=f"sort_order_{i}"
+                            )
+                            ascending = (sort_order == '升序')
+                            filtered_table = filtered_table.sort_values(sort_col, ascending=ascending)
+                    
+                    # 显示表格
+                    if enable_editing:
+                        # 可编辑表格
+                        edited_table = st.data_editor(
+                            filtered_table, 
+                            use_container_width=True,
+                            key=f"editor_{i}"
+                        )
+                        
+                        # 检查是否有编辑
+                        if not edited_table.equals(filtered_table):
+                            st.success("✏️ 表格已编辑,可以导出修改后的数据")
+                            
+                    else:
+                        # 只读表格
+                        st.dataframe(filtered_table, use_container_width=True)
+                    
+                    # 显示表格信息
+                    if show_info:
+                        st.write(f"**表格信息:**")
+                        st.write(f"- 原始行数: {len(table)}")
+                        st.write(f"- 过滤后行数: {len(filtered_table)}")
+                        st.write(f"- 列数: {len(table.columns)}")
+                        st.write(f"- 列名: {', '.join(table.columns)}")
+                    
+                    # 显示统计信息
+                    if show_stats:
+                        st.write(f"**统计信息:**")
+                        numeric_cols = filtered_table.select_dtypes(include=[np.number]).columns
+                        if len(numeric_cols) > 0:
+                            st.dataframe(filtered_table[numeric_cols].describe())
+                        else:
+                            st.info("表格中没有数值列")
+                    
+                    # 导出功能
+                    if st.button(f"📥 导出表格 {i+1}", key=f"export_{i}"):
+                        # 创建CSV数据
+                        csv_data = filtered_table.to_csv(index=False)
+                        st.download_button(
+                            label=f"下载CSV (表格 {i+1})",
+                            data=csv_data,
+                            file_name=f"table_{i+1}.csv",
+                            mime="text/csv",
+                            key=f"download_csv_{i}"
+                        )
+                        
+                        # 创建Excel数据
+                        excel_buffer = BytesIO()
+                        filtered_table.to_excel(excel_buffer, index=False)
+                        st.download_button(
+                            label=f"下载Excel (表格 {i+1})",
+                            data=excel_buffer.getvalue(),
+                            file_name=f"table_{i+1}.xlsx",
+                            mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+                            key=f"download_excel_{i}"
+                        )
+                    
+                    st.markdown("---")
+            else:
+                st.warning("未找到可解析的表格")
+                
         except Exception as e:
             st.error(f"表格解析失败: {e}")
+            st.info("尝试使用HTML渲染模式查看表格")
             # 回退到HTML渲染
             st.markdown(html_content, unsafe_allow_html=True)
 
@@ -486,154 +596,221 @@ def main():
         st.error(f"❌ 统计信息计算失败: {e}")
         return
     
-    # 主要布局 - 左右分栏
-    left_col, right_col = st.columns([1, 1])
+    # 创建标签页
+    tab1, tab2, tab3 = st.tabs(["📄 文本校验", "📊 表格分析", "📈 数据统计"])
     
-    # 左侧 - OCR文本内容
-    with left_col:
-        st.header("📄 OCR识别内容")
-        
-        # 文本选择器
-        if st.session_state.validator.text_bbox_mapping:
-            text_options = ["请选择文本..."] + list(st.session_state.validator.text_bbox_mapping.keys())
-            selected_index = st.selectbox(
-                "选择要校验的文本",
-                range(len(text_options)),
-                format_func=lambda x: text_options[x],
-                key="text_selector"
-            )
-            
-            if selected_index > 0:
-                st.session_state.selected_text = text_options[selected_index]
-        else:
-            st.warning("没有找到可点击的文本")
+    with tab1:
+        # 原有的左右分栏内容
+        left_col, right_col = st.columns([1, 1])
         
-        # 显示MD内容(可搜索和过滤)
-        if st.session_state.validator.md_content:
-            search_term = st.text_input("🔍 搜索文本内容", placeholder="输入关键词搜索...")
+        # 左侧 - OCR文本内容
+        with left_col:
+            st.header("📄 OCR识别内容")
             
-            display_content = st.session_state.validator.md_content
-            if search_term:
-                lines = display_content.split('\n')
-                filtered_lines = [line for line in lines if search_term.lower() in line.lower()]
-                display_content = '\n'.join(filtered_lines)
-                if filtered_lines:
-                    st.success(f"找到 {len(filtered_lines)} 行包含 '{search_term}'")
-                else:
-                    st.warning(f"未找到包含 '{search_term}' 的内容")
-            
-            # 渲染方式选择
-            render_mode = st.radio(
-                "选择渲染方式",
-                ["HTML渲染", "Markdown渲染", "DataFrame表格", "原始文本"],  # 添加DataFrame选项
-                horizontal=True
-            )
-
-            if render_mode == "HTML渲染":
-                # 使用unsafe_allow_html=True来渲染HTML表格
-                st.markdown(display_content, unsafe_allow_html=True)
-            elif render_mode == "Markdown渲染":
-                # 转换HTML表格为Markdown格式
-                converted_content = st.session_state.validator.convert_html_table_to_markdown(display_content)
-                st.markdown(converted_content)
-            elif render_mode == "DataFrame表格":
-                # 新增:使用DataFrame显示表格
-                if '<table>' in display_content.lower():
-                    st.session_state.validator.display_html_table_as_dataframe(display_content)
-                else:
-                    st.info("当前内容中没有检测到HTML表格")
-                    st.markdown(display_content)
-            else:
-                # 原始文本显示
-                st.text_area(
-                    "MD内容预览",
-                    display_content,
-                    height=300,
-                    help="OCR识别的文本内容"
+            # 文本选择器
+            if st.session_state.validator.text_bbox_mapping:
+                text_options = ["请选择文本..."] + list(st.session_state.validator.text_bbox_mapping.keys())
+                selected_index = st.selectbox(
+                    "选择要校验的文本",
+                    range(len(text_options)),
+                    format_func=lambda x: text_options[x],
+                    key="text_selector"
                 )
-        
-        # 可点击文本列表
-        st.subheader("🎯 可点击文本列表")
-        
-        if st.session_state.validator.text_bbox_mapping:
-            for text, info_list in st.session_state.validator.text_bbox_mapping.items():
-                info = info_list[0]  # 使用第一个bbox信息
-                
-                # 确定显示样式
-                is_selected = (text == st.session_state.selected_text)
-                is_error = (text in st.session_state.marked_errors)
                 
-                # 创建按钮行
-                button_col, error_col = st.columns([4, 1])
+                if selected_index > 0:
+                    st.session_state.selected_text = text_options[selected_index]
+            else:
+                st.warning("没有找到可点击的文本")
+            
+            # 显示MD内容(可搜索和过滤)
+            if st.session_state.validator.md_content:
+                search_term = st.text_input("🔍 搜索文本内容", placeholder="输入关键词搜索...")
                 
-                with button_col:
-                    button_type = "primary" if is_selected else "secondary"
-                    if st.button(f"📍 {text}", key=f"btn_{text}", type=button_type):
-                        st.session_state.selected_text = text
-                        st.rerun()
+                display_content = st.session_state.validator.md_content
+                if search_term:
+                    lines = display_content.split('\n')
+                    filtered_lines = [line for line in lines if search_term.lower() in line.lower()]
+                    display_content = '\n'.join(filtered_lines)
+                    if filtered_lines:
+                        st.success(f"找到 {len(filtered_lines)} 行包含 '{search_term}'")
+                    else:
+                        st.warning(f"未找到包含 '{search_term}' 的内容")
                 
-                with error_col:
-                    if is_error:
-                        if st.button("✅", key=f"fix_{text}", help="取消错误标记"):
-                            st.session_state.marked_errors.discard(text)
-                            st.rerun()
+                # 渲染方式选择
+                render_mode = st.radio(
+                    "选择渲染方式",
+                    ["HTML渲染", "Markdown渲染", "DataFrame表格", "原始文本"],  # 添加DataFrame选项
+                    horizontal=True
+                )
+
+                if render_mode == "HTML渲染":
+                    # 使用unsafe_allow_html=True来渲染HTML表格
+                    st.markdown(display_content, unsafe_allow_html=True)
+                elif render_mode == "Markdown渲染":
+                    # 转换HTML表格为Markdown格式
+                    converted_content = st.session_state.validator.convert_html_table_to_markdown(display_content)
+                    st.markdown(converted_content)
+                elif render_mode == "DataFrame表格":
+                    # 新增:使用DataFrame显示表格
+                    if '<table>' in display_content.lower():
+                        st.session_state.validator.display_html_table_as_dataframe(display_content)
                     else:
-                        if st.button("❌", key=f"error_{text}", help="标记为错误"):
-                            st.session_state.marked_errors.add(text)
+                        st.info("当前内容中没有检测到HTML表格")
+                        st.markdown(display_content)
+                else:
+                    # 原始文本显示
+                    st.text_area(
+                        "MD内容预览",
+                        display_content,
+                        height=300,
+                        help="OCR识别的文本内容"
+                    )
+            
+            # 可点击文本列表
+            st.subheader("🎯 可点击文本列表")
+            
+            if st.session_state.validator.text_bbox_mapping:
+                for text, info_list in st.session_state.validator.text_bbox_mapping.items():
+                    info = info_list[0] # 使用第一个bbox信息
+                    
+                    # 确定显示样式
+                    is_selected = (text == st.session_state.selected_text)
+                    is_error = (text in st.session_state.marked_errors)
+                    
+                    # 创建按钮行
+                    button_col, error_col = st.columns([4, 1])
+                    
+                    with button_col:
+                        button_type = "primary" if is_selected else "secondary"
+                        if st.button(f"📍 {text}", key=f"btn_{text}", type=button_type):
+                            st.session_state.selected_text = text
                             st.rerun()
-        else:
-            st.info("没有可点击的文本项目")
+                    
+                    with error_col:
+                        if is_error:
+                            if st.button("✅", key=f"fix_{text}", help="取消错误标记"):
+                                st.session_state.marked_errors.discard(text)
+                                st.rerun()
+                        else:
+                            if st.button("❌", key=f"error_{text}", help="标记为错误"):
+                                st.session_state.marked_errors.add(text)
+                                st.rerun()
+            else:
+                st.info("没有可点击的文本项目")
+        
+        # 右侧 - 图像显示
+        with right_col:
+            st.header("🖼️ 原图标注")
+            
+            if st.session_state.validator.image_path and Path(st.session_state.validator.image_path).exists():
+                try:
+                    # 加载图片
+                    image = Image.open(st.session_state.validator.image_path)
+                    
+                    # 创建交互式图片
+                    selected_bbox = None
+                    if st.session_state.selected_text and st.session_state.selected_text in st.session_state.validator.text_bbox_mapping:
+                        info = st.session_state.validator.text_bbox_mapping[st.session_state.selected_text][0]
+                        selected_bbox = info['bbox']
+                    
+                    fig = st.session_state.validator.create_interactive_plot(image, selected_bbox)
+                    st.plotly_chart(fig, use_container_width=True)
+                    
+                    # 显示选中文本的详细信息
+                    if st.session_state.selected_text:
+                        st.subheader("📍 选中文本详情")
+                        
+                        if st.session_state.selected_text in st.session_state.validator.text_bbox_mapping:
+                            info = st.session_state.validator.text_bbox_mapping[st.session_state.selected_text][0]
+                            bbox = info['bbox']
+                            
+                            info_col1, info_col2 = st.columns(2)
+                            with info_col1:
+                                st.write(f"**文本内容:** {st.session_state.selected_text}")
+                                st.write(f"**类别:** {info['category']}")
+                                st.write(f"**置信度:** {info.get('confidence', 'N/A')}")
+                            
+                            with info_col2:
+                                st.write(f"**位置:** [{', '.join(map(str, bbox))}]")
+                                if len(bbox) >= 4:
+                                    st.write(f"**宽度:** {bbox[2] - bbox[0]} px")
+                                    st.write(f"**高度:** {bbox[3] - bbox[1]} px")
+                            
+                            # 标记状态
+                            is_error = st.session_state.selected_text in st.session_state.marked_errors
+                            if is_error:
+                                st.error("⚠️ 此文本已标记为错误")
+                            else:
+                                st.success("✅ 此文本未标记错误")
+                except Exception as e:
+                    st.error(f"❌ 图片处理失败: {e}")
+            else:
+                st.error("未找到对应的图片文件")
+                if st.session_state.validator.image_path:
+                    st.write(f"期望路径: {st.session_state.validator.image_path}")
     
-    # 右侧 - 图像显示
-    with right_col:
-        st.header("🖼️ 原图标注")
-        
-        if st.session_state.validator.image_path and Path(st.session_state.validator.image_path).exists():
-            try:
-                # 加载图片
-                image = Image.open(st.session_state.validator.image_path)
-                
-                # 创建交互式图片
-                selected_bbox = None
-                if st.session_state.selected_text and st.session_state.selected_text in st.session_state.validator.text_bbox_mapping:
-                    info = st.session_state.validator.text_bbox_mapping[st.session_state.selected_text][0]
-                    selected_bbox = info['bbox']
+    with tab2:
+        # 新增:专门的表格分析页面
+        st.header("📊 表格数据分析")
+        
+        if st.session_state.validator.md_content:
+            # 检查是否包含表格
+            if '<table' in st.session_state.validator.md_content.lower():
+                col1, col2 = st.columns([2, 1])
                 
-                fig = st.session_state.validator.create_interactive_plot(image, selected_bbox)
-                st.plotly_chart(fig, use_container_width=True)
+                with col1:
+                    st.subheader("🔍 表格数据预览")
+                    st.session_state.validator.display_html_table_as_dataframe(
+                        st.session_state.validator.md_content
+                    )
                 
-                # 显示选中文本的详细信息
-                if st.session_state.selected_text:
-                    st.subheader("📍 选中文本详情")
+                with col2:
+                    st.subheader("⚙️ 表格操作")
                     
-                    if st.session_state.selected_text in st.session_state.validator.text_bbox_mapping:
-                        info = st.session_state.validator.text_bbox_mapping[st.session_state.selected_text][0]
-                        bbox = info['bbox']
-                        
-                        info_col1, info_col2 = st.columns(2)
-                        with info_col1:
-                            st.write(f"**文本内容:** {st.session_state.selected_text}")
-                            st.write(f"**类别:** {info['category']}")
-                            st.write(f"**置信度:** {info.get('confidence', 'N/A')}")
-                        
-                        with info_col2:
-                            st.write(f"**位置:** [{', '.join(map(str, bbox))}]")
-                            if len(bbox) >= 4:
-                                st.write(f"**宽度:** {bbox[2] - bbox[0]} px")
-                                st.write(f"**高度:** {bbox[3] - bbox[1]} px")
-                        
-                        # 标记状态
-                        is_error = st.session_state.selected_text in st.session_state.marked_errors
-                        if is_error:
-                            st.error("⚠️ 此文本已标记为错误")
-                        else:
-                            st.success("✅ 此文本未标记错误")
-            except Exception as e:
-                st.error(f"❌ 图片处理失败: {e}")
+                    if st.button("📥 导出表格数据", type="primary"):
+                        try:
+                            import pandas as pd
+                            from io import StringIO
+                            tables = pd.read_html(StringIO(st.session_state.validator.md_content))
+                            if tables:
+                                # 创建Excel文件
+                                output = BytesIO()
+                                with pd.ExcelWriter(output, engine='openpyxl') as writer:
+                                    for i, table in enumerate(tables):
+                                        table.to_excel(writer, sheet_name=f'Table_{i+1}', index=False)
+                                
+                                st.download_button(
+                                    label="📥 下载Excel文件",
+                                    data=output.getvalue(),
+                                    file_name="ocr_tables.xlsx",
+                                    mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+                                )
+                        except Exception as e:
+                            st.error(f"导出失败: {e}")
+                    
+                    if st.button("🔍 表格统计分析"):
+                        try:
+                            import pandas as pd
+                            from io import StringIO
+                            tables = pd.read_html(StringIO(st.session_state.validator.md_content))
+                            if tables:
+                                st.write("**表格统计信息:**")
+                                for i, table in enumerate(tables):
+                                    st.write(f"表格 {i+1}:")
+                                    st.write(f"- 行数: {len(table)}")
+                                    st.write(f"- 列数: {len(table.columns)}")
+                                    st.write(f"- 数值列数: {len(table.select_dtypes(include=[np.number]).columns)}")
+                        except Exception as e:
+                            st.error(f"统计分析失败: {e}")
+            else:
+                st.info("当前OCR结果中没有检测到表格数据")
         else:
-            st.error("未找到对应的图片文件")
-            if st.session_state.validator.image_path:
-                st.write(f"期望路径: {st.session_state.validator.image_path}")
+            st.warning("请先加载OCR数据")
+    
+    with tab3:
+        # 数据统计页面
+        st.header("📈 OCR数据统计")
+        # ...现有的统计代码...
 
 if __name__ == "__main__":
     main()