"""
表格处理和分析功能
"""
import streamlit as st
import pandas as pd
import numpy as np
from io import BytesIO
from ocr_validator_file_utils import parse_html_tables
def display_html_table_as_dataframe(html_content: str, enable_editing: bool = False):
"""将HTML表格解析为DataFrame显示"""
tables = parse_html_tables(html_content)
wide_table_threshold = 15
if not tables:
st.warning("未找到可解析的表格")
st.markdown("""
""", unsafe_allow_html=True)
st.markdown(f'
{html_content}
', unsafe_allow_html=True)
return
for i, table in enumerate(tables):
st.subheader(f"📊 表格 {i+1}")
col_info1, col_info2, col_info3, col_info4 = st.columns(4)
with col_info1:
st.metric("行数", len(table))
with col_info2:
st.metric("列数", len(table.columns))
with col_info3:
is_wide_table = len(table.columns) > wide_table_threshold
st.metric("表格类型", "超宽表格" if is_wide_table else "普通表格")
with col_info4:
display_mode = st.selectbox(
f"显示模式 (表格{i+1})",
["完整显示", "分页显示", "筛选列显示"],
key=f"display_mode_{i}"
)
col1, col2, col3, col4 = st.columns(4)
with col1:
show_info = st.checkbox(f"显示详细信息", key=f"info_{i}")
with col2:
show_stats = st.checkbox(f"显示统计信息", key=f"stats_{i}")
with col3:
enable_filter = st.checkbox(f"启用过滤", key=f"filter_{i}")
with col4:
enable_sort = st.checkbox(f"启用排序", key=f"sort_{i}")
display_table = _process_table_display_mode(table, i, display_mode)
filtered_table = _apply_table_filters_and_sorts(display_table, i, enable_filter, enable_sort)
_render_table_with_style(filtered_table, table, i, enable_editing, wide_table_threshold)
_display_table_info_and_stats(table, filtered_table, show_info, show_stats, i)
st.markdown("---")
def _process_table_display_mode(table: pd.DataFrame, table_index: int, display_mode: str) -> pd.DataFrame:
"""根据显示模式处理表格"""
if display_mode == "分页显示":
page_size = st.selectbox(
f"每页显示行数 (表格 {table_index+1})",
[10, 20, 50, 100],
key=f"page_size_{table_index}"
)
total_pages = (len(table) - 1) // page_size + 1
if total_pages > 1:
page_number = st.selectbox(
f"页码 (表格 {table_index+1})",
range(1, total_pages + 1),
key=f"page_number_{table_index}"
)
start_idx = (page_number - 1) * page_size
end_idx = start_idx + page_size
return table.iloc[start_idx:end_idx]
return table
elif display_mode == "筛选列显示":
if len(table.columns) > 5:
selected_columns = st.multiselect(
f"选择要显示的列 (表格 {table_index+1})",
table.columns.tolist(),
default=table.columns.tolist()[:5],
key=f"selected_columns_{table_index}"
)
if selected_columns:
return table[selected_columns]
return table
else:
return table
def _apply_table_filters_and_sorts(table: pd.DataFrame, table_index: int,
enable_filter: bool, enable_sort: bool) -> pd.DataFrame:
"""应用表格过滤和排序"""
filtered_table = table.copy()
if enable_filter and not table.empty:
filter_col = st.selectbox(
f"选择过滤列 (表格 {table_index+1})",
options=['无'] + list(table.columns),
key=f"filter_col_{table_index}"
)
if filter_col != '无':
filter_value = st.text_input(f"过滤值 (表格 {table_index+1})", key=f"filter_value_{table_index}")
if filter_value:
filtered_table = table[table[filter_col].astype(str).str.contains(filter_value, na=False)]
if enable_sort and not filtered_table.empty:
sort_col = st.selectbox(
f"选择排序列 (表格 {table_index+1})",
options=['无'] + list(filtered_table.columns),
key=f"sort_col_{table_index}"
)
if sort_col != '无':
sort_order = st.radio(
f"排序方式 (表格 {table_index+1})",
options=['升序', '降序'],
horizontal=True,
key=f"sort_order_{table_index}"
)
ascending = (sort_order == '升序')
filtered_table = filtered_table.sort_values(sort_col, ascending=ascending)
return filtered_table
def _render_table_with_style(filtered_table: pd.DataFrame, original_table: pd.DataFrame,
table_index: int, enable_editing: bool, wide_table_threshold: int):
"""渲染表格并应用样式"""
st.markdown("""
""", unsafe_allow_html=True)
container_class = "wide-table-container" if len(original_table.columns) > wide_table_threshold else "dataframe-container"
if enable_editing:
st.markdown(f'', unsafe_allow_html=True)
edited_table = st.data_editor(
filtered_table,
width='stretch',
key=f"editor_{table_index}",
height=400 if len(original_table.columns) > 8 else None
)
st.markdown('
', unsafe_allow_html=True)
if not edited_table.equals(filtered_table):
st.success("✏️ 表格已编辑,可以导出修改后的数据")
else:
st.markdown(f'', unsafe_allow_html=True)
st.dataframe(
filtered_table,
width=400 if len(original_table.columns) > wide_table_threshold else "stretch"
)
st.markdown('
', unsafe_allow_html=True)
def _display_table_info_and_stats(original_table: pd.DataFrame, filtered_table: pd.DataFrame,
show_info: bool, show_stats: bool, table_index: int):
"""显示表格信息和统计数据"""
if show_info:
st.write("**表格信息:**")
st.write(f"- 原始行数: {len(original_table)}")
st.write(f"- 过滤后行数: {len(filtered_table)}")
st.write(f"- 列数: {len(original_table.columns)}")
st.write(f"- 列名: {', '.join(original_table.columns)}")
if show_stats:
st.write("**统计信息:**")
numeric_cols = filtered_table.select_dtypes(include=[np.number]).columns
if len(numeric_cols) > 0:
st.dataframe(filtered_table[numeric_cols].describe())
else:
st.info("表格中没有数值列")
if st.button(f"📥 导出表格 {table_index+1}", key=f"export_{table_index}"):
_create_export_buttons(filtered_table, table_index)
def _create_export_buttons(table: pd.DataFrame, table_index: int):
"""创建导出按钮"""
csv_data = table.to_csv(index=False)
st.download_button(
label=f"下载CSV (表格 {table_index+1})",
data=csv_data,
file_name=f"table_{table_index+1}.csv",
mime="text/csv",
key=f"download_csv_{table_index}"
)
excel_buffer = BytesIO()
table.to_excel(excel_buffer, index=False)
st.download_button(
label=f"下载Excel (表格 {table_index+1})",
data=excel_buffer.getvalue(),
file_name=f"table_{table_index+1}.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
key=f"download_excel_{table_index}"
)