streamlit_validator_table.py 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266
  1. """
  2. 表格处理和分析功能
  3. """
  4. import streamlit as st
  5. import pandas as pd
  6. import numpy as np
  7. from io import BytesIO
  8. from ocr_validator_file_utils import parse_html_tables
  9. def display_html_table_as_dataframe(html_content: str, enable_editing: bool = False):
  10. """将HTML表格解析为DataFrame显示"""
  11. tables = parse_html_tables(html_content)
  12. wide_table_threshold = 15
  13. if not tables:
  14. st.warning("未找到可解析的表格")
  15. st.markdown("""
  16. <style>
  17. .scrollable-table {
  18. overflow-x: auto;
  19. white-space: nowrap;
  20. border: 1px solid #ddd;
  21. border-radius: 5px;
  22. margin: 10px 0;
  23. }
  24. .scrollable-table table {
  25. width: 100%;
  26. border-collapse: collapse;
  27. }
  28. .scrollable-table th, .scrollable-table td {
  29. border: 1px solid #ddd;
  30. padding: 8px;
  31. text-align: left;
  32. min-width: 100px;
  33. }
  34. .scrollable-table th {
  35. background-color: #f5f5f5;
  36. font-weight: bold;
  37. }
  38. </style>
  39. """, unsafe_allow_html=True)
  40. st.markdown(f'<div class="scrollable-table">{html_content}</div>', unsafe_allow_html=True)
  41. return
  42. for i, table in enumerate(tables):
  43. st.subheader(f"📊 表格 {i+1}")
  44. col_info1, col_info2, col_info3, col_info4 = st.columns(4)
  45. with col_info1:
  46. st.metric("行数", len(table))
  47. with col_info2:
  48. st.metric("列数", len(table.columns))
  49. with col_info3:
  50. is_wide_table = len(table.columns) > wide_table_threshold
  51. st.metric("表格类型", "超宽表格" if is_wide_table else "普通表格")
  52. with col_info4:
  53. display_mode = st.selectbox(
  54. f"显示模式 (表格{i+1})",
  55. ["完整显示", "分页显示", "筛选列显示"],
  56. key=f"display_mode_{i}"
  57. )
  58. col1, col2, col3, col4 = st.columns(4)
  59. with col1:
  60. show_info = st.checkbox(f"显示详细信息", key=f"info_{i}")
  61. with col2:
  62. show_stats = st.checkbox(f"显示统计信息", key=f"stats_{i}")
  63. with col3:
  64. enable_filter = st.checkbox(f"启用过滤", key=f"filter_{i}")
  65. with col4:
  66. enable_sort = st.checkbox(f"启用排序", key=f"sort_{i}")
  67. display_table = _process_table_display_mode(table, i, display_mode)
  68. filtered_table = _apply_table_filters_and_sorts(display_table, i, enable_filter, enable_sort)
  69. _render_table_with_style(filtered_table, table, i, enable_editing, wide_table_threshold)
  70. _display_table_info_and_stats(table, filtered_table, show_info, show_stats, i)
  71. st.markdown("---")
  72. def _process_table_display_mode(table: pd.DataFrame, table_index: int, display_mode: str) -> pd.DataFrame:
  73. """根据显示模式处理表格"""
  74. if display_mode == "分页显示":
  75. page_size = st.selectbox(
  76. f"每页显示行数 (表格 {table_index+1})",
  77. [10, 20, 50, 100],
  78. key=f"page_size_{table_index}"
  79. )
  80. total_pages = (len(table) - 1) // page_size + 1
  81. if total_pages > 1:
  82. page_number = st.selectbox(
  83. f"页码 (表格 {table_index+1})",
  84. range(1, total_pages + 1),
  85. key=f"page_number_{table_index}"
  86. )
  87. start_idx = (page_number - 1) * page_size
  88. end_idx = start_idx + page_size
  89. return table.iloc[start_idx:end_idx]
  90. return table
  91. elif display_mode == "筛选列显示":
  92. if len(table.columns) > 5:
  93. selected_columns = st.multiselect(
  94. f"选择要显示的列 (表格 {table_index+1})",
  95. table.columns.tolist(),
  96. default=table.columns.tolist()[:5],
  97. key=f"selected_columns_{table_index}"
  98. )
  99. if selected_columns:
  100. return table[selected_columns]
  101. return table
  102. else:
  103. return table
  104. def _apply_table_filters_and_sorts(table: pd.DataFrame, table_index: int,
  105. enable_filter: bool, enable_sort: bool) -> pd.DataFrame:
  106. """应用表格过滤和排序"""
  107. filtered_table = table.copy()
  108. if enable_filter and not table.empty:
  109. filter_col = st.selectbox(
  110. f"选择过滤列 (表格 {table_index+1})",
  111. options=['无'] + list(table.columns),
  112. key=f"filter_col_{table_index}"
  113. )
  114. if filter_col != '无':
  115. filter_value = st.text_input(f"过滤值 (表格 {table_index+1})", key=f"filter_value_{table_index}")
  116. if filter_value:
  117. filtered_table = table[table[filter_col].astype(str).str.contains(filter_value, na=False)]
  118. if enable_sort and not filtered_table.empty:
  119. sort_col = st.selectbox(
  120. f"选择排序列 (表格 {table_index+1})",
  121. options=['无'] + list(filtered_table.columns),
  122. key=f"sort_col_{table_index}"
  123. )
  124. if sort_col != '无':
  125. sort_order = st.radio(
  126. f"排序方式 (表格 {table_index+1})",
  127. options=['升序', '降序'],
  128. horizontal=True,
  129. key=f"sort_order_{table_index}"
  130. )
  131. ascending = (sort_order == '升序')
  132. filtered_table = filtered_table.sort_values(sort_col, ascending=ascending)
  133. return filtered_table
  134. def _render_table_with_style(filtered_table: pd.DataFrame, original_table: pd.DataFrame,
  135. table_index: int, enable_editing: bool, wide_table_threshold: int):
  136. """渲染表格并应用样式"""
  137. st.markdown("""
  138. <style>
  139. .dataframe-container {
  140. overflow-x: auto;
  141. border: 1px solid #ddd;
  142. border-radius: 5px;
  143. margin: 10px 0;
  144. }
  145. .wide-table-container {
  146. overflow-x: auto;
  147. max-height: 500px;
  148. overflow-y: auto;
  149. border: 2px solid #0288d1;
  150. border-radius: 8px;
  151. background: linear-gradient(90deg, #f8f9fa 0%, #ffffff 100%);
  152. }
  153. .dataframe thead th {
  154. position: sticky;
  155. top: 0;
  156. background-color: #f5f5f5 !important;
  157. z-index: 10;
  158. border-bottom: 2px solid #0288d1;
  159. }
  160. .dataframe tbody td {
  161. white-space: nowrap;
  162. min-width: 100px;
  163. max-width: 300px;
  164. overflow: hidden;
  165. text-overflow: ellipsis;
  166. }
  167. </style>
  168. """, unsafe_allow_html=True)
  169. container_class = "wide-table-container" if len(original_table.columns) > wide_table_threshold else "dataframe-container"
  170. if enable_editing:
  171. st.markdown(f'<div class="{container_class}">', unsafe_allow_html=True)
  172. edited_table = st.data_editor(
  173. filtered_table,
  174. width='stretch',
  175. key=f"editor_{table_index}",
  176. height=400 if len(original_table.columns) > 8 else None
  177. )
  178. st.markdown('</div>', unsafe_allow_html=True)
  179. if not edited_table.equals(filtered_table):
  180. st.success("✏️ 表格已编辑,可以导出修改后的数据")
  181. else:
  182. st.markdown(f'<div class="{container_class}">', unsafe_allow_html=True)
  183. st.dataframe(
  184. filtered_table,
  185. width=400 if len(original_table.columns) > wide_table_threshold else "stretch"
  186. )
  187. st.markdown('</div>', unsafe_allow_html=True)
  188. def _display_table_info_and_stats(original_table: pd.DataFrame, filtered_table: pd.DataFrame,
  189. show_info: bool, show_stats: bool, table_index: int):
  190. """显示表格信息和统计数据"""
  191. if show_info:
  192. st.write("**表格信息:**")
  193. st.write(f"- 原始行数: {len(original_table)}")
  194. st.write(f"- 过滤后行数: {len(filtered_table)}")
  195. st.write(f"- 列数: {len(original_table.columns)}")
  196. st.write(f"- 列名: {', '.join(original_table.columns)}")
  197. if show_stats:
  198. st.write("**统计信息:**")
  199. numeric_cols = filtered_table.select_dtypes(include=[np.number]).columns
  200. if len(numeric_cols) > 0:
  201. st.dataframe(filtered_table[numeric_cols].describe())
  202. else:
  203. st.info("表格中没有数值列")
  204. if st.button(f"📥 导出表格 {table_index+1}", key=f"export_{table_index}"):
  205. _create_export_buttons(filtered_table, table_index)
  206. def _create_export_buttons(table: pd.DataFrame, table_index: int):
  207. """创建导出按钮"""
  208. csv_data = table.to_csv(index=False)
  209. st.download_button(
  210. label=f"下载CSV (表格 {table_index+1})",
  211. data=csv_data,
  212. file_name=f"table_{table_index+1}.csv",
  213. mime="text/csv",
  214. key=f"download_csv_{table_index}"
  215. )
  216. excel_buffer = BytesIO()
  217. table.to_excel(excel_buffer, index=False)
  218. st.download_button(
  219. label=f"下载Excel (表格 {table_index+1})",
  220. data=excel_buffer.getvalue(),
  221. file_name=f"table_{table_index+1}.xlsx",
  222. mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
  223. key=f"download_excel_{table_index}"
  224. )