streamlit_validator_table.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277
  1. """
  2. 表格处理和分析功能
  3. """
  4. import streamlit as st
  5. import pandas as pd
  6. import numpy as np
  7. from io import BytesIO
  8. import sys
  9. from pathlib import Path
  10. # 添加 ocr_platform 根目录到 Python 路径(用于导入 ocr_utils)
  11. # 使用 resolve() 确保路径是绝对路径,避免相对路径导致的 IndexError
  12. _file_path = Path(__file__).resolve()
  13. ocr_platform_root = _file_path.parents[1] # streamlit_validator_table.py -> ocr_validator -> ocr_platform
  14. if str(ocr_platform_root) not in sys.path:
  15. sys.path.insert(0, str(ocr_platform_root))
  16. # 从 ocr_utils 导入通用工具
  17. from ocr_utils.html_utils import parse_html_tables
  18. def display_html_table_as_dataframe(html_content: str, enable_editing: bool = False):
  19. """将HTML表格解析为DataFrame显示"""
  20. tables = parse_html_tables(html_content)
  21. wide_table_threshold = 15
  22. if not tables:
  23. st.warning("未找到可解析的表格")
  24. st.markdown("""
  25. <style>
  26. .scrollable-table {
  27. overflow-x: auto;
  28. white-space: nowrap;
  29. border: 1px solid #ddd;
  30. border-radius: 5px;
  31. margin: 10px 0;
  32. }
  33. .scrollable-table table {
  34. width: 100%;
  35. border-collapse: collapse;
  36. }
  37. .scrollable-table th, .scrollable-table td {
  38. border: 1px solid #ddd;
  39. padding: 8px;
  40. text-align: left;
  41. min-width: 100px;
  42. }
  43. .scrollable-table th {
  44. background-color: #f5f5f5;
  45. font-weight: bold;
  46. }
  47. </style>
  48. """, unsafe_allow_html=True)
  49. st.markdown(f'<div class="scrollable-table">{html_content}</div>', unsafe_allow_html=True)
  50. return
  51. for i, table in enumerate(tables):
  52. st.subheader(f"📊 表格 {i+1}")
  53. col_info1, col_info2, col_info3, col_info4 = st.columns(4)
  54. with col_info1:
  55. st.metric("行数", len(table))
  56. with col_info2:
  57. st.metric("列数", len(table.columns))
  58. with col_info3:
  59. is_wide_table = len(table.columns) > wide_table_threshold
  60. st.metric("表格类型", "超宽表格" if is_wide_table else "普通表格")
  61. with col_info4:
  62. display_mode = st.selectbox(
  63. f"显示模式 (表格{i+1})",
  64. ["完整显示", "分页显示", "筛选列显示"],
  65. key=f"display_mode_{i}"
  66. )
  67. col1, col2, col3, col4 = st.columns(4)
  68. with col1:
  69. show_info = st.checkbox(f"显示详细信息", key=f"info_{i}")
  70. with col2:
  71. show_stats = st.checkbox(f"显示统计信息", key=f"stats_{i}")
  72. with col3:
  73. enable_filter = st.checkbox(f"启用过滤", key=f"filter_{i}")
  74. with col4:
  75. enable_sort = st.checkbox(f"启用排序", key=f"sort_{i}")
  76. display_table = _process_table_display_mode(table, i, display_mode)
  77. filtered_table = _apply_table_filters_and_sorts(display_table, i, enable_filter, enable_sort)
  78. _render_table_with_style(filtered_table, table, i, enable_editing, wide_table_threshold)
  79. _display_table_info_and_stats(table, filtered_table, show_info, show_stats, i)
  80. st.markdown("---")
  81. def _process_table_display_mode(table: pd.DataFrame, table_index: int, display_mode: str) -> pd.DataFrame:
  82. """根据显示模式处理表格"""
  83. if display_mode == "分页显示":
  84. page_size = st.selectbox(
  85. f"每页显示行数 (表格 {table_index+1})",
  86. [10, 20, 50, 100],
  87. key=f"page_size_{table_index}"
  88. )
  89. total_pages = (len(table) - 1) // page_size + 1
  90. if total_pages > 1:
  91. page_number = st.selectbox(
  92. f"页码 (表格 {table_index+1})",
  93. range(1, total_pages + 1),
  94. key=f"page_number_{table_index}"
  95. )
  96. start_idx = (page_number - 1) * page_size
  97. end_idx = start_idx + page_size
  98. return table.iloc[start_idx:end_idx]
  99. return table
  100. elif display_mode == "筛选列显示":
  101. if len(table.columns) > 5:
  102. selected_columns = st.multiselect(
  103. f"选择要显示的列 (表格 {table_index+1})",
  104. table.columns.tolist(),
  105. default=table.columns.tolist()[:5],
  106. key=f"selected_columns_{table_index}"
  107. )
  108. if selected_columns:
  109. return table[selected_columns]
  110. return table
  111. else:
  112. return table
  113. def _apply_table_filters_and_sorts(table: pd.DataFrame, table_index: int,
  114. enable_filter: bool, enable_sort: bool) -> pd.DataFrame:
  115. """应用表格过滤和排序"""
  116. filtered_table = table.copy()
  117. if enable_filter and not table.empty:
  118. filter_col = st.selectbox(
  119. f"选择过滤列 (表格 {table_index+1})",
  120. options=['无'] + list(table.columns),
  121. key=f"filter_col_{table_index}"
  122. )
  123. if filter_col != '无':
  124. filter_value = st.text_input(f"过滤值 (表格 {table_index+1})", key=f"filter_value_{table_index}")
  125. if filter_value:
  126. filtered_table = table[table[filter_col].astype(str).str.contains(filter_value, na=False)]
  127. if enable_sort and not filtered_table.empty:
  128. sort_col = st.selectbox(
  129. f"选择排序列 (表格 {table_index+1})",
  130. options=['无'] + list(filtered_table.columns),
  131. key=f"sort_col_{table_index}"
  132. )
  133. if sort_col != '无':
  134. sort_order = st.radio(
  135. f"排序方式 (表格 {table_index+1})",
  136. options=['升序', '降序'],
  137. horizontal=True,
  138. key=f"sort_order_{table_index}"
  139. )
  140. ascending = (sort_order == '升序')
  141. filtered_table = filtered_table.sort_values(sort_col, ascending=ascending)
  142. return filtered_table
  143. def _render_table_with_style(filtered_table: pd.DataFrame, original_table: pd.DataFrame,
  144. table_index: int, enable_editing: bool, wide_table_threshold: int):
  145. """渲染表格并应用样式"""
  146. st.markdown("""
  147. <style>
  148. .dataframe-container {
  149. overflow-x: auto;
  150. border: 1px solid #ddd;
  151. border-radius: 5px;
  152. margin: 10px 0;
  153. }
  154. .wide-table-container {
  155. overflow-x: auto;
  156. max-height: 500px;
  157. overflow-y: auto;
  158. border: 2px solid #0288d1;
  159. border-radius: 8px;
  160. background: linear-gradient(90deg, #f8f9fa 0%, #ffffff 100%);
  161. }
  162. .dataframe thead th {
  163. position: sticky;
  164. top: 0;
  165. background-color: #f5f5f5 !important;
  166. z-index: 10;
  167. border-bottom: 2px solid #0288d1;
  168. }
  169. .dataframe tbody td {
  170. white-space: nowrap;
  171. min-width: 100px;
  172. max-width: 300px;
  173. overflow: hidden;
  174. text-overflow: ellipsis;
  175. }
  176. </style>
  177. """, unsafe_allow_html=True)
  178. container_class = "wide-table-container" if len(original_table.columns) > wide_table_threshold else "dataframe-container"
  179. if enable_editing:
  180. st.markdown(f'<div class="{container_class}">', unsafe_allow_html=True)
  181. edited_table = st.data_editor(
  182. filtered_table,
  183. width='stretch',
  184. key=f"editor_{table_index}",
  185. height=400 if len(original_table.columns) > 8 else None
  186. )
  187. st.markdown('</div>', unsafe_allow_html=True)
  188. if not edited_table.equals(filtered_table):
  189. st.success("✏️ 表格已编辑,可以导出修改后的数据")
  190. else:
  191. st.markdown(f'<div class="{container_class}">', unsafe_allow_html=True)
  192. st.dataframe(
  193. filtered_table,
  194. width=400 if len(original_table.columns) > wide_table_threshold else "stretch"
  195. )
  196. st.markdown('</div>', unsafe_allow_html=True)
  197. def _display_table_info_and_stats(original_table: pd.DataFrame, filtered_table: pd.DataFrame,
  198. show_info: bool, show_stats: bool, table_index: int):
  199. """显示表格信息和统计数据"""
  200. if show_info:
  201. st.write("**表格信息:**")
  202. st.write(f"- 原始行数: {len(original_table)}")
  203. st.write(f"- 过滤后行数: {len(filtered_table)}")
  204. st.write(f"- 列数: {len(original_table.columns)}")
  205. st.write(f"- 列名: {', '.join(original_table.columns)}")
  206. if show_stats:
  207. st.write("**统计信息:**")
  208. numeric_cols = filtered_table.select_dtypes(include=[np.number]).columns
  209. if len(numeric_cols) > 0:
  210. st.dataframe(filtered_table[numeric_cols].describe())
  211. else:
  212. st.info("表格中没有数值列")
  213. if st.button(f"📥 导出表格 {table_index+1}", key=f"export_{table_index}"):
  214. _create_export_buttons(filtered_table, table_index)
  215. def _create_export_buttons(table: pd.DataFrame, table_index: int):
  216. """创建导出按钮"""
  217. csv_data = table.to_csv(index=False)
  218. st.download_button(
  219. label=f"下载CSV (表格 {table_index+1})",
  220. data=csv_data,
  221. file_name=f"table_{table_index+1}.csv",
  222. mime="text/csv",
  223. key=f"download_csv_{table_index}"
  224. )
  225. excel_buffer = BytesIO()
  226. table.to_excel(excel_buffer, index=False)
  227. st.download_button(
  228. label=f"下载Excel (表格 {table_index+1})",
  229. data=excel_buffer.getvalue(),
  230. file_name=f"table_{table_index+1}.xlsx",
  231. mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
  232. key=f"download_excel_{table_index}"
  233. )