dataframe_diff_v0.1.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566
  1. import streamlit as st
  2. import pandas as pd
  3. import numpy as np
  4. from typing import Dict, List, Tuple, Optional
  5. import plotly.graph_objects as go
  6. from plotly.subplots import make_subplots
  7. import plotly.express as px
  8. def create_dataframe_diff_visualizer():
  9. st.title("📊 DataFrame可视化比对工具")
  10. st.markdown("---")
  11. # 初始化数据
  12. if 'original_df' not in st.session_state:
  13. st.session_state.original_df = create_sample_data()
  14. if 'edited_df' not in st.session_state:
  15. st.session_state.edited_df = st.session_state.original_df.copy()
  16. # 控制面板
  17. with st.expander("🎛️ 控制面板", expanded=True):
  18. col1, col2, col3, col4 = st.columns(4)
  19. with col1:
  20. if st.button("🔄 重置数据", type="secondary"):
  21. st.session_state.original_df = create_sample_data()
  22. st.session_state.edited_df = st.session_state.original_df.copy()
  23. st.rerun()
  24. with col2:
  25. if st.button("🎲 生成随机差异", type="secondary"):
  26. st.session_state.edited_df = create_random_differences(st.session_state.original_df)
  27. st.rerun()
  28. with col3:
  29. sync_mode = st.checkbox("🔗 同步滚动", value=True)
  30. with col4:
  31. show_stats = st.checkbox("📈 显示统计", value=True)
  32. # 分析差异
  33. diff_analysis = analyze_dataframe_differences(
  34. st.session_state.original_df,
  35. st.session_state.edited_df
  36. )
  37. # 显示差异统计
  38. if show_stats:
  39. display_diff_statistics(diff_analysis)
  40. # 主要比对区域
  41. st.subheader("📝 数据比对")
  42. # 使用两列布局
  43. left_col, right_col = st.columns(2)
  44. with left_col:
  45. st.markdown("### 📝 可编辑版本 (左侧)")
  46. # 可编辑的数据编辑器
  47. edited_df = st.data_editor(
  48. st.session_state.edited_df,
  49. height=500,
  50. use_container_width=True,
  51. num_rows="dynamic",
  52. key="left_editor",
  53. column_config=create_column_config(st.session_state.edited_df)
  54. )
  55. # 更新编辑后的数据
  56. if not edited_df.equals(st.session_state.edited_df):
  57. st.session_state.edited_df = edited_df.copy()
  58. st.rerun()
  59. with right_col:
  60. st.markdown("### 📊 原始版本 (右侧)")
  61. # 显示带差异高亮的原始数据
  62. display_dataframe_with_diff_highlighting(
  63. st.session_state.original_df,
  64. diff_analysis,
  65. "original"
  66. )
  67. # 详细差异视图
  68. st.markdown("---")
  69. create_detailed_diff_view(diff_analysis)
  70. def create_sample_data() -> pd.DataFrame:
  71. """创建示例数据"""
  72. np.random.seed(42)
  73. data = {
  74. 'ID': range(1, 21),
  75. 'Name': [f'Product_{i}' for i in range(1, 21)],
  76. 'Category': np.random.choice(['Electronics', 'Clothing', 'Food', 'Books'], 20),
  77. 'Price': np.round(np.random.uniform(10, 100, 20), 2),
  78. 'Stock': np.random.randint(0, 200, 20),
  79. 'Rating': np.round(np.random.uniform(1, 5, 20), 1),
  80. 'Active': np.random.choice([True, False], 20)
  81. }
  82. return pd.DataFrame(data)
  83. def create_random_differences(df: pd.DataFrame) -> pd.DataFrame:
  84. """创建随机差异用于演示"""
  85. modified_df = df.copy()
  86. # 随机修改一些单元格
  87. num_changes = np.random.randint(5, 15)
  88. for _ in range(num_changes):
  89. row_idx = np.random.randint(0, len(modified_df))
  90. col_idx = np.random.randint(1, len(modified_df.columns)) # 跳过ID列
  91. col_name = modified_df.columns[col_idx]
  92. if col_name == 'Name':
  93. modified_df.loc[row_idx, col_name] = f'Modified_{row_idx}'
  94. elif col_name == 'Category':
  95. modified_df.loc[row_idx, col_name] = np.random.choice(['Modified_Cat', 'New_Category'])
  96. elif col_name == 'Price':
  97. modified_df.loc[row_idx, col_name] = np.round(np.random.uniform(10, 150), 2)
  98. elif col_name == 'Stock':
  99. modified_df.loc[row_idx, col_name] = np.random.randint(0, 300)
  100. elif col_name == 'Rating':
  101. modified_df.loc[row_idx, col_name] = np.round(np.random.uniform(1, 5), 1)
  102. elif col_name == 'Active':
  103. modified_df.loc[row_idx, col_name] = not modified_df.loc[row_idx, col_name]
  104. return modified_df
  105. def analyze_dataframe_differences(df1: pd.DataFrame, df2: pd.DataFrame) -> Dict:
  106. """分析两个DataFrame之间的差异"""
  107. # 确保两个DataFrame具有相同的形状和列
  108. if df1.shape != df2.shape:
  109. st.warning("⚠️ 两个DataFrame的形状不匹配!")
  110. common_columns = list(set(df1.columns) & set(df2.columns))
  111. differences = {
  112. 'cell_differences': [],
  113. 'added_rows': [],
  114. 'removed_rows': [],
  115. 'column_differences': {
  116. 'added_columns': list(set(df2.columns) - set(df1.columns)),
  117. 'removed_columns': list(set(df1.columns) - set(df2.columns))
  118. },
  119. 'summary': {
  120. 'total_differences': 0,
  121. 'modified_cells': 0,
  122. 'modified_rows': set(),
  123. 'modified_columns': set()
  124. }
  125. }
  126. # 比较相同大小的DataFrame
  127. min_rows = min(len(df1), len(df2))
  128. for row_idx in range(min_rows):
  129. for col in common_columns:
  130. try:
  131. val1 = df1.iloc[row_idx][col]
  132. val2 = df2.iloc[row_idx][col]
  133. # 处理NaN值
  134. if pd.isna(val1) and pd.isna(val2):
  135. continue
  136. if pd.isna(val1) or pd.isna(val2) or val1 != val2:
  137. differences['cell_differences'].append({
  138. 'row': row_idx,
  139. 'column': col,
  140. 'original_value': val1,
  141. 'new_value': val2,
  142. 'change_type': determine_change_type(val1, val2)
  143. })
  144. differences['summary']['modified_cells'] += 1
  145. differences['summary']['modified_rows'].add(row_idx)
  146. differences['summary']['modified_columns'].add(col)
  147. except Exception as e:
  148. st.warning(f"比较时出错 (行{row_idx}, 列{col}): {e}")
  149. # 检查行数差异
  150. if len(df1) > len(df2):
  151. differences['removed_rows'] = list(range(len(df2), len(df1)))
  152. elif len(df2) > len(df1):
  153. differences['added_rows'] = list(range(len(df1), len(df2)))
  154. differences['summary']['total_differences'] = (
  155. differences['summary']['modified_cells'] +
  156. len(differences['added_rows']) +
  157. len(differences['removed_rows']) +
  158. len(differences['column_differences']['added_columns']) +
  159. len(differences['column_differences']['removed_columns'])
  160. )
  161. return differences
  162. def determine_change_type(val1, val2) -> str:
  163. """确定变更类型"""
  164. if pd.isna(val1):
  165. return "added"
  166. elif pd.isna(val2):
  167. return "removed"
  168. else:
  169. return "modified"
  170. def create_column_config(df: pd.DataFrame) -> Dict:
  171. """为data_editor创建列配置"""
  172. config = {}
  173. for col in df.columns:
  174. if df[col].dtype in ['int64', 'float64']:
  175. config[col] = st.column_config.NumberColumn(
  176. col,
  177. help=f"数值列: {col}",
  178. format="%.2f" if df[col].dtype == 'float64' else "%d"
  179. )
  180. elif df[col].dtype == 'bool':
  181. config[col] = st.column_config.CheckboxColumn(
  182. col,
  183. help=f"布尔列: {col}"
  184. )
  185. else:
  186. config[col] = st.column_config.TextColumn(
  187. col,
  188. help=f"文本列: {col}",
  189. max_chars=100
  190. )
  191. return config
  192. def display_dataframe_with_diff_highlighting(df: pd.DataFrame, diff_analysis: Dict, view_type: str):
  193. """显示带差异高亮的DataFrame"""
  194. # 创建样式化的HTML表格
  195. html_table = create_styled_diff_table(df, diff_analysis, view_type)
  196. # 自定义CSS样式
  197. st.markdown("""
  198. <style>
  199. .diff-table {
  200. height: 500px;
  201. overflow: auto;
  202. border: 1px solid #ddd;
  203. border-radius: 5px;
  204. font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
  205. font-size: 12px;
  206. }
  207. .diff-table table {
  208. width: 100%;
  209. border-collapse: collapse;
  210. margin: 0;
  211. }
  212. .diff-table th {
  213. background-color: #f5f5f5;
  214. border: 1px solid #ddd;
  215. padding: 8px;
  216. text-align: left;
  217. position: sticky;
  218. top: 0;
  219. z-index: 10;
  220. }
  221. .diff-table td {
  222. border: 1px solid #ddd;
  223. padding: 8px;
  224. white-space: nowrap;
  225. }
  226. /* 差异高亮样式 */
  227. .cell-modified {
  228. background-color: #fff3cd !important;
  229. border: 2px solid #ffc107 !important;
  230. position: relative;
  231. }
  232. .cell-added {
  233. background-color: #d4edda !important;
  234. border: 2px solid #28a745 !important;
  235. }
  236. .cell-removed {
  237. background-color: #f8d7da !important;
  238. border: 2px solid #dc3545 !important;
  239. }
  240. .row-highlight {
  241. background-color: #f8f9fa !important;
  242. }
  243. /* 悬停效果 */
  244. .diff-table td:hover {
  245. background-color: #e3f2fd !important;
  246. cursor: pointer;
  247. }
  248. /* 差异标记 */
  249. .diff-marker {
  250. position: absolute;
  251. top: 2px;
  252. right: 2px;
  253. width: 8px;
  254. height: 8px;
  255. border-radius: 50%;
  256. }
  257. .marker-modified { background-color: #ffc107; }
  258. .marker-added { background-color: #28a745; }
  259. .marker-removed { background-color: #dc3545; }
  260. </style>
  261. """, unsafe_allow_html=True)
  262. # 显示表格
  263. st.markdown(f'<div class="diff-table">{html_table}</div>', unsafe_allow_html=True)
  264. def create_styled_diff_table(df: pd.DataFrame, diff_analysis: Dict, view_type: str) -> str:
  265. """创建带样式的差异表格HTML"""
  266. # 创建差异映射
  267. diff_map = {}
  268. for diff in diff_analysis['cell_differences']:
  269. key = (diff['row'], diff['column'])
  270. diff_map[key] = diff
  271. # 开始构建HTML
  272. html = '<table>'
  273. # 表头
  274. html += '<tr>'
  275. for col in df.columns:
  276. html += f'<th>{col}</th>'
  277. html += '</tr>'
  278. # 表格行
  279. for row_idx in range(len(df)):
  280. row_class = "row-highlight" if row_idx in diff_analysis['summary']['modified_rows'] else ""
  281. html += f'<tr class="{row_class}">'
  282. for col in df.columns:
  283. value = df.iloc[row_idx][col]
  284. cell_key = (row_idx, col)
  285. # 确定单元格样式
  286. cell_class = ""
  287. marker_class = ""
  288. if cell_key in diff_map:
  289. diff_info = diff_map[cell_key]
  290. change_type = diff_info['change_type']
  291. if change_type == "modified":
  292. cell_class = "cell-modified"
  293. marker_class = "marker-modified"
  294. elif change_type == "added":
  295. cell_class = "cell-added"
  296. marker_class = "marker-added"
  297. elif change_type == "removed":
  298. cell_class = "cell-removed"
  299. marker_class = "marker-removed"
  300. # 处理值显示
  301. display_value = str(value) if not pd.isna(value) else ""
  302. # 构建单元格HTML
  303. cell_html = f'<td class="{cell_class}" title="行{row_idx}, 列{col}: {display_value}">'
  304. if marker_class:
  305. cell_html += f'<div class="diff-marker {marker_class}"></div>'
  306. cell_html += display_value
  307. cell_html += '</td>'
  308. html += cell_html
  309. html += '</tr>'
  310. html += '</table>'
  311. return html
  312. def display_diff_statistics(diff_analysis: Dict):
  313. """显示差异统计信息"""
  314. st.subheader("📈 差异统计")
  315. col1, col2, col3, col4, col5 = st.columns(5)
  316. with col1:
  317. st.metric(
  318. "总差异数",
  319. diff_analysis['summary']['total_differences'],
  320. help="所有类型的差异总数"
  321. )
  322. with col2:
  323. st.metric(
  324. "修改的单元格",
  325. diff_analysis['summary']['modified_cells'],
  326. help="被修改的单元格数量"
  327. )
  328. with col3:
  329. st.metric(
  330. "影响的行数",
  331. len(diff_analysis['summary']['modified_rows']),
  332. help="包含差异的行数"
  333. )
  334. with col4:
  335. st.metric(
  336. "影响的列数",
  337. len(diff_analysis['summary']['modified_columns']),
  338. help="包含差异的列数"
  339. )
  340. with col5:
  341. added_rows = len(diff_analysis['added_rows'])
  342. removed_rows = len(diff_analysis['removed_rows'])
  343. row_diff = added_rows - removed_rows
  344. st.metric(
  345. "行数变化",
  346. f"+{added_rows}/-{removed_rows}",
  347. delta=row_diff if row_diff != 0 else None
  348. )
  349. def create_detailed_diff_view(diff_analysis: Dict):
  350. """创建详细的差异视图"""
  351. st.subheader("🔍 详细差异分析")
  352. if diff_analysis['summary']['total_differences'] == 0:
  353. st.success("✅ 没有发现任何差异!")
  354. return
  355. # 差异类型选择器
  356. diff_types = []
  357. if diff_analysis['cell_differences']:
  358. diff_types.append("单元格差异")
  359. if diff_analysis['added_rows']:
  360. diff_types.append("新增行")
  361. if diff_analysis['removed_rows']:
  362. diff_types.append("删除行")
  363. if diff_analysis['column_differences']['added_columns']:
  364. diff_types.append("新增列")
  365. if diff_analysis['column_differences']['removed_columns']:
  366. diff_types.append("删除列")
  367. selected_diff_type = st.selectbox("选择要查看的差异类型", diff_types)
  368. # 显示相应的差异详情
  369. if selected_diff_type == "单元格差异":
  370. display_cell_differences(diff_analysis['cell_differences'])
  371. elif selected_diff_type == "新增行":
  372. st.info(f"新增了 {len(diff_analysis['added_rows'])} 行: {diff_analysis['added_rows']}")
  373. elif selected_diff_type == "删除行":
  374. st.warning(f"删除了 {len(diff_analysis['removed_rows'])} 行: {diff_analysis['removed_rows']}")
  375. elif selected_diff_type == "新增列":
  376. st.info(f"新增了列: {diff_analysis['column_differences']['added_columns']}")
  377. elif selected_diff_type == "删除列":
  378. st.warning(f"删除了列: {diff_analysis['column_differences']['removed_columns']}")
  379. def display_cell_differences(cell_differences: List[Dict]):
  380. """显示单元格差异详情"""
  381. if not cell_differences:
  382. return
  383. st.write(f"共发现 {len(cell_differences)} 个单元格差异:")
  384. # 创建差异DataFrame用于显示
  385. diff_data = []
  386. for diff in cell_differences:
  387. diff_data.append({
  388. '位置': f"行{diff['row']}, 列{diff['column']}",
  389. '列名': diff['column'],
  390. '原始值': diff['original_value'],
  391. '新值': diff['new_value'],
  392. '变更类型': diff['change_type']
  393. })
  394. diff_df = pd.DataFrame(diff_data)
  395. # 使用颜色编码的表格
  396. st.dataframe(
  397. diff_df,
  398. use_container_width=True,
  399. height=300,
  400. column_config={
  401. '位置': st.column_config.TextColumn('位置', help='差异的具体位置'),
  402. '列名': st.column_config.TextColumn('列名'),
  403. '原始值': st.column_config.TextColumn('原始值'),
  404. '新值': st.column_config.TextColumn('新值'),
  405. '变更类型': st.column_config.TextColumn('变更类型')
  406. }
  407. )
  408. # 导出差异报告
  409. if st.button("📥 导出差异报告"):
  410. csv_data = diff_df.to_csv(index=False)
  411. st.download_button(
  412. label="下载CSV格式差异报告",
  413. data=csv_data,
  414. file_name="dataframe_diff_report.csv",
  415. mime="text/csv"
  416. )
  417. def create_plotly_diff_heatmap(diff_analysis: Dict, df_shape: Tuple[int, int]):
  418. """创建差异热力图"""
  419. if not diff_analysis['cell_differences']:
  420. return None
  421. # 创建差异矩阵
  422. diff_matrix = np.zeros(df_shape)
  423. for diff in diff_analysis['cell_differences']:
  424. row, col = diff['row'], df_shape[1] - 1 # 简化处理
  425. if diff['change_type'] == 'modified':
  426. diff_matrix[row, col] = 1
  427. elif diff['change_type'] == 'added':
  428. diff_matrix[row, col] = 2
  429. elif diff['change_type'] == 'removed':
  430. diff_matrix[row, col] = 3
  431. fig = go.Figure(data=go.Heatmap(
  432. z=diff_matrix,
  433. colorscale=[[0, 'white'], [0.33, 'yellow'], [0.66, 'green'], [1, 'red']],
  434. showscale=True,
  435. colorbar=dict(
  436. title="差异类型",
  437. tickmode="array",
  438. tickvals=[0, 1, 2, 3],
  439. ticktext=["无差异", "修改", "新增", "删除"]
  440. )
  441. ))
  442. fig.update_layout(
  443. title="DataFrame差异热力图",
  444. xaxis_title="列",
  445. yaxis_title="行",
  446. height=400
  447. )
  448. return fig
  449. # 主函数
  450. def main():
  451. create_dataframe_diff_visualizer()
  452. # 可选:添加热力图视图
  453. if st.checkbox("🔥 显示差异热力图"):
  454. if 'original_df' in st.session_state and 'edited_df' in st.session_state:
  455. diff_analysis = analyze_dataframe_differences(
  456. st.session_state.original_df,
  457. st.session_state.edited_df
  458. )
  459. heatmap_fig = create_plotly_diff_heatmap(
  460. diff_analysis,
  461. st.session_state.original_df.shape
  462. )
  463. if heatmap_fig:
  464. st.plotly_chart(heatmap_fig, use_container_width=True)
  465. if __name__ == "__main__":
  466. main()