| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566 |
- import streamlit as st
- import pandas as pd
- import numpy as np
- from typing import Dict, List, Tuple, Optional
- import plotly.graph_objects as go
- from plotly.subplots import make_subplots
- import plotly.express as px
- def create_dataframe_diff_visualizer():
- st.title("📊 DataFrame可视化比对工具")
- st.markdown("---")
-
- # 初始化数据
- if 'original_df' not in st.session_state:
- st.session_state.original_df = create_sample_data()
-
- if 'edited_df' not in st.session_state:
- st.session_state.edited_df = st.session_state.original_df.copy()
-
- # 控制面板
- with st.expander("🎛️ 控制面板", expanded=True):
- col1, col2, col3, col4 = st.columns(4)
-
- with col1:
- if st.button("🔄 重置数据", type="secondary"):
- st.session_state.original_df = create_sample_data()
- st.session_state.edited_df = st.session_state.original_df.copy()
- st.rerun()
-
- with col2:
- if st.button("🎲 生成随机差异", type="secondary"):
- st.session_state.edited_df = create_random_differences(st.session_state.original_df)
- st.rerun()
-
- with col3:
- sync_mode = st.checkbox("🔗 同步滚动", value=True)
-
- with col4:
- show_stats = st.checkbox("📈 显示统计", value=True)
-
- # 分析差异
- diff_analysis = analyze_dataframe_differences(
- st.session_state.original_df,
- st.session_state.edited_df
- )
-
- # 显示差异统计
- if show_stats:
- display_diff_statistics(diff_analysis)
-
- # 主要比对区域
- st.subheader("📝 数据比对")
-
- # 使用两列布局
- left_col, right_col = st.columns(2)
-
- with left_col:
- st.markdown("### 📝 可编辑版本 (左侧)")
-
- # 可编辑的数据编辑器
- edited_df = st.data_editor(
- st.session_state.edited_df,
- height=500,
- use_container_width=True,
- num_rows="dynamic",
- key="left_editor",
- column_config=create_column_config(st.session_state.edited_df)
- )
-
- # 更新编辑后的数据
- if not edited_df.equals(st.session_state.edited_df):
- st.session_state.edited_df = edited_df.copy()
- st.rerun()
-
- with right_col:
- st.markdown("### 📊 原始版本 (右侧)")
-
- # 显示带差异高亮的原始数据
- display_dataframe_with_diff_highlighting(
- st.session_state.original_df,
- diff_analysis,
- "original"
- )
-
- # 详细差异视图
- st.markdown("---")
- create_detailed_diff_view(diff_analysis)
- def create_sample_data() -> pd.DataFrame:
- """创建示例数据"""
- np.random.seed(42)
- data = {
- 'ID': range(1, 21),
- 'Name': [f'Product_{i}' for i in range(1, 21)],
- 'Category': np.random.choice(['Electronics', 'Clothing', 'Food', 'Books'], 20),
- 'Price': np.round(np.random.uniform(10, 100, 20), 2),
- 'Stock': np.random.randint(0, 200, 20),
- 'Rating': np.round(np.random.uniform(1, 5, 20), 1),
- 'Active': np.random.choice([True, False], 20)
- }
- return pd.DataFrame(data)
- def create_random_differences(df: pd.DataFrame) -> pd.DataFrame:
- """创建随机差异用于演示"""
- modified_df = df.copy()
-
- # 随机修改一些单元格
- num_changes = np.random.randint(5, 15)
-
- for _ in range(num_changes):
- row_idx = np.random.randint(0, len(modified_df))
- col_idx = np.random.randint(1, len(modified_df.columns)) # 跳过ID列
- col_name = modified_df.columns[col_idx]
-
- if col_name == 'Name':
- modified_df.loc[row_idx, col_name] = f'Modified_{row_idx}'
- elif col_name == 'Category':
- modified_df.loc[row_idx, col_name] = np.random.choice(['Modified_Cat', 'New_Category'])
- elif col_name == 'Price':
- modified_df.loc[row_idx, col_name] = np.round(np.random.uniform(10, 150), 2)
- elif col_name == 'Stock':
- modified_df.loc[row_idx, col_name] = np.random.randint(0, 300)
- elif col_name == 'Rating':
- modified_df.loc[row_idx, col_name] = np.round(np.random.uniform(1, 5), 1)
- elif col_name == 'Active':
- modified_df.loc[row_idx, col_name] = not modified_df.loc[row_idx, col_name]
-
- return modified_df
- def analyze_dataframe_differences(df1: pd.DataFrame, df2: pd.DataFrame) -> Dict:
- """分析两个DataFrame之间的差异"""
-
- # 确保两个DataFrame具有相同的形状和列
- if df1.shape != df2.shape:
- st.warning("⚠️ 两个DataFrame的形状不匹配!")
-
- common_columns = list(set(df1.columns) & set(df2.columns))
-
- differences = {
- 'cell_differences': [],
- 'added_rows': [],
- 'removed_rows': [],
- 'column_differences': {
- 'added_columns': list(set(df2.columns) - set(df1.columns)),
- 'removed_columns': list(set(df1.columns) - set(df2.columns))
- },
- 'summary': {
- 'total_differences': 0,
- 'modified_cells': 0,
- 'modified_rows': set(),
- 'modified_columns': set()
- }
- }
-
- # 比较相同大小的DataFrame
- min_rows = min(len(df1), len(df2))
-
- for row_idx in range(min_rows):
- for col in common_columns:
- try:
- val1 = df1.iloc[row_idx][col]
- val2 = df2.iloc[row_idx][col]
-
- # 处理NaN值
- if pd.isna(val1) and pd.isna(val2):
- continue
-
- if pd.isna(val1) or pd.isna(val2) or val1 != val2:
- differences['cell_differences'].append({
- 'row': row_idx,
- 'column': col,
- 'original_value': val1,
- 'new_value': val2,
- 'change_type': determine_change_type(val1, val2)
- })
-
- differences['summary']['modified_cells'] += 1
- differences['summary']['modified_rows'].add(row_idx)
- differences['summary']['modified_columns'].add(col)
-
- except Exception as e:
- st.warning(f"比较时出错 (行{row_idx}, 列{col}): {e}")
-
- # 检查行数差异
- if len(df1) > len(df2):
- differences['removed_rows'] = list(range(len(df2), len(df1)))
- elif len(df2) > len(df1):
- differences['added_rows'] = list(range(len(df1), len(df2)))
-
- differences['summary']['total_differences'] = (
- differences['summary']['modified_cells'] +
- len(differences['added_rows']) +
- len(differences['removed_rows']) +
- len(differences['column_differences']['added_columns']) +
- len(differences['column_differences']['removed_columns'])
- )
-
- return differences
- def determine_change_type(val1, val2) -> str:
- """确定变更类型"""
- if pd.isna(val1):
- return "added"
- elif pd.isna(val2):
- return "removed"
- else:
- return "modified"
- def create_column_config(df: pd.DataFrame) -> Dict:
- """为data_editor创建列配置"""
- config = {}
-
- for col in df.columns:
- if df[col].dtype in ['int64', 'float64']:
- config[col] = st.column_config.NumberColumn(
- col,
- help=f"数值列: {col}",
- format="%.2f" if df[col].dtype == 'float64' else "%d"
- )
- elif df[col].dtype == 'bool':
- config[col] = st.column_config.CheckboxColumn(
- col,
- help=f"布尔列: {col}"
- )
- else:
- config[col] = st.column_config.TextColumn(
- col,
- help=f"文本列: {col}",
- max_chars=100
- )
-
- return config
- def display_dataframe_with_diff_highlighting(df: pd.DataFrame, diff_analysis: Dict, view_type: str):
- """显示带差异高亮的DataFrame"""
-
- # 创建样式化的HTML表格
- html_table = create_styled_diff_table(df, diff_analysis, view_type)
-
- # 自定义CSS样式
- st.markdown("""
- <style>
- .diff-table {
- height: 500px;
- overflow: auto;
- border: 1px solid #ddd;
- border-radius: 5px;
- font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
- font-size: 12px;
- }
-
- .diff-table table {
- width: 100%;
- border-collapse: collapse;
- margin: 0;
- }
-
- .diff-table th {
- background-color: #f5f5f5;
- border: 1px solid #ddd;
- padding: 8px;
- text-align: left;
- position: sticky;
- top: 0;
- z-index: 10;
- }
-
- .diff-table td {
- border: 1px solid #ddd;
- padding: 8px;
- white-space: nowrap;
- }
-
- /* 差异高亮样式 */
- .cell-modified {
- background-color: #fff3cd !important;
- border: 2px solid #ffc107 !important;
- position: relative;
- }
-
- .cell-added {
- background-color: #d4edda !important;
- border: 2px solid #28a745 !important;
- }
-
- .cell-removed {
- background-color: #f8d7da !important;
- border: 2px solid #dc3545 !important;
- }
-
- .row-highlight {
- background-color: #f8f9fa !important;
- }
-
- /* 悬停效果 */
- .diff-table td:hover {
- background-color: #e3f2fd !important;
- cursor: pointer;
- }
-
- /* 差异标记 */
- .diff-marker {
- position: absolute;
- top: 2px;
- right: 2px;
- width: 8px;
- height: 8px;
- border-radius: 50%;
- }
-
- .marker-modified { background-color: #ffc107; }
- .marker-added { background-color: #28a745; }
- .marker-removed { background-color: #dc3545; }
- </style>
- """, unsafe_allow_html=True)
-
- # 显示表格
- st.markdown(f'<div class="diff-table">{html_table}</div>', unsafe_allow_html=True)
- def create_styled_diff_table(df: pd.DataFrame, diff_analysis: Dict, view_type: str) -> str:
- """创建带样式的差异表格HTML"""
-
- # 创建差异映射
- diff_map = {}
- for diff in diff_analysis['cell_differences']:
- key = (diff['row'], diff['column'])
- diff_map[key] = diff
-
- # 开始构建HTML
- html = '<table>'
-
- # 表头
- html += '<tr>'
- for col in df.columns:
- html += f'<th>{col}</th>'
- html += '</tr>'
-
- # 表格行
- for row_idx in range(len(df)):
- row_class = "row-highlight" if row_idx in diff_analysis['summary']['modified_rows'] else ""
- html += f'<tr class="{row_class}">'
-
- for col in df.columns:
- value = df.iloc[row_idx][col]
- cell_key = (row_idx, col)
-
- # 确定单元格样式
- cell_class = ""
- marker_class = ""
-
- if cell_key in diff_map:
- diff_info = diff_map[cell_key]
- change_type = diff_info['change_type']
-
- if change_type == "modified":
- cell_class = "cell-modified"
- marker_class = "marker-modified"
- elif change_type == "added":
- cell_class = "cell-added"
- marker_class = "marker-added"
- elif change_type == "removed":
- cell_class = "cell-removed"
- marker_class = "marker-removed"
-
- # 处理值显示
- display_value = str(value) if not pd.isna(value) else ""
-
- # 构建单元格HTML
- cell_html = f'<td class="{cell_class}" title="行{row_idx}, 列{col}: {display_value}">'
-
- if marker_class:
- cell_html += f'<div class="diff-marker {marker_class}"></div>'
-
- cell_html += display_value
- cell_html += '</td>'
-
- html += cell_html
-
- html += '</tr>'
-
- html += '</table>'
- return html
- def display_diff_statistics(diff_analysis: Dict):
- """显示差异统计信息"""
- st.subheader("📈 差异统计")
-
- col1, col2, col3, col4, col5 = st.columns(5)
-
- with col1:
- st.metric(
- "总差异数",
- diff_analysis['summary']['total_differences'],
- help="所有类型的差异总数"
- )
-
- with col2:
- st.metric(
- "修改的单元格",
- diff_analysis['summary']['modified_cells'],
- help="被修改的单元格数量"
- )
-
- with col3:
- st.metric(
- "影响的行数",
- len(diff_analysis['summary']['modified_rows']),
- help="包含差异的行数"
- )
-
- with col4:
- st.metric(
- "影响的列数",
- len(diff_analysis['summary']['modified_columns']),
- help="包含差异的列数"
- )
-
- with col5:
- added_rows = len(diff_analysis['added_rows'])
- removed_rows = len(diff_analysis['removed_rows'])
- row_diff = added_rows - removed_rows
- st.metric(
- "行数变化",
- f"+{added_rows}/-{removed_rows}",
- delta=row_diff if row_diff != 0 else None
- )
- def create_detailed_diff_view(diff_analysis: Dict):
- """创建详细的差异视图"""
- st.subheader("🔍 详细差异分析")
-
- if diff_analysis['summary']['total_differences'] == 0:
- st.success("✅ 没有发现任何差异!")
- return
-
- # 差异类型选择器
- diff_types = []
- if diff_analysis['cell_differences']:
- diff_types.append("单元格差异")
- if diff_analysis['added_rows']:
- diff_types.append("新增行")
- if diff_analysis['removed_rows']:
- diff_types.append("删除行")
- if diff_analysis['column_differences']['added_columns']:
- diff_types.append("新增列")
- if diff_analysis['column_differences']['removed_columns']:
- diff_types.append("删除列")
-
- selected_diff_type = st.selectbox("选择要查看的差异类型", diff_types)
-
- # 显示相应的差异详情
- if selected_diff_type == "单元格差异":
- display_cell_differences(diff_analysis['cell_differences'])
- elif selected_diff_type == "新增行":
- st.info(f"新增了 {len(diff_analysis['added_rows'])} 行: {diff_analysis['added_rows']}")
- elif selected_diff_type == "删除行":
- st.warning(f"删除了 {len(diff_analysis['removed_rows'])} 行: {diff_analysis['removed_rows']}")
- elif selected_diff_type == "新增列":
- st.info(f"新增了列: {diff_analysis['column_differences']['added_columns']}")
- elif selected_diff_type == "删除列":
- st.warning(f"删除了列: {diff_analysis['column_differences']['removed_columns']}")
- def display_cell_differences(cell_differences: List[Dict]):
- """显示单元格差异详情"""
- if not cell_differences:
- return
-
- st.write(f"共发现 {len(cell_differences)} 个单元格差异:")
-
- # 创建差异DataFrame用于显示
- diff_data = []
- for diff in cell_differences:
- diff_data.append({
- '位置': f"行{diff['row']}, 列{diff['column']}",
- '列名': diff['column'],
- '原始值': diff['original_value'],
- '新值': diff['new_value'],
- '变更类型': diff['change_type']
- })
-
- diff_df = pd.DataFrame(diff_data)
-
- # 使用颜色编码的表格
- st.dataframe(
- diff_df,
- use_container_width=True,
- height=300,
- column_config={
- '位置': st.column_config.TextColumn('位置', help='差异的具体位置'),
- '列名': st.column_config.TextColumn('列名'),
- '原始值': st.column_config.TextColumn('原始值'),
- '新值': st.column_config.TextColumn('新值'),
- '变更类型': st.column_config.TextColumn('变更类型')
- }
- )
-
- # 导出差异报告
- if st.button("📥 导出差异报告"):
- csv_data = diff_df.to_csv(index=False)
- st.download_button(
- label="下载CSV格式差异报告",
- data=csv_data,
- file_name="dataframe_diff_report.csv",
- mime="text/csv"
- )
- def create_plotly_diff_heatmap(diff_analysis: Dict, df_shape: Tuple[int, int]):
- """创建差异热力图"""
- if not diff_analysis['cell_differences']:
- return None
-
- # 创建差异矩阵
- diff_matrix = np.zeros(df_shape)
-
- for diff in diff_analysis['cell_differences']:
- row, col = diff['row'], df_shape[1] - 1 # 简化处理
- if diff['change_type'] == 'modified':
- diff_matrix[row, col] = 1
- elif diff['change_type'] == 'added':
- diff_matrix[row, col] = 2
- elif diff['change_type'] == 'removed':
- diff_matrix[row, col] = 3
-
- fig = go.Figure(data=go.Heatmap(
- z=diff_matrix,
- colorscale=[[0, 'white'], [0.33, 'yellow'], [0.66, 'green'], [1, 'red']],
- showscale=True,
- colorbar=dict(
- title="差异类型",
- tickmode="array",
- tickvals=[0, 1, 2, 3],
- ticktext=["无差异", "修改", "新增", "删除"]
- )
- ))
-
- fig.update_layout(
- title="DataFrame差异热力图",
- xaxis_title="列",
- yaxis_title="行",
- height=400
- )
-
- return fig
- # 主函数
- def main():
- create_dataframe_diff_visualizer()
-
- # 可选:添加热力图视图
- if st.checkbox("🔥 显示差异热力图"):
- if 'original_df' in st.session_state and 'edited_df' in st.session_state:
- diff_analysis = analyze_dataframe_differences(
- st.session_state.original_df,
- st.session_state.edited_df
- )
-
- heatmap_fig = create_plotly_diff_heatmap(
- diff_analysis,
- st.session_state.original_df.shape
- )
-
- if heatmap_fig:
- st.plotly_chart(heatmap_fig, use_container_width=True)
- if __name__ == "__main__":
- main()
|