dataframe_diff_v0.2.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623
  1. import streamlit as st
  2. import pandas as pd
  3. import numpy as np
  4. from typing import Dict, List, Tuple, Optional, Set
  5. import difflib
  6. from dataclasses import dataclass
  7. from enum import Enum
  8. import hashlib
  9. class ChangeType(Enum):
  10. UNCHANGED = "unchanged"
  11. MODIFIED = "modified"
  12. ADDED = "added"
  13. REMOVED = "removed"
  14. @dataclass
  15. class CellDiff:
  16. row_left: Optional[int]
  17. row_right: Optional[int]
  18. column: str
  19. change_type: ChangeType
  20. old_value: any = None
  21. new_value: any = None
  22. similarity: float = 0.0
  23. @dataclass
  24. class RowDiff:
  25. row_left: Optional[int]
  26. row_right: Optional[int]
  27. change_type: ChangeType
  28. similarity: float = 0.0
  29. cell_diffs: List[CellDiff] = None
  30. class DataFrameDiffAlgorithm:
  31. """类似VSCode的DataFrame差异算法"""
  32. def __init__(self, similarity_threshold: float = 0.7):
  33. self.similarity_threshold = similarity_threshold
  34. def compute_diff(self, df_left: pd.DataFrame, df_right: pd.DataFrame) -> List[RowDiff]:
  35. """计算两个DataFrame的差异,从上到下寻找最匹配的行"""
  36. # 确保列对齐
  37. all_columns = list(set(df_left.columns) | set(df_right.columns))
  38. df_left_aligned = self._align_columns(df_left, all_columns)
  39. df_right_aligned = self._align_columns(df_right, all_columns)
  40. # 计算行相似度矩阵
  41. similarity_matrix = self._compute_similarity_matrix(df_left_aligned, df_right_aligned)
  42. # 从上到下匹配行
  43. row_mappings = self._match_rows_top_down(similarity_matrix)
  44. # 生成差异结果
  45. return self._generate_diff_result(df_left_aligned, df_right_aligned, row_mappings, all_columns)
  46. def _align_columns(self, df: pd.DataFrame, all_columns: List[str]) -> pd.DataFrame:
  47. """对齐DataFrame列"""
  48. aligned_df = df.copy()
  49. for col in all_columns:
  50. if col not in aligned_df.columns:
  51. aligned_df[col] = None
  52. return aligned_df[all_columns]
  53. def _compute_similarity_matrix(self, df_left: pd.DataFrame, df_right: pd.DataFrame) -> np.ndarray:
  54. """计算行之间的相似度矩阵"""
  55. matrix = np.zeros((len(df_left), len(df_right)))
  56. for i in range(len(df_left)):
  57. for j in range(len(df_right)):
  58. matrix[i, j] = self._compute_row_similarity(
  59. df_left.iloc[i], df_right.iloc[j]
  60. )
  61. return matrix
  62. def _compute_row_similarity(self, row1: pd.Series, row2: pd.Series) -> float:
  63. """计算两行的相似度"""
  64. total_cols = len(row1)
  65. if total_cols == 0:
  66. return 1.0
  67. matches = 0
  68. for col in row1.index:
  69. val1, val2 = row1[col], row2[col]
  70. # 处理NaN值
  71. if pd.isna(val1) and pd.isna(val2):
  72. matches += 1
  73. elif pd.isna(val1) or pd.isna(val2):
  74. continue
  75. else:
  76. # 字符串相似度计算
  77. str1, str2 = str(val1), str(val2)
  78. if str1 == str2:
  79. matches += 1
  80. else:
  81. # 使用difflib计算字符串相似度
  82. similarity = difflib.SequenceMatcher(None, str1, str2).ratio()
  83. matches += similarity * 0.5 # 部分匹配给予部分分数
  84. return matches / total_cols
  85. def _match_rows_top_down(self, similarity_matrix: np.ndarray) -> Dict[int, Optional[int]]:
  86. """从上到下匹配行,优先匹配相似度高的行"""
  87. left_rows, right_rows = similarity_matrix.shape
  88. matched_right = set()
  89. row_mappings = {}
  90. # 从上到下处理左侧每一行
  91. for left_idx in range(left_rows):
  92. best_right_idx = None
  93. best_similarity = 0.0
  94. # 在未匹配的右侧行中寻找最佳匹配
  95. for right_idx in range(right_rows):
  96. if right_idx not in matched_right:
  97. similarity = similarity_matrix[left_idx, right_idx]
  98. if similarity > best_similarity and similarity >= self.similarity_threshold:
  99. best_similarity = similarity
  100. best_right_idx = right_idx
  101. if best_right_idx is not None:
  102. row_mappings[left_idx] = best_right_idx
  103. matched_right.add(best_right_idx)
  104. else:
  105. row_mappings[left_idx] = None # 左侧行被删除
  106. return row_mappings
  107. def _generate_diff_result(self, df_left: pd.DataFrame, df_right: pd.DataFrame,
  108. row_mappings: Dict[int, Optional[int]], all_columns: List[str]) -> List[RowDiff]:
  109. """生成差异结果"""
  110. result = []
  111. matched_right_rows = set(row_mappings.values()) - {None}
  112. # 处理匹配的行和删除的行
  113. for left_idx, right_idx in row_mappings.items():
  114. if right_idx is None:
  115. # 删除的行
  116. result.append(RowDiff(
  117. row_left=left_idx,
  118. row_right=None,
  119. change_type=ChangeType.REMOVED,
  120. similarity=0.0
  121. ))
  122. else:
  123. # 匹配的行 - 检查单元格差异
  124. cell_diffs = self._compare_cells(
  125. df_left.iloc[left_idx], df_right.iloc[right_idx],
  126. left_idx, right_idx, all_columns
  127. )
  128. change_type = ChangeType.UNCHANGED
  129. if any(cell.change_type != ChangeType.UNCHANGED for cell in cell_diffs):
  130. change_type = ChangeType.MODIFIED
  131. similarity = self._compute_row_similarity(df_left.iloc[left_idx], df_right.iloc[right_idx])
  132. result.append(RowDiff(
  133. row_left=left_idx,
  134. row_right=right_idx,
  135. change_type=change_type,
  136. similarity=similarity,
  137. cell_diffs=cell_diffs
  138. ))
  139. # 处理新增的行(右侧未匹配的行)
  140. for right_idx in range(len(df_right)):
  141. if right_idx not in matched_right_rows:
  142. result.append(RowDiff(
  143. row_left=None,
  144. row_right=right_idx,
  145. change_type=ChangeType.ADDED,
  146. similarity=0.0
  147. ))
  148. return result
  149. def _compare_cells(self, row_left: pd.Series, row_right: pd.Series,
  150. left_idx: int, right_idx: int, columns: List[str]) -> List[CellDiff]:
  151. """比较单元格差异"""
  152. cell_diffs = []
  153. for col in columns:
  154. val_left = row_left[col] if col in row_left.index else None
  155. val_right = row_right[col] if col in row_right.index else None
  156. # 处理NaN值
  157. if pd.isna(val_left) and pd.isna(val_right):
  158. change_type = ChangeType.UNCHANGED
  159. elif pd.isna(val_left):
  160. change_type = ChangeType.ADDED
  161. elif pd.isna(val_right):
  162. change_type = ChangeType.REMOVED
  163. elif str(val_left) == str(val_right):
  164. change_type = ChangeType.UNCHANGED
  165. else:
  166. change_type = ChangeType.MODIFIED
  167. cell_diffs.append(CellDiff(
  168. row_left=left_idx,
  169. row_right=right_idx,
  170. column=col,
  171. change_type=change_type,
  172. old_value=val_left,
  173. new_value=val_right
  174. ))
  175. return cell_diffs
  176. class VSCodeStyleDataFrameDiff:
  177. """类似VSCode样式的DataFrame差异展示"""
  178. def __init__(self):
  179. self.diff_algorithm = DataFrameDiffAlgorithm()
  180. self._inject_css()
  181. def _inject_css(self):
  182. """注入VSCode风格的CSS样式"""
  183. st.markdown("""
  184. <style>
  185. /* VSCode风格的差异显示 */
  186. .vscode-diff-container {
  187. font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
  188. font-size: 12px;
  189. border: 1px solid #3c3c3c;
  190. border-radius: 6px;
  191. overflow: hidden;
  192. }
  193. .diff-header {
  194. background-color: #2d2d30;
  195. color: #cccccc;
  196. padding: 8px 12px;
  197. font-weight: bold;
  198. border-bottom: 1px solid #3c3c3c;
  199. }
  200. .diff-content {
  201. height: 500px;
  202. overflow: auto;
  203. background-color: #1e1e1e;
  204. }
  205. .diff-table {
  206. width: 100%;
  207. border-collapse: collapse;
  208. color: #cccccc;
  209. }
  210. .diff-table th {
  211. background-color: #2d2d30;
  212. border: 1px solid #3c3c3c;
  213. padding: 6px 8px;
  214. text-align: left;
  215. position: sticky;
  216. top: 0;
  217. z-index: 10;
  218. }
  219. .diff-table td {
  220. border: 1px solid #3c3c3c;
  221. padding: 4px 8px;
  222. white-space: nowrap;
  223. position: relative;
  224. }
  225. /* 差异颜色 - VSCode风格 */
  226. .diff-added {
  227. background-color: rgba(22, 160, 133, 0.2) !important;
  228. border-left: 3px solid #16a085 !important;
  229. }
  230. .diff-removed {
  231. background-color: rgba(231, 76, 60, 0.2) !important;
  232. border-left: 3px solid #e74c3c !important;
  233. }
  234. .diff-modified {
  235. background-color: rgba(241, 196, 15, 0.2) !important;
  236. border-left: 3px solid #f1c40f !important;
  237. }
  238. .diff-unchanged {
  239. background-color: transparent;
  240. }
  241. /* 行号样式 */
  242. .line-number {
  243. background-color: #2d2d30;
  244. color: #858585;
  245. text-align: right;
  246. padding: 4px 8px;
  247. border-right: 1px solid #3c3c3c;
  248. user-select: none;
  249. min-width: 40px;
  250. }
  251. /* 悬停效果 */
  252. .diff-table tbody tr:hover {
  253. background-color: rgba(255, 255, 255, 0.05);
  254. }
  255. /* 差异指示器 */
  256. .diff-indicator {
  257. position: absolute;
  258. left: -3px;
  259. top: 0;
  260. bottom: 0;
  261. width: 3px;
  262. }
  263. .indicator-added { background-color: #16a085; }
  264. .indicator-removed { background-color: #e74c3c; }
  265. .indicator-modified { background-color: #f1c40f; }
  266. /* 滚动条样式 */
  267. .diff-content::-webkit-scrollbar {
  268. width: 12px;
  269. height: 12px;
  270. }
  271. .diff-content::-webkit-scrollbar-track {
  272. background: #2d2d30;
  273. }
  274. .diff-content::-webkit-scrollbar-thumb {
  275. background: #555555;
  276. border-radius: 6px;
  277. }
  278. .diff-content::-webkit-scrollbar-thumb:hover {
  279. background: #777777;
  280. }
  281. </style>
  282. """, unsafe_allow_html=True)
  283. def create_diff_view(self):
  284. """创建差异对比视图"""
  285. st.title("📊 VSCode风格 DataFrame 差异对比")
  286. st.markdown("---")
  287. # 初始化数据
  288. if 'df_original' not in st.session_state:
  289. st.session_state.df_original = self._create_sample_data()
  290. if 'df_edited' not in st.session_state:
  291. st.session_state.df_edited = st.session_state.df_original.copy()
  292. # 控制面板
  293. self._create_control_panel()
  294. # 计算差异
  295. row_diffs = self.diff_algorithm.compute_diff(
  296. st.session_state.df_original,
  297. st.session_state.df_edited
  298. )
  299. # 显示统计信息
  300. self._display_diff_statistics(row_diffs)
  301. # 主要对比区域
  302. self._create_main_diff_view(row_diffs)
  303. # 详细差异列表
  304. self._create_detailed_diff_view(row_diffs)
  305. def _create_sample_data(self) -> pd.DataFrame:
  306. """创建示例数据"""
  307. return pd.DataFrame({
  308. 'ID': [1, 2, 3, 4, 5],
  309. 'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
  310. 'Age': [25, 30, 35, 40, 45],
  311. 'City': ['New York', 'London', 'Paris', 'Tokyo', 'Sydney'],
  312. 'Salary': [50000, 60000, 70000, 80000, 90000]
  313. })
  314. def _create_control_panel(self):
  315. """创建控制面板"""
  316. with st.expander("🎛️ 控制面板", expanded=True):
  317. col1, col2, col3, col4 = st.columns(4)
  318. with col1:
  319. if st.button("🔄 重置数据"):
  320. st.session_state.df_original = self._create_sample_data()
  321. st.session_state.df_edited = st.session_state.df_original.copy()
  322. st.rerun()
  323. with col2:
  324. if st.button("🎲 生成随机差异"):
  325. st.session_state.df_edited = self._create_random_diff()
  326. st.rerun()
  327. with col3:
  328. similarity_threshold = st.slider(
  329. "相似度阈值",
  330. min_value=0.1,
  331. max_value=1.0,
  332. value=0.7,
  333. step=0.1
  334. )
  335. self.diff_algorithm.similarity_threshold = similarity_threshold
  336. with col4:
  337. auto_scroll = st.checkbox("🔗 同步滚动", value=True)
  338. def _create_random_diff(self) -> pd.DataFrame:
  339. """创建随机差异用于演示"""
  340. df = st.session_state.df_original.copy()
  341. # 修改一些值
  342. df.loc[1, 'Name'] = 'Robert'
  343. df.loc[2, 'Age'] = 36
  344. df.loc[3, 'City'] = 'Berlin'
  345. # 删除一行
  346. df = df.drop(index=4)
  347. # 新增一行
  348. new_row = pd.DataFrame({
  349. 'ID': [6], 'Name': ['Frank'], 'Age': [28],
  350. 'City': ['Madrid'], 'Salary': [55000]
  351. })
  352. df = pd.concat([df, new_row], ignore_index=True)
  353. return df
  354. def _display_diff_statistics(self, row_diffs: List[RowDiff]):
  355. """显示差异统计"""
  356. stats = self._compute_diff_stats(row_diffs)
  357. col1, col2, col3, col4, col5 = st.columns(5)
  358. with col1:
  359. st.metric("总行数", stats['total_rows'])
  360. with col2:
  361. st.metric("🟢 新增", stats['added_rows'], delta=stats['added_rows'])
  362. with col3:
  363. st.metric("🔴 删除", stats['removed_rows'], delta=-stats['removed_rows'])
  364. with col4:
  365. st.metric("🟡 修改", stats['modified_rows'])
  366. with col5:
  367. st.metric("⚪ 未变", stats['unchanged_rows'])
  368. def _compute_diff_stats(self, row_diffs: List[RowDiff]) -> Dict:
  369. """计算差异统计"""
  370. stats = {
  371. 'total_rows': len(row_diffs),
  372. 'added_rows': 0,
  373. 'removed_rows': 0,
  374. 'modified_rows': 0,
  375. 'unchanged_rows': 0
  376. }
  377. for row_diff in row_diffs:
  378. if row_diff.change_type == ChangeType.ADDED:
  379. stats['added_rows'] += 1
  380. elif row_diff.change_type == ChangeType.REMOVED:
  381. stats['removed_rows'] += 1
  382. elif row_diff.change_type == ChangeType.MODIFIED:
  383. stats['modified_rows'] += 1
  384. else:
  385. stats['unchanged_rows'] += 1
  386. return stats
  387. def _create_main_diff_view(self, row_diffs: List[RowDiff]):
  388. """创建主要差异视图"""
  389. st.subheader("🔍 并排对比")
  390. left_col, right_col = st.columns(2)
  391. with left_col:
  392. st.markdown("### 📝 原始版本")
  393. self._create_diff_table(row_diffs, 'left')
  394. with right_col:
  395. st.markdown("### ✏️ 编辑版本")
  396. self._create_diff_table(row_diffs, 'right')
  397. def _create_diff_table(self, row_diffs: List[RowDiff], side: str):
  398. """创建带差异高亮的表格"""
  399. df = st.session_state.df_original if side == 'left' else st.session_state.df_edited
  400. # 构建HTML表格
  401. html_table = self._build_diff_html_table(row_diffs, side, df)
  402. # 显示表格
  403. st.markdown(f"""
  404. <div class="vscode-diff-container">
  405. <div class="diff-header">
  406. {"原始版本" if side == 'left' else "编辑版本"}
  407. ({len(df)} 行)
  408. </div>
  409. <div class="diff-content">
  410. {html_table}
  411. </div>
  412. </div>
  413. """, unsafe_allow_html=True)
  414. def _build_diff_html_table(self, row_diffs: List[RowDiff], side: str, df: pd.DataFrame) -> str:
  415. """构建差异HTML表格"""
  416. if df.empty:
  417. return "<p>数据为空</p>"
  418. html = '<table class="diff-table">'
  419. # 表头
  420. html += '<thead><tr><th class="line-number">#</th>'
  421. for col in df.columns:
  422. html += f'<th>{col}</th>'
  423. html += '</tr></thead><tbody>'
  424. # 根据差异类型构建行
  425. for i, row_diff in enumerate(row_diffs):
  426. row_idx = row_diff.row_left if side == 'left' else row_diff.row_right
  427. if row_idx is None:
  428. continue # 该侧没有对应行
  429. if row_idx >= len(df):
  430. continue # 超出范围
  431. # 确定行样式
  432. row_class = self._get_row_css_class(row_diff.change_type)
  433. html += f'<tr class="{row_class}">'
  434. html += f'<td class="line-number">{row_idx + 1}</td>'
  435. # 构建单元格
  436. for col in df.columns:
  437. cell_class = row_class
  438. cell_value = df.iloc[row_idx][col]
  439. # 如果是修改的行,检查单元格级别的差异
  440. if row_diff.change_type == ChangeType.MODIFIED and row_diff.cell_diffs:
  441. cell_diff = next((cd for cd in row_diff.cell_diffs if cd.column == col), None)
  442. if cell_diff:
  443. cell_class = self._get_cell_css_class(cell_diff.change_type)
  444. display_value = str(cell_value) if not pd.isna(cell_value) else ""
  445. html += f'<td class="{cell_class}">{display_value}</td>'
  446. html += '</tr>'
  447. html += '</tbody></table>'
  448. return html
  449. def _get_row_css_class(self, change_type: ChangeType) -> str:
  450. """获取行的CSS类"""
  451. mapping = {
  452. ChangeType.ADDED: "diff-added",
  453. ChangeType.REMOVED: "diff-removed",
  454. ChangeType.MODIFIED: "diff-modified",
  455. ChangeType.UNCHANGED: "diff-unchanged"
  456. }
  457. return mapping.get(change_type, "diff-unchanged")
  458. def _get_cell_css_class(self, change_type: ChangeType) -> str:
  459. """获取单元格的CSS类"""
  460. return self._get_row_css_class(change_type)
  461. def _create_detailed_diff_view(self, row_diffs: List[RowDiff]):
  462. """创建详细差异视图"""
  463. st.markdown("---")
  464. st.subheader("📋 详细差异列表")
  465. # 筛选选项
  466. change_types = st.multiselect(
  467. "显示的变更类型",
  468. [ct.value for ct in ChangeType],
  469. default=[ChangeType.ADDED.value, ChangeType.REMOVED.value, ChangeType.MODIFIED.value]
  470. )
  471. filtered_diffs = [
  472. rd for rd in row_diffs
  473. if rd.change_type.value in change_types
  474. ]
  475. if not filtered_diffs:
  476. st.info("没有符合条件的差异")
  477. return
  478. # 显示差异详情
  479. for i, row_diff in enumerate(filtered_diffs):
  480. with st.expander(f"差异 {i+1}: {row_diff.change_type.value.upper()}", expanded=False):
  481. self._display_row_diff_details(row_diff)
  482. def _display_row_diff_details(self, row_diff: RowDiff):
  483. """显示行差异详情"""
  484. col1, col2 = st.columns(2)
  485. with col1:
  486. st.write("**位置信息:**")
  487. st.write(f"- 左侧行: {row_diff.row_left}")
  488. st.write(f"- 右侧行: {row_diff.row_right}")
  489. st.write(f"- 变更类型: {row_diff.change_type.value}")
  490. st.write(f"- 相似度: {row_diff.similarity:.2%}")
  491. with col2:
  492. if row_diff.cell_diffs:
  493. st.write("**单元格差异:**")
  494. cell_diff_data = []
  495. for cell_diff in row_diff.cell_diffs:
  496. if cell_diff.change_type != ChangeType.UNCHANGED:
  497. cell_diff_data.append({
  498. '列': cell_diff.column,
  499. '变更类型': cell_diff.change_type.value,
  500. '原值': str(cell_diff.old_value),
  501. '新值': str(cell_diff.new_value)
  502. })
  503. if cell_diff_data:
  504. st.dataframe(pd.DataFrame(cell_diff_data), use_container_width=True)
  505. def main():
  506. """主函数"""
  507. diff_viewer = VSCodeStyleDataFrameDiff()
  508. diff_viewer.create_diff_view()
  509. if __name__ == "__main__":
  510. main()