streamlit_ocr_validator.py 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816
  1. #!/usr/bin/env python3
  2. """
  3. 基于Streamlit的OCR可视化校验工具(修复版)
  4. 提供丰富的交互组件和更好的用户体验
  5. """
  6. import streamlit as st
  7. import json
  8. import pandas as pd
  9. from pathlib import Path
  10. import numpy as np
  11. from PIL import Image, ImageDraw, ImageFont
  12. import cv2
  13. import base64
  14. from typing import Dict, List, Optional, Tuple
  15. import plotly.express as px
  16. import plotly.graph_objects as go
  17. from plotly.subplots import make_subplots
  18. # 设置页面配置
  19. st.set_page_config(
  20. page_title="OCR可视化校验工具",
  21. page_icon="🔍",
  22. layout="wide",
  23. initial_sidebar_state="expanded"
  24. )
  25. # 自定义CSS样式
  26. st.markdown("""
  27. <style>
  28. .main > div {
  29. padding-top: 2rem;
  30. }
  31. .stSelectbox > div > div > div {
  32. background-color: #f0f2f6;
  33. }
  34. .clickable-text {
  35. background-color: #e1f5fe;
  36. padding: 2px 6px;
  37. border-radius: 4px;
  38. border: 1px solid #0288d1;
  39. cursor: pointer;
  40. margin: 2px;
  41. display: inline-block;
  42. }
  43. .selected-text {
  44. background-color: #fff3e0;
  45. border-color: #ff9800;
  46. font-weight: bold;
  47. }
  48. .error-text {
  49. background-color: #ffebee;
  50. border-color: #f44336;
  51. color: #d32f2f;
  52. }
  53. .stats-container {
  54. background-color: #f8f9fa;
  55. padding: 1rem;
  56. border-radius: 8px;
  57. border-left: 4px solid #28a745;
  58. }
  59. </style>
  60. """, unsafe_allow_html=True)
  61. class StreamlitOCRValidator:
  62. def __init__(self):
  63. self.ocr_data = []
  64. self.md_content = ""
  65. self.image_path = ""
  66. self.text_bbox_mapping = {}
  67. self.selected_text = None
  68. self.marked_errors = set()
  69. def load_ocr_data(self, json_path: str, md_path: Optional[str] = None, image_path: Optional[str] = None):
  70. """加载OCR相关数据"""
  71. json_file = Path(json_path)
  72. # 加载JSON数据
  73. try:
  74. with open(json_file, 'r', encoding='utf-8') as f:
  75. data = json.load(f)
  76. # 确保数据是列表格式
  77. if isinstance(data, list):
  78. self.ocr_data = data
  79. elif isinstance(data, dict) and 'results' in data:
  80. self.ocr_data = data['results']
  81. else:
  82. st.error(f"❌ 不支持的JSON格式: {json_path}")
  83. return
  84. except Exception as e:
  85. st.error(f"❌ 加载JSON文件失败: {e}")
  86. return
  87. # 推断MD文件路径
  88. if md_path is None:
  89. md_file = json_file.with_suffix('.md')
  90. else:
  91. md_file = Path(md_path)
  92. if md_file.exists():
  93. with open(md_file, 'r', encoding='utf-8') as f:
  94. self.md_content = f.read()
  95. # 推断图片路径
  96. if image_path is None:
  97. image_name = json_file.stem
  98. sample_data_dir = Path("./sample_data")
  99. image_candidates = [
  100. sample_data_dir / f"{image_name}.png",
  101. sample_data_dir / f"{image_name}.jpg",
  102. json_file.parent / f"{image_name}.png",
  103. json_file.parent / f"{image_name}.jpg",
  104. ]
  105. for candidate in image_candidates:
  106. if candidate.exists():
  107. self.image_path = str(candidate)
  108. break
  109. else:
  110. self.image_path = image_path
  111. # 处理数据
  112. self.process_data()
  113. def process_data(self):
  114. """处理OCR数据,建立文本到bbox的映射"""
  115. self.text_bbox_mapping = {}
  116. # 确保 ocr_data 是列表
  117. if not isinstance(self.ocr_data, list):
  118. st.warning("⚠️ OCR数据格式不正确,期望列表格式")
  119. return
  120. for i, item in enumerate(self.ocr_data):
  121. # 确保 item 是字典类型
  122. if not isinstance(item, dict):
  123. continue
  124. if 'text' in item and 'bbox' in item:
  125. text = str(item['text']).strip()
  126. if text and text not in ['Picture', '']:
  127. bbox = item['bbox']
  128. # 确保bbox是4个数字的列表
  129. if isinstance(bbox, list) and len(bbox) == 4:
  130. if text not in self.text_bbox_mapping:
  131. self.text_bbox_mapping[text] = []
  132. self.text_bbox_mapping[text].append({
  133. 'bbox': bbox,
  134. 'category': item.get('category', 'Text'),
  135. 'index': i,
  136. 'confidence': item.get('confidence', 1.0)
  137. })
  138. def draw_bbox_on_image(self, image: Image.Image, bbox: List[int], color: str = "red", width: int = 3) -> Image.Image:
  139. """在图片上绘制bbox框"""
  140. img_copy = image.copy()
  141. draw = ImageDraw.Draw(img_copy)
  142. x1, y1, x2, y2 = bbox
  143. # 绘制矩形框
  144. draw.rectangle([x1, y1, x2, y2], outline=color, width=width)
  145. # 添加半透明填充
  146. overlay = Image.new('RGBA', img_copy.size, (0, 0, 0, 0))
  147. overlay_draw = ImageDraw.Draw(overlay)
  148. if color == "red":
  149. fill_color = (255, 0, 0, 30)
  150. elif color == "blue":
  151. fill_color = (0, 0, 255, 30)
  152. elif color == "green":
  153. fill_color = (0, 255, 0, 30)
  154. else:
  155. fill_color = (255, 255, 0, 30)
  156. overlay_draw.rectangle([x1, y1, x2, y2], fill=fill_color)
  157. img_copy = Image.alpha_composite(img_copy.convert('RGBA'), overlay).convert('RGB')
  158. return img_copy
  159. def create_interactive_plot(self, image: Image.Image, selected_bbox: Optional[List[int]] = None) -> go.Figure:
  160. """创建交互式图片显示"""
  161. fig = go.Figure()
  162. # 添加图片
  163. fig.add_layout_image(
  164. dict(
  165. source=image,
  166. xref="x",
  167. yref="y",
  168. x=0,
  169. y=image.height,
  170. sizex=image.width,
  171. sizey=image.height,
  172. sizing="stretch",
  173. opacity=1.0,
  174. layer="below"
  175. )
  176. )
  177. # 添加所有bbox(浅色显示)
  178. for text, info_list in self.text_bbox_mapping.items():
  179. for info in info_list:
  180. bbox = info['bbox']
  181. if len(bbox) >= 4: # 确保bbox有足够的坐标
  182. x1, y1, x2, y2 = bbox[:4]
  183. color = "rgba(0, 100, 200, 0.2)" # 默认浅蓝色
  184. if text in self.marked_errors:
  185. color = "rgba(255, 0, 0, 0.3)" # 错误标记为红色
  186. fig.add_shape(
  187. type="rect",
  188. x0=x1, y0=image.height-y2,
  189. x1=x2, y1=image.height-y1,
  190. line=dict(color=color.replace('0.2', '0.8').replace('0.3', '1.0'), width=1),
  191. fillcolor=color,
  192. )
  193. # 高亮显示选中的bbox
  194. if selected_bbox and len(selected_bbox) >= 4:
  195. x1, y1, x2, y2 = selected_bbox[:4]
  196. fig.add_shape(
  197. type="rect",
  198. x0=x1, y0=image.height-y2,
  199. x1=x2, y1=image.height-y1,
  200. line=dict(color="red", width=3),
  201. fillcolor="rgba(255, 0, 0, 0.2)",
  202. )
  203. # 设置布局
  204. fig.update_xaxes(
  205. visible=False,
  206. range=[0, image.width]
  207. )
  208. fig.update_yaxes(
  209. visible=False,
  210. range=[0, image.height],
  211. scaleanchor="x"
  212. )
  213. fig.update_layout(
  214. width=800,
  215. height=600,
  216. margin=dict(l=0, r=0, t=0, b=0),
  217. xaxis_showgrid=False,
  218. yaxis_showgrid=False,
  219. plot_bgcolor='white'
  220. )
  221. return fig
  222. def get_statistics(self) -> Dict:
  223. """获取统计信息"""
  224. # 先确保 ocr_data 不为空且是列表
  225. if not isinstance(self.ocr_data, list) or not self.ocr_data:
  226. return {
  227. 'total_texts': 0,
  228. 'clickable_texts': 0,
  229. 'marked_errors': 0,
  230. 'categories': {},
  231. 'accuracy_rate': 0
  232. }
  233. total_texts = len(self.ocr_data)
  234. clickable_texts = len(self.text_bbox_mapping)
  235. marked_errors = len(self.marked_errors)
  236. # 按类别统计 - 添加类型检查
  237. categories = {}
  238. for item in self.ocr_data:
  239. # 确保 item 是字典类型
  240. if isinstance(item, dict):
  241. category = item.get('category', 'Unknown')
  242. elif isinstance(item, str):
  243. category = 'Text' # 字符串类型默认为 Text 类别
  244. else:
  245. category = 'Unknown'
  246. categories[category] = categories.get(category, 0) + 1
  247. return {
  248. 'total_texts': total_texts,
  249. 'clickable_texts': clickable_texts,
  250. 'marked_errors': marked_errors,
  251. 'categories': categories,
  252. 'accuracy_rate': (clickable_texts - marked_errors) / clickable_texts * 100 if clickable_texts > 0 else 0
  253. }
  254. def convert_html_table_to_markdown(self, content: str) -> str:
  255. """将HTML表格转换为Markdown表格格式"""
  256. import re
  257. from html import unescape
  258. # 简单的HTML表格到Markdown转换
  259. def replace_table(match):
  260. table_html = match.group(0)
  261. # 提取所有行
  262. rows = re.findall(r'<tr>(.*?)</tr>', table_html, re.DOTALL | re.IGNORECASE)
  263. if not rows:
  264. return table_html # 如果没有找到行,返回原始内容
  265. markdown_rows = []
  266. for i, row in enumerate(rows):
  267. # 提取单元格
  268. cells = re.findall(r'<td[^>]*>(.*?)</td>', row, re.DOTALL | re.IGNORECASE)
  269. if cells:
  270. # 清理单元格内容
  271. clean_cells = []
  272. for cell in cells:
  273. # 移除HTML标签,保留文本
  274. cell_text = re.sub(r'<[^>]+>', '', cell).strip()
  275. cell_text = unescape(cell_text) # 解码HTML实体
  276. clean_cells.append(cell_text)
  277. # 构建Markdown行
  278. markdown_row = '| ' + ' | '.join(clean_cells) + ' |'
  279. markdown_rows.append(markdown_row)
  280. # 在第一行后添加分隔符
  281. if i == 0:
  282. separator = '| ' + ' | '.join(['---'] * len(clean_cells)) + ' |'
  283. markdown_rows.append(separator)
  284. return '\n'.join(markdown_rows) if markdown_rows else table_html
  285. # 替换所有HTML表格
  286. converted = re.sub(r'<table[^>]*>.*?</table>', replace_table, content, flags=re.DOTALL | re.IGNORECASE)
  287. return converted
  288. def render_markdown_with_options(self, markdown_content: str, table_format: str = "grid", escape_html: bool = True):
  289. """自定义Markdown渲染方法,支持多种选项"""
  290. import markdown
  291. # 处理HTML表格
  292. if escape_html:
  293. markdown_content = self.convert_html_table_to_markdown(markdown_content)
  294. # 渲染Markdown
  295. html_content = markdown.markdown(markdown_content)
  296. # 根据选项包裹在特定的HTML结构中
  297. if table_format == "grid":
  298. # 网格布局
  299. wrapped_content = f"""
  300. <div class="markdown-grid">
  301. {html_content}
  302. </div>
  303. """
  304. elif table_format == "list":
  305. # 列表布局
  306. wrapped_content = f"""
  307. <div class="markdown-list">
  308. {html_content}
  309. </div>
  310. """
  311. else:
  312. # 默认直接返回
  313. wrapped_content = html_content
  314. return wrapped_content
  315. def display_html_table_as_dataframe(self, html_content: str, enable_editing: bool = False):
  316. """将HTML表格解析为DataFrame显示"""
  317. import pandas as pd
  318. from io import StringIO, BytesIO
  319. try:
  320. # 使用pandas直接读取HTML表格
  321. tables = pd.read_html(StringIO(html_content))
  322. if tables:
  323. for i, table in enumerate(tables):
  324. st.subheader(f"📊 表格 {i+1}")
  325. # 创建表格操作按钮
  326. col1, col2, col3, col4 = st.columns(4)
  327. with col1:
  328. show_info = st.checkbox(f"显示表格信息", key=f"info_{i}")
  329. with col2:
  330. show_stats = st.checkbox(f"显示统计信息", key=f"stats_{i}")
  331. with col3:
  332. enable_filter = st.checkbox(f"启用过滤", key=f"filter_{i}")
  333. with col4:
  334. enable_sort = st.checkbox(f"启用排序", key=f"sort_{i}")
  335. # 数据过滤
  336. filtered_table = table.copy()
  337. if enable_filter and not table.empty:
  338. filter_col = st.selectbox(
  339. f"选择过滤列 (表格 {i+1})",
  340. options=['无'] + list(table.columns),
  341. key=f"filter_col_{i}"
  342. )
  343. if filter_col != '无':
  344. filter_value = st.text_input(
  345. f"过滤值 (表格 {i+1})",
  346. key=f"filter_value_{i}"
  347. )
  348. if filter_value:
  349. filtered_table = table[
  350. table[filter_col].astype(str).str.contains(filter_value, na=False)
  351. ]
  352. # 数据排序
  353. if enable_sort and not filtered_table.empty:
  354. sort_col = st.selectbox(
  355. f"选择排序列 (表格 {i+1})",
  356. options=['无'] + list(filtered_table.columns),
  357. key=f"sort_col_{i}"
  358. )
  359. if sort_col != '无':
  360. sort_order = st.radio(
  361. f"排序方式 (表格 {i+1})",
  362. options=['升序', '降序'],
  363. horizontal=True,
  364. key=f"sort_order_{i}"
  365. )
  366. ascending = (sort_order == '升序')
  367. filtered_table = filtered_table.sort_values(sort_col, ascending=ascending)
  368. # 显示表格
  369. if enable_editing:
  370. # 可编辑表格
  371. edited_table = st.data_editor(
  372. filtered_table,
  373. use_container_width=True,
  374. key=f"editor_{i}"
  375. )
  376. # 检查是否有编辑
  377. if not edited_table.equals(filtered_table):
  378. st.success("✏️ 表格已编辑,可以导出修改后的数据")
  379. else:
  380. # 只读表格
  381. st.dataframe(filtered_table, use_container_width=True)
  382. # 显示表格信息
  383. if show_info:
  384. st.write(f"**表格信息:**")
  385. st.write(f"- 原始行数: {len(table)}")
  386. st.write(f"- 过滤后行数: {len(filtered_table)}")
  387. st.write(f"- 列数: {len(table.columns)}")
  388. st.write(f"- 列名: {', '.join(table.columns)}")
  389. # 显示统计信息
  390. if show_stats:
  391. st.write(f"**统计信息:**")
  392. numeric_cols = filtered_table.select_dtypes(include=[np.number]).columns
  393. if len(numeric_cols) > 0:
  394. st.dataframe(filtered_table[numeric_cols].describe())
  395. else:
  396. st.info("表格中没有数值列")
  397. # 导出功能
  398. if st.button(f"📥 导出表格 {i+1}", key=f"export_{i}"):
  399. # 创建CSV数据
  400. csv_data = filtered_table.to_csv(index=False)
  401. st.download_button(
  402. label=f"下载CSV (表格 {i+1})",
  403. data=csv_data,
  404. file_name=f"table_{i+1}.csv",
  405. mime="text/csv",
  406. key=f"download_csv_{i}"
  407. )
  408. # 创建Excel数据
  409. excel_buffer = BytesIO()
  410. filtered_table.to_excel(excel_buffer, index=False)
  411. st.download_button(
  412. label=f"下载Excel (表格 {i+1})",
  413. data=excel_buffer.getvalue(),
  414. file_name=f"table_{i+1}.xlsx",
  415. mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
  416. key=f"download_excel_{i}"
  417. )
  418. st.markdown("---")
  419. else:
  420. st.warning("未找到可解析的表格")
  421. except Exception as e:
  422. st.error(f"表格解析失败: {e}")
  423. st.info("尝试使用HTML渲染模式查看表格")
  424. # 回退到HTML渲染
  425. st.markdown(html_content, unsafe_allow_html=True)
  426. def main():
  427. """主应用"""
  428. st.title("🔍 OCR可视化校验工具")
  429. st.markdown("---")
  430. # 初始化session state
  431. if 'validator' not in st.session_state:
  432. st.session_state.validator = StreamlitOCRValidator()
  433. if 'selected_text' not in st.session_state:
  434. st.session_state.selected_text = None
  435. if 'marked_errors' not in st.session_state:
  436. st.session_state.marked_errors = set()
  437. # 同步标记的错误到validator
  438. st.session_state.validator.marked_errors = st.session_state.marked_errors
  439. # 侧边栏 - 文件选择和控制
  440. with st.sidebar:
  441. st.header("📁 文件选择")
  442. # 查找可用的OCR文件
  443. output_dir = Path("output")
  444. available_files = []
  445. if output_dir.exists():
  446. for json_file in output_dir.rglob("*.json"):
  447. available_files.append(str(json_file))
  448. if available_files:
  449. selected_file = st.selectbox(
  450. "选择OCR结果文件",
  451. available_files,
  452. index=0
  453. )
  454. if st.button("🔄 加载文件", type="primary") and selected_file:
  455. try:
  456. st.session_state.validator.load_ocr_data(selected_file)
  457. st.success("✅ 文件加载成功!")
  458. st.rerun() # 重新运行应用以更新界面
  459. except Exception as e:
  460. st.error(f"❌ 加载失败: {e}")
  461. else:
  462. st.warning("未找到OCR结果文件")
  463. st.info("请确保output目录下有OCR结果文件")
  464. st.markdown("---")
  465. # 控制面板
  466. st.header("🎛️ 控制面板")
  467. if st.button("🧹 清除选择"):
  468. st.session_state.selected_text = None
  469. st.rerun()
  470. if st.button("❌ 清除错误标记"):
  471. st.session_state.marked_errors = set()
  472. st.rerun()
  473. # 显示调试信息
  474. if st.checkbox("🔧 调试信息"):
  475. st.write("**当前状态:**")
  476. st.write(f"- OCR数据项数: {len(st.session_state.validator.ocr_data)}")
  477. st.write(f"- 可点击文本: {len(st.session_state.validator.text_bbox_mapping)}")
  478. st.write(f"- 选中文本: {st.session_state.selected_text}")
  479. st.write(f"- 标记错误数: {len(st.session_state.marked_errors)}")
  480. if st.session_state.validator.ocr_data:
  481. st.write("**数据类型检查:**")
  482. sample_item = st.session_state.validator.ocr_data[0] if st.session_state.validator.ocr_data else None
  483. st.write(f"- 第一项类型: {type(sample_item)}")
  484. if isinstance(sample_item, dict):
  485. st.write(f"- 第一项键: {list(sample_item.keys())}")
  486. # 主内容区域
  487. if not st.session_state.validator.ocr_data:
  488. st.info("👈 请在左侧选择并加载OCR结果文件")
  489. return
  490. # 显示统计信息
  491. try:
  492. stats = st.session_state.validator.get_statistics()
  493. col1, col2, col3, col4 = st.columns(4)
  494. with col1:
  495. st.metric("📊 总文本块", stats['total_texts'])
  496. with col2:
  497. st.metric("🔗 可点击文本", stats['clickable_texts'])
  498. with col3:
  499. st.metric("❌ 标记错误", stats['marked_errors'])
  500. with col4:
  501. st.metric("✅ 准确率", f"{stats['accuracy_rate']:.1f}%")
  502. st.markdown("---")
  503. except Exception as e:
  504. st.error(f"❌ 统计信息计算失败: {e}")
  505. return
  506. # 创建标签页
  507. tab1, tab2, tab3 = st.tabs(["📄 文本校验", "📊 表格分析", "📈 数据统计"])
  508. with tab1:
  509. # 原有的左右分栏内容
  510. left_col, right_col = st.columns([1, 1])
  511. # 左侧 - OCR文本内容
  512. with left_col:
  513. st.header("📄 OCR识别内容")
  514. # 文本选择器
  515. if st.session_state.validator.text_bbox_mapping:
  516. text_options = ["请选择文本..."] + list(st.session_state.validator.text_bbox_mapping.keys())
  517. selected_index = st.selectbox(
  518. "选择要校验的文本",
  519. range(len(text_options)),
  520. format_func=lambda x: text_options[x],
  521. key="text_selector"
  522. )
  523. if selected_index > 0:
  524. st.session_state.selected_text = text_options[selected_index]
  525. else:
  526. st.warning("没有找到可点击的文本")
  527. # 显示MD内容(可搜索和过滤)
  528. if st.session_state.validator.md_content:
  529. search_term = st.text_input("🔍 搜索文本内容", placeholder="输入关键词搜索...")
  530. display_content = st.session_state.validator.md_content
  531. if search_term:
  532. lines = display_content.split('\n')
  533. filtered_lines = [line for line in lines if search_term.lower() in line.lower()]
  534. display_content = '\n'.join(filtered_lines)
  535. if filtered_lines:
  536. st.success(f"找到 {len(filtered_lines)} 行包含 '{search_term}'")
  537. else:
  538. st.warning(f"未找到包含 '{search_term}' 的内容")
  539. # 渲染方式选择
  540. render_mode = st.radio(
  541. "选择渲染方式",
  542. ["HTML渲染", "Markdown渲染", "DataFrame表格", "原始文本"], # 添加DataFrame选项
  543. horizontal=True
  544. )
  545. if render_mode == "HTML渲染":
  546. # 使用unsafe_allow_html=True来渲染HTML表格
  547. st.markdown(display_content, unsafe_allow_html=True)
  548. elif render_mode == "Markdown渲染":
  549. # 转换HTML表格为Markdown格式
  550. converted_content = st.session_state.validator.convert_html_table_to_markdown(display_content)
  551. st.markdown(converted_content)
  552. elif render_mode == "DataFrame表格":
  553. # 新增:使用DataFrame显示表格
  554. if '<table>' in display_content.lower():
  555. st.session_state.validator.display_html_table_as_dataframe(display_content)
  556. else:
  557. st.info("当前内容中没有检测到HTML表格")
  558. st.markdown(display_content)
  559. else:
  560. # 原始文本显示
  561. st.text_area(
  562. "MD内容预览",
  563. display_content,
  564. height=300,
  565. help="OCR识别的文本内容"
  566. )
  567. # 可点击文本列表
  568. st.subheader("🎯 可点击文本列表")
  569. if st.session_state.validator.text_bbox_mapping:
  570. for text, info_list in st.session_state.validator.text_bbox_mapping.items():
  571. info = info_list[0] # 使用第一个bbox信息
  572. # 确定显示样式
  573. is_selected = (text == st.session_state.selected_text)
  574. is_error = (text in st.session_state.marked_errors)
  575. # 创建按钮行
  576. button_col, error_col = st.columns([4, 1])
  577. with button_col:
  578. button_type = "primary" if is_selected else "secondary"
  579. if st.button(f"📍 {text}", key=f"btn_{text}", type=button_type):
  580. st.session_state.selected_text = text
  581. st.rerun()
  582. with error_col:
  583. if is_error:
  584. if st.button("✅", key=f"fix_{text}", help="取消错误标记"):
  585. st.session_state.marked_errors.discard(text)
  586. st.rerun()
  587. else:
  588. if st.button("❌", key=f"error_{text}", help="标记为错误"):
  589. st.session_state.marked_errors.add(text)
  590. st.rerun()
  591. else:
  592. st.info("没有可点击的文本项目")
  593. # 右侧 - 图像显示
  594. with right_col:
  595. st.header("🖼️ 原图标注")
  596. if st.session_state.validator.image_path and Path(st.session_state.validator.image_path).exists():
  597. try:
  598. # 加载图片
  599. image = Image.open(st.session_state.validator.image_path)
  600. # 创建交互式图片
  601. selected_bbox = None
  602. if st.session_state.selected_text and st.session_state.selected_text in st.session_state.validator.text_bbox_mapping:
  603. info = st.session_state.validator.text_bbox_mapping[st.session_state.selected_text][0]
  604. selected_bbox = info['bbox']
  605. fig = st.session_state.validator.create_interactive_plot(image, selected_bbox)
  606. st.plotly_chart(fig, use_container_width=True)
  607. # 显示选中文本的详细信息
  608. if st.session_state.selected_text:
  609. st.subheader("📍 选中文本详情")
  610. if st.session_state.selected_text in st.session_state.validator.text_bbox_mapping:
  611. info = st.session_state.validator.text_bbox_mapping[st.session_state.selected_text][0]
  612. bbox = info['bbox']
  613. info_col1, info_col2 = st.columns(2)
  614. with info_col1:
  615. st.write(f"**文本内容:** {st.session_state.selected_text}")
  616. st.write(f"**类别:** {info['category']}")
  617. st.write(f"**置信度:** {info.get('confidence', 'N/A')}")
  618. with info_col2:
  619. st.write(f"**位置:** [{', '.join(map(str, bbox))}]")
  620. if len(bbox) >= 4:
  621. st.write(f"**宽度:** {bbox[2] - bbox[0]} px")
  622. st.write(f"**高度:** {bbox[3] - bbox[1]} px")
  623. # 标记状态
  624. is_error = st.session_state.selected_text in st.session_state.marked_errors
  625. if is_error:
  626. st.error("⚠️ 此文本已标记为错误")
  627. else:
  628. st.success("✅ 此文本未标记错误")
  629. except Exception as e:
  630. st.error(f"❌ 图片处理失败: {e}")
  631. else:
  632. st.error("未找到对应的图片文件")
  633. if st.session_state.validator.image_path:
  634. st.write(f"期望路径: {st.session_state.validator.image_path}")
  635. with tab2:
  636. # 新增:专门的表格分析页面
  637. st.header("📊 表格数据分析")
  638. if st.session_state.validator.md_content:
  639. # 检查是否包含表格
  640. if '<table' in st.session_state.validator.md_content.lower():
  641. col1, col2 = st.columns([2, 1])
  642. with col1:
  643. st.subheader("🔍 表格数据预览")
  644. st.session_state.validator.display_html_table_as_dataframe(
  645. st.session_state.validator.md_content
  646. )
  647. with col2:
  648. st.subheader("⚙️ 表格操作")
  649. if st.button("📥 导出表格数据", type="primary"):
  650. try:
  651. import pandas as pd
  652. from io import StringIO
  653. tables = pd.read_html(StringIO(st.session_state.validator.md_content))
  654. if tables:
  655. # 创建Excel文件
  656. output = BytesIO()
  657. with pd.ExcelWriter(output, engine='openpyxl') as writer:
  658. for i, table in enumerate(tables):
  659. table.to_excel(writer, sheet_name=f'Table_{i+1}', index=False)
  660. st.download_button(
  661. label="📥 下载Excel文件",
  662. data=output.getvalue(),
  663. file_name="ocr_tables.xlsx",
  664. mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
  665. )
  666. except Exception as e:
  667. st.error(f"导出失败: {e}")
  668. if st.button("🔍 表格统计分析"):
  669. try:
  670. import pandas as pd
  671. from io import StringIO
  672. tables = pd.read_html(StringIO(st.session_state.validator.md_content))
  673. if tables:
  674. st.write("**表格统计信息:**")
  675. for i, table in enumerate(tables):
  676. st.write(f"表格 {i+1}:")
  677. st.write(f"- 行数: {len(table)}")
  678. st.write(f"- 列数: {len(table.columns)}")
  679. st.write(f"- 数值列数: {len(table.select_dtypes(include=[np.number]).columns)}")
  680. except Exception as e:
  681. st.error(f"统计分析失败: {e}")
  682. else:
  683. st.info("当前OCR结果中没有检测到表格数据")
  684. else:
  685. st.warning("请先加载OCR数据")
  686. with tab3:
  687. # 数据统计页面
  688. st.header("📈 OCR数据统计")
  689. # ...现有的统计代码...
  690. if __name__ == "__main__":
  691. main()