| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816 |
- #!/usr/bin/env python3
- """
- 基于Streamlit的OCR可视化校验工具(修复版)
- 提供丰富的交互组件和更好的用户体验
- """
- import streamlit as st
- import json
- import pandas as pd
- from pathlib import Path
- import numpy as np
- from PIL import Image, ImageDraw, ImageFont
- import cv2
- import base64
- from typing import Dict, List, Optional, Tuple
- import plotly.express as px
- import plotly.graph_objects as go
- from plotly.subplots import make_subplots
- # 设置页面配置
- st.set_page_config(
- page_title="OCR可视化校验工具",
- page_icon="🔍",
- layout="wide",
- initial_sidebar_state="expanded"
- )
- # 自定义CSS样式
- st.markdown("""
- <style>
- .main > div {
- padding-top: 2rem;
- }
-
- .stSelectbox > div > div > div {
- background-color: #f0f2f6;
- }
-
- .clickable-text {
- background-color: #e1f5fe;
- padding: 2px 6px;
- border-radius: 4px;
- border: 1px solid #0288d1;
- cursor: pointer;
- margin: 2px;
- display: inline-block;
- }
-
- .selected-text {
- background-color: #fff3e0;
- border-color: #ff9800;
- font-weight: bold;
- }
-
- .error-text {
- background-color: #ffebee;
- border-color: #f44336;
- color: #d32f2f;
- }
-
- .stats-container {
- background-color: #f8f9fa;
- padding: 1rem;
- border-radius: 8px;
- border-left: 4px solid #28a745;
- }
- </style>
- """, unsafe_allow_html=True)
- class StreamlitOCRValidator:
- def __init__(self):
- self.ocr_data = []
- self.md_content = ""
- self.image_path = ""
- self.text_bbox_mapping = {}
- self.selected_text = None
- self.marked_errors = set()
-
- def load_ocr_data(self, json_path: str, md_path: Optional[str] = None, image_path: Optional[str] = None):
- """加载OCR相关数据"""
- json_file = Path(json_path)
-
- # 加载JSON数据
- try:
- with open(json_file, 'r', encoding='utf-8') as f:
- data = json.load(f)
- # 确保数据是列表格式
- if isinstance(data, list):
- self.ocr_data = data
- elif isinstance(data, dict) and 'results' in data:
- self.ocr_data = data['results']
- else:
- st.error(f"❌ 不支持的JSON格式: {json_path}")
- return
- except Exception as e:
- st.error(f"❌ 加载JSON文件失败: {e}")
- return
-
- # 推断MD文件路径
- if md_path is None:
- md_file = json_file.with_suffix('.md')
- else:
- md_file = Path(md_path)
-
- if md_file.exists():
- with open(md_file, 'r', encoding='utf-8') as f:
- self.md_content = f.read()
-
- # 推断图片路径
- if image_path is None:
- image_name = json_file.stem
- sample_data_dir = Path("./sample_data")
-
- image_candidates = [
- sample_data_dir / f"{image_name}.png",
- sample_data_dir / f"{image_name}.jpg",
- json_file.parent / f"{image_name}.png",
- json_file.parent / f"{image_name}.jpg",
- ]
-
- for candidate in image_candidates:
- if candidate.exists():
- self.image_path = str(candidate)
- break
- else:
- self.image_path = image_path
-
- # 处理数据
- self.process_data()
-
- def process_data(self):
- """处理OCR数据,建立文本到bbox的映射"""
- self.text_bbox_mapping = {}
-
- # 确保 ocr_data 是列表
- if not isinstance(self.ocr_data, list):
- st.warning("⚠️ OCR数据格式不正确,期望列表格式")
- return
-
- for i, item in enumerate(self.ocr_data):
- # 确保 item 是字典类型
- if not isinstance(item, dict):
- continue
-
- if 'text' in item and 'bbox' in item:
- text = str(item['text']).strip()
- if text and text not in ['Picture', '']:
- bbox = item['bbox']
- # 确保bbox是4个数字的列表
- if isinstance(bbox, list) and len(bbox) == 4:
- if text not in self.text_bbox_mapping:
- self.text_bbox_mapping[text] = []
- self.text_bbox_mapping[text].append({
- 'bbox': bbox,
- 'category': item.get('category', 'Text'),
- 'index': i,
- 'confidence': item.get('confidence', 1.0)
- })
-
- def draw_bbox_on_image(self, image: Image.Image, bbox: List[int], color: str = "red", width: int = 3) -> Image.Image:
- """在图片上绘制bbox框"""
- img_copy = image.copy()
- draw = ImageDraw.Draw(img_copy)
-
- x1, y1, x2, y2 = bbox
-
- # 绘制矩形框
- draw.rectangle([x1, y1, x2, y2], outline=color, width=width)
-
- # 添加半透明填充
- overlay = Image.new('RGBA', img_copy.size, (0, 0, 0, 0))
- overlay_draw = ImageDraw.Draw(overlay)
-
- if color == "red":
- fill_color = (255, 0, 0, 30)
- elif color == "blue":
- fill_color = (0, 0, 255, 30)
- elif color == "green":
- fill_color = (0, 255, 0, 30)
- else:
- fill_color = (255, 255, 0, 30)
-
- overlay_draw.rectangle([x1, y1, x2, y2], fill=fill_color)
- img_copy = Image.alpha_composite(img_copy.convert('RGBA'), overlay).convert('RGB')
-
- return img_copy
-
- def create_interactive_plot(self, image: Image.Image, selected_bbox: Optional[List[int]] = None) -> go.Figure:
- """创建交互式图片显示"""
- fig = go.Figure()
-
- # 添加图片
- fig.add_layout_image(
- dict(
- source=image,
- xref="x",
- yref="y",
- x=0,
- y=image.height,
- sizex=image.width,
- sizey=image.height,
- sizing="stretch",
- opacity=1.0,
- layer="below"
- )
- )
-
- # 添加所有bbox(浅色显示)
- for text, info_list in self.text_bbox_mapping.items():
- for info in info_list:
- bbox = info['bbox']
- if len(bbox) >= 4: # 确保bbox有足够的坐标
- x1, y1, x2, y2 = bbox[:4]
-
- color = "rgba(0, 100, 200, 0.2)" # 默认浅蓝色
- if text in self.marked_errors:
- color = "rgba(255, 0, 0, 0.3)" # 错误标记为红色
-
- fig.add_shape(
- type="rect",
- x0=x1, y0=image.height-y2,
- x1=x2, y1=image.height-y1,
- line=dict(color=color.replace('0.2', '0.8').replace('0.3', '1.0'), width=1),
- fillcolor=color,
- )
-
- # 高亮显示选中的bbox
- if selected_bbox and len(selected_bbox) >= 4:
- x1, y1, x2, y2 = selected_bbox[:4]
- fig.add_shape(
- type="rect",
- x0=x1, y0=image.height-y2,
- x1=x2, y1=image.height-y1,
- line=dict(color="red", width=3),
- fillcolor="rgba(255, 0, 0, 0.2)",
- )
-
- # 设置布局
- fig.update_xaxes(
- visible=False,
- range=[0, image.width]
- )
-
- fig.update_yaxes(
- visible=False,
- range=[0, image.height],
- scaleanchor="x"
- )
-
- fig.update_layout(
- width=800,
- height=600,
- margin=dict(l=0, r=0, t=0, b=0),
- xaxis_showgrid=False,
- yaxis_showgrid=False,
- plot_bgcolor='white'
- )
-
- return fig
-
- def get_statistics(self) -> Dict:
- """获取统计信息"""
- # 先确保 ocr_data 不为空且是列表
- if not isinstance(self.ocr_data, list) or not self.ocr_data:
- return {
- 'total_texts': 0,
- 'clickable_texts': 0,
- 'marked_errors': 0,
- 'categories': {},
- 'accuracy_rate': 0
- }
-
- total_texts = len(self.ocr_data)
- clickable_texts = len(self.text_bbox_mapping)
- marked_errors = len(self.marked_errors)
-
- # 按类别统计 - 添加类型检查
- categories = {}
- for item in self.ocr_data:
- # 确保 item 是字典类型
- if isinstance(item, dict):
- category = item.get('category', 'Unknown')
- elif isinstance(item, str):
- category = 'Text' # 字符串类型默认为 Text 类别
- else:
- category = 'Unknown'
-
- categories[category] = categories.get(category, 0) + 1
-
- return {
- 'total_texts': total_texts,
- 'clickable_texts': clickable_texts,
- 'marked_errors': marked_errors,
- 'categories': categories,
- 'accuracy_rate': (clickable_texts - marked_errors) / clickable_texts * 100 if clickable_texts > 0 else 0
- }
-
- def convert_html_table_to_markdown(self, content: str) -> str:
- """将HTML表格转换为Markdown表格格式"""
- import re
- from html import unescape
-
- # 简单的HTML表格到Markdown转换
- def replace_table(match):
- table_html = match.group(0)
-
- # 提取所有行
- rows = re.findall(r'<tr>(.*?)</tr>', table_html, re.DOTALL | re.IGNORECASE)
- if not rows:
- return table_html # 如果没有找到行,返回原始内容
-
- markdown_rows = []
- for i, row in enumerate(rows):
- # 提取单元格
- cells = re.findall(r'<td[^>]*>(.*?)</td>', row, re.DOTALL | re.IGNORECASE)
- if cells:
- # 清理单元格内容
- clean_cells = []
- for cell in cells:
- # 移除HTML标签,保留文本
- cell_text = re.sub(r'<[^>]+>', '', cell).strip()
- cell_text = unescape(cell_text) # 解码HTML实体
- clean_cells.append(cell_text)
-
- # 构建Markdown行
- markdown_row = '| ' + ' | '.join(clean_cells) + ' |'
- markdown_rows.append(markdown_row)
-
- # 在第一行后添加分隔符
- if i == 0:
- separator = '| ' + ' | '.join(['---'] * len(clean_cells)) + ' |'
- markdown_rows.append(separator)
-
- return '\n'.join(markdown_rows) if markdown_rows else table_html
-
- # 替换所有HTML表格
- converted = re.sub(r'<table[^>]*>.*?</table>', replace_table, content, flags=re.DOTALL | re.IGNORECASE)
- return converted
-
- def render_markdown_with_options(self, markdown_content: str, table_format: str = "grid", escape_html: bool = True):
- """自定义Markdown渲染方法,支持多种选项"""
- import markdown
-
- # 处理HTML表格
- if escape_html:
- markdown_content = self.convert_html_table_to_markdown(markdown_content)
-
- # 渲染Markdown
- html_content = markdown.markdown(markdown_content)
-
- # 根据选项包裹在特定的HTML结构中
- if table_format == "grid":
- # 网格布局
- wrapped_content = f"""
- <div class="markdown-grid">
- {html_content}
- </div>
- """
- elif table_format == "list":
- # 列表布局
- wrapped_content = f"""
- <div class="markdown-list">
- {html_content}
- </div>
- """
- else:
- # 默认直接返回
- wrapped_content = html_content
-
- return wrapped_content
-
- def display_html_table_as_dataframe(self, html_content: str, enable_editing: bool = False):
- """将HTML表格解析为DataFrame显示"""
- import pandas as pd
- from io import StringIO, BytesIO
-
- try:
- # 使用pandas直接读取HTML表格
- tables = pd.read_html(StringIO(html_content))
- if tables:
- for i, table in enumerate(tables):
- st.subheader(f"📊 表格 {i+1}")
-
- # 创建表格操作按钮
- col1, col2, col3, col4 = st.columns(4)
- with col1:
- show_info = st.checkbox(f"显示表格信息", key=f"info_{i}")
- with col2:
- show_stats = st.checkbox(f"显示统计信息", key=f"stats_{i}")
- with col3:
- enable_filter = st.checkbox(f"启用过滤", key=f"filter_{i}")
- with col4:
- enable_sort = st.checkbox(f"启用排序", key=f"sort_{i}")
-
- # 数据过滤
- filtered_table = table.copy()
- if enable_filter and not table.empty:
- filter_col = st.selectbox(
- f"选择过滤列 (表格 {i+1})",
- options=['无'] + list(table.columns),
- key=f"filter_col_{i}"
- )
-
- if filter_col != '无':
- filter_value = st.text_input(
- f"过滤值 (表格 {i+1})",
- key=f"filter_value_{i}"
- )
- if filter_value:
- filtered_table = table[
- table[filter_col].astype(str).str.contains(filter_value, na=False)
- ]
-
- # 数据排序
- if enable_sort and not filtered_table.empty:
- sort_col = st.selectbox(
- f"选择排序列 (表格 {i+1})",
- options=['无'] + list(filtered_table.columns),
- key=f"sort_col_{i}"
- )
-
- if sort_col != '无':
- sort_order = st.radio(
- f"排序方式 (表格 {i+1})",
- options=['升序', '降序'],
- horizontal=True,
- key=f"sort_order_{i}"
- )
- ascending = (sort_order == '升序')
- filtered_table = filtered_table.sort_values(sort_col, ascending=ascending)
-
- # 显示表格
- if enable_editing:
- # 可编辑表格
- edited_table = st.data_editor(
- filtered_table,
- use_container_width=True,
- key=f"editor_{i}"
- )
-
- # 检查是否有编辑
- if not edited_table.equals(filtered_table):
- st.success("✏️ 表格已编辑,可以导出修改后的数据")
-
- else:
- # 只读表格
- st.dataframe(filtered_table, use_container_width=True)
-
- # 显示表格信息
- if show_info:
- st.write(f"**表格信息:**")
- st.write(f"- 原始行数: {len(table)}")
- st.write(f"- 过滤后行数: {len(filtered_table)}")
- st.write(f"- 列数: {len(table.columns)}")
- st.write(f"- 列名: {', '.join(table.columns)}")
-
- # 显示统计信息
- if show_stats:
- st.write(f"**统计信息:**")
- numeric_cols = filtered_table.select_dtypes(include=[np.number]).columns
- if len(numeric_cols) > 0:
- st.dataframe(filtered_table[numeric_cols].describe())
- else:
- st.info("表格中没有数值列")
-
- # 导出功能
- if st.button(f"📥 导出表格 {i+1}", key=f"export_{i}"):
- # 创建CSV数据
- csv_data = filtered_table.to_csv(index=False)
- st.download_button(
- label=f"下载CSV (表格 {i+1})",
- data=csv_data,
- file_name=f"table_{i+1}.csv",
- mime="text/csv",
- key=f"download_csv_{i}"
- )
-
- # 创建Excel数据
- excel_buffer = BytesIO()
- filtered_table.to_excel(excel_buffer, index=False)
- st.download_button(
- label=f"下载Excel (表格 {i+1})",
- data=excel_buffer.getvalue(),
- file_name=f"table_{i+1}.xlsx",
- mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
- key=f"download_excel_{i}"
- )
-
- st.markdown("---")
- else:
- st.warning("未找到可解析的表格")
-
- except Exception as e:
- st.error(f"表格解析失败: {e}")
- st.info("尝试使用HTML渲染模式查看表格")
- # 回退到HTML渲染
- st.markdown(html_content, unsafe_allow_html=True)
- def main():
- """主应用"""
- st.title("🔍 OCR可视化校验工具")
- st.markdown("---")
-
- # 初始化session state
- if 'validator' not in st.session_state:
- st.session_state.validator = StreamlitOCRValidator()
-
- if 'selected_text' not in st.session_state:
- st.session_state.selected_text = None
-
- if 'marked_errors' not in st.session_state:
- st.session_state.marked_errors = set()
-
- # 同步标记的错误到validator
- st.session_state.validator.marked_errors = st.session_state.marked_errors
-
- # 侧边栏 - 文件选择和控制
- with st.sidebar:
- st.header("📁 文件选择")
-
- # 查找可用的OCR文件
- output_dir = Path("output")
- available_files = []
-
- if output_dir.exists():
- for json_file in output_dir.rglob("*.json"):
- available_files.append(str(json_file))
-
- if available_files:
- selected_file = st.selectbox(
- "选择OCR结果文件",
- available_files,
- index=0
- )
-
- if st.button("🔄 加载文件", type="primary") and selected_file:
- try:
- st.session_state.validator.load_ocr_data(selected_file)
- st.success("✅ 文件加载成功!")
- st.rerun() # 重新运行应用以更新界面
- except Exception as e:
- st.error(f"❌ 加载失败: {e}")
- else:
- st.warning("未找到OCR结果文件")
- st.info("请确保output目录下有OCR结果文件")
-
- st.markdown("---")
-
- # 控制面板
- st.header("🎛️ 控制面板")
-
- if st.button("🧹 清除选择"):
- st.session_state.selected_text = None
- st.rerun()
-
- if st.button("❌ 清除错误标记"):
- st.session_state.marked_errors = set()
- st.rerun()
-
- # 显示调试信息
- if st.checkbox("🔧 调试信息"):
- st.write("**当前状态:**")
- st.write(f"- OCR数据项数: {len(st.session_state.validator.ocr_data)}")
- st.write(f"- 可点击文本: {len(st.session_state.validator.text_bbox_mapping)}")
- st.write(f"- 选中文本: {st.session_state.selected_text}")
- st.write(f"- 标记错误数: {len(st.session_state.marked_errors)}")
-
- if st.session_state.validator.ocr_data:
- st.write("**数据类型检查:**")
- sample_item = st.session_state.validator.ocr_data[0] if st.session_state.validator.ocr_data else None
- st.write(f"- 第一项类型: {type(sample_item)}")
- if isinstance(sample_item, dict):
- st.write(f"- 第一项键: {list(sample_item.keys())}")
-
- # 主内容区域
- if not st.session_state.validator.ocr_data:
- st.info("👈 请在左侧选择并加载OCR结果文件")
- return
-
- # 显示统计信息
- try:
- stats = st.session_state.validator.get_statistics()
-
- col1, col2, col3, col4 = st.columns(4)
- with col1:
- st.metric("📊 总文本块", stats['total_texts'])
- with col2:
- st.metric("🔗 可点击文本", stats['clickable_texts'])
- with col3:
- st.metric("❌ 标记错误", stats['marked_errors'])
- with col4:
- st.metric("✅ 准确率", f"{stats['accuracy_rate']:.1f}%")
-
- st.markdown("---")
- except Exception as e:
- st.error(f"❌ 统计信息计算失败: {e}")
- return
-
- # 创建标签页
- tab1, tab2, tab3 = st.tabs(["📄 文本校验", "📊 表格分析", "📈 数据统计"])
-
- with tab1:
- # 原有的左右分栏内容
- left_col, right_col = st.columns([1, 1])
-
- # 左侧 - OCR文本内容
- with left_col:
- st.header("📄 OCR识别内容")
-
- # 文本选择器
- if st.session_state.validator.text_bbox_mapping:
- text_options = ["请选择文本..."] + list(st.session_state.validator.text_bbox_mapping.keys())
- selected_index = st.selectbox(
- "选择要校验的文本",
- range(len(text_options)),
- format_func=lambda x: text_options[x],
- key="text_selector"
- )
-
- if selected_index > 0:
- st.session_state.selected_text = text_options[selected_index]
- else:
- st.warning("没有找到可点击的文本")
-
- # 显示MD内容(可搜索和过滤)
- if st.session_state.validator.md_content:
- search_term = st.text_input("🔍 搜索文本内容", placeholder="输入关键词搜索...")
-
- display_content = st.session_state.validator.md_content
- if search_term:
- lines = display_content.split('\n')
- filtered_lines = [line for line in lines if search_term.lower() in line.lower()]
- display_content = '\n'.join(filtered_lines)
- if filtered_lines:
- st.success(f"找到 {len(filtered_lines)} 行包含 '{search_term}'")
- else:
- st.warning(f"未找到包含 '{search_term}' 的内容")
-
- # 渲染方式选择
- render_mode = st.radio(
- "选择渲染方式",
- ["HTML渲染", "Markdown渲染", "DataFrame表格", "原始文本"], # 添加DataFrame选项
- horizontal=True
- )
- if render_mode == "HTML渲染":
- # 使用unsafe_allow_html=True来渲染HTML表格
- st.markdown(display_content, unsafe_allow_html=True)
- elif render_mode == "Markdown渲染":
- # 转换HTML表格为Markdown格式
- converted_content = st.session_state.validator.convert_html_table_to_markdown(display_content)
- st.markdown(converted_content)
- elif render_mode == "DataFrame表格":
- # 新增:使用DataFrame显示表格
- if '<table>' in display_content.lower():
- st.session_state.validator.display_html_table_as_dataframe(display_content)
- else:
- st.info("当前内容中没有检测到HTML表格")
- st.markdown(display_content)
- else:
- # 原始文本显示
- st.text_area(
- "MD内容预览",
- display_content,
- height=300,
- help="OCR识别的文本内容"
- )
-
- # 可点击文本列表
- st.subheader("🎯 可点击文本列表")
-
- if st.session_state.validator.text_bbox_mapping:
- for text, info_list in st.session_state.validator.text_bbox_mapping.items():
- info = info_list[0] # 使用第一个bbox信息
-
- # 确定显示样式
- is_selected = (text == st.session_state.selected_text)
- is_error = (text in st.session_state.marked_errors)
-
- # 创建按钮行
- button_col, error_col = st.columns([4, 1])
-
- with button_col:
- button_type = "primary" if is_selected else "secondary"
- if st.button(f"📍 {text}", key=f"btn_{text}", type=button_type):
- st.session_state.selected_text = text
- st.rerun()
-
- with error_col:
- if is_error:
- if st.button("✅", key=f"fix_{text}", help="取消错误标记"):
- st.session_state.marked_errors.discard(text)
- st.rerun()
- else:
- if st.button("❌", key=f"error_{text}", help="标记为错误"):
- st.session_state.marked_errors.add(text)
- st.rerun()
- else:
- st.info("没有可点击的文本项目")
-
- # 右侧 - 图像显示
- with right_col:
- st.header("🖼️ 原图标注")
-
- if st.session_state.validator.image_path and Path(st.session_state.validator.image_path).exists():
- try:
- # 加载图片
- image = Image.open(st.session_state.validator.image_path)
-
- # 创建交互式图片
- selected_bbox = None
- if st.session_state.selected_text and st.session_state.selected_text in st.session_state.validator.text_bbox_mapping:
- info = st.session_state.validator.text_bbox_mapping[st.session_state.selected_text][0]
- selected_bbox = info['bbox']
-
- fig = st.session_state.validator.create_interactive_plot(image, selected_bbox)
- st.plotly_chart(fig, use_container_width=True)
-
- # 显示选中文本的详细信息
- if st.session_state.selected_text:
- st.subheader("📍 选中文本详情")
-
- if st.session_state.selected_text in st.session_state.validator.text_bbox_mapping:
- info = st.session_state.validator.text_bbox_mapping[st.session_state.selected_text][0]
- bbox = info['bbox']
-
- info_col1, info_col2 = st.columns(2)
- with info_col1:
- st.write(f"**文本内容:** {st.session_state.selected_text}")
- st.write(f"**类别:** {info['category']}")
- st.write(f"**置信度:** {info.get('confidence', 'N/A')}")
-
- with info_col2:
- st.write(f"**位置:** [{', '.join(map(str, bbox))}]")
- if len(bbox) >= 4:
- st.write(f"**宽度:** {bbox[2] - bbox[0]} px")
- st.write(f"**高度:** {bbox[3] - bbox[1]} px")
-
- # 标记状态
- is_error = st.session_state.selected_text in st.session_state.marked_errors
- if is_error:
- st.error("⚠️ 此文本已标记为错误")
- else:
- st.success("✅ 此文本未标记错误")
- except Exception as e:
- st.error(f"❌ 图片处理失败: {e}")
- else:
- st.error("未找到对应的图片文件")
- if st.session_state.validator.image_path:
- st.write(f"期望路径: {st.session_state.validator.image_path}")
-
- with tab2:
- # 新增:专门的表格分析页面
- st.header("📊 表格数据分析")
-
- if st.session_state.validator.md_content:
- # 检查是否包含表格
- if '<table' in st.session_state.validator.md_content.lower():
- col1, col2 = st.columns([2, 1])
-
- with col1:
- st.subheader("🔍 表格数据预览")
- st.session_state.validator.display_html_table_as_dataframe(
- st.session_state.validator.md_content
- )
-
- with col2:
- st.subheader("⚙️ 表格操作")
-
- if st.button("📥 导出表格数据", type="primary"):
- try:
- import pandas as pd
- from io import StringIO
- tables = pd.read_html(StringIO(st.session_state.validator.md_content))
- if tables:
- # 创建Excel文件
- output = BytesIO()
- with pd.ExcelWriter(output, engine='openpyxl') as writer:
- for i, table in enumerate(tables):
- table.to_excel(writer, sheet_name=f'Table_{i+1}', index=False)
-
- st.download_button(
- label="📥 下载Excel文件",
- data=output.getvalue(),
- file_name="ocr_tables.xlsx",
- mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
- )
- except Exception as e:
- st.error(f"导出失败: {e}")
-
- if st.button("🔍 表格统计分析"):
- try:
- import pandas as pd
- from io import StringIO
- tables = pd.read_html(StringIO(st.session_state.validator.md_content))
- if tables:
- st.write("**表格统计信息:**")
- for i, table in enumerate(tables):
- st.write(f"表格 {i+1}:")
- st.write(f"- 行数: {len(table)}")
- st.write(f"- 列数: {len(table.columns)}")
- st.write(f"- 数值列数: {len(table.select_dtypes(include=[np.number]).columns)}")
- except Exception as e:
- st.error(f"统计分析失败: {e}")
- else:
- st.info("当前OCR结果中没有检测到表格数据")
- else:
- st.warning("请先加载OCR数据")
-
- with tab3:
- # 数据统计页面
- st.header("📈 OCR数据统计")
- # ...现有的统计代码...
- if __name__ == "__main__":
- main()
|