| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275 |
- #!/usr/bin/env python3
- """
- 基于Streamlit的OCR可视化校验工具(修复版)
- 提供丰富的交互组件和更好的用户体验
- """
- import streamlit as st
- import json
- import pandas as pd
- from pathlib import Path
- import numpy as np
- from PIL import Image, ImageDraw, ImageFont
- import cv2
- import base64
- from typing import Dict, List, Optional, Tuple
- import plotly.express as px
- import plotly.graph_objects as go
- from plotly.subplots import make_subplots
- from io import StringIO, BytesIO
- # 设置页面配置
- st.set_page_config(
- page_title="OCR可视化校验工具",
- page_icon="🔍",
- layout="wide",
- initial_sidebar_state="expanded"
- )
- # 自定义CSS样式 - 修复背景和文字颜色
- st.markdown("""
- <style>
- /* 设置主体背景为白色 */
- .main > div {
- padding-top: 2rem;
- background-color: white !important;
- color: #333333 !important;
- }
-
- /* 设置整体页面背景 */
- .stApp {
- background-color: white !important;
- }
-
- /* 设置内容区域背景 */
- .block-container {
- background-color: white !important;
- color: #333333 !important;
- }
-
- /* 设置侧边栏样式 */
- .css-1d391kg {
- background-color: #f8f9fa !important;
- }
-
- /* 设置选择框样式 */
- .stSelectbox > div > div > div {
- background-color: #f0f2f6 !important;
- color: #333333 !important;
- }
-
- /* 设置标题样式 */
- h1, h2, h3, h4, h5, h6 {
- color: #333333 !important;
- }
-
- /* 设置文本样式 */
- p, div, span, label {
- color: #333333 !important;
- }
-
- /* 可点击文本样式 */
- .clickable-text {
- background-color: #e1f5fe;
- padding: 2px 6px;
- border-radius: 4px;
- border: 1px solid #0288d1;
- cursor: pointer;
- margin: 2px;
- display: inline-block;
- color: #0288d1 !important;
- }
-
- .selected-text {
- background-color: #fff3e0;
- border-color: #ff9800;
- font-weight: bold;
- color: #ff9800 !important;
- }
-
- .error-text {
- background-color: #ffebee;
- border-color: #f44336;
- color: #d32f2f !important;
- }
-
- .stats-container {
- background-color: #f8f9fa;
- padding: 1rem;
- border-radius: 8px;
- border-left: 4px solid #28a745;
- color: #333333 !important;
- }
-
- /* 修复滚动内容区域样式 */
- .scrollable-content {
- background-color: #fafafa !important;
- color: #333333 !important;
- border: 1px solid #ddd !important;
- }
-
- /* 修复紧凑内容样式 */
- .compact-content {
- background-color: #fafafa !important;
- color: #333333 !important;
- border: 1px solid #ddd !important;
- }
-
- /* 高亮文本样式 */
- .highlight-text {
- background-color: #ffeb3b !important;
- color: #333333 !important;
- padding: 2px 4px;
- border-radius: 3px;
- cursor: pointer;
- }
-
- .selected-highlight {
- background-color: #4caf50 !important;
- color: white !important;
- }
-
- /* 标准布局内容样式 */
- .standard-content {
- background-color: #fafafa !important;
- color: #333333 !important;
- border: 1px solid #ddd !important;
- }
- </style>
- """, unsafe_allow_html=True)
- class StreamlitOCRValidator:
- def __init__(self):
- self.ocr_data = []
- self.md_content = ""
- self.image_path = ""
- self.text_bbox_mapping = {}
- self.selected_text = None
- self.marked_errors = set()
-
- def load_ocr_data(self, json_path: str, md_path: Optional[str] = None, image_path: Optional[str] = None):
- """加载OCR相关数据"""
- json_file = Path(json_path)
-
- # 加载JSON数据
- try:
- with open(json_file, 'r', encoding='utf-8') as f:
- data = json.load(f)
- # 确保数据是列表格式
- if isinstance(data, list):
- self.ocr_data = data
- elif isinstance(data, dict) and 'results' in data:
- self.ocr_data = data['results']
- else:
- st.error(f"❌ 不支持的JSON格式: {json_path}")
- return
- except Exception as e:
- st.error(f"❌ 加载JSON文件失败: {e}")
- return
-
- # 推断MD文件路径
- if md_path is None:
- md_file = json_file.with_suffix('.md')
- else:
- md_file = Path(md_path)
-
- if md_file.exists():
- with open(md_file, 'r', encoding='utf-8') as f:
- self.md_content = f.read()
-
- # 推断图片路径
- if image_path is None:
- image_name = json_file.stem
- sample_data_dir = Path("./sample_data")
-
- image_candidates = [
- sample_data_dir / f"{image_name}.png",
- sample_data_dir / f"{image_name}.jpg",
- json_file.parent / f"{image_name}.png",
- json_file.parent / f"{image_name}.jpg",
- ]
-
- for candidate in image_candidates:
- if candidate.exists():
- self.image_path = str(candidate)
- break
- else:
- self.image_path = image_path
-
- # 处理数据
- self.process_data()
-
- def process_data(self):
- """处理OCR数据,建立文本到bbox的映射"""
- self.text_bbox_mapping = {}
-
- # 确保 ocr_data 是列表
- if not isinstance(self.ocr_data, list):
- st.warning("⚠️ OCR数据格式不正确,期望列表格式")
- return
-
- for i, item in enumerate(self.ocr_data):
- # 确保 item 是字典类型
- if not isinstance(item, dict):
- continue
-
- if 'text' in item and 'bbox' in item:
- text = str(item['text']).strip()
- if text and text not in ['Picture', '']:
- bbox = item['bbox']
- # 确保bbox是4个数字的列表
- if isinstance(bbox, list) and len(bbox) == 4:
- if text not in self.text_bbox_mapping:
- self.text_bbox_mapping[text] = []
- self.text_bbox_mapping[text].append({
- 'bbox': bbox,
- 'category': item.get('category', 'Text'),
- 'index': i,
- 'confidence': item.get('confidence', 1.0)
- })
-
- def draw_bbox_on_image(self, image: Image.Image, bbox: List[int], color: str = "red", width: int = 3) -> Image.Image:
- """在图片上绘制bbox框"""
- img_copy = image.copy()
- draw = ImageDraw.Draw(img_copy)
-
- x1, y1, x2, y2 = bbox
-
- # 绘制矩形框
- draw.rectangle([x1, y1, x2, y2], outline=color, width=width)
-
- # 添加半透明填充
- overlay = Image.new('RGBA', img_copy.size, (0, 0, 0, 0))
- overlay_draw = ImageDraw.Draw(overlay)
-
- if color == "red":
- fill_color = (255, 0, 0, 30)
- elif color == "blue":
- fill_color = (0, 0, 255, 30)
- elif color == "green":
- fill_color = (0, 255, 0, 30)
- else:
- fill_color = (255, 255, 0, 30)
-
- overlay_draw.rectangle([x1, y1, x2, y2], fill=fill_color)
- img_copy = Image.alpha_composite(img_copy.convert('RGBA'), overlay).convert('RGB')
-
- return img_copy
-
- def create_interactive_plot(self, image: Image.Image, selected_bbox: Optional[List[int]] = None) -> go.Figure:
- """创建交互式图片显示"""
- fig = go.Figure()
-
- # 添加图片
- fig.add_layout_image(
- dict(
- source=image,
- xref="x",
- yref="y",
- x=0,
- y=image.height,
- sizex=image.width,
- sizey=image.height,
- sizing="stretch",
- opacity=1.0,
- layer="below"
- )
- )
-
- # 添加所有bbox(浅色显示)
- for text, info_list in self.text_bbox_mapping.items():
- for info in info_list:
- bbox = info['bbox']
- if len(bbox) >= 4: # 确保bbox有足够的坐标
- x1, y1, x2, y2 = bbox[:4]
-
- color = "rgba(0, 100, 200, 0.2)" # 默认浅蓝色
- if text in self.marked_errors:
- color = "rgba(255, 0, 0, 0.3)" # 错误标记为红色
-
- fig.add_shape(
- type="rect",
- x0=x1, y0=image.height-y2,
- x1=x2, y1=image.height-y1,
- line=dict(color=color.replace('0.2', '0.8').replace('0.3', '1.0'), width=1),
- fillcolor=color,
- )
-
- # 高亮显示选中的bbox
- if selected_bbox and len(selected_bbox) >= 4:
- x1, y1, x2, y2 = selected_bbox[:4]
- fig.add_shape(
- type="rect",
- x0=x1, y0=image.height-y2,
- x1=x2, y1=image.height-y1,
- line=dict(color="red", width=3),
- fillcolor="rgba(255, 0, 0, 0.2)",
- )
-
- # 设置布局
- fig.update_xaxes(
- visible=False,
- range=[0, image.width]
- )
-
- fig.update_yaxes(
- visible=False,
- range=[0, image.height],
- scaleanchor="x"
- )
-
- fig.update_layout(
- width=800,
- height=600,
- margin=dict(l=0, r=0, t=0, b=0),
- xaxis_showgrid=False,
- yaxis_showgrid=False,
- plot_bgcolor='white'
- )
-
- return fig
-
- def get_statistics(self) -> Dict:
- """获取统计信息"""
- # 先确保 ocr_data 不为空且是列表
- if not isinstance(self.ocr_data, list) or not self.ocr_data:
- return {
- 'total_texts': 0,
- 'clickable_texts': 0,
- 'marked_errors': 0,
- 'categories': {},
- 'accuracy_rate': 0
- }
-
- total_texts = len(self.ocr_data)
- clickable_texts = len(self.text_bbox_mapping)
- marked_errors = len(self.marked_errors)
-
- # 按类别统计 - 添加类型检查
- categories = {}
- for item in self.ocr_data:
- # 确保 item 是字典类型
- if isinstance(item, dict):
- category = item.get('category', 'Unknown')
- elif isinstance(item, str):
- category = 'Text' # 字符串类型默认为 Text 类别
- else:
- category = 'Unknown'
-
- categories[category] = categories.get(category, 0) + 1
-
- return {
- 'total_texts': total_texts,
- 'clickable_texts': clickable_texts,
- 'marked_errors': marked_errors,
- 'categories': categories,
- 'accuracy_rate': (clickable_texts - marked_errors) / clickable_texts * 100 if clickable_texts > 0 else 0
- }
-
- def convert_html_table_to_markdown(self, content: str) -> str:
- """将HTML表格转换为Markdown表格格式"""
- import re
- from html import unescape
-
- # 简单的HTML表格到Markdown转换
- def replace_table(match):
- table_html = match.group(0)
-
- # 提取所有行
- rows = re.findall(r'<tr>(.*?)</tr>', table_html, re.DOTALL | re.IGNORECASE)
- if not rows:
- return table_html # 如果没有找到行,返回原始内容
-
- markdown_rows = []
- for i, row in enumerate(rows):
- # 提取单元格
- cells = re.findall(r'<td[^>]*>(.*?)</td>', row, re.DOTALL | re.IGNORECASE)
- if cells:
- # 清理单元格内容
- clean_cells = []
- for cell in cells:
- # 移除HTML标签,保留文本
- cell_text = re.sub(r'<[^>]+>', '', cell).strip()
- cell_text = unescape(cell_text) # 解码HTML实体
- clean_cells.append(cell_text)
-
- # 构建Markdown行
- markdown_row = '| ' + ' | '.join(clean_cells) + ' |'
- markdown_rows.append(markdown_row)
-
- # 在第一行后添加分隔符
- if i == 0:
- separator = '| ' + ' | '.join(['---'] * len(clean_cells)) + ' |'
- markdown_rows.append(separator)
-
- return '\n'.join(markdown_rows) if markdown_rows else table_html
-
- # 替换所有HTML表格
- converted = re.sub(r'<table[^>]*>.*?</table>', replace_table, content, flags=re.DOTALL | re.IGNORECASE)
- return converted
-
- def display_html_table_as_dataframe(self, html_content: str, enable_editing: bool = False):
- """将HTML表格解析为DataFrame显示"""
- try:
- # 使用pandas直接读取HTML表格
- tables = pd.read_html(StringIO(html_content))
- if tables:
- for i, table in enumerate(tables):
- st.subheader(f"📊 表格 {i+1}")
-
- # 创建表格操作按钮
- col1, col2, col3, col4 = st.columns(4)
- with col1:
- show_info = st.checkbox(f"显示表格信息", key=f"info_{i}")
- with col2:
- show_stats = st.checkbox(f"显示统计信息", key=f"stats_{i}")
- with col3:
- enable_filter = st.checkbox(f"启用过滤", key=f"filter_{i}")
- with col4:
- enable_sort = st.checkbox(f"启用排序", key=f"sort_{i}")
-
- # 数据过滤
- filtered_table = table.copy()
- if enable_filter and not table.empty:
- filter_col = st.selectbox(
- f"选择过滤列 (表格 {i+1})",
- options=['无'] + list(table.columns),
- key=f"filter_col_{i}"
- )
-
- if filter_col != '无':
- filter_value = st.text_input(
- f"过滤值 (表格 {i+1})",
- key=f"filter_value_{i}"
- )
- if filter_value:
- filtered_table = table[
- table[filter_col].astype(str).str.contains(filter_value, na=False)
- ]
-
- # 数据排序
- if enable_sort and not filtered_table.empty:
- sort_col = st.selectbox(
- f"选择排序列 (表格 {i+1})",
- options=['无'] + list(filtered_table.columns),
- key=f"sort_col_{i}"
- )
-
- if sort_col != '无':
- sort_order = st.radio(
- f"排序方式 (表格 {i+1})",
- options=['升序', '降序'],
- horizontal=True,
- key=f"sort_order_{i}"
- )
- ascending = (sort_order == '升序')
- filtered_table = filtered_table.sort_values(sort_col, ascending=ascending)
-
- # 显示表格
- if enable_editing:
- # 可编辑表格
- edited_table = st.data_editor(
- filtered_table,
- use_container_width=True,
- key=f"editor_{i}"
- )
-
- # 检查是否有编辑
- if not edited_table.equals(filtered_table):
- st.success("✏️ 表格已编辑,可以导出修改后的数据")
-
- else:
- # 只读表格
- st.dataframe(filtered_table, use_container_width=True)
-
- # 显示表格信息
- if show_info:
- st.write(f"**表格信息:**")
- st.write(f"- 原始行数: {len(table)}")
- st.write(f"- 过滤后行数: {len(filtered_table)}")
- st.write(f"- 列数: {len(table.columns)}")
- st.write(f"- 列名: {', '.join(table.columns)}")
-
- # 显示统计信息
- if show_stats:
- st.write(f"**统计信息:**")
- numeric_cols = filtered_table.select_dtypes(include=[np.number]).columns
- if len(numeric_cols) > 0:
- st.dataframe(filtered_table[numeric_cols].describe())
- else:
- st.info("表格中没有数值列")
-
- # 导出功能
- if st.button(f"📥 导出表格 {i+1}", key=f"export_{i}"):
- # 创建CSV数据
- csv_data = filtered_table.to_csv(index=False)
- st.download_button(
- label=f"下载CSV (表格 {i+1})",
- data=csv_data,
- file_name=f"table_{i+1}.csv",
- mime="text/csv",
- key=f"download_csv_{i}"
- )
-
- # 创建Excel数据
- excel_buffer = BytesIO()
- filtered_table.to_excel(excel_buffer, index=False)
- st.download_button(
- label=f"下载Excel (表格 {i+1})",
- data=excel_buffer.getvalue(),
- file_name=f"table_{i+1}.xlsx",
- mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
- key=f"download_excel_{i}"
- )
-
- st.markdown("---")
- else:
- st.warning("未找到可解析的表格")
-
- except Exception as e:
- st.error(f"表格解析失败: {e}")
- st.info("尝试使用HTML渲染模式查看表格")
- # 回退到HTML渲染
- st.markdown(html_content, unsafe_allow_html=True)
-
- def create_standard_layout(self, font_size: int = 12, zoom_level: float = 1.0):
- """创建标准布局 - 封装版本"""
- # 主要内容区域
- left_col, right_col = st.columns([0.7, 1])
-
- with left_col:
- st.header("📄 OCR识别内容")
-
- # 文本选择器
- if self.text_bbox_mapping:
- text_options = ["请选择文本..."] + list(self.text_bbox_mapping.keys())
- selected_index = st.selectbox(
- "选择要校验的文本",
- range(len(text_options)),
- format_func=lambda x: text_options[x][:50] + "..." if len(text_options[x]) > 50 else text_options[x],
- key="standard_text_selector"
- )
-
- if selected_index > 0:
- st.session_state.selected_text = text_options[selected_index]
- else:
- st.warning("没有找到可点击的文本")
-
- # 显示MD内容
- if self.md_content:
- search_term = st.text_input("🔍 搜索文本内容", placeholder="输入关键词搜索...", key="standard_search")
-
- display_content = self.md_content
- if search_term:
- lines = display_content.split('\n')
- filtered_lines = [line for line in lines if search_term.lower() in line.lower()]
- display_content = '\n'.join(filtered_lines)
- if filtered_lines:
- st.success(f"找到 {len(filtered_lines)} 行包含 '{search_term}'")
- else:
- st.warning(f"未找到包含 '{search_term}' 的内容")
-
- # 渲染方式选择
- render_mode = st.radio(
- "选择渲染方式",
- ["HTML渲染", "Markdown渲染", "DataFrame表格", "原始文本"],
- horizontal=True,
- key="standard_render_mode"
- )
- # 应用字体大小到内容显示
- content_style = f"""
- <style>
- .standard-content-display {{
- font-size: {font_size}px !important;
- line-height: 1.4;
- color: #333333 !important;
- background-color: #fafafa !important;
- padding: 10px;
- border-radius: 5px;
- border: 1px solid #ddd;
- }}
- </style>
- """
- st.markdown(content_style, unsafe_allow_html=True)
- if render_mode == "HTML渲染":
- st.markdown(f'<div class="standard-content-display">{display_content}</div>', unsafe_allow_html=True)
- elif render_mode == "Markdown渲染":
- converted_content = self.convert_html_table_to_markdown(display_content)
- st.markdown(f'<div class="standard-content-display">{converted_content}</div>', unsafe_allow_html=True)
- elif render_mode == "DataFrame表格":
- if '<table' in display_content.lower():
- self.display_html_table_as_dataframe(display_content)
- else:
- st.info("当前内容中没有检测到HTML表格")
- st.markdown(f'<div class="standard-content-display">{display_content}</div>', unsafe_allow_html=True)
- else:
- st.text_area(
- "MD内容预览",
- display_content,
- height=300,
- help="OCR识别的文本内容",
- key="standard_text_area"
- )
-
- with right_col:
- st.header("🖼️ 原图标注")
-
- # 图片缩放控制
- col1, col2 = st.columns(2)
- with col1:
- current_zoom = st.slider("图片缩放", 0.3, 2.0, zoom_level, 0.1, key="standard_zoom_level")
- with col2:
- show_all_boxes = st.checkbox("显示所有框", value=False, key="standard_show_all_boxes")
-
- if self.image_path and Path(self.image_path).exists():
- try:
- image = Image.open(self.image_path)
-
- # 应用缩放级别
- if current_zoom != 1.0:
- new_width = int(image.width * current_zoom)
- new_height = int(image.height * current_zoom)
- image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
-
- selected_bbox = None
- if st.session_state.selected_text and st.session_state.selected_text in self.text_bbox_mapping:
- info = self.text_bbox_mapping[st.session_state.selected_text][0]
- bbox = info['bbox']
- if current_zoom != 1.0:
- bbox = [int(coord * current_zoom) for coord in bbox]
- selected_bbox = bbox
-
- # 创建交互式图片
- if show_all_boxes:
- # 显示所有框的模式
- fig = self.create_interactive_plot(image, selected_bbox)
- else:
- # 只显示选中框的模式
- fig = go.Figure()
-
- # 添加图片
- fig.add_layout_image(
- dict(
- source=image,
- xref="x", yref="y",
- x=0, y=image.height,
- sizex=image.width, sizey=image.height,
- sizing="stretch", opacity=1.0, layer="below"
- )
- )
-
- # 只显示选中的bbox
- if selected_bbox and len(selected_bbox) >= 4:
- x1, y1, x2, y2 = selected_bbox[:4]
- fig.add_shape(
- type="rect",
- x0=x1, y0=image.height-y2,
- x1=x2, y1=image.height-y1,
- line=dict(color="red", width=3),
- fillcolor="rgba(255, 0, 0, 0.2)",
- )
-
- # 设置布局
- fig.update_xaxes(visible=False, range=[0, image.width])
- fig.update_yaxes(visible=False, range=[0, image.height], scaleanchor="x")
-
- fig.update_layout(
- width=800, height=600,
- margin=dict(l=0, r=0, t=0, b=0),
- xaxis_showgrid=False, yaxis_showgrid=False,
- plot_bgcolor='white'
- )
-
- st.plotly_chart(fig, use_container_width=True, key="standard_plot")
-
- # 显示选中文本的详细信息
- if st.session_state.selected_text:
- st.subheader("📍 选中文本详情")
-
- if st.session_state.selected_text in self.text_bbox_mapping:
- info = self.text_bbox_mapping[st.session_state.selected_text][0]
- original_bbox = info['bbox']
-
- info_col1, info_col2 = st.columns(2)
- with info_col1:
- st.write(f"**文本内容:** {st.session_state.selected_text[:30]}...")
- st.write(f"**类别:** {info['category']}")
- st.write(f"**置信度:** {info.get('confidence', 'N/A')}")
-
- with info_col2:
- st.write(f"**位置:** [{', '.join(map(str, original_bbox))}]")
- if len(original_bbox) >= 4:
- st.write(f"**宽度:** {original_bbox[2] - original_bbox[0]} px")
- st.write(f"**高度:** {original_bbox[3] - original_bbox[1]} px")
-
- # 错误标记功能
- col1, col2 = st.columns(2)
- with col1:
- if st.button("❌ 标记为错误", key="mark_error_standard"):
- st.session_state.marked_errors.add(st.session_state.selected_text)
- st.rerun()
-
- with col2:
- if st.button("✅ 取消错误标记", key="unmark_error_standard"):
- st.session_state.marked_errors.discard(st.session_state.selected_text)
- st.rerun()
-
- # 标记状态显示
- is_error = st.session_state.selected_text in st.session_state.marked_errors
- if is_error:
- st.error("⚠️ 此文本已标记为错误")
- else:
- st.success("✅ 此文本未标记错误")
-
- except Exception as e:
- st.error(f"❌ 图片处理失败: {e}")
- else:
- st.error("未找到对应的图片文件")
- if self.image_path:
- st.write(f"期望路径: {self.image_path}")
-
- def create_split_layout_with_fixed_image(self, font_size: int = 12, zoom_level: float = 1.0):
- """创建左侧滚动、右侧固定的布局 - 修复版本"""
- # 使用columns创建左右布局
- left_col, right_col = st.columns([0.7, 1])
-
- with left_col:
- st.header("📄 OCR识别内容")
-
- # 添加文本选择器
- if self.text_bbox_mapping:
- text_options = ["请选择文本..."] + list(self.text_bbox_mapping.keys())
- selected_index = st.selectbox(
- "选择要校验的文本",
- range(len(text_options)),
- format_func=lambda x: text_options[x][:50] + "..." if len(text_options[x]) > 50 else text_options[x],
- key="split_text_selector"
- )
-
- if selected_index > 0:
- st.session_state.selected_text = text_options[selected_index]
-
- # 创建可滚动的容器
- container_height = st.selectbox(
- "选择内容区域高度",
- [400, 600, 800, 1000, 1200],
- index=2,
- key="split_content_height"
- )
-
- # 使用自定义CSS创建滚动区域,应用字体大小参数
- st.markdown(f"""
- <style>
- .scrollable-content {{
- height: {container_height}px;
- overflow-y: auto;
- overflow-x: hidden;
- padding: 10px;
- border: 1px solid #ddd;
- border-radius: 5px;
- background-color: #fafafa !important;
- font-size: {font_size}px !important;
- line-height: 1.4;
- color: #333333 !important;
- }}
-
- .scrollable-content::-webkit-scrollbar {{
- width: 8px;
- }}
-
- .scrollable-content::-webkit-scrollbar-track {{
- background: #f1f1f1;
- border-radius: 4px;
- }}
-
- .scrollable-content::-webkit-scrollbar-thumb {{
- background: #888;
- border-radius: 4px;
- }}
-
- .scrollable-content::-webkit-scrollbar-thumb:hover {{
- background: #555;
- }}
- </style>
- """, unsafe_allow_html=True)
-
- # 显示可滚动的OCR内容
- if self.md_content:
- scrollable_content = f"""
- <div class="scrollable-content">
- {self.md_content.replace(chr(10), '<br>')}
- </div>
- """
- st.markdown(scrollable_content, unsafe_allow_html=True)
-
- with right_col:
- # 固定位置的图片显示
- self.create_fixed_image_display(zoom_level)
- def create_fixed_image_display(self, zoom_level: float = 1.0):
- """创建固定位置的图片显示 - 修复版本"""
- st.header("🖼️ 原图标注")
-
- # 图片缩放控制
- col1, col2 = st.columns(2)
- with col1:
- current_zoom = st.slider("图片缩放", 0.3, 2.0, zoom_level, 0.1, key="fixed_zoom_level")
- with col2:
- show_all_boxes = st.checkbox("显示所有框", value=False, key="fixed_show_all_boxes")
-
- if self.image_path and Path(self.image_path).exists():
- try:
- image = Image.open(self.image_path)
-
- # 根据缩放级别调整图片大小
- new_width = int(image.width * current_zoom)
- new_height = int(image.height * current_zoom)
- resized_image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
-
- # 在固定容器中显示图片
- selected_bbox = None
- if st.session_state.selected_text and st.session_state.selected_text in self.text_bbox_mapping:
- info = self.text_bbox_mapping[st.session_state.selected_text][0]
- # 根据缩放级别调整bbox坐标
- bbox = info['bbox']
- selected_bbox = [int(coord * current_zoom) for coord in bbox]
-
- # 创建交互式图片(调整大小)
- fig = self.create_resized_interactive_plot(resized_image, selected_bbox, current_zoom, show_all_boxes)
- st.plotly_chart(fig, use_container_width=True, key="fixed_plot")
-
- # 显示选中文本的详细信息
- if st.session_state.selected_text and st.session_state.selected_text in self.text_bbox_mapping:
- st.subheader("📍 选中文本详情")
-
- info = self.text_bbox_mapping[st.session_state.selected_text][0]
- bbox = info['bbox']
-
- info_col1, info_col2 = st.columns(2)
- with info_col1:
- st.write(f"**文本内容:** {st.session_state.selected_text[:30]}...")
- st.write(f"**类别:** {info['category']}")
-
- with info_col2:
- st.write(f"**位置:** [{', '.join(map(str, bbox))}]")
- if len(bbox) >= 4:
- st.write(f"**大小:** {bbox[2] - bbox[0]} x {bbox[3] - bbox[1]} px")
-
- except Exception as e:
- st.error(f"❌ 图片处理失败: {e}")
- else:
- st.error("未找到对应的图片文件")
- if self.image_path:
- st.write(f"期望路径: {self.image_path}")
-
- def create_resized_interactive_plot(self, image: Image.Image, selected_bbox: Optional[List[int]], zoom_level: float, show_all_boxes: bool) -> go.Figure:
- """创建可调整大小的交互式图片"""
- fig = go.Figure()
-
- fig.add_layout_image(
- dict(
- source=image,
- xref="x", yref="y",
- x=0, y=image.height,
- sizex=image.width, sizey=image.height,
- sizing="stretch", opacity=1.0, layer="below"
- )
- )
-
- # 显示所有bbox(如果开启)
- if show_all_boxes:
- for text, info_list in self.text_bbox_mapping.items():
- for info in info_list:
- bbox = info['bbox']
- if len(bbox) >= 4:
- x1, y1, x2, y2 = [coord * zoom_level for coord in bbox[:4]]
-
- color = "rgba(0, 100, 200, 0.2)"
- if text in self.marked_errors:
- color = "rgba(255, 0, 0, 0.3)"
-
- fig.add_shape(
- type="rect",
- x0=x1, y0=image.height-y2,
- x1=x2, y1=image.height-y1,
- line=dict(color=color.replace('0.2', '0.8').replace('0.3', '1.0'), width=1),
- fillcolor=color,
- )
-
- # 高亮显示选中的bbox
- if selected_bbox and len(selected_bbox) >= 4:
- x1, y1, x2, y2 = selected_bbox[:4]
- fig.add_shape(
- type="rect",
- x0=x1, y0=image.height-y2,
- x1=x2, y1=image.height-y1,
- line=dict(color="red", width=2),
- fillcolor="rgba(255, 0, 0, 0.3)",
- )
-
- fig.update_xaxes(visible=False, range=[0, image.width])
- fig.update_yaxes(visible=False, range=[0, image.height], scaleanchor="x")
-
- fig.update_layout(
- width=image.width,
- height=image.height,
- margin=dict(l=0, r=0, t=0, b=0),
- showlegend=False,
- plot_bgcolor='white'
- )
-
- return fig
-
- def create_compact_layout(self, font_size: int = 12, zoom_level: float = 1.0):
- """创建紧凑的对比布局 - 修复版本"""
- # 顶部控制区域
- control_col1, control_col2, control_col3 = st.columns([1, 1, 1])
-
- with control_col1:
- current_font_size = st.selectbox("字体大小", [10, 12, 14, 16, 18],
- index=[10, 12, 14, 16, 18].index(font_size) if font_size in [10, 12, 14, 16, 18] else 1,
- key="compact_font")
-
- with control_col2:
- content_height = st.selectbox("内容高度", [300, 400, 500, 600], index=1, key="compact_height")
-
- with control_col3:
- current_zoom = st.slider("图片缩放", 0.3, 1.5, zoom_level, 0.1, key="compact_zoom")
-
- # 主要内容区域
- left_col, right_col = st.columns([0.7, 1]) # 调整比例
-
- with left_col:
- st.subheader("📄 OCR内容")
-
- # 文本选择器
- if self.text_bbox_mapping:
- text_options = ["请选择文本..."] + list(self.text_bbox_mapping.keys())
- selected_index = st.selectbox(
- "快速定位文本",
- range(len(text_options)),
- format_func=lambda x: text_options[x][:30] + "..." if len(text_options[x]) > 30 else text_options[x],
- key="compact_text_selector"
- )
-
- if selected_index > 0:
- st.session_state.selected_text = text_options[selected_index]
-
- # 自定义CSS样式,应用字体大小参数
- st.markdown(f"""
- <style>
- .compact-content {{
- height: {content_height}px;
- overflow-y: auto;
- font-size: {current_font_size}px !important;
- line-height: 1.4;
- border: 1px solid #ddd;
- padding: 10px;
- background-color: #fafafa !important;
- font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
- color: #333333 !important;
- }}
-
- .highlight-text {{
- background-color: #ffeb3b !important;
- padding: 2px 4px;
- border-radius: 3px;
- cursor: pointer;
- color: #333333 !important;
- }}
-
- .selected-highlight {{
- background-color: #4caf50 !important;
- color: white !important;
- }}
- </style>
- """, unsafe_allow_html=True)
-
- # 处理并显示OCR内容
- if self.md_content:
- # 高亮可点击文本
- highlighted_content = self.md_content
- for text in self.text_bbox_mapping.keys():
- if len(text) > 2: # 避免高亮过短的文本
- css_class = "highlight-text selected-highlight" if text == st.session_state.selected_text else "highlight-text"
- # 使用更安全的替换方法
- highlighted_content = highlighted_content.replace(
- text,
- f'<span class="{css_class}" title="{text[:50]}...">{text}</span>'
- )
-
- st.markdown(
- f'<div class="compact-content">{highlighted_content}</div>',
- unsafe_allow_html=True
- )
-
- with right_col:
- st.subheader("🖼️ 图片标注")
-
- if self.image_path and Path(self.image_path).exists():
- try:
- image = Image.open(self.image_path)
-
- # 调整图片大小以适应布局
- display_width = int(400 * current_zoom) # 使用当前缩放值
- aspect_ratio = image.height / image.width
- display_height = int(display_width * aspect_ratio)
-
- resized_image = image.resize((display_width, display_height), Image.Resampling.LANCZOS)
-
- # 显示选中文本的bbox
- if st.session_state.selected_text and st.session_state.selected_text in self.text_bbox_mapping:
- info = self.text_bbox_mapping[st.session_state.selected_text][0]
- bbox = info['bbox']
-
- # 在图片上绘制bbox
- annotated_image = self.draw_bbox_on_image(resized_image,
- [int(coord * current_zoom) for coord in bbox], "red", 3)
- st.image(annotated_image, use_column_width=True)
-
- # 显示详细信息
- st.info(f"**选中:** {st.session_state.selected_text[:20]}...\n**位置:** [{', '.join(map(str, bbox))}]")
- else:
- st.image(resized_image, use_column_width=True)
-
- except Exception as e:
- st.error(f"❌ 图片处理失败: {e}")
- else:
- st.error("未找到对应的图片文件")
- def main():
- """主应用"""
- st.title("🔍 OCR可视化校验工具")
- st.markdown("---")
-
- # 初始化session state
- if 'validator' not in st.session_state:
- st.session_state.validator = StreamlitOCRValidator()
-
- if 'selected_text' not in st.session_state:
- st.session_state.selected_text = None
-
- if 'marked_errors' not in st.session_state:
- st.session_state.marked_errors = set()
-
- # 同步标记的错误到validator
- st.session_state.validator.marked_errors = st.session_state.marked_errors
-
- # 侧边栏 - 文件选择和控制
- with st.sidebar:
- st.header("📁 文件选择")
-
- # 查找可用的OCR文件
- output_dir = Path("output")
- available_files = []
-
- if output_dir.exists():
- for json_file in output_dir.rglob("*.json"):
- available_files.append(str(json_file))
-
- if available_files:
- selected_file = st.selectbox(
- "选择OCR结果文件",
- available_files,
- index=0
- )
-
- if st.button("🔄 加载文件", type="primary") and selected_file:
- try:
- st.session_state.validator.load_ocr_data(selected_file)
- st.success("✅ 文件加载成功!")
- st.rerun()
- except Exception as e:
- st.error(f"❌ 加载失败: {e}")
- else:
- st.warning("未找到OCR结果文件")
- st.info("请确保output目录下有OCR结果文件")
-
- st.markdown("---")
-
- # 控制面板
- st.header("🎛️ 控制面板")
-
- if st.button("🧹 清除选择"):
- st.session_state.selected_text = None
- st.rerun()
-
- if st.button("❌ 清除错误标记"):
- st.session_state.marked_errors = set()
- st.rerun()
-
- # 主内容区域
- if not st.session_state.validator.ocr_data:
- st.info("👈 请在左侧选择并加载OCR结果文件")
- return
-
- # 显示统计信息
- try:
- stats = st.session_state.validator.get_statistics()
-
- col1, col2, col3, col4 = st.columns(4)
- with col1:
- st.metric("📊 总文本块", stats['total_texts'])
- with col2:
- st.metric("🔗 可点击文本", stats['clickable_texts'])
- with col3:
- st.metric("❌ 标记错误", stats['marked_errors'])
- with col4:
- st.metric("✅ 准确率", f"{stats['accuracy_rate']:.1f}%")
-
- st.markdown("---")
- except Exception as e:
- st.error(f"❌ 统计信息计算失败: {e}")
- return
-
- # 创建标签页
- tab1, tab2, tab3, tab4 = st.tabs(["📄 内容校验", "📊 表格分析", "📈 数据统计", "🚀 快速导航"])
-
- with tab1:
- # 顶部控制区域
- control_col1, control_col2, control_col3, control_col4 = st.columns(4)
-
- with control_col1:
- layout_mode = st.selectbox(
- "布局模式",
- ["标准布局", "滚动布局", "紧凑布局"],
- key="layout_mode"
- )
-
- with control_col2:
- if layout_mode != "标准布局":
- content_height = st.selectbox("内容高度", [400, 600, 800], index=1, key="content_height_select")
-
- with control_col3:
- font_size = st.selectbox("字体大小", [10, 12, 14, 16], index=1, key="font_size_select")
-
- with control_col4:
- zoom_level = st.slider("图片缩放", 0.3, 2.0, 1.0, 0.1, key="zoom_level_select")
-
- # 根据选择的布局模式显示不同的界面,传递参数
- if layout_mode == "滚动布局":
- st.session_state.validator.create_split_layout_with_fixed_image(font_size, zoom_level)
- elif layout_mode == "紧凑布局":
- st.session_state.validator.create_compact_layout(font_size, zoom_level)
- else:
- # 调用封装的标准布局方法
- st.session_state.validator.create_standard_layout(font_size, zoom_level)
- with tab2:
- # 表格分析页面
- st.header("📊 表格数据分析")
-
- if st.session_state.validator.md_content:
- if '<table' in st.session_state.validator.md_content.lower():
- col1, col2 = st.columns([2, 1])
-
- with col1:
- st.subheader("🔍 表格数据预览")
- st.session_state.validator.display_html_table_as_dataframe(
- st.session_state.validator.md_content
- )
-
- with col2:
- st.subheader("⚙️ 表格操作")
-
- if st.button("📥 导出表格数据", type="primary"):
- try:
- tables = pd.read_html(StringIO(st.session_state.validator.md_content))
- if tables:
- output = BytesIO()
- with pd.ExcelWriter(output, engine='openpyxl') as writer:
- for i, table in enumerate(tables):
- table.to_excel(writer, sheet_name=f'Table_{i+1}', index=False)
-
- st.download_button(
- label="📥 下载Excel文件",
- data=output.getvalue(),
- file_name="ocr_tables.xlsx",
- mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
- )
- except Exception as e:
- st.error(f"导出失败: {e}")
-
- if st.button("🔍 表格统计分析"):
- try:
- tables = pd.read_html(StringIO(st.session_state.validator.md_content))
- if tables:
- st.write("**表格统计信息:**")
- for i, table in enumerate(tables):
- st.write(f"表格 {i+1}:")
- st.write(f"- 行数: {len(table)}")
- st.write(f"- 列数: {len(table.columns)}")
- st.write(f"- 数值列数: {len(table.select_dtypes(include=[np.number]).columns)}")
- except Exception as e:
- st.error(f"统计分析失败: {e}")
- else:
- st.info("当前OCR结果中没有检测到表格数据")
- else:
- st.warning("请先加载OCR数据")
-
- with tab3:
- # 数据统计页面
- st.header("📈 OCR数据统计")
-
- if stats:
- # 类别统计图表
- if stats['categories']:
- st.subheader("📊 类别分布")
-
- fig_pie = px.pie(
- values=list(stats['categories'].values()),
- names=list(stats['categories'].keys()),
- title="文本类别分布"
- )
- st.plotly_chart(fig_pie, use_container_width=True)
-
- # 错误率分析
- st.subheader("📈 质量分析")
-
- accuracy_data = {
- '状态': ['正确', '错误'],
- '数量': [stats['clickable_texts'] - stats['marked_errors'], stats['marked_errors']]
- }
-
- fig_bar = px.bar(
- accuracy_data,
- x='状态',
- y='数量',
- title="识别质量分布",
- color='状态',
- color_discrete_map={'正确': 'green', '错误': 'red'}
- )
- st.plotly_chart(fig_bar, use_container_width=True)
- with tab4:
- # 快速导航功能
- st.header("🚀 快速导航")
-
- if not st.session_state.validator.text_bbox_mapping:
- st.info("没有可用的文本项进行导航")
- else:
- # 按类别分组
- categories = {}
- for text, info_list in st.session_state.validator.text_bbox_mapping.items():
- category = info_list[0]['category']
- if category not in categories:
- categories[category] = []
- categories[category].append(text)
-
- # 创建导航按钮
- for category, texts in categories.items():
- with st.expander(f"{category} ({len(texts)}项)", expanded=False):
- cols = st.columns(3) # 每行3个按钮
- for i, text in enumerate(texts):
- col_idx = i % 3
- with cols[col_idx]:
- display_text = text[:15] + "..." if len(text) > 15 else text
- if st.button(display_text, key=f"nav_{category}_{i}"):
- st.session_state.selected_text = text
- st.rerun()
- if __name__ == "__main__":
- main()
|