zhengchun
/
ocr_verify


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639
							#!/usr/bin/env python3
"""
基于Streamlit的OCR可视化校验工具（修复版）
提供丰富的交互组件和更好的用户体验
"""

import streamlit as st
import json
import pandas as pd
from pathlib import Path
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import cv2
import base64
from typing import Dict, List, Optional, Tuple
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# 设置页面配置
st.set_page_config(
    page_title="OCR可视化校验工具",
    page_icon="🔍",
    layout="wide",
    initial_sidebar_state="expanded"
)

# 自定义CSS样式
st.markdown("""
<style>
    .main > div {
        padding-top: 2rem;
    }
    
    .stSelectbox > div > div > div {
        background-color: #f0f2f6;
    }
    
    .clickable-text {
        background-color: #e1f5fe;
        padding: 2px 6px;
        border-radius: 4px;
        border: 1px solid #0288d1;
        cursor: pointer;
        margin: 2px;
        display: inline-block;
    }
    
    .selected-text {
        background-color: #fff3e0;
        border-color: #ff9800;
        font-weight: bold;
    }
    
    .error-text {
        background-color: #ffebee;
        border-color: #f44336;
        color: #d32f2f;
    }
    
    .stats-container {
        background-color: #f8f9fa;
        padding: 1rem;
        border-radius: 8px;
        border-left: 4px solid #28a745;
    }
</style>
""", unsafe_allow_html=True)

class StreamlitOCRValidator:
    def __init__(self):
        self.ocr_data = []
        self.md_content = ""
        self.image_path = ""
        self.text_bbox_mapping = {}
        self.selected_text = None
        self.marked_errors = set()
        
    def load_ocr_data(self, json_path: str, md_path: Optional[str] = None, image_path: Optional[str] = None):
        """加载OCR相关数据"""
        json_file = Path(json_path)
        
        # 加载JSON数据
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
                # 确保数据是列表格式
                if isinstance(data, list):
                    self.ocr_data = data
                elif isinstance(data, dict) and 'results' in data:
                    self.ocr_data = data['results']
                else:
                    st.error(f"❌ 不支持的JSON格式: {json_path}")
                    return
        except Exception as e:
            st.error(f"❌ 加载JSON文件失败: {e}")
            return
        
        # 推断MD文件路径
        if md_path is None:
            md_file = json_file.with_suffix('.md')
        else:
            md_file = Path(md_path)
        
        if md_file.exists():
            with open(md_file, 'r', encoding='utf-8') as f:
                self.md_content = f.read()
        
        # 推断图片路径
        if image_path is None:
            image_name = json_file.stem
            sample_data_dir = Path("./sample_data")
            
            image_candidates = [
                sample_data_dir / f"{image_name}.png",
                sample_data_dir / f"{image_name}.jpg",
                json_file.parent / f"{image_name}.png",
                json_file.parent / f"{image_name}.jpg",
            ]
            
            for candidate in image_candidates:
                if candidate.exists():
                    self.image_path = str(candidate)
                    break
        else:
            self.image_path = image_path
        
        # 处理数据
        self.process_data()
    
    def process_data(self):
        """处理OCR数据，建立文本到bbox的映射"""
        self.text_bbox_mapping = {}
        
        # 确保 ocr_data 是列表
        if not isinstance(self.ocr_data, list):
            st.warning("⚠️ OCR数据格式不正确，期望列表格式")
            return
        
        for i, item in enumerate(self.ocr_data):
            # 确保 item 是字典类型
            if not isinstance(item, dict):
                continue
                
            if 'text' in item and 'bbox' in item:
                text = str(item['text']).strip()
                if text and text not in ['Picture', '']:
                    bbox = item['bbox']
                    # 确保bbox是4个数字的列表
                    if isinstance(bbox, list) and len(bbox) == 4:
                        if text not in self.text_bbox_mapping:
                            self.text_bbox_mapping[text] = []
                        self.text_bbox_mapping[text].append({
                            'bbox': bbox,
                            'category': item.get('category', 'Text'),
                            'index': i,
                            'confidence': item.get('confidence', 1.0)
                        })
    
    def draw_bbox_on_image(self, image: Image.Image, bbox: List[int], color: str = "red", width: int = 3) -> Image.Image:
        """在图片上绘制bbox框"""
        img_copy = image.copy()
        draw = ImageDraw.Draw(img_copy)
        
        x1, y1, x2, y2 = bbox
        
        # 绘制矩形框
        draw.rectangle([x1, y1, x2, y2], outline=color, width=width)
        
        # 添加半透明填充
        overlay = Image.new('RGBA', img_copy.size, (0, 0, 0, 0))
        overlay_draw = ImageDraw.Draw(overlay)
        
        if color == "red":
            fill_color = (255, 0, 0, 30)
        elif color == "blue":
            fill_color = (0, 0, 255, 30)
        elif color == "green":
            fill_color = (0, 255, 0, 30)
        else:
            fill_color = (255, 255, 0, 30)
        
        overlay_draw.rectangle([x1, y1, x2, y2], fill=fill_color)
        img_copy = Image.alpha_composite(img_copy.convert('RGBA'), overlay).convert('RGB')
        
        return img_copy
    
    def create_interactive_plot(self, image: Image.Image, selected_bbox: Optional[List[int]] = None) -> go.Figure:
        """创建交互式图片显示"""
        fig = go.Figure()
        
        # 添加图片
        fig.add_layout_image(
            dict(
                source=image,
                xref="x",
                yref="y",
                x=0,
                y=image.height,
                sizex=image.width,
                sizey=image.height,
                sizing="stretch",
                opacity=1.0,
                layer="below"
            )
        )
        
        # 添加所有bbox（浅色显示）
        for text, info_list in self.text_bbox_mapping.items():
            for info in info_list:
                bbox = info['bbox']
                if len(bbox) >= 4:  # 确保bbox有足够的坐标
                    x1, y1, x2, y2 = bbox[:4]
                    
                    color = "rgba(0, 100, 200, 0.2)"  # 默认浅蓝色
                    if text in self.marked_errors:
                        color = "rgba(255, 0, 0, 0.3)"  # 错误标记为红色
                    
                    fig.add_shape(
                        type="rect",
                        x0=x1, y0=image.height-y2,
                        x1=x2, y1=image.height-y1,
                        line=dict(color=color.replace('0.2', '0.8').replace('0.3', '1.0'), width=1),
                        fillcolor=color,
                    )
        
        # 高亮显示选中的bbox
        if selected_bbox and len(selected_bbox) >= 4:
            x1, y1, x2, y2 = selected_bbox[:4]
            fig.add_shape(
                type="rect",
                x0=x1, y0=image.height-y2,
                x1=x2, y1=image.height-y1,
                line=dict(color="red", width=3),
                fillcolor="rgba(255, 0, 0, 0.2)",
            )
        
        # 设置布局
        fig.update_xaxes(
            visible=False,
            range=[0, image.width]
        )
        
        fig.update_yaxes(
            visible=False,
            range=[0, image.height],
            scaleanchor="x"
        )
        
        fig.update_layout(
            width=800,
            height=600,
            margin=dict(l=0, r=0, t=0, b=0),
            xaxis_showgrid=False,
            yaxis_showgrid=False,
            plot_bgcolor='white'
        )
        
        return fig
    
    def get_statistics(self) -> Dict:
        """获取统计信息"""
        # 先确保 ocr_data 不为空且是列表
        if not isinstance(self.ocr_data, list) or not self.ocr_data:
            return {
                'total_texts': 0,
                'clickable_texts': 0,
                'marked_errors': 0,
                'categories': {},
                'accuracy_rate': 0
            }
        
        total_texts = len(self.ocr_data)
        clickable_texts = len(self.text_bbox_mapping)
        marked_errors = len(self.marked_errors)
        
        # 按类别统计 - 添加类型检查
        categories = {}
        for item in self.ocr_data:
            # 确保 item 是字典类型
            if isinstance(item, dict):
                category = item.get('category', 'Unknown')
            elif isinstance(item, str):
                category = 'Text'  # 字符串类型默认为 Text 类别
            else:
                category = 'Unknown'
            
            categories[category] = categories.get(category, 0) + 1
        
        return {
            'total_texts': total_texts,
            'clickable_texts': clickable_texts,
            'marked_errors': marked_errors,
            'categories': categories,
            'accuracy_rate': (clickable_texts - marked_errors) / clickable_texts * 100 if clickable_texts > 0 else 0
        }
    
    def convert_html_table_to_markdown(self, content: str) -> str:
        """将HTML表格转换为Markdown表格格式"""
        import re
        from html import unescape
        
        # 简单的HTML表格到Markdown转换
        def replace_table(match):
            table_html = match.group(0)
            
            # 提取所有行
            rows = re.findall(r'<tr>(.*?)</tr>', table_html, re.DOTALL | re.IGNORECASE)
            if not rows:
                return table_html  # 如果没有找到行，返回原始内容
            
            markdown_rows = []
            for i, row in enumerate(rows):
                # 提取单元格
                cells = re.findall(r'<td[^>]*>(.*?)</td>', row, re.DOTALL | re.IGNORECASE)
                if cells:
                    # 清理单元格内容
                    clean_cells = []
                    for cell in cells:
                        # 移除HTML标签，保留文本
                        cell_text = re.sub(r'<[^>]+>', '', cell).strip()
                        cell_text = unescape(cell_text)  # 解码HTML实体
                        clean_cells.append(cell_text)
                    
                    # 构建Markdown行
                    markdown_row = '| ' + ' | '.join(clean_cells) + ' |'
                    markdown_rows.append(markdown_row)
                    
                    # 在第一行后添加分隔符
                    if i == 0:
                        separator = '| ' + ' | '.join(['---'] * len(clean_cells)) + ' |'
                        markdown_rows.append(separator)
            
            return '\n'.join(markdown_rows) if markdown_rows else table_html
        
        # 替换所有HTML表格
        converted = re.sub(r'<table[^>]*>.*?</table>', replace_table, content, flags=re.DOTALL | re.IGNORECASE)
        return converted
    
    def render_markdown_with_options(self, markdown_content: str, table_format: str = "grid", escape_html: bool = True):
        """自定义Markdown渲染方法，支持多种选项"""
        import markdown
        
        # 处理HTML表格
        if escape_html:
            markdown_content = self.convert_html_table_to_markdown(markdown_content)
        
        # 渲染Markdown
        html_content = markdown.markdown(markdown_content)
        
        # 根据选项包裹在特定的HTML结构中
        if table_format == "grid":
            # 网格布局
            wrapped_content = f"""
            <div class="markdown-grid">
                {html_content}
            </div>
            """
        elif table_format == "list":
            # 列表布局
            wrapped_content = f"""
            <div class="markdown-list">
                {html_content}
            </div>
            """
        else:
            # 默认直接返回
            wrapped_content = html_content
        
        return wrapped_content
    
    def display_html_table_as_dataframe(self, html_content: str):
        """将HTML表格解析为DataFrame显示"""
        import pandas as pd
        from io import StringIO
        
        try:
            # 使用pandas直接读取HTML表格
            tables = pd.read_html(StringIO(html_content))
            if tables:
                for i, table in enumerate(tables):
                    st.subheader(f"表格 {i+1}")
                    st.dataframe(table, use_container_width=True)
        except Exception as e:
            st.error(f"表格解析失败: {e}")
            # 回退到HTML渲染
            st.markdown(html_content, unsafe_allow_html=True)

def main():
    """主应用"""
    st.title("🔍 OCR可视化校验工具")
    st.markdown("---")
    
    # 初始化session state
    if 'validator' not in st.session_state:
        st.session_state.validator = StreamlitOCRValidator()
    
    if 'selected_text' not in st.session_state:
        st.session_state.selected_text = None
    
    if 'marked_errors' not in st.session_state:
        st.session_state.marked_errors = set()
    
    # 同步标记的错误到validator
    st.session_state.validator.marked_errors = st.session_state.marked_errors
    
    # 侧边栏 - 文件选择和控制
    with st.sidebar:
        st.header("📁 文件选择")
        
        # 查找可用的OCR文件
        output_dir = Path("output")
        available_files = []
        
        if output_dir.exists():
            for json_file in output_dir.rglob("*.json"):
                available_files.append(str(json_file))
        
        if available_files:
            selected_file = st.selectbox(
                "选择OCR结果文件",
                available_files,
                index=0
            )
            
            if st.button("🔄 加载文件", type="primary") and selected_file:
                try:
                    st.session_state.validator.load_ocr_data(selected_file)
                    st.success("✅ 文件加载成功！")
                    st.rerun()  # 重新运行应用以更新界面
                except Exception as e:
                    st.error(f"❌ 加载失败: {e}")
        else:
            st.warning("未找到OCR结果文件")
            st.info("请确保output目录下有OCR结果文件")
        
        st.markdown("---")
        
        # 控制面板
        st.header("🎛️ 控制面板")
        
        if st.button("🧹 清除选择"):
            st.session_state.selected_text = None
            st.rerun()
        
        if st.button("❌ 清除错误标记"):
            st.session_state.marked_errors = set()
            st.rerun()
        
        # 显示调试信息
        if st.checkbox("🔧 调试信息"):
            st.write("**当前状态:**")
            st.write(f"- OCR数据项数: {len(st.session_state.validator.ocr_data)}")
            st.write(f"- 可点击文本: {len(st.session_state.validator.text_bbox_mapping)}")
            st.write(f"- 选中文本: {st.session_state.selected_text}")
            st.write(f"- 标记错误数: {len(st.session_state.marked_errors)}")
            
            if st.session_state.validator.ocr_data:
                st.write("**数据类型检查:**")
                sample_item = st.session_state.validator.ocr_data[0] if st.session_state.validator.ocr_data else None
                st.write(f"- 第一项类型: {type(sample_item)}")
                if isinstance(sample_item, dict):
                    st.write(f"- 第一项键: {list(sample_item.keys())}")
    
    # 主内容区域
    if not st.session_state.validator.ocr_data:
        st.info("👈 请在左侧选择并加载OCR结果文件")
        return
    
    # 显示统计信息
    try:
        stats = st.session_state.validator.get_statistics()
        
        col1, col2, col3, col4 = st.columns(4)
        with col1:
            st.metric("📊 总文本块", stats['total_texts'])
        with col2:
            st.metric("🔗 可点击文本", stats['clickable_texts'])
        with col3:
            st.metric("❌ 标记错误", stats['marked_errors'])
        with col4:
            st.metric("✅ 准确率", f"{stats['accuracy_rate']:.1f}%")
        
        st.markdown("---")
    except Exception as e:
        st.error(f"❌ 统计信息计算失败: {e}")
        return
    
    # 主要布局 - 左右分栏
    left_col, right_col = st.columns([1, 1])
    
    # 左侧 - OCR文本内容
    with left_col:
        st.header("📄 OCR识别内容")
        
        # 文本选择器
        if st.session_state.validator.text_bbox_mapping:
            text_options = ["请选择文本..."] + list(st.session_state.validator.text_bbox_mapping.keys())
            selected_index = st.selectbox(
                "选择要校验的文本",
                range(len(text_options)),
                format_func=lambda x: text_options[x],
                key="text_selector"
            )
            
            if selected_index > 0:
                st.session_state.selected_text = text_options[selected_index]
        else:
            st.warning("没有找到可点击的文本")
        
        # 显示MD内容（可搜索和过滤）
        if st.session_state.validator.md_content:
            search_term = st.text_input("🔍 搜索文本内容", placeholder="输入关键词搜索...")
            
            display_content = st.session_state.validator.md_content
            if search_term:
                lines = display_content.split('\n')
                filtered_lines = [line for line in lines if search_term.lower() in line.lower()]
                display_content = '\n'.join(filtered_lines)
                if filtered_lines:
                    st.success(f"找到 {len(filtered_lines)} 行包含 '{search_term}'")
                else:
                    st.warning(f"未找到包含 '{search_term}' 的内容")
            
            # 渲染方式选择
            render_mode = st.radio(
                "选择渲染方式",
                ["HTML渲染", "Markdown渲染", "DataFrame表格", "原始文本"],  # 添加DataFrame选项
                horizontal=True
            )

            if render_mode == "HTML渲染":
                # 使用unsafe_allow_html=True来渲染HTML表格
                st.markdown(display_content, unsafe_allow_html=True)
            elif render_mode == "Markdown渲染":
                # 转换HTML表格为Markdown格式
                converted_content = st.session_state.validator.convert_html_table_to_markdown(display_content)
                st.markdown(converted_content)
            elif render_mode == "DataFrame表格":
                # 新增：使用DataFrame显示表格
                if '<table>' in display_content.lower():
                    st.session_state.validator.display_html_table_as_dataframe(display_content)
                else:
                    st.info("当前内容中没有检测到HTML表格")
                    st.markdown(display_content)
            else:
                # 原始文本显示
                st.text_area(
                    "MD内容预览",
                    display_content,
                    height=300,
                    help="OCR识别的文本内容"
                )
        
        # 可点击文本列表
        st.subheader("🎯 可点击文本列表")
        
        if st.session_state.validator.text_bbox_mapping:
            for text, info_list in st.session_state.validator.text_bbox_mapping.items():
                info = info_list[0]  # 使用第一个bbox信息
                
                # 确定显示样式
                is_selected = (text == st.session_state.selected_text)
                is_error = (text in st.session_state.marked_errors)
                
                # 创建按钮行
                button_col, error_col = st.columns([4, 1])
                
                with button_col:
                    button_type = "primary" if is_selected else "secondary"
                    if st.button(f"📍 {text}", key=f"btn_{text}", type=button_type):
                        st.session_state.selected_text = text
                        st.rerun()
                
                with error_col:
                    if is_error:
                        if st.button("✅", key=f"fix_{text}", help="取消错误标记"):
                            st.session_state.marked_errors.discard(text)
                            st.rerun()
                    else:
                        if st.button("❌", key=f"error_{text}", help="标记为错误"):
                            st.session_state.marked_errors.add(text)
                            st.rerun()
        else:
            st.info("没有可点击的文本项目")
    
    # 右侧 - 图像显示
    with right_col:
        st.header("🖼️ 原图标注")
        
        if st.session_state.validator.image_path and Path(st.session_state.validator.image_path).exists():
            try:
                # 加载图片
                image = Image.open(st.session_state.validator.image_path)
                
                # 创建交互式图片
                selected_bbox = None
                if st.session_state.selected_text and st.session_state.selected_text in st.session_state.validator.text_bbox_mapping:
                    info = st.session_state.validator.text_bbox_mapping[st.session_state.selected_text][0]
                    selected_bbox = info['bbox']
                
                fig = st.session_state.validator.create_interactive_plot(image, selected_bbox)
                st.plotly_chart(fig, use_container_width=True)
                
                # 显示选中文本的详细信息
                if st.session_state.selected_text:
                    st.subheader("📍 选中文本详情")
                    
                    if st.session_state.selected_text in st.session_state.validator.text_bbox_mapping:
                        info = st.session_state.validator.text_bbox_mapping[st.session_state.selected_text][0]
                        bbox = info['bbox']
                        
                        info_col1, info_col2 = st.columns(2)
                        with info_col1:
                            st.write(f"**文本内容:** {st.session_state.selected_text}")
                            st.write(f"**类别:** {info['category']}")
                            st.write(f"**置信度:** {info.get('confidence', 'N/A')}")
                        
                        with info_col2:
                            st.write(f"**位置:** [{', '.join(map(str, bbox))}]")
                            if len(bbox) >= 4:
                                st.write(f"**宽度:** {bbox[2] - bbox[0]} px")
                                st.write(f"**高度:** {bbox[3] - bbox[1]} px")
                        
                        # 标记状态
                        is_error = st.session_state.selected_text in st.session_state.marked_errors
                        if is_error:
                            st.error("⚠️ 此文本已标记为错误")
                        else:
                            st.success("✅ 此文本未标记错误")
            except Exception as e:
                st.error(f"❌ 图片处理失败: {e}")
        else:
            st.error("未找到对应的图片文件")
            if st.session_state.validator.image_path:
                st.write(f"期望路径: {st.session_state.validator.image_path}")

if __name__ == "__main__":
    main()