2 luni în urmă · 0b222ea9d3
--- a/streamlit_ocr_validator.py
+++ b/streamlit_ocr_validator.py
@@ -16,6 +16,7 @@ from typing import Dict, List, Optional, Tuple
 
				 import plotly.express as px
			
 
				 import plotly.graph_objects as go
			
 
				 from plotly.subplots import make_subplots
			
 
				+from io import StringIO, BytesIO
			
 
				 
			
 
				 # 设置页面配置
			
 
				 st.set_page_config(
			
@@ -25,17 +26,49 @@ st.set_page_config(
 
				     initial_sidebar_state="expanded"
			
 
				 )
			
 
				 
			
 
				-# 自定义CSS样式
			
 
				+# 自定义CSS样式 - 修复背景和文字颜色
			
 
				 st.markdown("""
			
 
				 <style>
			
 
				+    /* 设置主体背景为白色 */
			
 
				     .main > div {
			
 
				         padding-top: 2rem;
			
 
				+        background-color: white !important;
			
 
				+        color: #333333 !important;
			
 
				     }
			
 
				     
			
 
				+    /* 设置整体页面背景 */
			
 
				+    .stApp {
			
 
				+        background-color: white !important;
			
 
				+    }
			
 
				+    
			
 
				+    /* 设置内容区域背景 */
			
 
				+    .block-container {
			
 
				+        background-color: white !important;
			
 
				+        color: #333333 !important;
			
 
				+    }
			
 
				+    
			
 
				+    /* 设置侧边栏样式 */
			
 
				+    .css-1d391kg {
			
 
				+        background-color: #f8f9fa !important;
			
 
				+    }
			
 
				+    
			
 
				+    /* 设置选择框样式 */
			
 
				     .stSelectbox > div > div > div {
			
 
				-        background-color: #f0f2f6;
			
 
				+        background-color: #f0f2f6 !important;
			
 
				+        color: #333333 !important;
			
 
				+    }
			
 
				+    
			
 
				+    /* 设置标题样式 */
			
 
				+    h1, h2, h3, h4, h5, h6 {
			
 
				+        color: #333333 !important;
			
 
				+    }
			
 
				+    
			
 
				+    /* 设置文本样式 */
			
 
				+    p, div, span, label {
			
 
				+        color: #333333 !important;
			
 
				     }
			
 
				     
			
 
				+    /* 可点击文本样式 */
			
 
				     .clickable-text {
			
 
				         background-color: #e1f5fe;
			
 
				         padding: 2px 6px;
			
@@ -44,18 +77,20 @@ st.markdown("""
 
				         cursor: pointer;
			
 
				         margin: 2px;
			
 
				         display: inline-block;
			
 
				+        color: #0288d1 !important;
			
 
				     }
			
 
				     
			
 
				     .selected-text {
			
 
				         background-color: #fff3e0;
			
 
				         border-color: #ff9800;
			
 
				         font-weight: bold;
			
 
				+        color: #ff9800 !important;
			
 
				     }
			
 
				     
			
 
				     .error-text {
			
 
				         background-color: #ffebee;
			
 
				         border-color: #f44336;
			
 
				-        color: #d32f2f;
			
 
				+        color: #d32f2f !important;
			
 
				     }
			
 
				     
			
 
				     .stats-container {
			
@@ -63,6 +98,42 @@ st.markdown("""
 
				         padding: 1rem;
			
 
				         border-radius: 8px;
			
 
				         border-left: 4px solid #28a745;
			
 
				+        color: #333333 !important;
			
 
				+    }
			
 
				+    
			
 
				+    /* 修复滚动内容区域样式 */
			
 
				+    .scrollable-content {
			
 
				+        background-color: #fafafa !important;
			
 
				+        color: #333333 !important;
			
 
				+        border: 1px solid #ddd !important;
			
 
				+    }
			
 
				+    
			
 
				+    /* 修复紧凑内容样式 */
			
 
				+    .compact-content {
			
 
				+        background-color: #fafafa !important;
			
 
				+        color: #333333 !important;
			
 
				+        border: 1px solid #ddd !important;
			
 
				+    }
			
 
				+    
			
 
				+    /* 高亮文本样式 */
			
 
				+    .highlight-text {
			
 
				+        background-color: #ffeb3b !important;
			
 
				+        color: #333333 !important;
			
 
				+        padding: 2px 4px;
			
 
				+        border-radius: 3px;
			
 
				+        cursor: pointer;
			
 
				+    }
			
 
				+    
			
 
				+    .selected-highlight {
			
 
				+        background-color: #4caf50 !important;
			
 
				+        color: white !important;
			
 
				+    }
			
 
				+    
			
 
				+    /* 标准布局内容样式 */
			
 
				+    .standard-content {
			
 
				+        background-color: #fafafa !important;
			
 
				+        color: #333333 !important;
			
 
				+        border: 1px solid #ddd !important;
			
 
				     }
			
 
				 </style>
			
 
				 """, unsafe_allow_html=True)
			
@@ -337,43 +408,8 @@ class StreamlitOCRValidator:
 
				         converted = re.sub(r'<table[^>]*>.*?</table>', replace_table, content, flags=re.DOTALL | re.IGNORECASE)
			
 
				         return converted
			
 
				     
			
 
				-    def render_markdown_with_options(self, markdown_content: str, table_format: str = "grid", escape_html: bool = True):
			
 
				-        """自定义Markdown渲染方法，支持多种选项"""
			
 
				-        import markdown
			
 
				-        
			
 
				-        # 处理HTML表格
			
 
				-        if escape_html:
			
 
				-            markdown_content = self.convert_html_table_to_markdown(markdown_content)
			
 
				-        
			
 
				-        # 渲染Markdown
			
 
				-        html_content = markdown.markdown(markdown_content)
			
 
				-        
			
 
				-        # 根据选项包裹在特定的HTML结构中
			
 
				-        if table_format == "grid":
			
 
				-            # 网格布局
			
 
				-            wrapped_content = f"""
			
 
				-            <div class="markdown-grid">
			
 
				-                {html_content}
			
 
				-            </div>
			
 
				-            """
			
 
				-        elif table_format == "list":
			
 
				-            # 列表布局
			
 
				-            wrapped_content = f"""
			
 
				-            <div class="markdown-list">
			
 
				-                {html_content}
			
 
				-            </div>
			
 
				-            """
			
 
				-        else:
			
 
				-            # 默认直接返回
			
 
				-            wrapped_content = html_content
			
 
				-        
			
 
				-        return wrapped_content
			
 
				-    
			
 
				     def display_html_table_as_dataframe(self, html_content: str, enable_editing: bool = False):
			
 
				         """将HTML表格解析为DataFrame显示"""
			
 
				-        import pandas as pd
			
 
				-        from io import StringIO, BytesIO
			
 
				-        
			
 
				         try:
			
 
				             # 使用pandas直接读取HTML表格
			
 
				             tables = pd.read_html(StringIO(html_content))
			
@@ -495,6 +531,514 @@ class StreamlitOCRValidator:
 
				             st.info("尝试使用HTML渲染模式查看表格")
			
 
				             # 回退到HTML渲染
			
 
				             st.markdown(html_content, unsafe_allow_html=True)
			
 
				+    
			
 
				+    def create_standard_layout(self, font_size: int = 12, zoom_level: float = 1.0):
			
 
				+        """创建标准布局 - 封装版本"""
			
 
				+        # 主要内容区域
			
 
				+        left_col, right_col = st.columns([0.7, 1])
			
 
				+        
			
 
				+        with left_col:
			
 
				+            st.header("📄 OCR识别内容")
			
 
				+            
			
 
				+            # 文本选择器
			
 
				+            if self.text_bbox_mapping:
			
 
				+                text_options = ["请选择文本..."] + list(self.text_bbox_mapping.keys())
			
 
				+                selected_index = st.selectbox(
			
 
				+                    "选择要校验的文本",
			
 
				+                    range(len(text_options)),
			
 
				+                    format_func=lambda x: text_options[x][:50] + "..." if len(text_options[x]) > 50 else text_options[x],
			
 
				+                    key="standard_text_selector"
			
 
				+                )
			
 
				+                
			
 
				+                if selected_index > 0:
			
 
				+                    st.session_state.selected_text = text_options[selected_index]
			
 
				+            else:
			
 
				+                st.warning("没有找到可点击的文本")
			
 
				+            
			
 
				+            # 显示MD内容
			
 
				+            if self.md_content:
			
 
				+                search_term = st.text_input("🔍 搜索文本内容", placeholder="输入关键词搜索...", key="standard_search")
			
 
				+                
			
 
				+                display_content = self.md_content
			
 
				+                if search_term:
			
 
				+                    lines = display_content.split('\n')
			
 
				+                    filtered_lines = [line for line in lines if search_term.lower() in line.lower()]
			
 
				+                    display_content = '\n'.join(filtered_lines)
			
 
				+                    if filtered_lines:
			
 
				+                        st.success(f"找到 {len(filtered_lines)} 行包含 '{search_term}'")
			
 
				+                    else:
			
 
				+                        st.warning(f"未找到包含 '{search_term}' 的内容")
			
 
				+                
			
 
				+                # 渲染方式选择
			
 
				+                render_mode = st.radio(
			
 
				+                    "选择渲染方式",
			
 
				+                    ["HTML渲染", "Markdown渲染", "DataFrame表格", "原始文本"],
			
 
				+                    horizontal=True,
			
 
				+                    key="standard_render_mode"
			
 
				+                )
			
 
				+
			
 
				+                # 应用字体大小到内容显示
			
 
				+                content_style = f"""
			
 
				+                <style>
			
 
				+                .standard-content-display {{
			
 
				+                    font-size: {font_size}px !important;
			
 
				+                    line-height: 1.4;
			
 
				+                    color: #333333 !important;
			
 
				+                    background-color: #fafafa !important;
			
 
				+                    padding: 10px;
			
 
				+                    border-radius: 5px;
			
 
				+                    border: 1px solid #ddd;
			
 
				+                }}
			
 
				+                </style>
			
 
				+                """
			
 
				+                st.markdown(content_style, unsafe_allow_html=True)
			
 
				+
			
 
				+                if render_mode == "HTML渲染":
			
 
				+                    st.markdown(f'<div class="standard-content-display">{display_content}</div>', unsafe_allow_html=True)
			
 
				+                elif render_mode == "Markdown渲染":
			
 
				+                    converted_content = self.convert_html_table_to_markdown(display_content)
			
 
				+                    st.markdown(f'<div class="standard-content-display">{converted_content}</div>', unsafe_allow_html=True)
			
 
				+                elif render_mode == "DataFrame表格":
			
 
				+                    if '<table' in display_content.lower():
			
 
				+                        self.display_html_table_as_dataframe(display_content)
			
 
				+                    else:
			
 
				+                        st.info("当前内容中没有检测到HTML表格")
			
 
				+                        st.markdown(f'<div class="standard-content-display">{display_content}</div>', unsafe_allow_html=True)
			
 
				+                else:
			
 
				+                    st.text_area(
			
 
				+                        "MD内容预览",
			
 
				+                        display_content,
			
 
				+                        height=300,
			
 
				+                        help="OCR识别的文本内容",
			
 
				+                        key="standard_text_area"
			
 
				+                    )
			
 
				+        
			
 
				+        with right_col:
			
 
				+            st.header("🖼️ 原图标注")
			
 
				+            
			
 
				+            # 图片缩放控制
			
 
				+            col1, col2 = st.columns(2)
			
 
				+            with col1:
			
 
				+                current_zoom = st.slider("图片缩放", 0.3, 2.0, zoom_level, 0.1, key="standard_zoom_level")
			
 
				+            with col2:
			
 
				+                show_all_boxes = st.checkbox("显示所有框", value=False, key="standard_show_all_boxes")
			
 
				+            
			
 
				+            if self.image_path and Path(self.image_path).exists():
			
 
				+                try:
			
 
				+                    image = Image.open(self.image_path)
			
 
				+                    
			
 
				+                    # 应用缩放级别
			
 
				+                    if current_zoom != 1.0:
			
 
				+                        new_width = int(image.width * current_zoom)
			
 
				+                        new_height = int(image.height * current_zoom)
			
 
				+                        image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
			
 
				+                    
			
 
				+                    selected_bbox = None
			
 
				+                    if st.session_state.selected_text and st.session_state.selected_text in self.text_bbox_mapping:
			
 
				+                        info = self.text_bbox_mapping[st.session_state.selected_text][0]
			
 
				+                        bbox = info['bbox']
			
 
				+                        if current_zoom != 1.0:
			
 
				+                            bbox = [int(coord * current_zoom) for coord in bbox]
			
 
				+                        selected_bbox = bbox
			
 
				+                    
			
 
				+                    # 创建交互式图片
			
 
				+                    if show_all_boxes:
			
 
				+                        # 显示所有框的模式
			
 
				+                        fig = self.create_interactive_plot(image, selected_bbox)
			
 
				+                    else:
			
 
				+                        # 只显示选中框的模式
			
 
				+                        fig = go.Figure()
			
 
				+                        
			
 
				+                        # 添加图片
			
 
				+                        fig.add_layout_image(
			
 
				+                            dict(
			
 
				+                                source=image,
			
 
				+                                xref="x", yref="y",
			
 
				+                                x=0, y=image.height,
			
 
				+                                sizex=image.width, sizey=image.height,
			
 
				+                                sizing="stretch", opacity=1.0, layer="below"
			
 
				+                            )
			
 
				+                        )
			
 
				+                        
			
 
				+                        # 只显示选中的bbox
			
 
				+                        if selected_bbox and len(selected_bbox) >= 4:
			
 
				+                            x1, y1, x2, y2 = selected_bbox[:4]
			
 
				+                            fig.add_shape(
			
 
				+                                type="rect",
			
 
				+                                x0=x1, y0=image.height-y2,
			
 
				+                                x1=x2, y1=image.height-y1,
			
 
				+                                line=dict(color="red", width=3),
			
 
				+                                fillcolor="rgba(255, 0, 0, 0.2)",
			
 
				+                            )
			
 
				+                        
			
 
				+                        # 设置布局
			
 
				+                        fig.update_xaxes(visible=False, range=[0, image.width])
			
 
				+                        fig.update_yaxes(visible=False, range=[0, image.height], scaleanchor="x")
			
 
				+                        
			
 
				+                        fig.update_layout(
			
 
				+                            width=800, height=600,
			
 
				+                            margin=dict(l=0, r=0, t=0, b=0),
			
 
				+                            xaxis_showgrid=False, yaxis_showgrid=False,
			
 
				+                            plot_bgcolor='white'
			
 
				+                        )
			
 
				+                    
			
 
				+                    st.plotly_chart(fig, use_container_width=True, key="standard_plot")
			
 
				+                    
			
 
				+                    # 显示选中文本的详细信息
			
 
				+                    if st.session_state.selected_text:
			
 
				+                        st.subheader("📍 选中文本详情")
			
 
				+                        
			
 
				+                        if st.session_state.selected_text in self.text_bbox_mapping:
			
 
				+                            info = self.text_bbox_mapping[st.session_state.selected_text][0]
			
 
				+                            original_bbox = info['bbox']
			
 
				+                            
			
 
				+                            info_col1, info_col2 = st.columns(2)
			
 
				+                            with info_col1:
			
 
				+                                st.write(f"**文本内容:** {st.session_state.selected_text[:30]}...")
			
 
				+                                st.write(f"**类别:** {info['category']}")
			
 
				+                                st.write(f"**置信度:** {info.get('confidence', 'N/A')}")
			
 
				+                            
			
 
				+                            with info_col2:
			
 
				+                                st.write(f"**位置:** [{', '.join(map(str, original_bbox))}]")
			
 
				+                                if len(original_bbox) >= 4:
			
 
				+                                    st.write(f"**宽度:** {original_bbox[2] - original_bbox[0]} px")
			
 
				+                                    st.write(f"**高度:** {original_bbox[3] - original_bbox[1]} px")
			
 
				+                            
			
 
				+                            # 错误标记功能
			
 
				+                            col1, col2 = st.columns(2)
			
 
				+                            with col1:
			
 
				+                                if st.button("❌ 标记为错误", key="mark_error_standard"):
			
 
				+                                    st.session_state.marked_errors.add(st.session_state.selected_text)
			
 
				+                                    st.rerun()
			
 
				+                            
			
 
				+                            with col2:
			
 
				+                                if st.button("✅ 取消错误标记", key="unmark_error_standard"):
			
 
				+                                    st.session_state.marked_errors.discard(st.session_state.selected_text)
			
 
				+                                    st.rerun()
			
 
				+                            
			
 
				+                            # 标记状态显示
			
 
				+                            is_error = st.session_state.selected_text in st.session_state.marked_errors
			
 
				+                            if is_error:
			
 
				+                                st.error("⚠️ 此文本已标记为错误")
			
 
				+                            else:
			
 
				+                                st.success("✅ 此文本未标记错误")
			
 
				+                                
			
 
				+                except Exception as e:
			
 
				+                    st.error(f"❌ 图片处理失败: {e}")
			
 
				+            else:
			
 
				+                st.error("未找到对应的图片文件")
			
 
				+                if self.image_path:
			
 
				+                    st.write(f"期望路径: {self.image_path}")
			
 
				+    
			
 
				+    def create_split_layout_with_fixed_image(self, font_size: int = 12, zoom_level: float = 1.0):
			
 
				+        """创建左侧滚动、右侧固定的布局 - 修复版本"""
			
 
				+        # 使用columns创建左右布局
			
 
				+        left_col, right_col = st.columns([0.7, 1])
			
 
				+        
			
 
				+        with left_col:
			
 
				+            st.header("📄 OCR识别内容")
			
 
				+            
			
 
				+            # 添加文本选择器
			
 
				+            if self.text_bbox_mapping:
			
 
				+                text_options = ["请选择文本..."] + list(self.text_bbox_mapping.keys())
			
 
				+                selected_index = st.selectbox(
			
 
				+                    "选择要校验的文本",
			
 
				+                    range(len(text_options)),
			
 
				+                    format_func=lambda x: text_options[x][:50] + "..." if len(text_options[x]) > 50 else text_options[x],
			
 
				+                    key="split_text_selector"
			
 
				+                )
			
 
				+                
			
 
				+                if selected_index > 0:
			
 
				+                    st.session_state.selected_text = text_options[selected_index]
			
 
				+            
			
 
				+            # 创建可滚动的容器
			
 
				+            container_height = st.selectbox(
			
 
				+                "选择内容区域高度", 
			
 
				+                [400, 600, 800, 1000, 1200], 
			
 
				+                index=2,
			
 
				+                key="split_content_height"
			
 
				+            )
			
 
				+            
			
 
				+            # 使用自定义CSS创建滚动区域，应用字体大小参数
			
 
				+            st.markdown(f"""
			
 
				+            <style>
			
 
				+            .scrollable-content {{
			
 
				+                height: {container_height}px;
			
 
				+                overflow-y: auto;
			
 
				+                overflow-x: hidden;
			
 
				+                padding: 10px;
			
 
				+                border: 1px solid #ddd;
			
 
				+                border-radius: 5px;
			
 
				+                background-color: #fafafa !important;
			
 
				+                font-size: {font_size}px !important;
			
 
				+                line-height: 1.4;
			
 
				+                color: #333333 !important;
			
 
				+            }}
			
 
				+            
			
 
				+            .scrollable-content::-webkit-scrollbar {{
			
 
				+                width: 8px;
			
 
				+            }}
			
 
				+            
			
 
				+            .scrollable-content::-webkit-scrollbar-track {{
			
 
				+                background: #f1f1f1;
			
 
				+                border-radius: 4px;
			
 
				+            }}
			
 
				+            
			
 
				+            .scrollable-content::-webkit-scrollbar-thumb {{
			
 
				+                background: #888;
			
 
				+                border-radius: 4px;
			
 
				+            }}
			
 
				+            
			
 
				+            .scrollable-content::-webkit-scrollbar-thumb:hover {{
			
 
				+                background: #555;
			
 
				+            }}
			
 
				+            </style>
			
 
				+            """, unsafe_allow_html=True)
			
 
				+            
			
 
				+            # 显示可滚动的OCR内容
			
 
				+            if self.md_content:
			
 
				+                scrollable_content = f"""
			
 
				+                <div class="scrollable-content">
			
 
				+                    {self.md_content.replace(chr(10), '<br>')}
			
 
				+                </div>
			
 
				+                """
			
 
				+                st.markdown(scrollable_content, unsafe_allow_html=True)
			
 
				+        
			
 
				+        with right_col:
			
 
				+            # 固定位置的图片显示
			
 
				+            self.create_fixed_image_display(zoom_level)
			
 
				+
			
 
				+    def create_fixed_image_display(self, zoom_level: float = 1.0):
			
 
				+        """创建固定位置的图片显示 - 修复版本"""
			
 
				+        st.header("🖼️ 原图标注")
			
 
				+        
			
 
				+        # 图片缩放控制
			
 
				+        col1, col2 = st.columns(2)
			
 
				+        with col1:
			
 
				+            current_zoom = st.slider("图片缩放", 0.3, 2.0, zoom_level, 0.1, key="fixed_zoom_level")
			
 
				+        with col2:
			
 
				+            show_all_boxes = st.checkbox("显示所有框", value=False, key="fixed_show_all_boxes")
			
 
				+        
			
 
				+        if self.image_path and Path(self.image_path).exists():
			
 
				+            try:
			
 
				+                image = Image.open(self.image_path)
			
 
				+                
			
 
				+                # 根据缩放级别调整图片大小
			
 
				+                new_width = int(image.width * current_zoom)
			
 
				+                new_height = int(image.height * current_zoom)
			
 
				+                resized_image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
			
 
				+                
			
 
				+                # 在固定容器中显示图片
			
 
				+                selected_bbox = None
			
 
				+                if st.session_state.selected_text and st.session_state.selected_text in self.text_bbox_mapping:
			
 
				+                    info = self.text_bbox_mapping[st.session_state.selected_text][0]
			
 
				+                    # 根据缩放级别调整bbox坐标
			
 
				+                    bbox = info['bbox']
			
 
				+                    selected_bbox = [int(coord * current_zoom) for coord in bbox]
			
 
				+                
			
 
				+                # 创建交互式图片（调整大小）
			
 
				+                fig = self.create_resized_interactive_plot(resized_image, selected_bbox, current_zoom, show_all_boxes)
			
 
				+                st.plotly_chart(fig, use_container_width=True, key="fixed_plot")
			
 
				+                
			
 
				+                # 显示选中文本的详细信息
			
 
				+                if st.session_state.selected_text and st.session_state.selected_text in self.text_bbox_mapping:
			
 
				+                    st.subheader("📍 选中文本详情")
			
 
				+                    
			
 
				+                    info = self.text_bbox_mapping[st.session_state.selected_text][0]
			
 
				+                    bbox = info['bbox']
			
 
				+                    
			
 
				+                    info_col1, info_col2 = st.columns(2)
			
 
				+                    with info_col1:
			
 
				+                        st.write(f"**文本内容:** {st.session_state.selected_text[:30]}...")
			
 
				+                        st.write(f"**类别:** {info['category']}")
			
 
				+                    
			
 
				+                    with info_col2:
			
 
				+                        st.write(f"**位置:** [{', '.join(map(str, bbox))}]")
			
 
				+                        if len(bbox) >= 4:
			
 
				+                            st.write(f"**大小:** {bbox[2] - bbox[0]} x {bbox[3] - bbox[1]} px")
			
 
				+                            
			
 
				+            except Exception as e:
			
 
				+                st.error(f"❌ 图片处理失败: {e}")
			
 
				+        else:
			
 
				+            st.error("未找到对应的图片文件")
			
 
				+            if self.image_path:
			
 
				+                st.write(f"期望路径: {self.image_path}")
			
 
				+    
			
 
				+    def create_resized_interactive_plot(self, image: Image.Image, selected_bbox: Optional[List[int]], zoom_level: float, show_all_boxes: bool) -> go.Figure:
			
 
				+        """创建可调整大小的交互式图片"""
			
 
				+        fig = go.Figure()
			
 
				+        
			
 
				+        fig.add_layout_image(
			
 
				+            dict(
			
 
				+                source=image,
			
 
				+                xref="x", yref="y",
			
 
				+                x=0, y=image.height,
			
 
				+                sizex=image.width, sizey=image.height,
			
 
				+                sizing="stretch", opacity=1.0, layer="below"
			
 
				+            )
			
 
				+        )
			
 
				+        
			
 
				+        # 显示所有bbox（如果开启）
			
 
				+        if show_all_boxes:
			
 
				+            for text, info_list in self.text_bbox_mapping.items():
			
 
				+                for info in info_list:
			
 
				+                    bbox = info['bbox']
			
 
				+                    if len(bbox) >= 4:
			
 
				+                        x1, y1, x2, y2 = [coord * zoom_level for coord in bbox[:4]]
			
 
				+                        
			
 
				+                        color = "rgba(0, 100, 200, 0.2)"
			
 
				+                        if text in self.marked_errors:
			
 
				+                            color = "rgba(255, 0, 0, 0.3)"
			
 
				+                        
			
 
				+                        fig.add_shape(
			
 
				+                            type="rect",
			
 
				+                            x0=x1, y0=image.height-y2,
			
 
				+                            x1=x2, y1=image.height-y1,
			
 
				+                            line=dict(color=color.replace('0.2', '0.8').replace('0.3', '1.0'), width=1),
			
 
				+                            fillcolor=color,
			
 
				+                        )
			
 
				+        
			
 
				+        # 高亮显示选中的bbox
			
 
				+        if selected_bbox and len(selected_bbox) >= 4:
			
 
				+            x1, y1, x2, y2 = selected_bbox[:4]
			
 
				+            fig.add_shape(
			
 
				+                type="rect",
			
 
				+                x0=x1, y0=image.height-y2,
			
 
				+                x1=x2, y1=image.height-y1,
			
 
				+                line=dict(color="red", width=2),
			
 
				+                fillcolor="rgba(255, 0, 0, 0.3)",
			
 
				+            )
			
 
				+        
			
 
				+        fig.update_xaxes(visible=False, range=[0, image.width])
			
 
				+        fig.update_yaxes(visible=False, range=[0, image.height], scaleanchor="x")
			
 
				+        
			
 
				+        fig.update_layout(
			
 
				+            width=image.width,
			
 
				+            height=image.height,
			
 
				+            margin=dict(l=0, r=0, t=0, b=0),
			
 
				+            showlegend=False,
			
 
				+            plot_bgcolor='white'
			
 
				+        )
			
 
				+        
			
 
				+        return fig
			
 
				+    
			
 
				+    def create_compact_layout(self, font_size: int = 12, zoom_level: float = 1.0):
			
 
				+        """创建紧凑的对比布局 - 修复版本"""
			
 
				+        # 顶部控制区域
			
 
				+        control_col1, control_col2, control_col3 = st.columns([1, 1, 1])
			
 
				+        
			
 
				+        with control_col1:
			
 
				+            current_font_size = st.selectbox("字体大小", [10, 12, 14, 16, 18], 
			
 
				+                                          index=[10, 12, 14, 16, 18].index(font_size) if font_size in [10, 12, 14, 16, 18] else 1, 
			
 
				+                                          key="compact_font")
			
 
				+        
			
 
				+        with control_col2:
			
 
				+            content_height = st.selectbox("内容高度", [300, 400, 500, 600], index=1, key="compact_height")
			
 
				+        
			
 
				+        with control_col3:
			
 
				+            current_zoom = st.slider("图片缩放", 0.3, 1.5, zoom_level, 0.1, key="compact_zoom")
			
 
				+        
			
 
				+        # 主要内容区域
			
 
				+        left_col, right_col = st.columns([0.7, 1])  # 调整比例
			
 
				+        
			
 
				+        with left_col:
			
 
				+            st.subheader("📄 OCR内容")
			
 
				+            
			
 
				+            # 文本选择器
			
 
				+            if self.text_bbox_mapping:
			
 
				+                text_options = ["请选择文本..."] + list(self.text_bbox_mapping.keys())
			
 
				+                selected_index = st.selectbox(
			
 
				+                    "快速定位文本",
			
 
				+                    range(len(text_options)),
			
 
				+                    format_func=lambda x: text_options[x][:30] + "..." if len(text_options[x]) > 30 else text_options[x],
			
 
				+                    key="compact_text_selector"
			
 
				+                )
			
 
				+                
			
 
				+                if selected_index > 0:
			
 
				+                    st.session_state.selected_text = text_options[selected_index]
			
 
				+            
			
 
				+            # 自定义CSS样式，应用字体大小参数
			
 
				+            st.markdown(f"""
			
 
				+            <style>
			
 
				+            .compact-content {{
			
 
				+                height: {content_height}px;
			
 
				+                overflow-y: auto;
			
 
				+                font-size: {current_font_size}px !important;
			
 
				+                line-height: 1.4;
			
 
				+                border: 1px solid #ddd;
			
 
				+                padding: 10px;
			
 
				+                background-color: #fafafa !important;
			
 
				+                font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
			
 
				+                color: #333333 !important;
			
 
				+            }}
			
 
				+            
			
 
				+            .highlight-text {{
			
 
				+                background-color: #ffeb3b !important;
			
 
				+                padding: 2px 4px;
			
 
				+                border-radius: 3px;
			
 
				+                cursor: pointer;
			
 
				+                color: #333333 !important;
			
 
				+            }}
			
 
				+            
			
 
				+            .selected-highlight {{
			
 
				+                background-color: #4caf50 !important;
			
 
				+                color: white !important;
			
 
				+            }}
			
 
				+            </style>
			
 
				+            """, unsafe_allow_html=True)
			
 
				+            
			
 
				+            # 处理并显示OCR内容
			
 
				+            if self.md_content:
			
 
				+                # 高亮可点击文本
			
 
				+                highlighted_content = self.md_content
			
 
				+                for text in self.text_bbox_mapping.keys():
			
 
				+                    if len(text) > 2:  # 避免高亮过短的文本
			
 
				+                        css_class = "highlight-text selected-highlight" if text == st.session_state.selected_text else "highlight-text"
			
 
				+                        # 使用更安全的替换方法
			
 
				+                        highlighted_content = highlighted_content.replace(
			
 
				+                            text, 
			
 
				+                            f'<span class="{css_class}" title="{text[:50]}...">{text}</span>'
			
 
				+                        )
			
 
				+                
			
 
				+                st.markdown(
			
 
				+                    f'<div class="compact-content">{highlighted_content}</div>', 
			
 
				+                    unsafe_allow_html=True
			
 
				+                )
			
 
				+        
			
 
				+        with right_col:
			
 
				+            st.subheader("🖼️ 图片标注")
			
 
				+            
			
 
				+            if self.image_path and Path(self.image_path).exists():
			
 
				+                try:
			
 
				+                    image = Image.open(self.image_path)
			
 
				+                    
			
 
				+                    # 调整图片大小以适应布局
			
 
				+                    display_width = int(400 * current_zoom)  # 使用当前缩放值
			
 
				+                    aspect_ratio = image.height / image.width
			
 
				+                    display_height = int(display_width * aspect_ratio)
			
 
				+                    
			
 
				+                    resized_image = image.resize((display_width, display_height), Image.Resampling.LANCZOS)
			
 
				+                    
			
 
				+                    # 显示选中文本的bbox
			
 
				+                    if st.session_state.selected_text and st.session_state.selected_text in self.text_bbox_mapping:
			
 
				+                        info = self.text_bbox_mapping[st.session_state.selected_text][0]
			
 
				+                        bbox = info['bbox']
			
 
				+                        
			
 
				+                        # 在图片上绘制bbox
			
 
				+                        annotated_image = self.draw_bbox_on_image(resized_image, 
			
 
				+                            [int(coord * current_zoom) for coord in bbox], "red", 3)
			
 
				+                        st.image(annotated_image, use_column_width=True)
			
 
				+                        
			
 
				+                        # 显示详细信息
			
 
				+                        st.info(f"**选中:** {st.session_state.selected_text[:20]}...\n**位置:** [{', '.join(map(str, bbox))}]")
			
 
				+                    else:
			
 
				+                        st.image(resized_image, use_column_width=True)
			
 
				+                        
			
 
				+                except Exception as e:
			
 
				+                    st.error(f"❌ 图片处理失败: {e}")
			
 
				+            else:
			
 
				+                st.error("未找到对应的图片文件")
			
 
				+
			
 
				 
			
 
				 def main():
			
 
				     """主应用"""
			
@@ -537,7 +1081,7 @@ def main():
 
				                 try:
			
 
				                     st.session_state.validator.load_ocr_data(selected_file)
			
 
				                     st.success("✅ 文件加载成功！")
			
 
				-                    st.rerun()  # 重新运行应用以更新界面
			
 
				+                    st.rerun()
			
 
				                 except Exception as e:
			
 
				                     st.error(f"❌ 加载失败: {e}")
			
 
				         else:
			
@@ -556,21 +1100,6 @@ def main():
 
				         if st.button("❌ 清除错误标记"):
			
 
				             st.session_state.marked_errors = set()
			
 
				             st.rerun()
			
 
				-        
			
 
				-        # 显示调试信息
			
 
				-        if st.checkbox("🔧 调试信息"):
			
 
				-            st.write("**当前状态:**")
			
 
				-            st.write(f"- OCR数据项数: {len(st.session_state.validator.ocr_data)}")
			
 
				-            st.write(f"- 可点击文本: {len(st.session_state.validator.text_bbox_mapping)}")
			
 
				-            st.write(f"- 选中文本: {st.session_state.selected_text}")
			
 
				-            st.write(f"- 标记错误数: {len(st.session_state.marked_errors)}")
			
 
				-            
			
 
				-            if st.session_state.validator.ocr_data:
			
 
				-                st.write("**数据类型检查:**")
			
 
				-                sample_item = st.session_state.validator.ocr_data[0] if st.session_state.validator.ocr_data else None
			
 
				-                st.write(f"- 第一项类型: {type(sample_item)}")
			
 
				-                if isinstance(sample_item, dict):
			
 
				-                    st.write(f"- 第一项键: {list(sample_item.keys())}")
			
 
				     
			
 
				     # 主内容区域
			
 
				     if not st.session_state.validator.ocr_data:
			
@@ -597,164 +1126,43 @@ def main():
 
				         return
			
 
				     
			
 
				     # 创建标签页
			
 
				-    tab1, tab2, tab3 = st.tabs(["📄 文本校验", "📊 表格分析", "📈 数据统计"])
			
 
				+    tab1, tab2, tab3, tab4 = st.tabs(["📄 内容校验", "📊 表格分析", "📈 数据统计", "🚀 快速导航"])
			
 
				     
			
 
				     with tab1:
			
 
				-        # 原有的左右分栏内容
			
 
				-        left_col, right_col = st.columns([1, 1])
			
 
				+        # 顶部控制区域
			
 
				+        control_col1, control_col2, control_col3, control_col4 = st.columns(4)
			
 
				         
			
 
				-        # 左侧 - OCR文本内容
			
 
				-        with left_col:
			
 
				-            st.header("📄 OCR识别内容")
			
 
				-            
			
 
				-            # 文本选择器
			
 
				-            if st.session_state.validator.text_bbox_mapping:
			
 
				-                text_options = ["请选择文本..."] + list(st.session_state.validator.text_bbox_mapping.keys())
			
 
				-                selected_index = st.selectbox(
			
 
				-                    "选择要校验的文本",
			
 
				-                    range(len(text_options)),
			
 
				-                    format_func=lambda x: text_options[x],
			
 
				-                    key="text_selector"
			
 
				-                )
			
 
				-                
			
 
				-                if selected_index > 0:
			
 
				-                    st.session_state.selected_text = text_options[selected_index]
			
 
				-            else:
			
 
				-                st.warning("没有找到可点击的文本")
			
 
				-            
			
 
				-            # 显示MD内容（可搜索和过滤）
			
 
				-            if st.session_state.validator.md_content:
			
 
				-                search_term = st.text_input("🔍 搜索文本内容", placeholder="输入关键词搜索...")
			
 
				-                
			
 
				-                display_content = st.session_state.validator.md_content
			
 
				-                if search_term:
			
 
				-                    lines = display_content.split('\n')
			
 
				-                    filtered_lines = [line for line in lines if search_term.lower() in line.lower()]
			
 
				-                    display_content = '\n'.join(filtered_lines)
			
 
				-                    if filtered_lines:
			
 
				-                        st.success(f"找到 {len(filtered_lines)} 行包含 '{search_term}'")
			
 
				-                    else:
			
 
				-                        st.warning(f"未找到包含 '{search_term}' 的内容")
			
 
				-                
			
 
				-                # 渲染方式选择
			
 
				-                render_mode = st.radio(
			
 
				-                    "选择渲染方式",
			
 
				-                    ["HTML渲染", "Markdown渲染", "DataFrame表格", "原始文本"],  # 添加DataFrame选项
			
 
				-                    horizontal=True
			
 
				-                )
			
 
				-
			
 
				-                if render_mode == "HTML渲染":
			
 
				-                    # 使用unsafe_allow_html=True来渲染HTML表格
			
 
				-                    st.markdown(display_content, unsafe_allow_html=True)
			
 
				-                elif render_mode == "Markdown渲染":
			
 
				-                    # 转换HTML表格为Markdown格式
			
 
				-                    converted_content = st.session_state.validator.convert_html_table_to_markdown(display_content)
			
 
				-                    st.markdown(converted_content)
			
 
				-                elif render_mode == "DataFrame表格":
			
 
				-                    # 新增：使用DataFrame显示表格
			
 
				-                    if '<table>' in display_content.lower():
			
 
				-                        st.session_state.validator.display_html_table_as_dataframe(display_content)
			
 
				-                    else:
			
 
				-                        st.info("当前内容中没有检测到HTML表格")
			
 
				-                        st.markdown(display_content)
			
 
				-                else:
			
 
				-                    # 原始文本显示
			
 
				-                    st.text_area(
			
 
				-                        "MD内容预览",
			
 
				-                        display_content,
			
 
				-                        height=300,
			
 
				-                        help="OCR识别的文本内容"
			
 
				-                    )
			
 
				-            
			
 
				-            # 可点击文本列表
			
 
				-            st.subheader("🎯 可点击文本列表")
			
 
				-            
			
 
				-            if st.session_state.validator.text_bbox_mapping:
			
 
				-                for text, info_list in st.session_state.validator.text_bbox_mapping.items():
			
 
				-                    info = info_list[0] # 使用第一个bbox信息
			
 
				-                    
			
 
				-                    # 确定显示样式
			
 
				-                    is_selected = (text == st.session_state.selected_text)
			
 
				-                    is_error = (text in st.session_state.marked_errors)
			
 
				-                    
			
 
				-                    # 创建按钮行
			
 
				-                    button_col, error_col = st.columns([4, 1])
			
 
				-                    
			
 
				-                    with button_col:
			
 
				-                        button_type = "primary" if is_selected else "secondary"
			
 
				-                        if st.button(f"📍 {text}", key=f"btn_{text}", type=button_type):
			
 
				-                            st.session_state.selected_text = text
			
 
				-                            st.rerun()
			
 
				-                    
			
 
				-                    with error_col:
			
 
				-                        if is_error:
			
 
				-                            if st.button("✅", key=f"fix_{text}", help="取消错误标记"):
			
 
				-                                st.session_state.marked_errors.discard(text)
			
 
				-                                st.rerun()
			
 
				-                        else:
			
 
				-                            if st.button("❌", key=f"error_{text}", help="标记为错误"):
			
 
				-                                st.session_state.marked_errors.add(text)
			
 
				-                                st.rerun()
			
 
				-            else:
			
 
				-                st.info("没有可点击的文本项目")
			
 
				+        with control_col1:
			
 
				+            layout_mode = st.selectbox(
			
 
				+                "布局模式", 
			
 
				+                ["标准布局", "滚动布局", "紧凑布局"],
			
 
				+                key="layout_mode"
			
 
				+            )
			
 
				         
			
 
				-        # 右侧 - 图像显示
			
 
				-        with right_col:
			
 
				-            st.header("🖼️ 原图标注")
			
 
				-            
			
 
				-            if st.session_state.validator.image_path and Path(st.session_state.validator.image_path).exists():
			
 
				-                try:
			
 
				-                    # 加载图片
			
 
				-                    image = Image.open(st.session_state.validator.image_path)
			
 
				-                    
			
 
				-                    # 创建交互式图片
			
 
				-                    selected_bbox = None
			
 
				-                    if st.session_state.selected_text and st.session_state.selected_text in st.session_state.validator.text_bbox_mapping:
			
 
				-                        info = st.session_state.validator.text_bbox_mapping[st.session_state.selected_text][0]
			
 
				-                        selected_bbox = info['bbox']
			
 
				-                    
			
 
				-                    fig = st.session_state.validator.create_interactive_plot(image, selected_bbox)
			
 
				-                    st.plotly_chart(fig, use_container_width=True)
			
 
				-                    
			
 
				-                    # 显示选中文本的详细信息
			
 
				-                    if st.session_state.selected_text:
			
 
				-                        st.subheader("📍 选中文本详情")
			
 
				-                        
			
 
				-                        if st.session_state.selected_text in st.session_state.validator.text_bbox_mapping:
			
 
				-                            info = st.session_state.validator.text_bbox_mapping[st.session_state.selected_text][0]
			
 
				-                            bbox = info['bbox']
			
 
				-                            
			
 
				-                            info_col1, info_col2 = st.columns(2)
			
 
				-                            with info_col1:
			
 
				-                                st.write(f"**文本内容:** {st.session_state.selected_text}")
			
 
				-                                st.write(f"**类别:** {info['category']}")
			
 
				-                                st.write(f"**置信度:** {info.get('confidence', 'N/A')}")
			
 
				-                            
			
 
				-                            with info_col2:
			
 
				-                                st.write(f"**位置:** [{', '.join(map(str, bbox))}]")
			
 
				-                                if len(bbox) >= 4:
			
 
				-                                    st.write(f"**宽度:** {bbox[2] - bbox[0]} px")
			
 
				-                                    st.write(f"**高度:** {bbox[3] - bbox[1]} px")
			
 
				-                            
			
 
				-                            # 标记状态
			
 
				-                            is_error = st.session_state.selected_text in st.session_state.marked_errors
			
 
				-                            if is_error:
			
 
				-                                st.error("⚠️ 此文本已标记为错误")
			
 
				-                            else:
			
 
				-                                st.success("✅ 此文本未标记错误")
			
 
				-                except Exception as e:
			
 
				-                    st.error(f"❌ 图片处理失败: {e}")
			
 
				-            else:
			
 
				-                st.error("未找到对应的图片文件")
			
 
				-                if st.session_state.validator.image_path:
			
 
				-                    st.write(f"期望路径: {st.session_state.validator.image_path}")
			
 
				-    
			
 
				+        with control_col2:
			
 
				+            if layout_mode != "标准布局":
			
 
				+                content_height = st.selectbox("内容高度", [400, 600, 800], index=1, key="content_height_select")
			
 
				+        
			
 
				+        with control_col3:
			
 
				+            font_size = st.selectbox("字体大小", [10, 12, 14, 16], index=1, key="font_size_select")
			
 
				+        
			
 
				+        with control_col4:
			
 
				+            zoom_level = st.slider("图片缩放", 0.3, 2.0, 1.0, 0.1, key="zoom_level_select")
			
 
				+        
			
 
				+        # 根据选择的布局模式显示不同的界面，传递参数
			
 
				+        if layout_mode == "滚动布局":
			
 
				+            st.session_state.validator.create_split_layout_with_fixed_image(font_size, zoom_level)
			
 
				+        elif layout_mode == "紧凑布局":
			
 
				+            st.session_state.validator.create_compact_layout(font_size, zoom_level)
			
 
				+        else:
			
 
				+            # 调用封装的标准布局方法
			
 
				+            st.session_state.validator.create_standard_layout(font_size, zoom_level)
			
 
				+
			
 
				     with tab2:
			
 
				-        # 新增：专门的表格分析页面
			
 
				+        # 表格分析页面
			
 
				         st.header("📊 表格数据分析")
			
 
				         
			
 
				         if st.session_state.validator.md_content:
			
 
				-            # 检查是否包含表格
			
 
				             if '<table' in st.session_state.validator.md_content.lower():
			
 
				                 col1, col2 = st.columns([2, 1])
			
 
				                 
			
@@ -769,11 +1177,8 @@ def main():
 
				                     
			
 
				                     if st.button("📥 导出表格数据", type="primary"):
			
 
				                         try:
			
 
				-                            import pandas as pd
			
 
				-                            from io import StringIO
			
 
				                             tables = pd.read_html(StringIO(st.session_state.validator.md_content))
			
 
				                             if tables:
			
 
				-                                # 创建Excel文件
			
 
				                                 output = BytesIO()
			
 
				                                 with pd.ExcelWriter(output, engine='openpyxl') as writer:
			
 
				                                     for i, table in enumerate(tables):
			
@@ -790,8 +1195,6 @@ def main():
 
				                     
			
 
				                     if st.button("🔍 表格统计分析"):
			
 
				                         try:
			
 
				-                            import pandas as pd
			
 
				-                            from io import StringIO
			
 
				                             tables = pd.read_html(StringIO(st.session_state.validator.md_content))
			
 
				                             if tables:
			
 
				                                 st.write("**表格统计信息:**")
			
@@ -810,7 +1213,63 @@ def main():
 
				     with tab3:
			
 
				         # 数据统计页面
			
 
				         st.header("📈 OCR数据统计")
			
 
				-        # ...现有的统计代码...
			
 
				+        
			
 
				+        if stats:
			
 
				+            # 类别统计图表
			
 
				+            if stats['categories']:
			
 
				+                st.subheader("📊 类别分布")
			
 
				+                
			
 
				+                fig_pie = px.pie(
			
 
				+                    values=list(stats['categories'].values()),
			
 
				+                    names=list(stats['categories'].keys()),
			
 
				+                    title="文本类别分布"
			
 
				+                )
			
 
				+                st.plotly_chart(fig_pie, use_container_width=True)
			
 
				+            
			
 
				+            # 错误率分析
			
 
				+            st.subheader("📈 质量分析")
			
 
				+            
			
 
				+            accuracy_data = {
			
 
				+                '状态': ['正确', '错误'],
			
 
				+                '数量': [stats['clickable_texts'] - stats['marked_errors'], stats['marked_errors']]
			
 
				+            }
			
 
				+            
			
 
				+            fig_bar = px.bar(
			
 
				+                accuracy_data,
			
 
				+                x='状态',
			
 
				+                y='数量',
			
 
				+                title="识别质量分布",
			
 
				+                color='状态',
			
 
				+                color_discrete_map={'正确': 'green', '错误': 'red'}
			
 
				+            )
			
 
				+            st.plotly_chart(fig_bar, use_container_width=True)
			
 
				+
			
 
				+    with tab4:
			
 
				+        # 快速导航功能
			
 
				+        st.header("🚀 快速导航")
			
 
				+        
			
 
				+        if not st.session_state.validator.text_bbox_mapping:
			
 
				+            st.info("没有可用的文本项进行导航")
			
 
				+        else:
			
 
				+            # 按类别分组
			
 
				+            categories = {}
			
 
				+            for text, info_list in st.session_state.validator.text_bbox_mapping.items():
			
 
				+                category = info_list[0]['category']
			
 
				+                if category not in categories:
			
 
				+                    categories[category] = []
			
 
				+                categories[category].append(text)
			
 
				+            
			
 
				+            # 创建导航按钮
			
 
				+            for category, texts in categories.items():
			
 
				+                with st.expander(f"{category} ({len(texts)}项)", expanded=False):
			
 
				+                    cols = st.columns(3)  # 每行3个按钮
			
 
				+                    for i, text in enumerate(texts):
			
 
				+                        col_idx = i % 3
			
 
				+                        with cols[col_idx]:
			
 
				+                            display_text = text[:15] + "..." if len(text) > 15 else text
			
 
				+                            if st.button(display_text, key=f"nav_{category}_{i}"):
			
 
				+                                st.session_state.selected_text = text
			
 
				+                                st.rerun()
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				     main()