2 maanden geleden · db786abb0a
--- a/ocr_validator_utils.py
+++ b/ocr_validator_utils.py
@@ -341,6 +341,134 @@ def get_rotation_angle_from_ppstructv3(data: Dict) -> float:
 
				     return 0.0
			
 
				 
			
 
				 
			
 
				+def find_image_in_multiple_locations(img_src: str, json_path: str) -> Optional[str]:
			
 
				+    """
			
 
				+    在多个可能的位置查找图片文件
			
 
				+    """
			
 
				+    json_dir = os.path.dirname(json_path)
			
 
				+    
			
 
				+    # 可能的搜索路径
			
 
				+    search_paths = [
			
 
				+        # 相对于JSON文件的路径
			
 
				+        os.path.join(json_dir, img_src),
			
 
				+        # 相对于JSON文件父目录的路径
			
 
				+        os.path.join(os.path.dirname(json_dir), img_src),
			
 
				+        # imgs目录（常见的图片目录）
			
 
				+        os.path.join(json_dir, 'imgs', os.path.basename(img_src)),
			
 
				+        os.path.join(os.path.dirname(json_dir), 'imgs', os.path.basename(img_src)),
			
 
				+        # images目录
			
 
				+        os.path.join(json_dir, 'images', os.path.basename(img_src)),
			
 
				+        os.path.join(os.path.dirname(json_dir), 'images', os.path.basename(img_src)),
			
 
				+        # 同名目录
			
 
				+        os.path.join(json_dir, os.path.splitext(os.path.basename(json_path))[0], os.path.basename(img_src)),
			
 
				+    ]
			
 
				+    
			
 
				+    # 如果是绝对路径，也加入搜索
			
 
				+    if os.path.isabs(img_src):
			
 
				+        search_paths.insert(0, img_src)
			
 
				+    
			
 
				+    # 查找存在的文件
			
 
				+    for path in search_paths:
			
 
				+        if os.path.exists(path):
			
 
				+            return path
			
 
				+    
			
 
				+    return None
			
 
				+
			
 
				+
			
 
				+def process_html_images(html_content: str, json_path: str) -> str:
			
 
				+    """
			
 
				+    处理HTML内容中的图片引用，将本地图片转换为base64 - 增强版
			
 
				+    """
			
 
				+    import re
			
 
				+    
			
 
				+    # 匹配HTML图片标签: <img src="path" ... />
			
 
				+    img_pattern = r'<img\s+[^>]*src\s*=\s*["\']([^"\']+)["\'][^>]*/?>'
			
 
				+    
			
 
				+    def replace_html_image(match):
			
 
				+        full_tag = match.group(0)
			
 
				+        img_src = match.group(1)
			
 
				+        
			
 
				+        # 如果已经是base64或者网络链接，直接返回
			
 
				+        if img_src.startswith('data:image') or img_src.startswith('http'):
			
 
				+            return full_tag
			
 
				+        
			
 
				+        # 增强的图片查找
			
 
				+        full_img_path = find_image_in_multiple_locations(img_src, json_path)
			
 
				+        
			
 
				+        # 尝试转换为base64
			
 
				+        try:
			
 
				+            if full_img_path and os.path.exists(full_img_path):
			
 
				+                with open(full_img_path, 'rb') as img_file:
			
 
				+                    img_data = img_file.read()
			
 
				+                    
			
 
				+                # 获取文件扩展名确定MIME类型
			
 
				+                ext = os.path.splitext(full_img_path)[1].lower()
			
 
				+                mime_type = {
			
 
				+                    '.png': 'image/png',
			
 
				+                    '.jpg': 'image/jpeg',
			
 
				+                    '.jpeg': 'image/jpeg',
			
 
				+                    '.gif': 'image/gif',
			
 
				+                    '.bmp': 'image/bmp',
			
 
				+                    '.webp': 'image/webp'
			
 
				+                }.get(ext, 'image/jpeg')
			
 
				+                
			
 
				+                # 转换为base64
			
 
				+                img_base64 = base64.b64encode(img_data).decode('utf-8')
			
 
				+                data_url = f"data:{mime_type};base64,{img_base64}"
			
 
				+                
			
 
				+                # 替换src属性，保持其他属性不变
			
 
				+                updated_tag = re.sub(
			
 
				+                    r'src\s*=\s*["\'][^"\']+["\']',
			
 
				+                    f'src="{data_url}"',
			
 
				+                    full_tag
			
 
				+                )
			
 
				+                return updated_tag
			
 
				+            else:
			
 
				+                # 文件不存在，显示详细的错误信息
			
 
				+                search_info = f"搜索路径: {img_src}"
			
 
				+                if full_img_path:
			
 
				+                    search_info += f" -> {full_img_path}"
			
 
				+                
			
 
				+                error_content = f"""
			
 
				+                <div style="
			
 
				+                    color: #d32f2f; 
			
 
				+                    border: 2px dashed #d32f2f; 
			
 
				+                    padding: 10px; 
			
 
				+                    margin: 10px 0; 
			
 
				+                    border-radius: 5px;
			
 
				+                    background-color: #ffebee;
			
 
				+                    text-align: center;
			
 
				+                ">
			
 
				+                    <strong>🖼️ 图片无法加载</strong><br>
			
 
				+                    <small>原始路径: {img_src}</small><br>
			
 
				+                    <small>JSON文件: {os.path.basename(json_path)}</small><br>
			
 
				+                    <em>请检查图片文件是否存在</em>
			
 
				+                </div>
			
 
				+                """
			
 
				+                return error_content
			
 
				+        except Exception as e:
			
 
				+            # 转换失败，返回错误信息
			
 
				+            error_content = f"""
			
 
				+            <div style="
			
 
				+                color: #f57c00; 
			
 
				+                border: 2px dashed #f57c00; 
			
 
				+                padding: 10px; 
			
 
				+                margin: 10px 0; 
			
 
				+                border-radius: 5px;
			
 
				+                background-color: #fff3e0;
			
 
				+                text-align: center;
			
 
				+            ">
			
 
				+                <strong>⚠️ 图片处理失败</strong><br>
			
 
				+                <small>文件: {img_src}</small><br>
			
 
				+                <small>错误: {str(e)}</small>
			
 
				+            </div>
			
 
				+            """
			
 
				+            return error_content
			
 
				+    
			
 
				+    # 替换所有HTML图片标签
			
 
				+    processed_content = re.sub(img_pattern, replace_html_image, html_content, flags=re.IGNORECASE)
			
 
				+    return processed_content
			
 
				+
			
 
				 def process_markdown_images(md_content: str, json_path: str) -> str:
			
 
				     """
			
 
				     处理Markdown中的图片引用，将本地图片转换为base64
			
@@ -399,7 +527,17 @@ def process_markdown_images(md_content: str, json_path: str) -> str:
 
				     processed_content = re.sub(img_pattern, replace_image, md_content)
			
 
				     return processed_content
			
 
				 
			
 
				+def process_all_images_in_content(content: str, json_path: str) -> str:
			
 
				+    """
			
 
				+    处理内容中的所有图片引用（包括Markdown和HTML格式）
			
 
				+    """
			
 
				+    # 先处理HTML图片
			
 
				+    content = process_html_images(content, json_path)
			
 
				+    # 再处理Markdown图片
			
 
				+    content = process_markdown_images(content, json_path)
			
 
				+    return content
			
 
				 
			
 
				+# 修改 load_ocr_data_file 函数
			
 
				 def load_ocr_data_file(json_path: str, config: Dict) -> Tuple[List, str, str]:
			
 
				     """加载OCR相关数据文件"""
			
 
				     json_file = Path(json_path)
			
@@ -434,8 +572,8 @@ def load_ocr_data_file(json_path: str, config: Dict) -> Tuple[List, str, str]:
 
				         with open(md_file, 'r', encoding='utf-8') as f:
			
 
				             raw_md_content = f.read()
			
 
				             
			
 
				-        # 处理Markdown中的图片引用
			
 
				-        md_content = process_markdown_images(raw_md_content, str(json_file))
			
 
				+        # 处理内容中的所有图片引用（HTML和Markdown）
			
 
				+        md_content = process_all_images_in_content(raw_md_content, str(json_file))
			
 
				     
			
 
				     # 推断图片路径
			
 
				     image_name = json_file.stem
			
@@ -585,35 +723,54 @@ def get_ocr_statistics(ocr_data: List, text_bbox_mapping: Dict, marked_errors: s
 
				 
			
 
				 
			
 
				 def convert_html_table_to_markdown(content: str) -> str:
			
 
				-    """将HTML表格转换为Markdown表格格式"""
			
 
				+    """将HTML表格转换为Markdown表格格式 - 支持横向滚动的增强版本"""
			
 
				     def replace_table(match):
			
 
				         table_html = match.group(0)
			
 
				         
			
 
				         # 提取所有行
			
 
				-        rows = re.findall(r'<tr>(.*?)</tr>', table_html, re.DOTALL | re.IGNORECASE)
			
 
				+        rows = re.findall(r'<tr[^>]*>(.*?)</tr>', table_html, re.DOTALL | re.IGNORECASE)
			
 
				         if not rows:
			
 
				             return table_html
			
 
				         
			
 
				         markdown_rows = []
			
 
				-        for i, row in enumerate(rows):
			
 
				-            # 提取单元格
			
 
				-            cells = re.findall(r'<td[^>]*>(.*?)</td>', row, re.DOTALL | re.IGNORECASE)
			
 
				+        max_cols = 0
			
 
				+        
			
 
				+        # 处理所有行，找出最大列数
			
 
				+        processed_rows = []
			
 
				+        for row in rows:
			
 
				+            # 提取单元格，支持 th 和 td
			
 
				+            cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row, re.DOTALL | re.IGNORECASE)
			
 
				             if cells:
			
 
				-                # 清理单元格内容
			
 
				                 clean_cells = []
			
 
				                 for cell in cells:
			
 
				                     cell_text = re.sub(r'<[^>]+>', '', cell).strip()
			
 
				                     cell_text = unescape(cell_text)
			
 
				-                    clean_cells.append(cell_text)
			
 
				-                
			
 
				-                # 构建Markdown行
			
 
				-                markdown_row = '| ' + ' | '.join(clean_cells) + ' |'
			
 
				-                markdown_rows.append(markdown_row)
			
 
				+                    # 限制单元格长度，避免表格过宽
			
 
				+                    if len(cell_text) > 30:
			
 
				+                        cell_text = cell_text[:27] + "..."
			
 
				+                    clean_cells.append(cell_text or " ")  # 空单元格用空格替代
			
 
				                 
			
 
				-                # 在第一行后添加分隔符
			
 
				-                if i == 0:
			
 
				-                    separator = '| ' + ' | '.join(['---'] * len(clean_cells)) + ' |'
			
 
				-                    markdown_rows.append(separator)
			
 
				+                processed_rows.append(clean_cells)
			
 
				+                max_cols = max(max_cols, len(clean_cells))
			
 
				+        
			
 
				+        # 统一所有行的列数
			
 
				+        for i, row_cells in enumerate(processed_rows):
			
 
				+            while len(row_cells) < max_cols:
			
 
				+                row_cells.append(" ")
			
 
				+            
			
 
				+            # 构建Markdown行
			
 
				+            markdown_row = '| ' + ' | '.join(row_cells) + ' |'
			
 
				+            markdown_rows.append(markdown_row)
			
 
				+            
			
 
				+            # 在第一行后添加分隔符
			
 
				+            if i == 0:
			
 
				+                separator = '| ' + ' | '.join(['---'] * max_cols) + ' |'
			
 
				+                markdown_rows.append(separator)
			
 
				+        
			
 
				+        # 添加滚动提示
			
 
				+        if max_cols > 8:
			
 
				+            scroll_note = "\n> 📋 **提示**: 此表格列数较多，在某些视图中可能需要横向滚动查看完整内容。\n"
			
 
				+            return scroll_note + '\n'.join(markdown_rows) if markdown_rows else table_html
			
 
				         
			
 
				         return '\n'.join(markdown_rows) if markdown_rows else table_html