|
|
@@ -0,0 +1,317 @@
|
|
|
+"""
|
|
|
+HTML/Markdown 处理工具模块
|
|
|
+
|
|
|
+提供 HTML 和 Markdown 内容的处理功能:
|
|
|
+- 图片引用处理(转换为 base64)
|
|
|
+- HTML 表格转换
|
|
|
+- 表格解析
|
|
|
+"""
|
|
|
+import os
|
|
|
+import base64
|
|
|
+import pandas as pd
|
|
|
+from io import StringIO
|
|
|
+from html import unescape
|
|
|
+from typing import List, Optional
|
|
|
+import re
|
|
|
+
|
|
|
+
|
|
|
+def find_image_in_multiple_locations(img_src: str, json_path: str) -> Optional[str]:
|
|
|
+ """
|
|
|
+ 在多个可能的位置查找图片文件
|
|
|
+
|
|
|
+ Args:
|
|
|
+ img_src: 图片源路径
|
|
|
+ json_path: JSON 文件路径(用于相对路径解析)
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 找到的图片完整路径,如果未找到则返回 None
|
|
|
+ """
|
|
|
+ json_dir = os.path.dirname(json_path)
|
|
|
+
|
|
|
+ # 可能的搜索路径
|
|
|
+ search_paths = [
|
|
|
+ # 相对于JSON文件的路径
|
|
|
+ os.path.join(json_dir, img_src),
|
|
|
+ # 相对于JSON文件父目录的路径
|
|
|
+ os.path.join(os.path.dirname(json_dir), img_src),
|
|
|
+ # imgs目录(常见的图片目录)
|
|
|
+ os.path.join(json_dir, 'imgs', os.path.basename(img_src)),
|
|
|
+ os.path.join(os.path.dirname(json_dir), 'imgs', os.path.basename(img_src)),
|
|
|
+ # images目录
|
|
|
+ os.path.join(json_dir, 'images', os.path.basename(img_src)),
|
|
|
+ os.path.join(os.path.dirname(json_dir), 'images', os.path.basename(img_src)),
|
|
|
+ # 同名目录
|
|
|
+ os.path.join(json_dir, os.path.splitext(os.path.basename(json_path))[0], os.path.basename(img_src)),
|
|
|
+ ]
|
|
|
+
|
|
|
+ # 如果是绝对路径,也加入搜索
|
|
|
+ if os.path.isabs(img_src):
|
|
|
+ search_paths.insert(0, img_src)
|
|
|
+
|
|
|
+ # 查找存在的文件
|
|
|
+ for path in search_paths:
|
|
|
+ if os.path.exists(path):
|
|
|
+ return path
|
|
|
+
|
|
|
+ return None
|
|
|
+
|
|
|
+
|
|
|
+def process_html_images(html_content: str, json_path: str) -> str:
|
|
|
+ """
|
|
|
+ 处理HTML内容中的图片引用,将本地图片转换为base64
|
|
|
+
|
|
|
+ Args:
|
|
|
+ html_content: HTML 内容
|
|
|
+ json_path: JSON 文件路径(用于相对路径解析)
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 处理后的 HTML 内容(图片已转换为 base64)
|
|
|
+ """
|
|
|
+ # 匹配HTML图片标签: <img src="path" ... />
|
|
|
+ img_pattern = r'<img\s+[^>]*src\s*=\s*["\']([^"\']+)["\'][^>]*/?>'
|
|
|
+
|
|
|
+ def replace_html_image(match):
|
|
|
+ full_tag = match.group(0)
|
|
|
+ img_src = match.group(1)
|
|
|
+
|
|
|
+ # 如果已经是base64或者网络链接,直接返回
|
|
|
+ if img_src.startswith('data:image') or img_src.startswith('http'):
|
|
|
+ return full_tag
|
|
|
+
|
|
|
+ # 增强的图片查找
|
|
|
+ full_img_path = find_image_in_multiple_locations(img_src, json_path)
|
|
|
+
|
|
|
+ # 尝试转换为base64
|
|
|
+ try:
|
|
|
+ if full_img_path and os.path.exists(full_img_path):
|
|
|
+ with open(full_img_path, 'rb') as img_file:
|
|
|
+ img_data = img_file.read()
|
|
|
+
|
|
|
+ # 获取文件扩展名确定MIME类型
|
|
|
+ ext = os.path.splitext(full_img_path)[1].lower()
|
|
|
+ mime_type = {
|
|
|
+ '.png': 'image/png',
|
|
|
+ '.jpg': 'image/jpeg',
|
|
|
+ '.jpeg': 'image/jpeg',
|
|
|
+ '.gif': 'image/gif',
|
|
|
+ '.bmp': 'image/bmp',
|
|
|
+ '.webp': 'image/webp'
|
|
|
+ }.get(ext, 'image/jpeg')
|
|
|
+
|
|
|
+ # 转换为base64
|
|
|
+ img_base64 = base64.b64encode(img_data).decode('utf-8')
|
|
|
+ data_url = f"data:{mime_type};base64,{img_base64}"
|
|
|
+
|
|
|
+ # 替换src属性,保持其他属性不变
|
|
|
+ updated_tag = re.sub(
|
|
|
+ r'src\s*=\s*["\'][^"\']+["\']',
|
|
|
+ f'src="{data_url}"',
|
|
|
+ full_tag
|
|
|
+ )
|
|
|
+ return updated_tag
|
|
|
+ else:
|
|
|
+ # 文件不存在,显示详细的错误信息
|
|
|
+ error_content = f"""
|
|
|
+ <div style="
|
|
|
+ color: #d32f2f;
|
|
|
+ border: 2px dashed #d32f2f;
|
|
|
+ padding: 10px;
|
|
|
+ margin: 10px 0;
|
|
|
+ border-radius: 5px;
|
|
|
+ background-color: #ffebee;
|
|
|
+ text-align: center;
|
|
|
+ ">
|
|
|
+ <strong>🖼️ 图片无法加载</strong><br>
|
|
|
+ <small>原始路径: {img_src}</small><br>
|
|
|
+ <small>JSON文件: {os.path.basename(json_path)}</small><br>
|
|
|
+ <em>请检查图片文件是否存在</em>
|
|
|
+ </div>
|
|
|
+ """
|
|
|
+ return error_content
|
|
|
+ except Exception as e:
|
|
|
+ # 转换失败,返回错误信息
|
|
|
+ error_content = f"""
|
|
|
+ <div style="
|
|
|
+ color: #f57c00;
|
|
|
+ border: 2px dashed #f57c00;
|
|
|
+ padding: 10px;
|
|
|
+ margin: 10px 0;
|
|
|
+ border-radius: 5px;
|
|
|
+ background-color: #fff3e0;
|
|
|
+ text-align: center;
|
|
|
+ ">
|
|
|
+ <strong>⚠️ 图片处理失败</strong><br>
|
|
|
+ <small>文件: {img_src}</small><br>
|
|
|
+ <small>错误: {str(e)}</small>
|
|
|
+ </div>
|
|
|
+ """
|
|
|
+ return error_content
|
|
|
+
|
|
|
+ # 替换所有HTML图片标签
|
|
|
+ processed_content = re.sub(img_pattern, replace_html_image, html_content, flags=re.IGNORECASE)
|
|
|
+ return processed_content
|
|
|
+
|
|
|
+
|
|
|
+def process_markdown_images(md_content: str, json_path: str) -> str:
|
|
|
+ """
|
|
|
+ 处理Markdown中的图片引用,将本地图片转换为base64
|
|
|
+
|
|
|
+ Args:
|
|
|
+ md_content: Markdown 内容
|
|
|
+ json_path: JSON 文件路径(用于相对路径解析)
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 处理后的 Markdown 内容(图片已转换为 base64)
|
|
|
+ """
|
|
|
+ # 匹配Markdown图片语法: 
|
|
|
+ img_pattern = r'!\[([^\]]*)\]\(([^)]+)\)'
|
|
|
+
|
|
|
+ def replace_image(match):
|
|
|
+ alt_text = match.group(1)
|
|
|
+ img_path = match.group(2)
|
|
|
+
|
|
|
+ # 如果已经是base64或者网络链接,直接返回
|
|
|
+ if img_path.startswith('data:image') or img_path.startswith('http'):
|
|
|
+ return match.group(0)
|
|
|
+
|
|
|
+ # 处理相对路径
|
|
|
+ if not os.path.isabs(img_path):
|
|
|
+ # 相对于JSON文件的路径
|
|
|
+ json_dir = os.path.dirname(json_path)
|
|
|
+ full_img_path = os.path.join(json_dir, img_path)
|
|
|
+ else:
|
|
|
+ full_img_path = img_path
|
|
|
+
|
|
|
+ # 尝试转换为base64
|
|
|
+ try:
|
|
|
+ if os.path.exists(full_img_path):
|
|
|
+ with open(full_img_path, 'rb') as img_file:
|
|
|
+ img_data = img_file.read()
|
|
|
+
|
|
|
+ # 获取文件扩展名确定MIME类型
|
|
|
+ ext = os.path.splitext(full_img_path)[1].lower()
|
|
|
+ mime_type = {
|
|
|
+ '.png': 'image/png',
|
|
|
+ '.jpg': 'image/jpeg',
|
|
|
+ '.jpeg': 'image/jpeg',
|
|
|
+ '.gif': 'image/gif',
|
|
|
+ '.bmp': 'image/bmp',
|
|
|
+ '.webp': 'image/webp'
|
|
|
+ }.get(ext, 'image/jpeg')
|
|
|
+
|
|
|
+ # 转换为base64
|
|
|
+ img_base64 = base64.b64encode(img_data).decode('utf-8')
|
|
|
+ data_url = f"data:{mime_type};base64,{img_base64}"
|
|
|
+
|
|
|
+ return f''
|
|
|
+ else:
|
|
|
+ # 文件不存在,返回原始链接但添加警告
|
|
|
+ return f''
|
|
|
+ except Exception as e:
|
|
|
+ # 转换失败,返回原始链接
|
|
|
+ return f''
|
|
|
+
|
|
|
+ # 替换所有图片引用
|
|
|
+ processed_content = re.sub(img_pattern, replace_image, md_content)
|
|
|
+ return processed_content
|
|
|
+
|
|
|
+
|
|
|
+def process_all_images_in_content(content: str, json_path: str) -> str:
|
|
|
+ """
|
|
|
+ 处理内容中的所有图片引用(包括Markdown和HTML格式)
|
|
|
+
|
|
|
+ Args:
|
|
|
+ content: 内容(可能包含 HTML 和 Markdown)
|
|
|
+ json_path: JSON 文件路径
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 处理后的内容(所有图片已转换为 base64)
|
|
|
+ """
|
|
|
+ # 先处理HTML图片
|
|
|
+ content = process_html_images(content, json_path)
|
|
|
+ # 再处理Markdown图片
|
|
|
+ content = process_markdown_images(content, json_path)
|
|
|
+ return content
|
|
|
+
|
|
|
+
|
|
|
+def convert_html_table_to_markdown(content: str) -> str:
|
|
|
+ """
|
|
|
+ 将HTML表格转换为Markdown表格格式 - 支持横向滚动的增强版本
|
|
|
+
|
|
|
+ Args:
|
|
|
+ content: 包含 HTML 表格的内容
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 转换后的内容(HTML 表格已转换为 Markdown)
|
|
|
+ """
|
|
|
+ def replace_table(match):
|
|
|
+ table_html = match.group(0)
|
|
|
+
|
|
|
+ # 提取所有行
|
|
|
+ rows = re.findall(r'<tr[^>]*>(.*?)</tr>', table_html, re.DOTALL | re.IGNORECASE)
|
|
|
+ if not rows:
|
|
|
+ return table_html
|
|
|
+
|
|
|
+ markdown_rows = []
|
|
|
+ max_cols = 0
|
|
|
+
|
|
|
+ # 处理所有行,找出最大列数
|
|
|
+ processed_rows = []
|
|
|
+ for row in rows:
|
|
|
+ # 提取单元格,支持 th 和 td
|
|
|
+ cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row, re.DOTALL | re.IGNORECASE)
|
|
|
+ if cells:
|
|
|
+ clean_cells = []
|
|
|
+ for cell in cells:
|
|
|
+ cell_text = re.sub(r'<[^>]+>', '', cell).strip()
|
|
|
+ cell_text = unescape(cell_text)
|
|
|
+ # 限制单元格长度,避免表格过宽
|
|
|
+ if len(cell_text) > 30:
|
|
|
+ cell_text = cell_text[:27] + "..."
|
|
|
+ clean_cells.append(cell_text or " ") # 空单元格用空格替代
|
|
|
+
|
|
|
+ processed_rows.append(clean_cells)
|
|
|
+ max_cols = max(max_cols, len(clean_cells))
|
|
|
+
|
|
|
+ # 统一所有行的列数
|
|
|
+ for i, row_cells in enumerate(processed_rows):
|
|
|
+ while len(row_cells) < max_cols:
|
|
|
+ row_cells.append(" ")
|
|
|
+
|
|
|
+ # 构建Markdown行
|
|
|
+ markdown_row = '| ' + ' | '.join(row_cells) + ' |'
|
|
|
+ markdown_rows.append(markdown_row)
|
|
|
+
|
|
|
+ # 在第一行后添加分隔符
|
|
|
+ if i == 0:
|
|
|
+ separator = '| ' + ' | '.join(['---'] * max_cols) + ' |'
|
|
|
+ markdown_rows.append(separator)
|
|
|
+
|
|
|
+ # 添加滚动提示
|
|
|
+ if max_cols > 8:
|
|
|
+ scroll_note = "\n> 📋 **提示**: 此表格列数较多,在某些视图中可能需要横向滚动查看完整内容。\n"
|
|
|
+ return scroll_note + '\n'.join(markdown_rows) if markdown_rows else table_html
|
|
|
+
|
|
|
+ return '\n'.join(markdown_rows) if markdown_rows else table_html
|
|
|
+
|
|
|
+ # 替换所有HTML表格
|
|
|
+ converted = re.sub(r'<table[^>]*>.*?</table>', replace_table, content, flags=re.DOTALL | re.IGNORECASE)
|
|
|
+ return converted
|
|
|
+
|
|
|
+
|
|
|
+def parse_html_tables(html_content: str) -> List[pd.DataFrame]:
|
|
|
+ """
|
|
|
+ 解析HTML内容中的表格为DataFrame列表
|
|
|
+
|
|
|
+ Args:
|
|
|
+ html_content: HTML 内容
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ DataFrame 列表,每个 DataFrame 对应一个表格
|
|
|
+ """
|
|
|
+ try:
|
|
|
+ tables = pd.read_html(StringIO(html_content))
|
|
|
+ return tables if tables else []
|
|
|
+ except Exception:
|
|
|
+ return []
|
|
|
+
|