6 månader sedan · d810bf44fc
--- a/ocr_utils/__init__.py
+++ b/ocr_utils/__init__.py
@@ -0,0 +1,71 @@
 
				+"""
			
 
				+OCR 工具包
			
 
				+
			
 
				+整合了文档处理相关的工具函数，包括：
			
 
				+- PDF 处理工具
			
 
				+- JSON/Markdown/HTML 格式化工具
			
 
				+- 文件处理工具
			
 
				+- 数字标准化工具
			
 
				+"""
			
 
				+
			
 
				+from .pdf_utils import PDFUtils
			
 
				+from .json_formatters import JSONFormatters
			
 
				+from .markdown_generator import MarkdownGenerator
			
 
				+from .html_generator import HTMLGenerator
			
 
				+from .visualization_utils import VisualizationUtils
			
 
				+from .output_formatter_v2 import OutputFormatterV2, save_mineru_format
			
 
				+from .pdf_extractor import extract_pdf_pages
			
 
				+from .normalize_financial_numbers import (
			
 
				+    normalize_financial_numbers,
			
 
				+    normalize_json_table,
			
 
				+    normalize_markdown_table,
			
 
				+    normalize_json_file
			
 
				+)
			
 
				+from .file_utils import (
			
 
				+    get_input_files,
			
 
				+    collect_pid_files,
			
 
				+    get_image_files_from_dir,
			
 
				+    get_image_files_from_list,
			
 
				+    get_image_files_from_csv,
			
 
				+    convert_pdf_to_images,
			
 
				+    split_files,
			
 
				+    create_temp_file_list
			
 
				+)
			
 
				+from .log_utils import setup_logging
			
 
				+
			
 
				+__all__ = [
			
 
				+    # PDF 工具
			
 
				+    'PDFUtils',
			
 
				+    'extract_pdf_pages',
			
 
				+    # JSON 格式化
			
 
				+    'JSONFormatters',
			
 
				+    # Markdown 生成
			
 
				+    'MarkdownGenerator',
			
 
				+    # HTML 生成
			
 
				+    'HTMLGenerator',
			
 
				+    # 可视化
			
 
				+    'VisualizationUtils',
			
 
				+    # 输出格式化
			
 
				+    'OutputFormatterV2',
			
 
				+    'save_mineru_format',
			
 
				+    # 数字标准化
			
 
				+    'normalize_financial_numbers',
			
 
				+    'normalize_json_table',
			
 
				+    'normalize_markdown_table',
			
 
				+    'normalize_json_file',
			
 
				+    # 文件工具
			
 
				+    'get_input_files',
			
 
				+    'collect_pid_files',
			
 
				+    'get_image_files_from_dir',
			
 
				+    'get_image_files_from_list',
			
 
				+    'get_image_files_from_csv',
			
 
				+    'convert_pdf_to_images',
			
 
				+    'split_files',
			
 
				+    'create_temp_file_list',
			
 
				+    # 日志工具
			
 
				+    'setup_logging',
			
 
				+]
			
 
				+
			
 
				+__version__ = "1.0.0"
			
 
				+__author__ = "zhch158"
			
 
				+
			
--- a/ocr_utils/file_utils.py
+++ b/ocr_utils/file_utils.py
@@ -0,0 +1,397 @@
 
				+"""
			
 
				+文件处理工具模块
			
 
				+
			
 
				+提供文件处理相关功能：
			
 
				+- 输入文件获取（支持文件/目录/列表/CSV）
			
 
				+- PDF转图片
			
 
				+- 文件列表处理
			
 
				+"""
			
 
				+import tempfile
			
 
				+from pathlib import Path
			
 
				+from typing import List, Tuple
			
 
				+import json
			
 
				+import traceback
			
 
				+from loguru import logger
			
 
				+
			
 
				+try:
			
 
				+    from mineru.utils.pdf_image_tools import load_images_from_pdf
			
 
				+    from mineru.utils.enum_class import ImageType
			
 
				+    MINERU_AVAILABLE = True
			
 
				+except ImportError:
			
 
				+    MINERU_AVAILABLE = False
			
 
				+    load_images_from_pdf = None
			
 
				+    ImageType = None
			
 
				+
			
 
				+
			
 
				+def split_files(file_list: List[str], num_splits: int) -> List[List[str]]:
			
 
				+    """
			
 
				+    将文件列表分割成指定数量的子列表
			
 
				+    
			
 
				+    Args:
			
 
				+        file_list: 文件路径列表
			
 
				+        num_splits: 分割数量
			
 
				+        
			
 
				+    Returns:
			
 
				+        分割后的文件列表
			
 
				+    """
			
 
				+    if num_splits <= 0:
			
 
				+        return [file_list]
			
 
				+    
			
 
				+    chunk_size = len(file_list) // num_splits
			
 
				+    remainder = len(file_list) % num_splits
			
 
				+    
			
 
				+    chunks = []
			
 
				+    start = 0
			
 
				+    
			
 
				+    for i in range(num_splits):
			
 
				+        # 前remainder个chunk多分配一个文件
			
 
				+        current_chunk_size = chunk_size + (1 if i < remainder else 0)
			
 
				+        if current_chunk_size > 0:
			
 
				+            chunks.append(file_list[start:start + current_chunk_size])
			
 
				+            start += current_chunk_size
			
 
				+    
			
 
				+    return [chunk for chunk in chunks if chunk]  # 过滤空列表
			
 
				+
			
 
				+
			
 
				+def create_temp_file_list(file_chunk: List[str]) -> str:
			
 
				+    """
			
 
				+    创建临时文件列表文件
			
 
				+    
			
 
				+    Args:
			
 
				+        file_chunk: 文件路径列表
			
 
				+        
			
 
				+    Returns:
			
 
				+        临时文件路径
			
 
				+    """
			
 
				+    with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
			
 
				+        for file_path in file_chunk:
			
 
				+            f.write(f"{file_path}\n")
			
 
				+        return f.name
			
 
				+
			
 
				+
			
 
				+def get_image_files_from_dir(input_dir: Path, pattern: str = "*", max_files: int | None = None) -> List[str]:
			
 
				+    """
			
 
				+    从目录获取图像文件列表
			
 
				+    
			
 
				+    Args:
			
 
				+        input_dir: 输入目录
			
 
				+        pattern: 文件名模式
			
 
				+        max_files: 最大文件数量限制
			
 
				+        
			
 
				+    Returns:
			
 
				+        图像文件路径列表
			
 
				+    """
			
 
				+    image_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif']
			
 
				+    image_files = []
			
 
				+    
			
 
				+    for ext in image_extensions:
			
 
				+        image_files.extend(list(input_dir.glob(f"{pattern}{ext}")))
			
 
				+        image_files.extend(list(input_dir.glob(f"{pattern}{ext.upper()}")))
			
 
				+    
			
 
				+    # 去重并排序
			
 
				+    image_files = sorted(list(set(str(f) for f in image_files)))
			
 
				+    
			
 
				+    # 限制文件数量
			
 
				+    if max_files:
			
 
				+        image_files = image_files[:max_files]
			
 
				+    
			
 
				+    return image_files
			
 
				+
			
 
				+
			
 
				+def get_image_files_from_list(file_list_path: str) -> List[str]:
			
 
				+    """
			
 
				+    从文件列表获取图像文件列表
			
 
				+    
			
 
				+    Args:
			
 
				+        file_list_path: 文件列表路径
			
 
				+        
			
 
				+    Returns:
			
 
				+        图像文件路径列表
			
 
				+    """
			
 
				+    logger.info(f"📄 Reading file list from: {file_list_path}")
			
 
				+    
			
 
				+    with open(file_list_path, 'r', encoding='utf-8') as f:
			
 
				+        image_files = [line.strip() for line in f if line.strip()]
			
 
				+    
			
 
				+    # 验证文件存在性
			
 
				+    valid_files = []
			
 
				+    missing_files = []
			
 
				+    
			
 
				+    for file_path in image_files:
			
 
				+        if Path(file_path).exists():
			
 
				+            valid_files.append(file_path)
			
 
				+        else:
			
 
				+            missing_files.append(file_path)
			
 
				+    
			
 
				+    if missing_files:
			
 
				+        logger.warning(f"⚠️ Warning: {len(missing_files)} files not found:")
			
 
				+        for missing_file in missing_files[:5]:  # 只显示前5个
			
 
				+            logger.warning(f"  - {missing_file}")
			
 
				+        if len(missing_files) > 5:
			
 
				+            logger.warning(f"  ... and {len(missing_files) - 5} more")
			
 
				+    
			
 
				+    logger.info(f"✅ Found {len(valid_files)} valid files out of {len(image_files)} in list")
			
 
				+    return valid_files
			
 
				+
			
 
				+
			
 
				+def get_image_files_from_csv(csv_file: str, status_filter: str = "fail") -> List[str]:
			
 
				+    """
			
 
				+    从CSV文件获取图像文件列表
			
 
				+
			
 
				+    Args:
			
 
				+        csv_file: CSV文件路径
			
 
				+        status_filter: 状态过滤器
			
 
				+
			
 
				+    Returns:
			
 
				+        图像文件路径列表
			
 
				+    """
			
 
				+    logger.info(f"📄 Reading image files from CSV: {csv_file}")
			
 
				+
			
 
				+    # 读取CSV文件, 表头：image_path,status
			
 
				+    image_files = []
			
 
				+    with open(csv_file, 'r', encoding='utf-8') as f:
			
 
				+        for line in f:
			
 
				+            # 需要去掉表头， 按","分割，读取文件名，状态
			
 
				+            parts = line.strip().split(",")
			
 
				+            if len(parts) >= 2:
			
 
				+                image_file, status = parts[0], parts[1]
			
 
				+                if status.lower() == status_filter.lower():
			
 
				+                    image_files.append(image_file)
			
 
				+
			
 
				+    return image_files
			
 
				+
			
 
				+
			
 
				+def collect_pid_files(pid_output_file: str) -> List[Tuple[str, str]]:
			
 
				+    """
			
 
				+    从进程输出文件中收集文件
			
 
				+
			
 
				+    Args:
			
 
				+        pid_output_file: 进程输出文件路径
			
 
				+
			
 
				+    Returns:
			
 
				+        文件列表(文件路径，处理结果)
			
 
				+    """
			
 
				+    """
			
 
				+    单进程结果统计文件格式
			
 
				+    "results": [
			
 
				+    {
			
 
				+      "image_path": "docstructbench_dianzishu_zhongwenzaixian-o.O-61520612.pdf_140.jpg",
			
 
				+      "processing_time": 2.0265579223632812e-06,
			
 
				+      "success": true,
			
 
				+      "device": "gpu:3",
			
 
				+      "output_json": "/home/ubuntu/zhch/PaddleX/zhch/OmniDocBench_Results_Scheduler/process_3/docstructbench_dianzishu_zhongwenzaixian-o.O-61520612.pdf_140.json",
			
 
				+      "output_md": "/home/ubuntu/zhch/PaddleX/zhch/OmniDocBench_Results_Scheduler/process_3/docstructbench_dianzishu_zhongwenzaixian-o.O-61520612.pdf_140.md"
			
 
				+    },
			
 
				+    ...
			
 
				+    """
			
 
				+    if not Path(pid_output_file).exists():
			
 
				+        logger.warning(f"⚠️ Warning: PID output file not found: {pid_output_file}")
			
 
				+        return []
			
 
				+
			
 
				+    with open(pid_output_file, 'r', encoding='utf-8') as f:
			
 
				+        data = json.load(f)
			
 
				+
			
 
				+    if not isinstance(data, dict) or "results" not in data:
			
 
				+        logger.warning(f"⚠️ Warning: Invalid PID output file format: {pid_output_file}")
			
 
				+        return []
			
 
				+    # 返回文件路径和处理状态, 如果"success": True, 则状态为"success", 否则为"fail"
			
 
				+    file_list = []
			
 
				+    for file_result in data.get("results", []):
			
 
				+        image_path = file_result.get("image_path", "")
			
 
				+        status = "success" if file_result.get("success", False) else "fail"
			
 
				+        file_list.append((image_path, status))
			
 
				+    return file_list
			
 
				+
			
 
				+
			
 
				+def convert_pdf_to_images(
			
 
				+    pdf_file: str, 
			
 
				+    output_dir: str | None = None, 
			
 
				+    dpi: int = 200,
			
 
				+    page_range: str | None = None
			
 
				+) -> List[str]:
			
 
				+    """
			
 
				+    将PDF转换为图像文件，支持页面范围过滤
			
 
				+    
			
 
				+    Args:
			
 
				+        pdf_file: PDF文件路径
			
 
				+        output_dir: 输出目录
			
 
				+        dpi: 图像分辨率
			
 
				+        page_range: 页面范围字符串，如 "1-5,7,9-12"
			
 
				+        
			
 
				+    Returns:
			
 
				+        生成的图像文件路径列表
			
 
				+    """
			
 
				+    pdf_path = Path(pdf_file)
			
 
				+    if not pdf_path.exists() or pdf_path.suffix.lower() != '.pdf':
			
 
				+        logger.error(f"❌ Invalid PDF file: {pdf_path}")
			
 
				+        return []
			
 
				+
			
 
				+    # 如果没有指定输出目录，使用PDF同名目录
			
 
				+    if output_dir is None:
			
 
				+        output_path = pdf_path.parent / f"{pdf_path.stem}"
			
 
				+    else:
			
 
				+        output_path = Path(output_dir) / f"{pdf_path.stem}"
			
 
				+    output_path = output_path.resolve()
			
 
				+    output_path.mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+    try:
			
 
				+        # 使用MinerU的函数加载PDF图像
			
 
				+        if not MINERU_AVAILABLE or load_images_from_pdf is None or ImageType is None:
			
 
				+            logger.error("❌ MinerU components not available for PDF to image conversion")
			
 
				+            return []
			
 
				+        
			
 
				+        images, _ = load_images_from_pdf(
			
 
				+            pdf_path.read_bytes(),
			
 
				+            dpi=dpi,
			
 
				+            image_type=ImageType.PIL  # 返回包含 img_pil 的字典列表
			
 
				+        )
			
 
				+        
			
 
				+        # 应用页面范围过滤
			
 
				+        selected_pages = None
			
 
				+        if page_range:
			
 
				+            from .pdf_utils import PDFUtils
			
 
				+            total_pages = len(images)
			
 
				+            selected_pages = PDFUtils.parse_page_range(page_range, total_pages)
			
 
				+            if selected_pages:
			
 
				+                images = [images[i] for i in sorted(selected_pages)]
			
 
				+                logger.info(f"📋 PDF 共 {total_pages} 页，选择处理 {len(images)} 页")
			
 
				+            else:
			
 
				+                logger.warning(f"⚠️ 页面范围 '{page_range}' 没有匹配到任何有效页面")
			
 
				+                return []
			
 
				+        
			
 
				+        image_paths = []
			
 
				+        # 需要跟踪原始页码索引，以便正确命名文件
			
 
				+        original_indices = sorted(selected_pages) if selected_pages else list(range(len(images)))
			
 
				+        
			
 
				+        for idx, image in enumerate(images):
			
 
				+            # 获取原始页码索引（用于文件命名）
			
 
				+            original_idx = original_indices[idx] if selected_pages else idx
			
 
				+            # 生成图像文件名（使用原始页码，从1开始）
			
 
				+            image_filename = f"{pdf_path.stem}_page_{original_idx + 1:03d}.png"
			
 
				+            image_path = output_path / image_filename
			
 
				+
			
 
				+            # 保存图像 - 从字典中提取 img_pil
			
 
				+            if isinstance(image, dict):
			
 
				+                pil_image = image.get('img_pil')
			
 
				+                if pil_image is None:
			
 
				+                    logger.error(f"❌ Image dict at index {idx} does not contain 'img_pil' key")
			
 
				+                    continue
			
 
				+                pil_image.save(str(image_path))
			
 
				+            else:
			
 
				+                # 如果不是字典，假设是直接的 PIL Image
			
 
				+                image.save(str(image_path))
			
 
				+            image_paths.append(str(image_path))
			
 
				+            
			
 
				+        logger.info(f"✅ Converted {len(images)} pages from {pdf_path.name} to images")
			
 
				+        return image_paths
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        logger.error(f"❌ Error converting PDF {pdf_path}: {e}")
			
 
				+        traceback.print_exc()
			
 
				+        return []
			
 
				+
			
 
				+
			
 
				+def get_input_files(args, page_range: str | None = None) -> List[str]:
			
 
				+    """
			
 
				+    获取输入文件列表，统一处理PDF和图像文件，支持页面范围过滤
			
 
				+    
			
 
				+    支持自动判断输入类型：
			
 
				+    - 如果是文件路径，判断是PDF还是图片
			
 
				+    - 如果是目录，扫描所有PDF和图片文件
			
 
				+    - 如果是CSV文件，读取文件列表
			
 
				+    - 如果是文本文件，读取文件列表
			
 
				+    
			
 
				+    Args:
			
 
				+        args: 命令行参数对象，需要包含 input, output_dir, pdf_dpi 属性
			
 
				+        page_range: 页面范围字符串（可选），如 "1-5,7,9-12"
			
 
				+        
			
 
				+    Returns:
			
 
				+        处理后的图像文件路径列表
			
 
				+    """
			
 
				+    input_files = []
			
 
				+    input_path = Path(args.input)
			
 
				+    
			
 
				+    if not input_path.exists():
			
 
				+        logger.error(f"❌ Input path does not exist: {input_path}")
			
 
				+        return []
			
 
				+    
			
 
				+    # 判断输入类型
			
 
				+    if input_path.is_file():
			
 
				+        # 单个文件
			
 
				+        if input_path.suffix.lower() == '.pdf':
			
 
				+            # PDF文件：转换为图片
			
 
				+            logger.info(f"📄 Processing PDF: {input_path.name}")
			
 
				+            pdf_images = convert_pdf_to_images(
			
 
				+                str(input_path), 
			
 
				+                getattr(args, 'output_dir', None),
			
 
				+                dpi=getattr(args, 'pdf_dpi', 200),
			
 
				+                page_range=page_range  # 传递页面范围参数
			
 
				+            )
			
 
				+            input_files.extend(pdf_images)
			
 
				+        elif input_path.suffix.lower() in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif']:
			
 
				+            # 图片文件：直接添加
			
 
				+            input_files.append(str(input_path))
			
 
				+        elif input_path.suffix.lower() == '.csv':
			
 
				+            # CSV文件：读取文件列表
			
 
				+            input_files = get_image_files_from_csv(str(input_path), "fail")
			
 
				+        elif input_path.suffix.lower() in ['.txt', '.list']:
			
 
				+            # 文本文件：读取文件列表
			
 
				+            input_files = get_image_files_from_list(str(input_path))
			
 
				+        else:
			
 
				+            logger.warning(f"⚠️ Unsupported file type: {input_path.suffix}")
			
 
				+    
			
 
				+    elif input_path.is_dir():
			
 
				+        # 目录：扫描所有PDF和图片文件
			
 
				+        image_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif']
			
 
				+        pdf_extensions = ['.pdf']
			
 
				+        
			
 
				+        raw_files = []
			
 
				+        for ext in image_extensions + pdf_extensions:
			
 
				+            raw_files.extend(list(input_path.glob(f"*{ext}")))
			
 
				+            raw_files.extend(list(input_path.glob(f"*{ext.upper()}")))
			
 
				+        
			
 
				+        # 分离PDF和图像文件
			
 
				+        pdf_files = [f for f in sorted(set(raw_files)) if f.suffix.lower() == '.pdf']
			
 
				+        image_files = [f for f in sorted(set(raw_files)) if f.suffix.lower() in image_extensions]
			
 
				+        
			
 
				+        # 对于图片目录，应用页面范围过滤
			
 
				+        if page_range and image_files:
			
 
				+            from .pdf_utils import PDFUtils
			
 
				+            total_pages = len(image_files)
			
 
				+            selected_pages = PDFUtils.parse_page_range(page_range, total_pages)
			
 
				+            if selected_pages:
			
 
				+                image_files = [image_files[i] for i in sorted(selected_pages)]
			
 
				+                logger.info(f"📋 图片目录共 {total_pages} 张，选择处理 {len(image_files)} 张")
			
 
				+            else:
			
 
				+                logger.warning(f"⚠️ 页面范围 '{page_range}' 没有匹配到任何有效图片")
			
 
				+                image_files = []
			
 
				+        
			
 
				+        # 分别处理PDF和图像文件
			
 
				+        pdf_count = 0
			
 
				+        image_count = 0
			
 
				+        
			
 
				+        for file_path in pdf_files:
			
 
				+            # 转换PDF为图像
			
 
				+            logger.info(f"📄 Processing PDF: {file_path.name}")
			
 
				+            pdf_images = convert_pdf_to_images(
			
 
				+                str(file_path), 
			
 
				+                getattr(args, 'output_dir', None),
			
 
				+                dpi=getattr(args, 'pdf_dpi', 200),
			
 
				+                page_range=page_range  # 传递页面范围参数
			
 
				+            )
			
 
				+            input_files.extend(pdf_images)
			
 
				+            pdf_count += 1
			
 
				+        
			
 
				+        for file_path in image_files:
			
 
				+            # 直接添加图像文件
			
 
				+            input_files.append(str(file_path))
			
 
				+            image_count += 1
			
 
				+        
			
 
				+        logger.info(f"📊 Input summary:")
			
 
				+        logger.info(f"  PDF files processed: {pdf_count}")
			
 
				+        logger.info(f"  Image files found: {image_count}")
			
 
				+    
			
 
				+    logger.info(f"📊 Total image files to process: {len(input_files)}")
			
 
				+    
			
 
				+    return sorted(list(set(str(f) for f in input_files)))
			
 
				+
			
--- a/ocr_utils/html_generator.py
+++ b/ocr_utils/html_generator.py
@@ -0,0 +1,199 @@
 
				+"""
			
 
				+HTML 生成器模块
			
 
				+
			
 
				+提供 HTML 输出功能：
			
 
				+- 表格 HTML 生成（带样式）
			
 
				+- 单元格坐标展示
			
 
				+"""
			
 
				+import json
			
 
				+from pathlib import Path
			
 
				+from typing import Dict, Any, List
			
 
				+from loguru import logger
			
 
				+
			
 
				+
			
 
				+class HTMLGenerator:
			
 
				+    """HTML 生成器类"""
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def save_table_htmls(
			
 
				+        results: Dict[str, Any],
			
 
				+        output_dir: Path,
			
 
				+        doc_name: str,
			
 
				+        is_pdf: bool = True
			
 
				+    ) -> Path:
			
 
				+        """
			
 
				+        保存表格 HTML 文件
			
 
				+        
			
 
				+        命名规则:
			
 
				+        - PDF输入: 文件名_table_1_page_001.html
			
 
				+        - 图片输入（单页）: 文件名_table_1.html
			
 
				+        
			
 
				+        Args:
			
 
				+            results: 处理结果
			
 
				+            output_dir: 输出目录
			
 
				+            doc_name: 文档名称
			
 
				+            is_pdf: 是否为 PDF 输入
			
 
				+            
			
 
				+        Returns:
			
 
				+            表格目录路径
			
 
				+        """
			
 
				+        tables_dir = output_dir / 'tables'
			
 
				+        tables_dir.mkdir(exist_ok=True)
			
 
				+        
			
 
				+        table_count = 0
			
 
				+        total_pages = len(results.get('pages', []))
			
 
				+        
			
 
				+        for page in results.get('pages', []):
			
 
				+            page_idx = page.get('page_idx', 0)
			
 
				+            
			
 
				+            for element in page.get('elements', []):
			
 
				+                if element.get('type') in ['table', 'table_body']:
			
 
				+                    table_count += 1
			
 
				+                    content = element.get('content', {})
			
 
				+                    html = content.get('html', '')
			
 
				+                    cells = content.get('cells', [])
			
 
				+                    
			
 
				+                    if html:
			
 
				+                        full_html = HTMLGenerator._generate_table_html_with_styles(
			
 
				+                            html, cells, doc_name, page_idx, table_count
			
 
				+                        )
			
 
				+                        
			
 
				+                        # 根据输入类型决定命名
			
 
				+                        if is_pdf or total_pages > 1:
			
 
				+                            html_path = tables_dir / f"{doc_name}_table_{table_count}_page_{page_idx + 1:03d}.html"
			
 
				+                        else:
			
 
				+                            html_path = tables_dir / f"{doc_name}_table_{table_count}.html"
			
 
				+                        
			
 
				+                        with open(html_path, 'w', encoding='utf-8') as f:
			
 
				+                            f.write(full_html)
			
 
				+        
			
 
				+        if table_count > 0:
			
 
				+            logger.info(f"📊 {table_count} tables saved to: {tables_dir}")
			
 
				+        
			
 
				+        return tables_dir
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def _generate_table_html_with_styles(
			
 
				+        table_html: str,
			
 
				+        cells: List[Dict],
			
 
				+        doc_name: str,
			
 
				+        page_idx: int,
			
 
				+        table_idx: int
			
 
				+    ) -> str:
			
 
				+        """
			
 
				+        生成带样式的完整 HTML
			
 
				+        
			
 
				+        Args:
			
 
				+            table_html: 表格 HTML 内容
			
 
				+            cells: 单元格列表
			
 
				+            doc_name: 文档名称
			
 
				+            page_idx: 页码
			
 
				+            table_idx: 表格序号
			
 
				+            
			
 
				+        Returns:
			
 
				+            完整的 HTML 字符串
			
 
				+        """
			
 
				+        cells_json = json.dumps(cells, ensure_ascii=False, indent=2) if cells else "[]"
			
 
				+        
			
 
				+        return f"""<!DOCTYPE html>
			
 
				+<html lang="zh-CN">
			
 
				+<head>
			
 
				+    <meta charset="UTF-8">
			
 
				+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
			
 
				+    <title>{doc_name} - Table {table_idx}</title>
			
 
				+    <style>
			
 
				+        body {{
			
 
				+            font-family: Arial, "Microsoft YaHei", sans-serif;
			
 
				+            margin: 20px;
			
 
				+            background-color: #f5f5f5;
			
 
				+        }}
			
 
				+        .container {{
			
 
				+            max-width: 1400px;
			
 
				+            margin: 0 auto;
			
 
				+            background-color: white;
			
 
				+            padding: 20px;
			
 
				+            box-shadow: 0 0 10px rgba(0,0,0,0.1);
			
 
				+            border-radius: 8px;
			
 
				+        }}
			
 
				+        .meta {{
			
 
				+            color: #666;
			
 
				+            font-size: 0.9em;
			
 
				+            margin-bottom: 20px;
			
 
				+            padding-bottom: 10px;
			
 
				+            border-bottom: 1px solid #ddd;
			
 
				+        }}
			
 
				+        table {{
			
 
				+            border-collapse: collapse;
			
 
				+            width: 100%;
			
 
				+            margin: 20px 0;
			
 
				+        }}
			
 
				+        th, td {{
			
 
				+            border: 1px solid #ddd;
			
 
				+            padding: 8px 12px;
			
 
				+            text-align: left;
			
 
				+        }}
			
 
				+        th {{
			
 
				+            background-color: #f2f2f2;
			
 
				+            font-weight: bold;
			
 
				+        }}
			
 
				+        tr:hover {{
			
 
				+            background-color: #f9f9f9;
			
 
				+        }}
			
 
				+        td[data-bbox], th[data-bbox] {{
			
 
				+            position: relative;
			
 
				+        }}
			
 
				+        td[data-bbox]:hover::after, th[data-bbox]:hover::after {{
			
 
				+            content: attr(data-bbox);
			
 
				+            position: absolute;
			
 
				+            bottom: 100%;
			
 
				+            left: 0;
			
 
				+            background: #333;
			
 
				+            color: white;
			
 
				+            padding: 2px 6px;
			
 
				+            font-size: 10px;
			
 
				+            border-radius: 3px;
			
 
				+            white-space: nowrap;
			
 
				+            z-index: 100;
			
 
				+        }}
			
 
				+        .cells-info {{
			
 
				+            margin-top: 30px;
			
 
				+            padding: 15px;
			
 
				+            background-color: #f8f9fa;
			
 
				+            border-radius: 5px;
			
 
				+        }}
			
 
				+        .cells-info summary {{
			
 
				+            cursor: pointer;
			
 
				+            font-weight: bold;
			
 
				+            color: #333;
			
 
				+        }}
			
 
				+        .cells-info pre {{
			
 
				+            background-color: #2d2d2d;
			
 
				+            color: #f8f8f2;
			
 
				+            padding: 15px;
			
 
				+            border-radius: 5px;
			
 
				+            overflow-x: auto;
			
 
				+            font-size: 12px;
			
 
				+        }}
			
 
				+    </style>
			
 
				+</head>
			
 
				+<body>
			
 
				+    <div class="container">
			
 
				+        <div class="meta">
			
 
				+            <p><strong>Document:</strong> {doc_name}</p>
			
 
				+            <p><strong>Page:</strong> {page_idx + 1}</p>
			
 
				+            <p><strong>Table:</strong> {table_idx}</p>
			
 
				+            <p><strong>Cells with coordinates:</strong> {len(cells)}</p>
			
 
				+        </div>
			
 
				+        
			
 
				+        {table_html}
			
 
				+        
			
 
				+        <div class="cells-info">
			
 
				+            <details>
			
 
				+                <summary>📍 单元格坐标数据 (JSON)</summary>
			
 
				+                <pre>{cells_json}</pre>
			
 
				+            </details>
			
 
				+        </div>
			
 
				+    </div>
			
 
				+</body>
			
 
				+</html>"""
			
 
				+
			
--- a/ocr_utils/json_formatters.py
+++ b/ocr_utils/json_formatters.py
@@ -0,0 +1,396 @@
 
				+"""
			
 
				+JSON 格式化工具模块
			
 
				+
			
 
				+提供 JSON 输出格式化功能：
			
 
				+- MinerU middle.json 格式转换
			
 
				+- mineru_vllm_results_cell_bbox 格式转换
			
 
				+- 表格单元格格式化
			
 
				+- 金额数字标准化（全角→半角）
			
 
				+"""
			
 
				+import json
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+from typing import Dict, Any, List, Optional
			
 
				+from loguru import logger
			
 
				+
			
 
				+# 导入数字标准化工具
			
 
				+from .normalize_financial_numbers import normalize_json_table
			
 
				+
			
 
				+class JSONFormatters:
			
 
				+    """JSON 格式化工具类"""
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def convert_to_middle_json(results: Dict[str, Any]) -> Dict[str, Any]:
			
 
				+        """
			
 
				+        转换为 MinerU 标准 middle.json 格式
			
 
				+        
			
 
				+        用于 vlm_union_make 生成 Markdown
			
 
				+        
			
 
				+        Args:
			
 
				+            results: 处理结果
			
 
				+            
			
 
				+        Returns:
			
 
				+            MinerU middle.json 格式的字典
			
 
				+        """
			
 
				+        middle_json = {
			
 
				+            "pdf_info": [],
			
 
				+            "_backend": "vlm",
			
 
				+            "_scene": results.get('scene', 'unknown'),
			
 
				+            "_version_name": "2.5.0"
			
 
				+        }
			
 
				+        
			
 
				+        for page in results.get('pages', []):
			
 
				+            page_info = {
			
 
				+                'page_idx': page['page_idx'],
			
 
				+                'page_size': list(page.get('image_shape', [0, 0])[:2][::-1]),
			
 
				+                'angle': page.get('angle', 0),
			
 
				+                'para_blocks': [],
			
 
				+                'discarded_blocks': []
			
 
				+            }
			
 
				+            
			
 
				+            # 处理普通元素
			
 
				+            for element in page.get('elements', []):
			
 
				+                block = JSONFormatters._element_to_middle_block(element)
			
 
				+                if block:
			
 
				+                    elem_type = element.get('type', '')
			
 
				+                    if elem_type in ['header', 'footer', 'page_number', 'aside_text', 'abandon', 'discarded']:
			
 
				+                        page_info['discarded_blocks'].append(block)
			
 
				+                    else:
			
 
				+                        page_info['para_blocks'].append(block)
			
 
				+            
			
 
				+            # 处理丢弃元素（从 discarded_blocks 字段）
			
 
				+            for element in page.get('discarded_blocks', []):
			
 
				+                block = JSONFormatters._element_to_middle_block(element)
			
 
				+                if block:
			
 
				+                    page_info['discarded_blocks'].append(block)
			
 
				+            
			
 
				+            middle_json['pdf_info'].append(page_info)
			
 
				+        
			
 
				+        return middle_json
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def _element_to_middle_block(element: Dict[str, Any]) -> Optional[Dict[str, Any]]:
			
 
				+        """
			
 
				+        将元素转换为 MinerU middle.json block 格式
			
 
				+        
			
 
				+        MinerU 期望的嵌套结构:
			
 
				+        - image 类型: { type: "image", blocks: [{ type: "image_body", lines: [...] }] }
			
 
				+        - table 类型: { type: "table", blocks: [{ type: "table_body", lines: [...] }] }
			
 
				+        """
			
 
				+        elem_type = element.get('type', '')
			
 
				+        bbox = element.get('bbox', [0, 0, 0, 0])
			
 
				+        content = element.get('content', {})
			
 
				+        
			
 
				+        block = {
			
 
				+            'type': elem_type,
			
 
				+            'bbox': bbox,
			
 
				+            'angle': element.get('angle', 0),
			
 
				+            'reading_order': element.get('reading_order', 0),
			
 
				+            'lines': []
			
 
				+        }
			
 
				+        
			
 
				+        # 文本类型
			
 
				+        if elem_type in ['text', 'title', 'ref_text', 'header', 'footer', 'ocr_text']:
			
 
				+            text = content.get('text', '') if isinstance(content, dict) else str(content)
			
 
				+            if text:
			
 
				+                block['lines'] = [{
			
 
				+                    'bbox': bbox,
			
 
				+                    'spans': [{
			
 
				+                        'bbox': bbox,
			
 
				+                        'type': 'text',
			
 
				+                        'content': text
			
 
				+                    }]
			
 
				+                }]
			
 
				+        
			
 
				+        # 表格类型 - 嵌套结构
			
 
				+        elif elem_type in ['table', 'table_body']:
			
 
				+            table_html = content.get('html', '')
			
 
				+            cells = content.get('cells', [])
			
 
				+            
			
 
				+            block['type'] = 'table'
			
 
				+            block['blocks'] = [{
			
 
				+                'type': 'table_body',
			
 
				+                'bbox': bbox,
			
 
				+                'angle': 0,
			
 
				+                'lines': [{
			
 
				+                    'bbox': bbox,
			
 
				+                    'spans': [{
			
 
				+                        'bbox': bbox,
			
 
				+                        'type': 'table',
			
 
				+                        'html': table_html,
			
 
				+                        'cells': cells
			
 
				+                    }]
			
 
				+                }]
			
 
				+            }]
			
 
				+        
			
 
				+        # 图片类型 - 嵌套结构
			
 
				+        elif elem_type in ['image', 'image_body', 'figure']:
			
 
				+            block['type'] = 'image'
			
 
				+            block['blocks'] = [{
			
 
				+                'type': 'image_body',
			
 
				+                'bbox': bbox,
			
 
				+                'angle': element.get('angle', 0),
			
 
				+                'lines': [{
			
 
				+                    'bbox': bbox,
			
 
				+                    'spans': [{
			
 
				+                        'bbox': bbox,
			
 
				+                        'type': 'image',
			
 
				+                        'image_path': content.get('image_path', ''),
			
 
				+                        'description': content.get('description', '')
			
 
				+                    }]
			
 
				+                }]
			
 
				+            }]
			
 
				+        
			
 
				+        # 公式类型
			
 
				+        elif elem_type in ['interline_equation', 'inline_equation', 'equation']:
			
 
				+            latex = content.get('latex', '')
			
 
				+            block['lines'] = [{
			
 
				+                'bbox': bbox,
			
 
				+                'spans': [{
			
 
				+                    'bbox': bbox,
			
 
				+                    'type': 'interline_equation' if 'interline' in elem_type else 'inline_equation',
			
 
				+                    'content': latex
			
 
				+                }]
			
 
				+            }]
			
 
				+        
			
 
				+        # 表格/图片附属文本
			
 
				+        elif elem_type in ['table_caption', 'table_footnote', 'image_caption', 'image_footnote']:
			
 
				+            text = content.get('text', '') if isinstance(content, dict) else str(content)
			
 
				+            if text:
			
 
				+                block['lines'] = [{
			
 
				+                    'bbox': bbox,
			
 
				+                    'spans': [{
			
 
				+                        'bbox': bbox,
			
 
				+                        'type': 'text',
			
 
				+                        'content': text
			
 
				+                    }]
			
 
				+                }]
			
 
				+        
			
 
				+        # 丢弃类型
			
 
				+        elif elem_type in ['abandon', 'discarded']:
			
 
				+            block['type'] = 'abandon'
			
 
				+            text = content.get('text', '') if isinstance(content, dict) else str(content)
			
 
				+            if text:
			
 
				+                block['lines'] = [{
			
 
				+                    'bbox': bbox,
			
 
				+                    'spans': [{
			
 
				+                        'bbox': bbox,
			
 
				+                        'type': 'text',
			
 
				+                        'content': text
			
 
				+                    }]
			
 
				+                }]
			
 
				+        
			
 
				+        return block
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def save_page_jsons(
			
 
				+        results: Dict[str, Any],
			
 
				+        output_dir: Path,
			
 
				+        doc_name: str,
			
 
				+        is_pdf: bool = True,
			
 
				+        normalize_numbers: bool = True
			
 
				+    ) -> List[str]:
			
 
				+        """
			
 
				+        保存每页独立的 JSON（mineru_vllm_results_cell_bbox 格式）
			
 
				+        
			
 
				+        命名规则:
			
 
				+        - PDF输入: 文件名_page_001.json
			
 
				+        - 图片输入（单页）: 文件名.json
			
 
				+        
			
 
				+        Args:
			
 
				+            results: 处理结果
			
 
				+            output_dir: 输出目录
			
 
				+            doc_name: 文档名称
			
 
				+            is_pdf: 是否为 PDF 输入
			
 
				+            normalize_numbers: 是否标准化金额数字（全角→半角）
			
 
				+            
			
 
				+        Returns:
			
 
				+            保存的文件路径列表
			
 
				+        """
			
 
				+        saved_paths = []
			
 
				+        total_pages = len(results.get('pages', []))
			
 
				+        
			
 
				+        for page in results.get('pages', []):
			
 
				+            page_idx = page.get('page_idx', 0)
			
 
				+            
			
 
				+            # 根据输入类型决定命名
			
 
				+            if is_pdf or total_pages > 1:
			
 
				+                page_name = f"{doc_name}_page_{page_idx + 1:03d}"
			
 
				+            else:
			
 
				+                page_name = doc_name
			
 
				+            
			
 
				+            # 转换为 mineru_vllm_results_cell_bbox 格式
			
 
				+            page_elements = []
			
 
				+            for element in page.get('elements', []):
			
 
				+                converted = JSONFormatters._element_to_cell_bbox_format(element, page_idx)
			
 
				+                if converted:
			
 
				+                    page_elements.append(converted)
			
 
				+            
			
 
				+            # 添加丢弃元素
			
 
				+            for element in page.get('discarded_blocks', []):
			
 
				+                converted = JSONFormatters._element_to_cell_bbox_format(element, page_idx)
			
 
				+                if converted:
			
 
				+                    page_elements.append(converted)
			
 
				+            
			
 
				+            # 转换为 JSON 字符串
			
 
				+            json_content = json.dumps(page_elements, ensure_ascii=False, indent=2)
			
 
				+            
			
 
				+            # 金额数字标准化
			
 
				+            if normalize_numbers:
			
 
				+                original_content = json_content
			
 
				+                json_content = normalize_json_table(json_content)
			
 
				+                
			
 
				+                if json_content != original_content:
			
 
				+                    original_path = output_dir / f"{page_name}_original.json"
			
 
				+                    with open(original_path, 'w', encoding='utf-8') as f:
			
 
				+                        f.write(original_content)
			
 
				+                    logger.debug(f"📄 Original page JSON saved: {original_path}")
			
 
				+            
			
 
				+            # 保存 JSON
			
 
				+            json_path = output_dir / f"{page_name}.json"
			
 
				+            with open(json_path, 'w', encoding='utf-8') as f:
			
 
				+                f.write(json_content)
			
 
				+            
			
 
				+            saved_paths.append(str(json_path))
			
 
				+            logger.debug(f"📄 Page JSON saved: {json_path}")
			
 
				+        
			
 
				+        if saved_paths:
			
 
				+            logger.info(f"📄 {len(saved_paths)} page JSONs saved")
			
 
				+        
			
 
				+        return saved_paths
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def _element_to_cell_bbox_format(
			
 
				+        element: Dict[str, Any],
			
 
				+        page_idx: int
			
 
				+    ) -> Optional[Dict[str, Any]]:
			
 
				+        """
			
 
				+        将元素转换为 mineru_vllm_results_cell_bbox 格式
			
 
				+        """
			
 
				+        elem_type = element.get('type', '')
			
 
				+        bbox = element.get('bbox', [0, 0, 0, 0])
			
 
				+        content = element.get('content', {})
			
 
				+        
			
 
				+        # 确保 bbox 是整数列表
			
 
				+        bbox = [int(x) for x in bbox[:4]] if bbox else [0, 0, 0, 0]
			
 
				+        
			
 
				+        result = {
			
 
				+            'bbox': bbox,
			
 
				+            'page_idx': page_idx,
			
 
				+            'reading_order': element.get('reading_order', 0)
			
 
				+        }
			
 
				+        
			
 
				+        # 文本类型
			
 
				+        if elem_type in ['text', 'title', 'ref_text', 'ocr_text']:
			
 
				+            text = content.get('text', '') if isinstance(content, dict) else str(content)
			
 
				+            result['type'] = 'text' if elem_type != 'title' else 'title'
			
 
				+            result['text'] = text
			
 
				+            if elem_type == 'title':
			
 
				+                result['text_level'] = element.get('level', 1)
			
 
				+        
			
 
				+        # 表格类型
			
 
				+        elif elem_type in ['table', 'table_body']:
			
 
				+            result['type'] = 'table'
			
 
				+            result['img_path'] = content.get('table_image_path', '')
			
 
				+            result['table_caption'] = JSONFormatters._ensure_list(content.get('table_caption', []))
			
 
				+            result['table_footnote'] = JSONFormatters._ensure_list(content.get('table_footnote', []))
			
 
				+            result['table_body'] = content.get('html', '')
			
 
				+            
			
 
				+            # 关键：table_cells 数组
			
 
				+            cells = content.get('cells', [])
			
 
				+            if cells:
			
 
				+                result['table_cells'] = JSONFormatters.format_table_cells(cells)
			
 
				+            
			
 
				+            # 旋转和倾斜信息
			
 
				+            if 'table_angle' in content:
			
 
				+                result['image_rotation_angle'] = float(content['table_angle'])
			
 
				+            if 'skew_angle' in content:
			
 
				+                result['skew_angle'] = float(content['skew_angle'])
			
 
				+        
			
 
				+        # 图片类型
			
 
				+        elif elem_type in ['image', 'image_body', 'figure']:
			
 
				+            result['type'] = 'image'
			
 
				+            image_filename = content.get('image_path', '')
			
 
				+            result['img_path'] = f"images/{image_filename}" if image_filename else ''
			
 
				+            result['image_caption'] = JSONFormatters._ensure_list(content.get('caption', []))
			
 
				+            result['image_footnote'] = JSONFormatters._ensure_list(content.get('footnote', []))
			
 
				+        
			
 
				+        # 公式类型
			
 
				+        elif elem_type in ['interline_equation', 'inline_equation', 'equation']:
			
 
				+            result['type'] = 'equation'
			
 
				+            result['text'] = content.get('latex', '') if isinstance(content, dict) else ''
			
 
				+            result['text_format'] = 'latex'
			
 
				+        
			
 
				+        # 列表类型
			
 
				+        elif elem_type == 'list':
			
 
				+            result['type'] = 'list'
			
 
				+            result['sub_type'] = 'text'
			
 
				+            result['list_items'] = content.get('list_items', []) if isinstance(content, dict) else []
			
 
				+        
			
 
				+        # 页眉页脚
			
 
				+        elif elem_type in ['header', 'footer']:
			
 
				+            result['type'] = elem_type
			
 
				+            result['text'] = content.get('text', '') if isinstance(content, dict) else str(content)
			
 
				+        
			
 
				+        # 表格/图片附属文本
			
 
				+        elif elem_type in ['table_caption', 'table_footnote', 'image_caption', 'image_footnote']:
			
 
				+            result['type'] = elem_type
			
 
				+            result['text'] = content.get('text', '') if isinstance(content, dict) else str(content)
			
 
				+        
			
 
				+        # 丢弃元素
			
 
				+        elif elem_type in ['discarded', 'abandon']:
			
 
				+            result['type'] = 'discarded'
			
 
				+            result['original_category'] = element.get('original_category', 'unknown')
			
 
				+            result['text'] = content.get('text', '') if isinstance(content, dict) else ''
			
 
				+        
			
 
				+        else:
			
 
				+            return None
			
 
				+        
			
 
				+        return result
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def format_table_cells(cells: List[Dict]) -> List[Dict[str, Any]]:
			
 
				+        """
			
 
				+        格式化表格单元格为 mineru_vllm_results_cell_bbox 格式
			
 
				+        
			
 
				+        输出格式:
			
 
				+        {
			
 
				+            "type": "table_cell",
			
 
				+            "text": "单元格内容",
			
 
				+            "matched_text": "OCR匹配文本",
			
 
				+            "bbox": [x1, y1, x2, y2],
			
 
				+            "row": 1,
			
 
				+            "col": 1,
			
 
				+            "score": 100.0,
			
 
				+            "paddle_bbox_indices": [0, 1]
			
 
				+        }
			
 
				+        """
			
 
				+        formatted_cells = []
			
 
				+        
			
 
				+        for cell in cells:
			
 
				+            formatted_cell = {
			
 
				+                'type': 'table_cell',
			
 
				+                'text': cell.get('text', ''),
			
 
				+                'matched_text': cell.get('matched_text', cell.get('text', '')),
			
 
				+                'bbox': [float(x) for x in cell.get('bbox', [0, 0, 0, 0])[:4]],
			
 
				+                'row': cell.get('row', 0),
			
 
				+                'col': cell.get('col', 0),
			
 
				+                'score': float(cell.get('score', 100.0)),
			
 
				+                'paddle_bbox_indices': cell.get('paddle_bbox_indices', 
			
 
				+                                                cell.get('paddle_indices', []))
			
 
				+            }
			
 
				+            formatted_cells.append(formatted_cell)
			
 
				+        
			
 
				+        return formatted_cells
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def _ensure_list(value) -> List:
			
 
				+        """确保值是列表"""
			
 
				+        if value is None:
			
 
				+            return []
			
 
				+        if isinstance(value, str):
			
 
				+            return [value] if value else []
			
 
				+        if isinstance(value, list):
			
 
				+            return value
			
 
				+        return [str(value)]
			
 
				+
			
--- a/ocr_utils/log_utils.py
+++ b/ocr_utils/log_utils.py
@@ -0,0 +1,35 @@
 
				+"""
			
 
				+日志工具模块
			
 
				+
			
 
				+提供统一的日志配置功能
			
 
				+"""
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+from loguru import logger
			
 
				+
			
 
				+def setup_logging(log_level: str = "INFO", log_file: str | None = None):
			
 
				+    """
			
 
				+    设置日志配置
			
 
				+    
			
 
				+    Args:
			
 
				+        log_level: 日志级别（DEBUG, INFO, WARNING, ERROR）
			
 
				+        log_file: 日志文件路径（可选）
			
 
				+    """
			
 
				+    logger.remove()
			
 
				+    
			
 
				+    # 控制台输出
			
 
				+    logger.add(
			
 
				+        sys.stdout,
			
 
				+        level=log_level,
			
 
				+        format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>"
			
 
				+    )
			
 
				+    
			
 
				+    # 文件输出
			
 
				+    if log_file:
			
 
				+        logger.add(
			
 
				+            log_file,
			
 
				+            level="DEBUG",
			
 
				+            format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function}:{line} - {message}",
			
 
				+            rotation="10 MB"
			
 
				+        )
			
 
				+
			
--- a/ocr_utils/markdown_generator.py
+++ b/ocr_utils/markdown_generator.py
@@ -0,0 +1,395 @@
 
				+"""
			
 
				+Markdown 生成器模块
			
 
				+
			
 
				+提供 Markdown 输出功能：
			
 
				+- 完整文档 Markdown 生成
			
 
				+- 按页 Markdown 生成
			
 
				+- MinerU union_make 集成
			
 
				+- 金额数字标准化（全角→半角）
			
 
				+"""
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+from typing import Dict, Any, List, Tuple, Optional
			
 
				+from loguru import logger
			
 
				+
			
 
				+# 导入 MinerU 组件
			
 
				+mineru_path = Path(__file__).parents[3]
			
 
				+if str(mineru_path) not in sys.path:
			
 
				+    sys.path.insert(0, str(mineru_path))
			
 
				+
			
 
				+try:
			
 
				+    from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
			
 
				+    from mineru.utils.enum_class import MakeMode
			
 
				+    MINERU_AVAILABLE = True
			
 
				+except ImportError:
			
 
				+    MINERU_AVAILABLE = False
			
 
				+    vlm_union_make = None
			
 
				+    
			
 
				+    class MakeMode:
			
 
				+        MM_MD = 'mm_md'
			
 
				+        NLP_MD = 'nlp_md'
			
 
				+
			
 
				+# 导入数字标准化工具
			
 
				+from .normalize_financial_numbers import normalize_markdown_table
			
 
				+
			
 
				+
			
 
				+class MarkdownGenerator:
			
 
				+    """Markdown 生成器类"""
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def save_markdown(
			
 
				+        results: Dict[str, Any],
			
 
				+        middle_json: Dict[str, Any],
			
 
				+        output_dir: Path,
			
 
				+        doc_name: str,
			
 
				+        use_mineru_union: bool = False,
			
 
				+        normalize_numbers: bool = True
			
 
				+    ) -> Tuple[Path, Optional[Path]]:
			
 
				+        """
			
 
				+        保存 Markdown 文件
			
 
				+        
			
 
				+        默认使用自定义实现，确保所有元素类型（包括 table_caption 等）都被正确处理
			
 
				+        可选使用 MinerU union_make（但它不处理 table_caption 等独立元素）
			
 
				+        
			
 
				+        Args:
			
 
				+            results: 处理结果
			
 
				+            middle_json: middle.json 格式数据
			
 
				+            output_dir: 输出目录
			
 
				+            doc_name: 文档名称
			
 
				+            use_mineru_union: 是否使用 MinerU union_make（默认 False）
			
 
				+            normalize_numbers: 是否标准化金额数字（全角→半角）
			
 
				+            
			
 
				+        Returns:
			
 
				+            (Markdown 文件路径, 原始文件路径 或 None)
			
 
				+        """
			
 
				+        md_path = output_dir / f"{doc_name}.md"
			
 
				+        original_path = None
			
 
				+        
			
 
				+        if use_mineru_union and MINERU_AVAILABLE and vlm_union_make is not None:
			
 
				+            try:
			
 
				+                img_bucket_path = "images"
			
 
				+                markdown_content = vlm_union_make(
			
 
				+                    middle_json['pdf_info'],
			
 
				+                    MakeMode.MM_MD,
			
 
				+                    img_bucket_path
			
 
				+                )
			
 
				+                
			
 
				+                if markdown_content:
			
 
				+                    if isinstance(markdown_content, list):
			
 
				+                        markdown_content = '\n\n'.join(markdown_content)
			
 
				+                    
			
 
				+                    header = MarkdownGenerator._generate_header(results)
			
 
				+                    markdown_content = header + str(markdown_content)
			
 
				+                    
			
 
				+                    # 金额数字标准化
			
 
				+                    if normalize_numbers:
			
 
				+                        original_content = markdown_content
			
 
				+                        markdown_content = normalize_markdown_table(markdown_content)
			
 
				+                        
			
 
				+                        if markdown_content != original_content:
			
 
				+                            original_path = output_dir / f"{doc_name}_original.md"
			
 
				+                            with open(original_path, 'w', encoding='utf-8') as f:
			
 
				+                                f.write(original_content)
			
 
				+                            logger.info(f"📝 Original Markdown saved: {original_path}")
			
 
				+                    
			
 
				+                    with open(md_path, 'w', encoding='utf-8') as f:
			
 
				+                        f.write(markdown_content)
			
 
				+                    
			
 
				+                    logger.info(f"📝 Markdown saved (MinerU format): {md_path}")
			
 
				+                    return md_path, original_path
			
 
				+                    
			
 
				+            except Exception as e:
			
 
				+                logger.warning(f"MinerU union_make failed: {e}, falling back to custom implementation")
			
 
				+        
			
 
				+        # 使用自定义实现，确保所有元素类型都被处理
			
 
				+        markdown_content = MarkdownGenerator._generate_full_markdown(results)
			
 
				+        
			
 
				+        # 金额数字标准化
			
 
				+        if normalize_numbers:
			
 
				+            original_content = markdown_content
			
 
				+            markdown_content = normalize_markdown_table(markdown_content)
			
 
				+            
			
 
				+            if markdown_content != original_content:
			
 
				+                original_path = output_dir / f"{doc_name}_original.md"
			
 
				+                with open(original_path, 'w', encoding='utf-8') as f:
			
 
				+                    f.write(original_content)
			
 
				+                logger.info(f"📝 Original Markdown saved: {original_path}")
			
 
				+        
			
 
				+        with open(md_path, 'w', encoding='utf-8') as f:
			
 
				+            f.write(markdown_content)
			
 
				+        
			
 
				+        logger.info(f"📝 Markdown saved (custom format): {md_path}")
			
 
				+        return md_path, original_path
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def save_page_markdowns(
			
 
				+        results: Dict[str, Any],
			
 
				+        output_dir: Path,
			
 
				+        doc_name: str,
			
 
				+        is_pdf: bool = True,
			
 
				+        normalize_numbers: bool = True
			
 
				+    ) -> List[str]:
			
 
				+        """
			
 
				+        按页保存 Markdown 文件
			
 
				+        
			
 
				+        命名规则:
			
 
				+        - PDF输入: 文件名_page_001.md
			
 
				+        - 图片输入（单页）: 文件名.md（跳过，因为已有完整版）
			
 
				+        
			
 
				+        Args:
			
 
				+            results: 处理结果
			
 
				+            output_dir: 输出目录
			
 
				+            doc_name: 文档名称
			
 
				+            is_pdf: 是否为 PDF 输入
			
 
				+            normalize_numbers: 是否标准化金额数字（全角→半角）
			
 
				+            
			
 
				+        Returns:
			
 
				+            保存的 Markdown 文件路径列表
			
 
				+        """
			
 
				+        saved_paths = []
			
 
				+        total_pages = len(results.get('pages', []))
			
 
				+        
			
 
				+        # 单个图片输入时，跳过按页保存（因为已有完整版 doc_name.md）
			
 
				+        if not is_pdf and total_pages == 1:
			
 
				+            logger.debug("📝 Single image input, skipping page markdown (full version exists)")
			
 
				+            return saved_paths
			
 
				+        
			
 
				+        for page in results.get('pages', []):
			
 
				+            page_idx = page.get('page_idx', 0)
			
 
				+            
			
 
				+            # 根据输入类型决定命名
			
 
				+            if is_pdf or total_pages > 1:
			
 
				+                page_name = f"{doc_name}_page_{page_idx + 1:03d}"
			
 
				+            else:
			
 
				+                page_name = doc_name
			
 
				+            
			
 
				+            # 生成单页 Markdown
			
 
				+            md_content = MarkdownGenerator._generate_page_markdown(page, doc_name, page_idx)
			
 
				+            
			
 
				+            # 金额数字标准化
			
 
				+            if normalize_numbers:
			
 
				+                original_content = md_content
			
 
				+                md_content = normalize_markdown_table(md_content)
			
 
				+                
			
 
				+                if md_content != original_content:
			
 
				+                    original_path = output_dir / f"{page_name}_original.md"
			
 
				+                    with open(original_path, 'w', encoding='utf-8') as f:
			
 
				+                        f.write(original_content)
			
 
				+                    logger.debug(f"📝 Original page Markdown saved: {original_path}")
			
 
				+            
			
 
				+            # 保存
			
 
				+            md_path = output_dir / f"{page_name}.md"
			
 
				+            with open(md_path, 'w', encoding='utf-8') as f:
			
 
				+                f.write(md_content)
			
 
				+            
			
 
				+            saved_paths.append(str(md_path))
			
 
				+            logger.debug(f"📝 Page Markdown saved: {md_path}")
			
 
				+        
			
 
				+        if saved_paths:
			
 
				+            logger.info(f"📝 {len(saved_paths)} page Markdowns saved")
			
 
				+        
			
 
				+        return saved_paths
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def _generate_header(results: Dict[str, Any]) -> str:
			
 
				+        """生成 Markdown 文件头"""
			
 
				+        return f"""<!--
			
 
				+scene: {results.get('scene', 'unknown')}
			
 
				+document: {results.get('document_path', '')}
			
 
				+pages: {len(results.get('pages', []))}
			
 
				+-->
			
 
				+"""
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def _generate_full_markdown(results: Dict[str, Any]) -> str:
			
 
				+        """
			
 
				+        生成完整文档的 Markdown（自定义实现）
			
 
				+        
			
 
				+        确保所有元素类型都被正确处理，包括 table_caption、table_footnote 等
			
 
				+        
			
 
				+        Args:
			
 
				+            results: 处理结果
			
 
				+            
			
 
				+        Returns:
			
 
				+            Markdown 内容字符串
			
 
				+        """
			
 
				+        md_lines = [
			
 
				+            f"<!-- ",
			
 
				+            f"scene: {results.get('scene', 'unknown')}",
			
 
				+            f"document: {results.get('document_path', '')}",
			
 
				+            f"pages: {len(results.get('pages', []))}",
			
 
				+            f"-->",
			
 
				+            "",
			
 
				+        ]
			
 
				+        
			
 
				+        for page in results.get('pages', []):
			
 
				+            # 按阅读顺序处理元素
			
 
				+            for element in page.get('elements', []):
			
 
				+                elem_type = element.get('type', '')
			
 
				+                content = element.get('content', {})
			
 
				+                
			
 
				+                if elem_type == 'title':
			
 
				+                    text = content.get('text', '') if isinstance(content, dict) else str(content)
			
 
				+                    level = element.get('level', 1)
			
 
				+                    if text:
			
 
				+                        md_lines.append(f"{'#' * min(level, 6)} {text}")
			
 
				+                        md_lines.append("")
			
 
				+                
			
 
				+                elif elem_type in ['text', 'ocr_text', 'ref_text']:
			
 
				+                    text = content.get('text', '') if isinstance(content, dict) else str(content)
			
 
				+                    if text:
			
 
				+                        md_lines.append(text)
			
 
				+                        md_lines.append("")
			
 
				+                
			
 
				+                elif elem_type in ['table', 'table_body']:
			
 
				+                    html = content.get('html', '')
			
 
				+                    if html:
			
 
				+                        md_lines.append(f"\n{html}\n")
			
 
				+                        md_lines.append("")
			
 
				+                
			
 
				+                elif elem_type in ['image', 'image_body', 'figure']:
			
 
				+                    img_filename = content.get('image_path', '')
			
 
				+                    if img_filename:
			
 
				+                        md_lines.append(f"![](images/{img_filename})")
			
 
				+                        md_lines.append("")
			
 
				+                
			
 
				+                elif elem_type in ['interline_equation', 'inline_equation', 'equation']:
			
 
				+                    latex = content.get('latex', '')
			
 
				+                    if latex:
			
 
				+                        md_lines.append(f"$$\n{latex}\n$$")
			
 
				+                        md_lines.append("")
			
 
				+                
			
 
				+                elif elem_type in ['table_caption', 'table_footnote']:
			
 
				+                    text = content.get('text', '') if isinstance(content, dict) else str(content)
			
 
				+                    if text:
			
 
				+                        if elem_type == 'table_caption':
			
 
				+                            md_lines.append(f"**{text}**")
			
 
				+                        else:
			
 
				+                            md_lines.append(f"*{text}*")
			
 
				+                        md_lines.append("")
			
 
				+                
			
 
				+                elif elem_type in ['image_caption', 'image_footnote']:
			
 
				+                    text = content.get('text', '') if isinstance(content, dict) else str(content)
			
 
				+                    if text:
			
 
				+                        if elem_type == 'image_caption':
			
 
				+                            md_lines.append(f"**{text}**")
			
 
				+                        else:
			
 
				+                            md_lines.append(f"*{text}*")
			
 
				+                        md_lines.append("")
			
 
				+        
			
 
				+        return '\n'.join(md_lines)
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def _generate_page_markdown(
			
 
				+        page: Dict[str, Any],
			
 
				+        doc_name: str,
			
 
				+        page_idx: int
			
 
				+    ) -> str:
			
 
				+        """
			
 
				+        生成单页的 Markdown 内容
			
 
				+        
			
 
				+        Args:
			
 
				+            page: 页面数据
			
 
				+            doc_name: 文档名称
			
 
				+            page_idx: 页码索引
			
 
				+            
			
 
				+        Returns:
			
 
				+            Markdown 内容字符串
			
 
				+        """
			
 
				+        md_lines = [
			
 
				+            f"<!--",
			
 
				+            f"document: {doc_name}",
			
 
				+            f"page: {page_idx + 1}",
			
 
				+            f"angle: {page.get('angle', 0)}",
			
 
				+            f"-->",
			
 
				+            "",
			
 
				+        ]
			
 
				+        
			
 
				+        for element in page.get('elements', []):
			
 
				+            elem_type = element.get('type', '')
			
 
				+            content = element.get('content', {})
			
 
				+            bbox = element.get('bbox', [])
			
 
				+            reading_order = element.get('reading_order', 0)
			
 
				+            
			
 
				+            # 添加元素注释
			
 
				+            md_lines.append(f"<!-- reading_order: {reading_order}, bbox: {bbox} -->")
			
 
				+            
			
 
				+            if elem_type == 'title':
			
 
				+                text = content.get('text', '') if isinstance(content, dict) else str(content)
			
 
				+                level = element.get('level', 1)
			
 
				+                md_lines.append(f"{'#' * min(level, 6)} {text}")
			
 
				+                md_lines.append("")
			
 
				+            
			
 
				+            elif elem_type in ['text', 'ocr_text', 'ref_text']:
			
 
				+                text = content.get('text', '') if isinstance(content, dict) else str(content)
			
 
				+                if text:
			
 
				+                    md_lines.append(text)
			
 
				+                    md_lines.append("")
			
 
				+            
			
 
				+            elif elem_type in ['table', 'table_body']:
			
 
				+                table_captions = content.get('table_caption', [])
			
 
				+                if isinstance(table_captions, str):
			
 
				+                    table_captions = [table_captions] if table_captions else []
			
 
				+                for caption in table_captions:
			
 
				+                    md_lines.append(f"**{caption}**")
			
 
				+                
			
 
				+                html = content.get('html', '')
			
 
				+                if html:
			
 
				+                    md_lines.append(f"\n{html}\n")
			
 
				+                md_lines.append("")
			
 
				+            
			
 
				+            elif elem_type in ['image', 'image_body', 'figure']:
			
 
				+                img_filename = content.get('image_path', '')
			
 
				+                if img_filename:
			
 
				+                    md_lines.append(f"![](images/{img_filename})")
			
 
				+                    md_lines.append("")
			
 
				+            
			
 
				+            elif elem_type in ['interline_equation', 'inline_equation', 'equation']:
			
 
				+                latex = content.get('latex', '')
			
 
				+                if latex:
			
 
				+                    md_lines.append(f"$$\n{latex}\n$$")
			
 
				+                    md_lines.append("")
			
 
				+            
			
 
				+            elif elem_type in ['table_caption', 'table_footnote']:
			
 
				+                text = content.get('text', '') if isinstance(content, dict) else str(content)
			
 
				+                if text:
			
 
				+                    # 表格标题加粗，表格脚注斜体
			
 
				+                    if elem_type == 'table_caption':
			
 
				+                        md_lines.append(f"**{text}**")
			
 
				+                    else:
			
 
				+                        md_lines.append(f"*{text}*")
			
 
				+                    md_lines.append("")
			
 
				+            
			
 
				+            elif elem_type in ['image_caption', 'image_footnote']:
			
 
				+                text = content.get('text', '') if isinstance(content, dict) else str(content)
			
 
				+                if text:
			
 
				+                    # 图片标题加粗，图片脚注斜体
			
 
				+                    if elem_type == 'image_caption':
			
 
				+                        md_lines.append(f"**{text}**")
			
 
				+                    else:
			
 
				+                        md_lines.append(f"*{text}*")
			
 
				+                    md_lines.append("")
			
 
				+            
			
 
				+            elif elem_type == 'discarded':
			
 
				+                text = content.get('text', '') if isinstance(content, dict) else ''
			
 
				+                if text:
			
 
				+                    md_lines.append(f"<!-- [discarded: {element.get('original_category', 'unknown')}] {text} -->")
			
 
				+                    md_lines.append("")
			
 
				+        
			
 
				+        # 处理丢弃元素
			
 
				+        for element in page.get('discarded_blocks', []):
			
 
				+            content = element.get('content', {})
			
 
				+            bbox = element.get('bbox', [])
			
 
				+            reading_order = element.get('reading_order', 0)
			
 
				+            original_category = element.get('original_category', 'unknown')
			
 
				+            
			
 
				+            md_lines.append(f"<!-- reading_order: {reading_order}, bbox: {bbox} -->")
			
 
				+            text = content.get('text', '') if isinstance(content, dict) else ''
			
 
				+            if text:
			
 
				+                md_lines.append(f"<!-- [discarded: {original_category}] {text} -->")
			
 
				+            else:
			
 
				+                md_lines.append(f"<!-- [discarded: {original_category}] (no text) -->")
			
 
				+            md_lines.append("")
			
 
				+        
			
 
				+        return '\n'.join(md_lines)
			
 
				+
			
--- a/ocr_utils/normalize_financial_numbers.py
+++ b/ocr_utils/normalize_financial_numbers.py
@@ -0,0 +1,269 @@
 
				+import re
			
 
				+import os
			
 
				+from pathlib import Path
			
 
				+
			
 
				+def normalize_financial_numbers(text: str) -> str:
			
 
				+    """
			
 
				+    标准化财务数字：将全角字符转换为半角字符
			
 
				+    
			
 
				+    Args:
			
 
				+        text: 原始文本
			
 
				+    
			
 
				+    Returns:
			
 
				+        标准化后的文本
			
 
				+    """
			
 
				+    if not text:
			
 
				+        return text
			
 
				+    
			
 
				+    # 定义全角到半角的映射
			
 
				+    fullwidth_to_halfwidth = {
			
 
				+        '０': '0', '１': '1', '２': '2', '３': '3', '４': '4',
			
 
				+        '５': '5', '６': '6', '７': '7', '８': '8', '９': '9',
			
 
				+        '，': ',',  # 全角逗号转半角逗号
			
 
				+        '。': '.',  # 全角句号转半角句号  
			
 
				+        '．': '.',  # 全角句点转半角句点
			
 
				+        '：': ':',  # 全角冒号转半角冒号
			
 
				+        '；': ';',  # 全角分号转半角分号
			
 
				+        '（': '(',  # 全角左括号转半角左括号
			
 
				+        '）': ')',  # 全角右括号转半角右括号
			
 
				+        '－': '-',  # 全角减号转半角减号
			
 
				+        '＋': '+',  # 全角加号转半角加号
			
 
				+        '％': '%',  # 全角百分号转半角百分号
			
 
				+    }
			
 
				+    
			
 
				+    # 第一步：执行基础字符替换
			
 
				+    normalized_text = text
			
 
				+    for fullwidth, halfwidth in fullwidth_to_halfwidth.items():
			
 
				+        normalized_text = normalized_text.replace(fullwidth, halfwidth)
			
 
				+    
			
 
				+    # 第二步：处理数字序列中的空格和分隔符
			
 
				+    # 修改正则表达式以匹配完整的数字序列，包括空格
			
 
				+    # 匹配模式：数字 + (空格? + 逗号 + 空格? + 数字)* + (空格? + 小数点 + 数字+)?
			
 
				+    number_sequence_pattern = r'(\d+(?:\s*[，,]\s*\d+)*(?:\s*[。．.]\s*\d+)?)'
			
 
				+    
			
 
				+    def normalize_number_sequence(match):
			
 
				+        sequence = match.group(1)
			
 
				+        
			
 
				+        # 处理千分位分隔符周围的空格
			
 
				+        # 将 "数字 + 空格 + 逗号 + 空格 + 数字" 标准化为 "数字,数字"
			
 
				+        sequence = re.sub(r'(\d)\s*[，,]\s*(\d)', r'\1,\2', sequence)
			
 
				+        
			
 
				+        # 处理小数点周围的空格
			
 
				+        # 将 "数字 + 空格 + 小数点 + 空格 + 数字" 标准化为 "数字.数字"
			
 
				+        sequence = re.sub(r'(\d)\s*[。．.]\s*(\d)', r'\1.\2', sequence)
			
 
				+        
			
 
				+        return sequence
			
 
				+    
			
 
				+    normalized_text = re.sub(number_sequence_pattern, normalize_number_sequence, normalized_text)
			
 
				+    return normalized_text
			
 
				+    
			
 
				+def normalize_markdown_table(markdown_content: str) -> str:
			
 
				+    """
			
 
				+    专门处理Markdown表格中的数字标准化
			
 
				+    
			
 
				+    注意：保留原始markdown中的换行符，只替换表格内的文本内容
			
 
				+    
			
 
				+    Args:
			
 
				+        markdown_content: Markdown内容
			
 
				+    
			
 
				+    Returns:
			
 
				+        标准化后的Markdown内容
			
 
				+    """
			
 
				+    # 使用BeautifulSoup处理HTML表格
			
 
				+    from bs4 import BeautifulSoup, Tag
			
 
				+    import re
			
 
				+    
			
 
				+    # 使用正则表达式找到所有表格的位置，并保留其前后的内容
			
 
				+    # 匹配完整的HTML表格标签（包括嵌套）
			
 
				+    table_pattern = r'(<table[^>]*>.*?</table>)'
			
 
				+    
			
 
				+    def normalize_table_match(match):
			
 
				+        """处理单个表格匹配，保留原始格式"""
			
 
				+        table_html = match.group(1)
			
 
				+        original_table_html = table_html  # 保存原始HTML用于比较
			
 
				+        
			
 
				+        # 解析表格HTML
			
 
				+        soup = BeautifulSoup(table_html, 'html.parser')
			
 
				+        tables = soup.find_all('table')
			
 
				+        
			
 
				+        # 记录所有需要替换的文本（原始文本 -> 标准化文本）
			
 
				+        replacements = []
			
 
				+        
			
 
				+        for table in tables:
			
 
				+            if isinstance(table, Tag):
			
 
				+                cells = table.find_all(['td', 'th'])
			
 
				+                for cell in cells:
			
 
				+                    if isinstance(cell, Tag):
			
 
				+                        # 获取单元格的纯文本内容
			
 
				+                        original_text = cell.get_text()
			
 
				+                        normalized_text = normalize_financial_numbers(original_text)
			
 
				+                        
			
 
				+                        # 如果内容发生了变化，记录替换
			
 
				+                        if original_text != normalized_text:
			
 
				+                            # 找到单元格中所有文本节点并替换
			
 
				+                            from bs4.element import NavigableString
			
 
				+                            for text_node in cell.find_all(string=True, recursive=True):
			
 
				+                                if isinstance(text_node, NavigableString):
			
 
				+                                    text_str = str(text_node)
			
 
				+                                    if text_str.strip():
			
 
				+                                        normalized = normalize_financial_numbers(text_str.strip())
			
 
				+                                        if normalized != text_str.strip():
			
 
				+                                            # 保留原始文本节点的前后空白
			
 
				+                                            if text_str.strip() == text_str:
			
 
				+                                                # 纯文本节点，直接替换
			
 
				+                                                text_node.replace_with(normalized)
			
 
				+                                            else:
			
 
				+                                                # 有前后空白，需要保留
			
 
				+                                                leading_ws = text_str[:len(text_str) - len(text_str.lstrip())]
			
 
				+                                                trailing_ws = text_str[len(text_str.rstrip()):]
			
 
				+                                                text_node.replace_with(leading_ws + normalized + trailing_ws)
			
 
				+        
			
 
				+        # 获取修改后的HTML
			
 
				+        modified_html = str(soup)
			
 
				+        
			
 
				+        # 如果内容没有变化，返回原始HTML（保持原始格式）
			
 
				+        # 检查是否只是格式变化（换行、空格等）
			
 
				+        original_text_only = re.sub(r'\s+', '', original_table_html)
			
 
				+        modified_text_only = re.sub(r'\s+', '', modified_html)
			
 
				+        
			
 
				+        if original_text_only == modified_text_only:
			
 
				+            # 只有格式变化，返回原始HTML以保留换行符
			
 
				+            return original_table_html
			
 
				+        
			
 
				+        # 有实际内容变化，返回修改后的HTML
			
 
				+        return modified_html
			
 
				+    
			
 
				+    # 使用正则替换，只替换表格内容，保留其他部分（包括换行符）不变
			
 
				+    normalized_content = re.sub(table_pattern, normalize_table_match, markdown_content, flags=re.DOTALL)
			
 
				+    
			
 
				+    return normalized_content
			
 
				+
			
 
				+def normalize_json_table(json_content: str) -> str:
			
 
				+    """
			
 
				+    专门处理JSON格式OCR结果中表格的数字标准化
			
 
				+    
			
 
				+    Args:
			
 
				+        json_content: JSON格式的OCR结果内容
			
 
				+    
			
 
				+    Returns:
			
 
				+        标准化后的JSON内容
			
 
				+    """
			
 
				+    """
			
 
				+    json_content 示例:
			
 
				+    [
			
 
				+        {
			
 
				+            "category": "Table",
			
 
				+            "text": "<table>...</table>"
			
 
				+        },
			
 
				+        {
			
 
				+            "category": "Text",
			
 
				+            "text": "Some other text"
			
 
				+        }
			
 
				+    ]
			
 
				+    """
			
 
				+    import json
			
 
				+    
			
 
				+    try:
			
 
				+        # 解析JSON内容
			
 
				+        data = json.loads(json_content) if isinstance(json_content, str) else json_content
			
 
				+        
			
 
				+        # 确保data是列表格式
			
 
				+        if not isinstance(data, list):
			
 
				+            return json_content
			
 
				+        
			
 
				+        # 遍历所有OCR结果项
			
 
				+        for item in data:
			
 
				+            if not isinstance(item, dict):
			
 
				+                continue
			
 
				+                
			
 
				+            # 检查是否是表格类型
			
 
				+            if item.get('category') == 'Table' and 'text' in item:
			
 
				+                table_html = item['text']
			
 
				+                
			
 
				+                # 使用BeautifulSoup处理HTML表格
			
 
				+                from bs4 import BeautifulSoup, Tag
			
 
				+                
			
 
				+                soup = BeautifulSoup(table_html, 'html.parser')
			
 
				+                tables = soup.find_all('table')
			
 
				+                
			
 
				+                for table in tables:
			
 
				+                    if isinstance(table, Tag):
			
 
				+                        cells = table.find_all(['td', 'th'])
			
 
				+                        for cell in cells:
			
 
				+                            if isinstance(cell, Tag):
			
 
				+                                original_text = cell.get_text()
			
 
				+                                
			
 
				+                                # 应用数字标准化
			
 
				+                                normalized_text = normalize_financial_numbers(original_text)
			
 
				+                                
			
 
				+                                # 如果内容发生了变化，更新单元格内容
			
 
				+                                if original_text != normalized_text:
			
 
				+                                    cell.string = normalized_text
			
 
				+                
			
 
				+                # 更新item中的表格内容
			
 
				+                item['text'] = str(soup)
			
 
				+            
			
 
				+            # 同时标准化普通文本中的数字（如果需要）
			
 
				+            # elif 'text' in item:
			
 
				+            #     original_text = item['text']
			
 
				+            #     normalized_text = normalize_financial_numbers(original_text)
			
 
				+            #     if original_text != normalized_text:
			
 
				+            #         item['text'] = normalized_text
			
 
				+        
			
 
				+        # 返回标准化后的JSON字符串
			
 
				+        return json.dumps(data, ensure_ascii=False, indent=2)
			
 
				+        
			
 
				+    except json.JSONDecodeError as e:
			
 
				+        print(f"⚠️ JSON解析失败: {e}")
			
 
				+        return json_content
			
 
				+    except Exception as e:
			
 
				+        print(f"⚠️ JSON表格标准化失败: {e}")
			
 
				+        return json_content
			
 
				+
			
 
				+def normalize_json_file(file_path: str, output_path: str | None = None) -> str:
			
 
				+    """
			
 
				+    标准化JSON文件中的表格数字
			
 
				+    
			
 
				+    Args:
			
 
				+        file_path: 输入JSON文件路径
			
 
				+        output_path: 输出文件路径，如果为None则覆盖原文件
			
 
				+    
			
 
				+    Returns:
			
 
				+        标准化后的JSON内容
			
 
				+    """
			
 
				+    input_file = Path(file_path)
			
 
				+    output_file = Path(output_path) if output_path else input_file
			
 
				+    
			
 
				+    if not input_file.exists():
			
 
				+        raise FileNotFoundError(f"找不到文件: {file_path}")
			
 
				+    
			
 
				+    # 读取原始JSON文件
			
 
				+    with open(input_file, 'r', encoding='utf-8') as f:
			
 
				+        original_content = f.read()
			
 
				+    
			
 
				+    print(f"🔧 正在标准化JSON文件: {input_file.name}")
			
 
				+    
			
 
				+    # 标准化内容
			
 
				+    normalized_content = normalize_json_table(original_content)
			
 
				+    
			
 
				+    # 保存标准化后的文件
			
 
				+    with open(output_file, 'w', encoding='utf-8') as f:
			
 
				+        f.write(normalized_content)
			
 
				+    
			
 
				+    # 统计变化
			
 
				+    changes = sum(1 for o, n in zip(original_content, normalized_content) if o != n)
			
 
				+    if changes > 0:
			
 
				+        print(f"✅ 标准化了 {changes} 个字符")
			
 
				+        
			
 
				+        # 如果输出路径不同，也保存原始版本
			
 
				+        if output_path and output_path != file_path:
			
 
				+            original_backup = Path(output_path).parent / f"{Path(output_path).stem}_original.json"
			
 
				+            with open(original_backup, 'w', encoding='utf-8') as f:
			
 
				+                f.write(original_content)
			
 
				+            print(f"📄 原始版本已保存到: {original_backup}")
			
 
				+    else:
			
 
				+        print("ℹ️ 无需标准化（已是标准格式）")
			
 
				+    
			
 
				+    print(f"📄 标准化结果已保存到: {output_file}")
			
 
				+    return normalized_content
			
 
				+
			
--- a/ocr_utils/output_formatter_v2.py
+++ b/ocr_utils/output_formatter_v2.py
@@ -0,0 +1,284 @@
 
				+"""
			
 
				+统一输出格式化器 v2
			
 
				+
			
 
				+严格遵循 MinerU mineru_vllm_results_cell_bbox 格式
			
 
				+
			
 
				+支持：
			
 
				+1. MinerU 标准 middle.json 格式（用于 union_make 生成 Markdown）
			
 
				+2. mineru_vllm_results_cell_bbox 格式（每页独立 JSON）
			
 
				+3. Markdown 输出（复用 MinerU union_make）
			
 
				+4. Debug 模式：layout 图片、OCR 图片
			
 
				+5. 表格 HTML 输出（带坐标信息）
			
 
				+6. 金额数字标准化（全角→半角转换）
			
 
				+
			
 
				+模块结构：
			
 
				+- json_formatters.py: JSON 格式化工具
			
 
				+- markdown_generator.py: Markdown 生成器
			
 
				+- html_generator.py: HTML 生成器
			
 
				+- visualization_utils.py: 可视化工具
			
 
				+"""
			
 
				+import json
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+from typing import Dict, Any, List, Optional
			
 
				+from loguru import logger
			
 
				+
			
 
				+# 导入子模块
			
 
				+from .json_formatters import JSONFormatters
			
 
				+from .markdown_generator import MarkdownGenerator
			
 
				+from .html_generator import HTMLGenerator
			
 
				+from .visualization_utils import VisualizationUtils
			
 
				+
			
 
				+# 导入数字标准化工具
			
 
				+from .normalize_financial_numbers import normalize_markdown_table, normalize_json_table
			
 
				+
			
 
				+
			
 
				+class OutputFormatterV2:
			
 
				+    """
			
 
				+    统一输出格式化器
			
 
				+    
			
 
				+    严格遵循 MinerU mineru_vllm_results_cell_bbox 格式:
			
 
				+    - middle.json: MinerU 标准格式，用于生成 Markdown
			
 
				+    - page_xxx.json: 每页独立的 JSON，包含 table_cells
			
 
				+    - Markdown: 带 bbox 注释
			
 
				+    - 表格: HTML 格式，带 data-bbox 属性
			
 
				+    
			
 
				+    命名规则:
			
 
				+    - PDF输入: 文件名_page_001.*（按页编号）
			
 
				+    - 图片输入: 文件名.*（不加页码后缀）
			
 
				+    """
			
 
				+    
			
 
				+    # 颜色映射（导出供其他模块使用）
			
 
				+    COLOR_MAP = VisualizationUtils.COLOR_MAP
			
 
				+    OCR_BOX_COLOR = VisualizationUtils.OCR_BOX_COLOR
			
 
				+    CELL_BOX_COLOR = VisualizationUtils.CELL_BOX_COLOR
			
 
				+    
			
 
				+    def __init__(self, output_dir: str):
			
 
				+        """
			
 
				+        初始化格式化器
			
 
				+        
			
 
				+        Args:
			
 
				+            output_dir: 输出目录
			
 
				+        """
			
 
				+        self.output_dir = Path(output_dir)
			
 
				+        self.output_dir.mkdir(parents=True, exist_ok=True)
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def is_pdf_input(results: Dict[str, Any]) -> bool:
			
 
				+        """
			
 
				+        判断输入是否为 PDF
			
 
				+        
			
 
				+        Args:
			
 
				+            results: 处理结果
			
 
				+            
			
 
				+        Returns:
			
 
				+            True 如果输入是 PDF，否则 False
			
 
				+        """
			
 
				+        doc_path = results.get('document_path', '')
			
 
				+        if doc_path:
			
 
				+            return Path(doc_path).suffix.lower() == '.pdf'
			
 
				+        
			
 
				+        # 如果没有 document_path，检查 metadata
			
 
				+        input_type = results.get('metadata', {}).get('input_type', '')
			
 
				+        return input_type == 'pdf'
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def get_page_name(doc_name: str, page_idx: int, is_pdf: bool, total_pages: int = 1) -> str:
			
 
				+        """
			
 
				+        获取页面名称
			
 
				+        
			
 
				+        Args:
			
 
				+            doc_name: 文档名称
			
 
				+            page_idx: 页码索引（从0开始）
			
 
				+            is_pdf: 是否为 PDF 输入
			
 
				+            total_pages: 总页数
			
 
				+            
			
 
				+        Returns:
			
 
				+            页面名称（不含扩展名）
			
 
				+        """
			
 
				+        if is_pdf or total_pages > 1:
			
 
				+            # PDF 或多页输入：添加页码后缀
			
 
				+            return f"{doc_name}_page_{page_idx + 1:03d}"
			
 
				+        else:
			
 
				+            # 单个图片：不添加页码后缀
			
 
				+            return doc_name
			
 
				+    
			
 
				+    def save_results(
			
 
				+        self,
			
 
				+        results: Dict[str, Any],
			
 
				+        output_config: Dict[str, Any]
			
 
				+    ) -> Dict[str, Any]:
			
 
				+        """
			
 
				+        保存处理结果
			
 
				+        
			
 
				+        命名规则:
			
 
				+        - PDF输入: 文件名_page_001.*（按页编号）
			
 
				+        - 图片输入: 文件名.*（不加页码后缀）
			
 
				+        
			
 
				+        Args:
			
 
				+            results: 处理结果
			
 
				+            output_config: 输出配置，支持以下选项：
			
 
				+                - create_subdir: 是否在输出目录下创建文档名子目录（默认 False）
			
 
				+                - ... 其他选项见 save_mineru_format 函数
			
 
				+            
			
 
				+        Returns:
			
 
				+            输出文件路径字典
			
 
				+        """
			
 
				+        output_paths: Dict[str, Any] = {
			
 
				+            'images': [],
			
 
				+            'json_pages': [],
			
 
				+        }
			
 
				+        
			
 
				+        # 创建文档输出目录
			
 
				+        doc_name = Path(results['document_path']).stem
			
 
				+        
			
 
				+        # 是否创建子目录（默认不创建，直接使用指定的输出目录）
			
 
				+        create_subdir = output_config.get('create_subdir', False)
			
 
				+        if create_subdir:
			
 
				+            doc_output_dir = self.output_dir / doc_name
			
 
				+        else:
			
 
				+            doc_output_dir = self.output_dir
			
 
				+        doc_output_dir.mkdir(parents=True, exist_ok=True)
			
 
				+        
			
 
				+        # 判断输入类型
			
 
				+        is_pdf = self.is_pdf_input(results)
			
 
				+        total_pages = len(results.get('pages', []))
			
 
				+        
			
 
				+        # 创建 images 子目录
			
 
				+        images_dir = doc_output_dir / 'images'
			
 
				+        images_dir.mkdir(exist_ok=True)
			
 
				+        
			
 
				+        # 1. 首先保存图片元素（设置 image_path）
			
 
				+        image_paths = VisualizationUtils.save_image_elements(
			
 
				+            results, images_dir, doc_name, is_pdf=is_pdf
			
 
				+        )
			
 
				+        if image_paths:
			
 
				+            output_paths['images'] = image_paths
			
 
				+        
			
 
				+        # 2. 转换为 MinerU middle.json 格式
			
 
				+        middle_json = JSONFormatters.convert_to_middle_json(results)
			
 
				+        
			
 
				+        # 3. 保存 middle.json
			
 
				+        if output_config.get('save_json', True):
			
 
				+            json_path = doc_output_dir / f"{doc_name}_middle.json"
			
 
				+            json_content = json.dumps(middle_json, ensure_ascii=False, indent=2)
			
 
				+            
			
 
				+            # 金额数字标准化
			
 
				+            normalize_numbers = output_config.get('normalize_numbers', True)
			
 
				+            if normalize_numbers:
			
 
				+                original_content = json_content
			
 
				+                json_content = normalize_json_table(json_content)
			
 
				+                
			
 
				+                # 检查是否有变化
			
 
				+                if json_content != original_content:
			
 
				+                    # 保存原始文件
			
 
				+                    original_path = doc_output_dir / f"{doc_name}_middle_original.json"
			
 
				+                    with open(original_path, 'w', encoding='utf-8') as f:
			
 
				+                        f.write(original_content)
			
 
				+                    logger.info(f"📄 Original middle JSON saved: {original_path}")
			
 
				+                    output_paths['middle_json_original'] = str(original_path)
			
 
				+            
			
 
				+            with open(json_path, 'w', encoding='utf-8') as f:
			
 
				+                f.write(json_content)
			
 
				+            output_paths['middle_json'] = str(json_path)
			
 
				+            logger.info(f"📄 Middle JSON saved: {json_path}")
			
 
				+        
			
 
				+        # 4. 保存每页独立的 mineru_vllm_results_cell_bbox 格式 JSON
			
 
				+        if output_config.get('save_page_json', True):
			
 
				+            normalize_numbers = output_config.get('normalize_numbers', True)
			
 
				+            page_json_paths = JSONFormatters.save_page_jsons(
			
 
				+                results, doc_output_dir, doc_name, is_pdf=is_pdf,
			
 
				+                normalize_numbers=normalize_numbers
			
 
				+            )
			
 
				+            output_paths['json_pages'] = page_json_paths
			
 
				+        
			
 
				+        # 5. 保存 Markdown（完整版）
			
 
				+        if output_config.get('save_markdown', True):
			
 
				+            normalize_numbers = output_config.get('normalize_numbers', True)
			
 
				+            md_path, original_md_path = MarkdownGenerator.save_markdown(
			
 
				+                results, middle_json, doc_output_dir, doc_name,
			
 
				+                normalize_numbers=normalize_numbers
			
 
				+            )
			
 
				+            output_paths['markdown'] = str(md_path)
			
 
				+            if original_md_path:
			
 
				+                output_paths['markdown_original'] = str(original_md_path)
			
 
				+        
			
 
				+        # 5.5 保存每页独立的 Markdown
			
 
				+        if output_config.get('save_page_markdown', True):
			
 
				+            normalize_numbers = output_config.get('normalize_numbers', True)
			
 
				+            page_md_paths = MarkdownGenerator.save_page_markdowns(
			
 
				+                results, doc_output_dir, doc_name, is_pdf=is_pdf,
			
 
				+                normalize_numbers=normalize_numbers
			
 
				+            )
			
 
				+            output_paths['markdown_pages'] = page_md_paths
			
 
				+        
			
 
				+        # 6. 保存表格 HTML
			
 
				+        if output_config.get('save_html', True):
			
 
				+            html_dir = HTMLGenerator.save_table_htmls(
			
 
				+                results, doc_output_dir, doc_name, is_pdf=is_pdf
			
 
				+            )
			
 
				+            output_paths['table_htmls'] = str(html_dir)
			
 
				+        
			
 
				+        # 7. Debug 模式：保存可视化图片
			
 
				+        if output_config.get('save_layout_image', False):
			
 
				+            layout_paths = VisualizationUtils.save_layout_images(
			
 
				+                results, doc_output_dir, doc_name,
			
 
				+                draw_type_label=output_config.get('draw_type_label', True),
			
 
				+                draw_bbox_number=output_config.get('draw_bbox_number', True),
			
 
				+                is_pdf=is_pdf
			
 
				+            )
			
 
				+            output_paths['layout_images'] = layout_paths
			
 
				+        
			
 
				+        if output_config.get('save_ocr_image', False):
			
 
				+            ocr_paths = VisualizationUtils.save_ocr_images(
			
 
				+                results, doc_output_dir, doc_name, is_pdf=is_pdf
			
 
				+            )
			
 
				+            output_paths['ocr_images'] = ocr_paths
			
 
				+        
			
 
				+        logger.info(f"✅ All results saved to: {doc_output_dir}")
			
 
				+        return output_paths
			
 
				+
			
 
				+
			
 
				+# ==================== 便捷函数 ====================
			
 
				+
			
 
				+def save_mineru_format(
			
 
				+    results: Dict[str, Any],
			
 
				+    output_dir: str,
			
 
				+    output_config: Optional[Dict[str, Any]] = None
			
 
				+) -> Dict[str, Any]:
			
 
				+    """
			
 
				+    便捷函数：保存为 MinerU 格式
			
 
				+    
			
 
				+    Args:
			
 
				+        results: pipeline 处理结果
			
 
				+        output_dir: 输出目录
			
 
				+        output_config: 输出配置，支持以下选项：
			
 
				+            - create_subdir: 在输出目录下创建文档名子目录（默认 False）
			
 
				+            - save_json: 保存 middle.json
			
 
				+            - save_page_json: 保存每页 JSON
			
 
				+            - save_markdown: 保存完整 Markdown
			
 
				+            - save_page_markdown: 保存每页 Markdown
			
 
				+            - save_html: 保存表格 HTML
			
 
				+            - save_layout_image: 保存布局可视化图
			
 
				+            - save_ocr_image: 保存 OCR 可视化图
			
 
				+            - normalize_numbers: 标准化金额数字（全角→半角）
			
 
				+        
			
 
				+    Returns:
			
 
				+        输出文件路径字典
			
 
				+    """
			
 
				+    if output_config is None:
			
 
				+        output_config = {
			
 
				+            'create_subdir': False,  # 默认不创建子目录，直接使用指定目录
			
 
				+            'save_json': True,
			
 
				+            'save_page_json': True,
			
 
				+            'save_markdown': True,
			
 
				+            'save_page_markdown': True,
			
 
				+            'save_html': True,
			
 
				+            'save_layout_image': False,
			
 
				+            'save_ocr_image': False,
			
 
				+            'normalize_numbers': True,  # 默认启用数字标准化
			
 
				+        }
			
 
				+    
			
 
				+    formatter = OutputFormatterV2(output_dir)
			
 
				+    return formatter.save_results(results, output_config)
			
 
				+
			
--- a/ocr_utils/pdf_extractor.py
+++ b/ocr_utils/pdf_extractor.py
@@ -0,0 +1,223 @@
 
				+#!/usr/bin/env python3
			
 
				+"""
			
 
				+PDF页面提取工具
			
 
				+
			
 
				+从PDF文件中提取指定页面并保存为新PDF文件。
			
 
				+
			
 
				+使用方法:
			
 
				+    python pdf_extractor.py input.pdf --pages "1-5,7,9-12" --output output.pdf
			
 
				+    python pdf_extractor.py input.pdf --pages "1-" --output output.pdf  # 提取第1页到最后
			
 
				+    python pdf_extractor.py input.pdf --pages "-10" --output output.pdf  # 提取前10页
			
 
				+"""
			
 
				+import argparse
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+from typing import List
			
 
				+import io
			
 
				+
			
 
				+try:
			
 
				+    import pypdfium2 as pdfium
			
 
				+    PDFIUM_AVAILABLE = True
			
 
				+except ImportError:
			
 
				+    PDFIUM_AVAILABLE = False
			
 
				+    pdfium = None
			
 
				+
			
 
				+from loguru import logger
			
 
				+from .pdf_utils import PDFUtils
			
 
				+
			
 
				+
			
 
				+def extract_pdf_pages(
			
 
				+    input_pdf_path: Path,
			
 
				+    page_indices: List[int],
			
 
				+    output_pdf_path: Path
			
 
				+) -> bool:
			
 
				+    """
			
 
				+    从PDF中提取指定页面并保存为新PDF
			
 
				+    
			
 
				+    Args:
			
 
				+        input_pdf_path: 输入PDF文件路径
			
 
				+        page_indices: 要提取的页面索引列表（0-based）
			
 
				+        output_pdf_path: 输出PDF文件路径
			
 
				+        
			
 
				+    Returns:
			
 
				+        是否成功
			
 
				+    """
			
 
				+    if not PDFIUM_AVAILABLE:
			
 
				+        logger.error("❌ pypdfium2 未安装，请先安装: pip install pypdfium2")
			
 
				+        return False
			
 
				+    
			
 
				+    if not input_pdf_path.exists():
			
 
				+        logger.error(f"❌ 输入文件不存在: {input_pdf_path}")
			
 
				+        return False
			
 
				+    
			
 
				+    if not input_pdf_path.suffix.lower() == '.pdf':
			
 
				+        logger.error(f"❌ 输入文件不是PDF格式: {input_pdf_path}")
			
 
				+        return False
			
 
				+    
			
 
				+    try:
			
 
				+        # 读取PDF文件
			
 
				+        with open(input_pdf_path, 'rb') as f:
			
 
				+            pdf_bytes = f.read()
			
 
				+        
			
 
				+        # 加载PDF文档
			
 
				+        pdf = pdfium.PdfDocument(pdf_bytes)
			
 
				+        total_pages = len(pdf)
			
 
				+        
			
 
				+        if total_pages == 0:
			
 
				+            logger.error("❌ PDF文件为空")
			
 
				+            pdf.close()
			
 
				+            return False
			
 
				+        
			
 
				+        # 验证页面索引
			
 
				+        valid_indices = []
			
 
				+        for idx in sorted(set(page_indices)):  # 去重并排序
			
 
				+            if 0 <= idx < total_pages:
			
 
				+                valid_indices.append(idx)
			
 
				+            else:
			
 
				+                logger.warning(f"⚠️  页面索引 {idx + 1} 超出范围（总页数: {total_pages}），已跳过")
			
 
				+        
			
 
				+        if not valid_indices:
			
 
				+            logger.error("❌ 没有有效的页面可提取")
			
 
				+            pdf.close()
			
 
				+            return False
			
 
				+        
			
 
				+        # 创建新PDF文档
			
 
				+        output_pdf = pdfium.PdfDocument.new()
			
 
				+        
			
 
				+        # 导入指定页面
			
 
				+        success_count = 0
			
 
				+        for page_idx in valid_indices:
			
 
				+            try:
			
 
				+                output_pdf.import_pages(pdf, pages=[page_idx])
			
 
				+                success_count += 1
			
 
				+            except Exception as e:
			
 
				+                logger.warning(f"⚠️  导入第 {page_idx + 1} 页失败: {e}，已跳过")
			
 
				+                continue
			
 
				+        
			
 
				+        if success_count == 0:
			
 
				+            logger.error("❌ 没有成功导入任何页面")
			
 
				+            pdf.close()
			
 
				+            output_pdf.close()
			
 
				+            return False
			
 
				+        
			
 
				+        # 保存到文件
			
 
				+        output_pdf_path.parent.mkdir(parents=True, exist_ok=True)
			
 
				+        
			
 
				+        # 保存到内存缓冲区
			
 
				+        output_buffer = io.BytesIO()
			
 
				+        output_pdf.save(output_buffer)
			
 
				+        output_bytes = output_buffer.getvalue()
			
 
				+        
			
 
				+        # 写入文件
			
 
				+        with open(output_pdf_path, 'wb') as f:
			
 
				+            f.write(output_bytes)
			
 
				+        
			
 
				+        # 清理资源
			
 
				+        pdf.close()
			
 
				+        output_pdf.close()
			
 
				+        
			
 
				+        logger.info(f"✅ 成功提取 {success_count} 页到: {output_pdf_path}")
			
 
				+        logger.info(f"   提取的页面: {', '.join([str(idx + 1) for idx in valid_indices])}")
			
 
				+        return True
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        logger.error(f"❌ 提取PDF页面时出错: {e}")
			
 
				+        import traceback
			
 
				+        logger.debug(traceback.format_exc())
			
 
				+        return False
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    """命令行入口"""
			
 
				+    parser = argparse.ArgumentParser(
			
 
				+        description='从PDF文件中提取指定页面并保存为新PDF文件',
			
 
				+        formatter_class=argparse.RawDescriptionHelpFormatter,
			
 
				+        epilog="""
			
 
				+示例:
			
 
				+  # 提取第1-5页和第7页
			
 
				+  python pdf_extractor.py --input input.pdf --pages "1-5,7" --output output.pdf
			
 
				+  
			
 
				+  # 提取第1页到最后
			
 
				+  python pdf_extractor.py --input input.pdf --pages "1-" --output output.pdf
			
 
				+  
			
 
				+  # 提取前10页
			
 
				+  python pdf_extractor.py --input input.pdf --pages "-10" --output output.pdf
			
 
				+  
			
 
				+  # 提取单页
			
 
				+  python pdf_extractor.py --input input.pdf --pages "3" --output output.pdf
			
 
				+        """
			
 
				+    )
			
 
				+    
			
 
				+    parser.add_argument(
			
 
				+        '--input', '-i',
			
 
				+        type=str,
			
 
				+        help='输入PDF文件路径'
			
 
				+    )
			
 
				+    
			
 
				+    parser.add_argument(
			
 
				+        '--pages', '-p',
			
 
				+        type=str,
			
 
				+        required=True,
			
 
				+        help='要提取的页面范围，支持格式: "1-5,7,9-12", "1-", "-10", "3"'
			
 
				+    )
			
 
				+    
			
 
				+    parser.add_argument(
			
 
				+        '--output', '-o',
			
 
				+        type=str,
			
 
				+        required=True,
			
 
				+        help='输出PDF文件路径'
			
 
				+    )
			
 
				+    
			
 
				+    args = parser.parse_args()
			
 
				+    
			
 
				+    # 解析输入路径
			
 
				+    input_path = Path(args.input).resolve()
			
 
				+    
			
 
				+    # 解析输出路径
			
 
				+    output_path = Path(args.output).resolve()
			
 
				+    
			
 
				+    # 如果输出路径是目录，自动生成文件名
			
 
				+    if output_path.is_dir() or not output_path.suffix.lower() == '.pdf':
			
 
				+        output_path = output_path / f"{input_path.stem}_extracted.pdf"
			
 
				+    
			
 
				+    # 检查 pypdfium2 是否可用
			
 
				+    if not PDFIUM_AVAILABLE:
			
 
				+        logger.error("❌ pypdfium2 未安装，请先安装: pip install pypdfium2")
			
 
				+        sys.exit(1)
			
 
				+    
			
 
				+    # 先加载PDF获取总页数（用于验证页面范围）
			
 
				+    try:
			
 
				+        with open(input_path, 'rb') as f:
			
 
				+            pdf_bytes = f.read()
			
 
				+        pdf = pdfium.PdfDocument(pdf_bytes)
			
 
				+        total_pages = len(pdf)
			
 
				+        pdf.close()
			
 
				+    except Exception as e:
			
 
				+        logger.error(f"❌ 无法读取PDF文件: {e}")
			
 
				+        sys.exit(1)
			
 
				+    
			
 
				+    # 解析页面范围
			
 
				+    page_set = PDFUtils.parse_page_range(args.pages, total_pages)
			
 
				+    page_indices = sorted(list(page_set))
			
 
				+    
			
 
				+    if not page_indices:
			
 
				+        logger.error(f"❌ 页面范围 '{args.pages}' 没有匹配到任何有效页面（总页数: {total_pages}）")
			
 
				+        sys.exit(1)
			
 
				+    
			
 
				+    logger.info(f"📋 PDF总页数: {total_pages}")
			
 
				+    logger.info(f"📋 要提取的页面: {args.pages} → {len(page_indices)} 页")
			
 
				+    
			
 
				+    # 执行提取
			
 
				+    success = extract_pdf_pages(input_path, page_indices, output_path)
			
 
				+    
			
 
				+    if success:
			
 
				+        logger.info(f"✅ 提取完成！输出文件: {output_path}")
			
 
				+        sys.exit(0)
			
 
				+    else:
			
 
				+        logger.error("❌ 提取失败")
			
 
				+        sys.exit(1)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    sys.exit(main())
			
 
				+
			
--- a/ocr_utils/pdf_utils.py
+++ b/ocr_utils/pdf_utils.py
@@ -0,0 +1,294 @@
 
				+"""
			
 
				+PDF处理工具模块
			
 
				+
			
 
				+提供PDF相关处理功能：
			
 
				+- PDF加载与分类
			
 
				+- PDF文本提取
			
 
				+- 跨页表格合并
			
 
				+- 页面范围解析与过滤
			
 
				+"""
			
 
				+from typing import Dict, List, Any, Optional, Tuple, Set
			
 
				+from pathlib import Path
			
 
				+from PIL import Image
			
 
				+from loguru import logger
			
 
				+import re
			
 
				+
			
 
				+# 导入 MinerU 组件
			
 
				+try:
			
 
				+    from mineru.utils.pdf_classify import classify as pdf_classify
			
 
				+    from mineru.utils.pdf_image_tools import load_images_from_pdf
			
 
				+    from mineru.utils.enum_class import ImageType
			
 
				+    from mineru.utils.pdf_text_tool import get_page as pdf_get_page_text
			
 
				+    MINERU_AVAILABLE = True
			
 
				+except ImportError:
			
 
				+    raise ImportError("MinerU components not available for PDF processing")
			
 
				+
			
 
				+class PDFUtils:
			
 
				+    """PDF处理工具类"""
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def parse_page_range(page_range: Optional[str], total_pages: int) -> Set[int]:
			
 
				+        """
			
 
				+        解析页面范围字符串
			
 
				+        
			
 
				+        支持格式：
			
 
				+        - "1-5" → {0, 1, 2, 3, 4}（页码从1开始，内部转为0-based索引）
			
 
				+        - "3" → {2}
			
 
				+        - "1-5,7,9-12" → {0, 1, 2, 3, 4, 6, 8, 9, 10, 11}
			
 
				+        - "1-" → 从第1页到最后
			
 
				+        - "-5" → 从第1页到第5页
			
 
				+        
			
 
				+        Args:
			
 
				+            page_range: 页面范围字符串（页码从1开始）
			
 
				+            total_pages: 总页数
			
 
				+            
			
 
				+        Returns:
			
 
				+            页面索引集合（0-based）
			
 
				+        """
			
 
				+        if not page_range or not page_range.strip():
			
 
				+            return set(range(total_pages))
			
 
				+        
			
 
				+        pages = set()
			
 
				+        parts = page_range.replace(' ', '').split(',')
			
 
				+        
			
 
				+        for part in parts:
			
 
				+            part = part.strip()
			
 
				+            if not part:
			
 
				+                continue
			
 
				+            
			
 
				+            if '-' in part:
			
 
				+                # 范围格式
			
 
				+                match = re.match(r'^(\d*)-(\d*)$', part)
			
 
				+                if match:
			
 
				+                    start_str, end_str = match.groups()
			
 
				+                    start = int(start_str) if start_str else 1
			
 
				+                    end = int(end_str) if end_str else total_pages
			
 
				+                    
			
 
				+                    # 转换为 0-based 索引
			
 
				+                    start = max(0, start - 1)
			
 
				+                    end = min(total_pages, end)
			
 
				+                    
			
 
				+                    pages.update(range(start, end))
			
 
				+            else:
			
 
				+                # 单页
			
 
				+                try:
			
 
				+                    page_num = int(part)
			
 
				+                    if 1 <= page_num <= total_pages:
			
 
				+                        pages.add(page_num - 1)  # 转换为 0-based 索引
			
 
				+                except ValueError:
			
 
				+                    logger.warning(f"Invalid page number: {part}")
			
 
				+        
			
 
				+        return pages
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def load_and_classify_document(
			
 
				+        document_path: Path,
			
 
				+        dpi: int = 200,
			
 
				+        page_range: Optional[str] = None
			
 
				+    ) -> Tuple[List[Dict], str, Optional[Any]]:
			
 
				+        """
			
 
				+        加载文档并分类，支持页面范围过滤
			
 
				+        
			
 
				+        Args:
			
 
				+            document_path: 文档路径
			
 
				+            dpi: PDF渲染DPI
			
 
				+            page_range: 页面范围字符串，如 "1-5,7,9-12"
			
 
				+                       - PDF：按页码（从1开始）
			
 
				+                       - 图片目录：按文件名排序后的位置（从1开始）
			
 
				+            
			
 
				+        Returns:
			
 
				+            (images_list, pdf_type, pdf_doc)
			
 
				+            - images_list: 图像列表，每个元素包含 {'img_pil': PIL.Image, 'scale': float, 'page_idx': int}
			
 
				+            - pdf_type: 'ocr' 或 'txt'
			
 
				+            - pdf_doc: PDF文档对象（如果是PDF）
			
 
				+        """
			
 
				+        pdf_doc = None
			
 
				+        pdf_type = 'ocr'  # 默认使用OCR模式
			
 
				+        all_images = []
			
 
				+        
			
 
				+        if document_path.is_dir():
			
 
				+            # 处理目录：遍历所有图片
			
 
				+            image_extensions = {'.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.tif'}
			
 
				+            image_files = sorted([
			
 
				+                f for f in document_path.iterdir() 
			
 
				+                if f.suffix.lower() in image_extensions
			
 
				+            ])
			
 
				+            
			
 
				+            # 解析页面范围
			
 
				+            total_pages = len(image_files)
			
 
				+            selected_pages = PDFUtils.parse_page_range(page_range, total_pages)
			
 
				+            
			
 
				+            if page_range:
			
 
				+                logger.info(f"📋 图片目录共 {total_pages} 张，选择处理 {len(selected_pages)} 张")
			
 
				+            
			
 
				+            for idx, img_file in enumerate(image_files):
			
 
				+                if idx not in selected_pages:
			
 
				+                    continue
			
 
				+                
			
 
				+                img = Image.open(img_file)
			
 
				+                if img.mode != 'RGB':
			
 
				+                    img = img.convert('RGB')
			
 
				+                all_images.append({
			
 
				+                    'img_pil': img,
			
 
				+                    'scale': 1.0,
			
 
				+                    'source_path': str(img_file),
			
 
				+                    'page_idx': idx,  # 原始索引
			
 
				+                    'page_name': img_file.stem  # 文件名（不含扩展名）
			
 
				+                })
			
 
				+            
			
 
				+            pdf_type = 'ocr'  # 图片目录始终使用OCR模式
			
 
				+            
			
 
				+        elif document_path.suffix.lower() == '.pdf':
			
 
				+            # 处理PDF文件
			
 
				+            if not MINERU_AVAILABLE:
			
 
				+                raise RuntimeError("MinerU components not available for PDF processing")
			
 
				+            
			
 
				+            with open(document_path, 'rb') as f:
			
 
				+                pdf_bytes = f.read()
			
 
				+            
			
 
				+            # PDF分类
			
 
				+            pdf_type = pdf_classify(pdf_bytes)
			
 
				+            logger.info(f"📋 PDF classified as: {pdf_type}")
			
 
				+            
			
 
				+            # 加载图像
			
 
				+            images_list, pdf_doc = load_images_from_pdf(
			
 
				+                pdf_bytes, 
			
 
				+                dpi=dpi,
			
 
				+                image_type=ImageType.PIL
			
 
				+            )
			
 
				+            
			
 
				+            # 解析页面范围
			
 
				+            total_pages = len(images_list)
			
 
				+            selected_pages = PDFUtils.parse_page_range(page_range, total_pages)
			
 
				+            
			
 
				+            if page_range:
			
 
				+                logger.info(f"📋 PDF 共 {total_pages} 页，选择处理 {len(selected_pages)} 页")
			
 
				+            
			
 
				+            for idx, img_dict in enumerate(images_list):
			
 
				+                if idx not in selected_pages:
			
 
				+                    continue
			
 
				+                
			
 
				+                all_images.append({
			
 
				+                    'img_pil': img_dict['img_pil'],
			
 
				+                    'scale': img_dict.get('scale', dpi / 72),
			
 
				+                    'source_path': str(document_path),
			
 
				+                    'page_idx': idx  # 原始页码索引
			
 
				+                })
			
 
				+                
			
 
				+        elif document_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.tif']:
			
 
				+            # 处理单个图片
			
 
				+            img = Image.open(document_path)
			
 
				+            if img.mode != 'RGB':
			
 
				+                img = img.convert('RGB')
			
 
				+            all_images.append({
			
 
				+                'img_pil': img,
			
 
				+                'scale': 1.0,
			
 
				+                'source_path': str(document_path),
			
 
				+                'page_idx': 0,
			
 
				+                'page_name': document_path.stem
			
 
				+            })
			
 
				+            pdf_type = 'ocr'
			
 
				+            
			
 
				+        else:
			
 
				+            raise ValueError(f"Unsupported file format: {document_path.suffix}")
			
 
				+        
			
 
				+        return all_images, pdf_type, pdf_doc
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def extract_text_from_pdf(
			
 
				+        pdf_doc: Any,
			
 
				+        page_idx: int,
			
 
				+        bbox: List[float],
			
 
				+        scale: float
			
 
				+    ) -> Tuple[str, bool]:
			
 
				+        """
			
 
				+        从PDF直接提取文本（使用 MinerU 的 pypdfium2 方式）
			
 
				+        
			
 
				+        Args:
			
 
				+            pdf_doc: pypdfium2 的 PdfDocument 对象
			
 
				+            page_idx: 页码索引
			
 
				+            bbox: 目标区域的bbox（图像坐标）
			
 
				+            scale: 图像与PDF的缩放比例
			
 
				+            
			
 
				+        Returns:
			
 
				+            (text, success)
			
 
				+        """
			
 
				+        if not MINERU_AVAILABLE or pdf_get_page_text is None:
			
 
				+            logger.debug("MinerU pdf_text_tool not available")
			
 
				+            return "", False
			
 
				+            
			
 
				+        try:
			
 
				+            page = pdf_doc[page_idx]
			
 
				+            
			
 
				+            # 将图像坐标转换为PDF坐标
			
 
				+            pdf_bbox = [
			
 
				+                bbox[0] / scale,
			
 
				+                bbox[1] / scale,
			
 
				+                bbox[2] / scale,
			
 
				+                bbox[3] / scale
			
 
				+            ]
			
 
				+            
			
 
				+            # 使用 MinerU 的方式获取页面文本信息
			
 
				+            page_dict = pdf_get_page_text(page)
			
 
				+            
			
 
				+            # 从 blocks 中提取与 bbox 重叠的文本
			
 
				+            text_parts = []
			
 
				+            for block in page_dict.get('blocks', []):
			
 
				+                for line in block.get('lines', []):
			
 
				+                    line_bbox = line.get('bbox')
			
 
				+                    if line_bbox and hasattr(line_bbox, 'bbox'):
			
 
				+                        line_bbox = line_bbox.bbox  # pdftext 的 BBox 对象
			
 
				+                    elif isinstance(line_bbox, (list, tuple)) and len(line_bbox) >= 4:
			
 
				+                        line_bbox = list(line_bbox)
			
 
				+                    else:
			
 
				+                        continue
			
 
				+                    
			
 
				+                    # 检查 line 是否与目标 bbox 重叠
			
 
				+                    if PDFUtils._bbox_overlap(pdf_bbox, line_bbox):
			
 
				+                        for span in line.get('spans', []):
			
 
				+                            span_text = span.get('text', '')
			
 
				+                            if span_text:
			
 
				+                                text_parts.append(span_text)
			
 
				+            
			
 
				+            text = ' '.join(text_parts)
			
 
				+            return text.strip(), bool(text.strip())
			
 
				+            
			
 
				+        except Exception as e:
			
 
				+            import traceback
			
 
				+            logger.debug(f"PDF text extraction error: {e}")
			
 
				+            logger.debug(traceback.format_exc())
			
 
				+            return "", False
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def _bbox_overlap(bbox1: List[float], bbox2: List[float]) -> bool:
			
 
				+        """检查两个 bbox 是否重叠"""
			
 
				+        if len(bbox1) < 4 or len(bbox2) < 4:
			
 
				+            return False
			
 
				+        
			
 
				+        x1_1, y1_1, x2_1, y2_1 = bbox1[:4]
			
 
				+        x1_2, y1_2, x2_2, y2_2 = bbox2[:4]
			
 
				+        
			
 
				+        if x2_1 < x1_2 or x2_2 < x1_1:
			
 
				+            return False
			
 
				+        if y2_1 < y1_2 or y2_2 < y1_1:
			
 
				+            return False
			
 
				+        
			
 
				+        return True
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def merge_cross_page_tables(results: Dict[str, Any]) -> Dict[str, Any]:
			
 
				+        """
			
 
				+        合并跨页表格
			
 
				+        
			
 
				+        TODO: 实现跨页表格合并逻辑
			
 
				+        可以参考 MinerU 的 cross_page_table_merge 实现
			
 
				+        
			
 
				+        Args:
			
 
				+            results: 处理结果字典
			
 
				+            
			
 
				+        Returns:
			
 
				+            合并后的结果
			
 
				+        """
			
 
				+        # TODO: 实现跨页表格合并逻辑
			
 
				+        return results
			
 
				+
			
--- a/ocr_utils/visualization_utils.py
+++ b/ocr_utils/visualization_utils.py
@@ -0,0 +1,436 @@
 
				+"""
			
 
				+可视化工具模块
			
 
				+
			
 
				+提供文档处理结果的可视化功能：
			
 
				+- Layout 布局可视化
			
 
				+- OCR 结果可视化
			
 
				+- 图片元素保存
			
 
				+"""
			
 
				+from pathlib import Path
			
 
				+from typing import Dict, Any, List, Tuple
			
 
				+import numpy as np
			
 
				+from PIL import Image, ImageDraw, ImageFont
			
 
				+import cv2
			
 
				+from loguru import logger
			
 
				+
			
 
				+
			
 
				+class VisualizationUtils:
			
 
				+    """可视化工具类"""
			
 
				+    
			
 
				+    # 颜色映射（与 MinerU BlockType / EnhancedDocPipeline 类别保持一致）
			
 
				+    COLOR_MAP = {
			
 
				+        # 文本类元素 (TEXT_CATEGORIES)
			
 
				+        'title': (102, 102, 255),           # 蓝色
			
 
				+        'text': (153, 0, 76),               # 深红
			
 
				+        'ocr_text': (153, 0, 76),           # 深红（同 text）
			
 
				+        'low_score_text': (200, 100, 100),  # 浅红
			
 
				+        'header': (128, 128, 128),          # 灰色
			
 
				+        'footer': (128, 128, 128),          # 灰色
			
 
				+        'page_number': (160, 160, 160),     # 浅灰
			
 
				+        'ref_text': (180, 180, 180),        # 浅灰
			
 
				+        'aside_text': (180, 180, 180),      # 浅灰
			
 
				+        'page_footnote': (200, 200, 200),   # 浅灰
			
 
				+        
			
 
				+        # 表格相关元素
			
 
				+        'table': (204, 204, 0),             # 黄色
			
 
				+        'table_body': (204, 204, 0),        # 黄色
			
 
				+        'table_caption': (255, 255, 102),   # 浅黄
			
 
				+        'table_footnote': (229, 255, 204),  # 浅黄绿
			
 
				+        
			
 
				+        # 图片相关元素
			
 
				+        'image': (153, 255, 51),            # 绿色
			
 
				+        'image_body': (153, 255, 51),       # 绿色
			
 
				+        'figure': (153, 255, 51),           # 绿色
			
 
				+        'image_caption': (102, 178, 255),   # 浅蓝
			
 
				+        'image_footnote': (255, 178, 102),  # 橙色
			
 
				+        
			
 
				+        # 公式类元素
			
 
				+        'interline_equation': (0, 255, 0),  # 亮绿
			
 
				+        'inline_equation': (0, 200, 0),     # 绿色
			
 
				+        'equation': (0, 220, 0),            # 绿色
			
 
				+        'interline_equation_yolo': (0, 180, 0),
			
 
				+        'interline_equation_number': (0, 160, 0),
			
 
				+        
			
 
				+        # 代码类元素
			
 
				+        'code': (102, 0, 204),              # 紫色
			
 
				+        'code_body': (102, 0, 204),         # 紫色
			
 
				+        'code_caption': (153, 51, 255),     # 浅紫
			
 
				+        'algorithm': (128, 0, 255),         # 紫色
			
 
				+        
			
 
				+        # 列表类元素
			
 
				+        'list': (40, 169, 92),              # 青绿
			
 
				+        'index': (60, 180, 100),            # 青绿
			
 
				+        
			
 
				+        # 丢弃类元素
			
 
				+        'abandon': (100, 100, 100),         # 深灰
			
 
				+        'discarded': (100, 100, 100),       # 深灰
			
 
				+        
			
 
				+        # 错误
			
 
				+        'error': (255, 0, 0),               # 红色
			
 
				+    }
			
 
				+    
			
 
				+    # OCR 框颜色
			
 
				+    OCR_BOX_COLOR = (0, 255, 0)  # 绿色
			
 
				+    CELL_BOX_COLOR = (255, 165, 0)  # 橙色
			
 
				+    DISCARD_COLOR = (128, 128, 128)  # 灰色
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def save_image_elements(
			
 
				+        results: Dict[str, Any],
			
 
				+        images_dir: Path,
			
 
				+        doc_name: str,
			
 
				+        is_pdf: bool = True
			
 
				+    ) -> List[str]:
			
 
				+        """
			
 
				+        保存图片元素
			
 
				+        
			
 
				+        命名规则:
			
 
				+        - PDF输入: 文件名_page_001_image_1.png
			
 
				+        - 图片输入（单页）: 文件名_image_1.png
			
 
				+        
			
 
				+        Args:
			
 
				+            results: 处理结果
			
 
				+            images_dir: 图片输出目录
			
 
				+            doc_name: 文档名称
			
 
				+            is_pdf: 是否为 PDF 输入
			
 
				+            
			
 
				+        Returns:
			
 
				+            保存的图片路径列表
			
 
				+        """
			
 
				+        saved_paths = []
			
 
				+        image_count = 0
			
 
				+        total_pages = len(results.get('pages', []))
			
 
				+        
			
 
				+        for page in results.get('pages', []):
			
 
				+            page_idx = page.get('page_idx', 0)
			
 
				+            
			
 
				+            for element in page.get('elements', []):
			
 
				+                if element.get('type') in ['image', 'image_body', 'figure']:
			
 
				+                    content = element.get('content', {})
			
 
				+                    image_data = content.get('image_data')
			
 
				+                    
			
 
				+                    if image_data is not None:
			
 
				+                        image_count += 1
			
 
				+                        
			
 
				+                        # 根据输入类型决定命名
			
 
				+                        if is_pdf or total_pages > 1:
			
 
				+                            image_filename = f"{doc_name}_page_{page_idx + 1}_image_{image_count}.png"
			
 
				+                        else:
			
 
				+                            image_filename = f"{doc_name}_image_{image_count}.png"
			
 
				+                        
			
 
				+                        image_path = images_dir / image_filename
			
 
				+                        
			
 
				+                        try:
			
 
				+                            if isinstance(image_data, np.ndarray):
			
 
				+                                cv2.imwrite(str(image_path), image_data)
			
 
				+                            else:
			
 
				+                                Image.fromarray(image_data).save(image_path)
			
 
				+                            
			
 
				+                            # 更新路径（只保存文件名）
			
 
				+                            content['image_path'] = image_filename
			
 
				+                            content.pop('image_data', None)
			
 
				+                            
			
 
				+                            saved_paths.append(str(image_path))
			
 
				+                            logger.debug(f"🖼️ Image saved: {image_path}")
			
 
				+                        except Exception as e:
			
 
				+                            logger.warning(f"Failed to save image: {e}")
			
 
				+        
			
 
				+        if image_count > 0:
			
 
				+            logger.info(f"🖼️ {image_count} images saved to: {images_dir}")
			
 
				+        
			
 
				+        return saved_paths
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def save_layout_images(
			
 
				+        results: Dict[str, Any],
			
 
				+        output_dir: Path,
			
 
				+        doc_name: str,
			
 
				+        draw_type_label: bool = True,
			
 
				+        draw_bbox_number: bool = True,
			
 
				+        is_pdf: bool = True
			
 
				+    ) -> List[str]:
			
 
				+        """
			
 
				+        保存 Layout 可视化图片
			
 
				+        
			
 
				+        命名规则:
			
 
				+        - PDF输入: 文件名_page_001_layout.png
			
 
				+        - 图片输入（单页）: 文件名_layout.png
			
 
				+        
			
 
				+        Args:
			
 
				+            results: 处理结果
			
 
				+            output_dir: 输出目录
			
 
				+            doc_name: 文档名称
			
 
				+            draw_type_label: 是否绘制类型标签
			
 
				+            draw_bbox_number: 是否绘制序号
			
 
				+            is_pdf: 是否为 PDF 输入
			
 
				+            
			
 
				+        Returns:
			
 
				+            保存的图片路径列表
			
 
				+        """
			
 
				+        layout_paths = []
			
 
				+        total_pages = len(results.get('pages', []))
			
 
				+        
			
 
				+        for page in results.get('pages', []):
			
 
				+            page_idx = page.get('page_idx', 0)
			
 
				+            processed_image = page.get('original_image')
			
 
				+            if processed_image is None:
			
 
				+                processed_image = page.get('processed_image')
			
 
				+            
			
 
				+            if processed_image is None:
			
 
				+                logger.warning(f"Page {page_idx}: No image data found for layout visualization")
			
 
				+                continue
			
 
				+            
			
 
				+            if isinstance(processed_image, np.ndarray):
			
 
				+                image = Image.fromarray(processed_image).convert('RGB')
			
 
				+            elif isinstance(processed_image, Image.Image):
			
 
				+                image = processed_image.convert('RGB')
			
 
				+            else:
			
 
				+                continue
			
 
				+            
			
 
				+            draw = ImageDraw.Draw(image, 'RGBA')
			
 
				+            font = VisualizationUtils._get_font(14)
			
 
				+            
			
 
				+            # 绘制普通元素
			
 
				+            for idx, element in enumerate(page.get('elements', []), 1):
			
 
				+                elem_type = element.get('type', '')
			
 
				+                bbox = element.get('bbox', [0, 0, 0, 0])
			
 
				+                
			
 
				+                if len(bbox) < 4:
			
 
				+                    continue
			
 
				+                
			
 
				+                x0, y0, x1, y1 = map(int, bbox[:4])
			
 
				+                color = VisualizationUtils.COLOR_MAP.get(elem_type, (255, 0, 0))
			
 
				+                
			
 
				+                # 半透明填充
			
 
				+                overlay = Image.new('RGBA', image.size, (255, 255, 255, 0))
			
 
				+                overlay_draw = ImageDraw.Draw(overlay)
			
 
				+                overlay_draw.rectangle([x0, y0, x1, y1], fill=(*color, 50))
			
 
				+                image = Image.alpha_composite(image.convert('RGBA'), overlay).convert('RGB')
			
 
				+                draw = ImageDraw.Draw(image)
			
 
				+                
			
 
				+                # 边框
			
 
				+                draw.rectangle([x0, y0, x1, y1], outline=color, width=2)
			
 
				+                
			
 
				+                # 类型标签
			
 
				+                if draw_type_label:
			
 
				+                    label = elem_type.replace('_', ' ').title()
			
 
				+                    bbox_label = draw.textbbox((x0 + 2, y0 + 2), label, font=font)
			
 
				+                    draw.rectangle(bbox_label, fill=color)
			
 
				+                    draw.text((x0 + 2, y0 + 2), label, fill='white', font=font)
			
 
				+                
			
 
				+                # 序号
			
 
				+                if draw_bbox_number:
			
 
				+                    number_text = str(idx)
			
 
				+                    bbox_number = draw.textbbox((x1 - 25, y0 + 2), number_text, font=font)
			
 
				+                    draw.rectangle(bbox_number, fill=(255, 0, 0))
			
 
				+                    draw.text((x1 - 25, y0 + 2), number_text, fill='white', font=font)
			
 
				+            
			
 
				+            # 绘制丢弃元素（灰色样式）
			
 
				+            for idx, element in enumerate(page.get('discarded_blocks', []), 1):
			
 
				+                original_category = element.get('original_category', 'unknown')
			
 
				+                bbox = element.get('bbox', [0, 0, 0, 0])
			
 
				+                
			
 
				+                if len(bbox) < 4:
			
 
				+                    continue
			
 
				+                
			
 
				+                x0, y0, x1, y1 = map(int, bbox[:4])
			
 
				+                
			
 
				+                # 半透明填充
			
 
				+                overlay = Image.new('RGBA', image.size, (255, 255, 255, 0))
			
 
				+                overlay_draw = ImageDraw.Draw(overlay)
			
 
				+                overlay_draw.rectangle([x0, y0, x1, y1], fill=(*VisualizationUtils.DISCARD_COLOR, 30))
			
 
				+                image = Image.alpha_composite(image.convert('RGBA'), overlay).convert('RGB')
			
 
				+                draw = ImageDraw.Draw(image)
			
 
				+                
			
 
				+                # 灰色边框
			
 
				+                draw.rectangle([x0, y0, x1, y1], outline=VisualizationUtils.DISCARD_COLOR, width=1)
			
 
				+                
			
 
				+                # 类型标签
			
 
				+                if draw_type_label:
			
 
				+                    label = f"D:{original_category}"
			
 
				+                    bbox_label = draw.textbbox((x0 + 2, y0 + 2), label, font=font)
			
 
				+                    draw.rectangle(bbox_label, fill=VisualizationUtils.DISCARD_COLOR)
			
 
				+                    draw.text((x0 + 2, y0 + 2), label, fill='white', font=font)
			
 
				+            
			
 
				+            # 根据输入类型决定命名
			
 
				+            if is_pdf or total_pages > 1:
			
 
				+                layout_path = output_dir / f"{doc_name}_page_{page_idx + 1:03d}_layout.png"
			
 
				+            else:
			
 
				+                layout_path = output_dir / f"{doc_name}_layout.png"
			
 
				+            
			
 
				+            image.save(layout_path)
			
 
				+            layout_paths.append(str(layout_path))
			
 
				+            logger.info(f"🖼️ Layout image saved: {layout_path}")
			
 
				+        
			
 
				+        return layout_paths
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def save_ocr_images(
			
 
				+        results: Dict[str, Any],
			
 
				+        output_dir: Path,
			
 
				+        doc_name: str,
			
 
				+        is_pdf: bool = True
			
 
				+    ) -> List[str]:
			
 
				+        """
			
 
				+        保存 OCR 可视化图片
			
 
				+        
			
 
				+        命名规则:
			
 
				+        - PDF输入: 文件名_page_001_ocr.png
			
 
				+        - 图片输入（单页）: 文件名_ocr.png
			
 
				+        
			
 
				+        Args:
			
 
				+            results: 处理结果
			
 
				+            output_dir: 输出目录
			
 
				+            doc_name: 文档名称
			
 
				+            is_pdf: 是否为 PDF 输入
			
 
				+            
			
 
				+        Returns:
			
 
				+            保存的图片路径列表
			
 
				+        """
			
 
				+        ocr_paths = []
			
 
				+        total_pages = len(results.get('pages', []))
			
 
				+        
			
 
				+        for page in results.get('pages', []):
			
 
				+            page_idx = page.get('page_idx', 0)
			
 
				+            processed_image = page.get('original_image')
			
 
				+            if processed_image is None:
			
 
				+                processed_image = page.get('processed_image')
			
 
				+            
			
 
				+            if processed_image is None:
			
 
				+                logger.warning(f"Page {page_idx}: No image data found for OCR visualization")
			
 
				+                continue
			
 
				+            
			
 
				+            if isinstance(processed_image, np.ndarray):
			
 
				+                image = Image.fromarray(processed_image).convert('RGB')
			
 
				+            elif isinstance(processed_image, Image.Image):
			
 
				+                image = processed_image.convert('RGB')
			
 
				+            else:
			
 
				+                continue
			
 
				+            
			
 
				+            draw = ImageDraw.Draw(image)
			
 
				+            font = VisualizationUtils._get_font(10)
			
 
				+            
			
 
				+            for element in page.get('elements', []):
			
 
				+                content = element.get('content', {})
			
 
				+                
			
 
				+                # OCR 文本框
			
 
				+                ocr_details = content.get('ocr_details', [])
			
 
				+                for ocr_item in ocr_details:
			
 
				+                    ocr_bbox = ocr_item.get('bbox', [])
			
 
				+                    if ocr_bbox:
			
 
				+                        VisualizationUtils._draw_polygon(
			
 
				+                            draw, ocr_bbox, VisualizationUtils.OCR_BOX_COLOR, width=1
			
 
				+                        )
			
 
				+                
			
 
				+                # 表格单元格
			
 
				+                cells = content.get('cells', [])
			
 
				+                for cell in cells:
			
 
				+                    cell_bbox = cell.get('bbox', [])
			
 
				+                    if cell_bbox and len(cell_bbox) >= 4:
			
 
				+                        x0, y0, x1, y1 = map(int, cell_bbox[:4])
			
 
				+                        draw.rectangle(
			
 
				+                            [x0, y0, x1, y1], 
			
 
				+                            outline=VisualizationUtils.CELL_BOX_COLOR, 
			
 
				+                            width=2
			
 
				+                        )
			
 
				+                        
			
 
				+                        cell_text = cell.get('text', '')[:10]
			
 
				+                        if cell_text:
			
 
				+                            draw.text(
			
 
				+                                (x0 + 2, y0 + 2), 
			
 
				+                                cell_text, 
			
 
				+                                fill=VisualizationUtils.CELL_BOX_COLOR, 
			
 
				+                                font=font
			
 
				+                            )
			
 
				+                
			
 
				+                # OCR 框
			
 
				+                ocr_boxes = content.get('ocr_boxes', [])
			
 
				+                for ocr_box in ocr_boxes:
			
 
				+                    bbox = ocr_box.get('bbox', [])
			
 
				+                    if bbox:
			
 
				+                        VisualizationUtils._draw_polygon(
			
 
				+                            draw, bbox, VisualizationUtils.OCR_BOX_COLOR, width=1
			
 
				+                        )
			
 
				+            
			
 
				+            # 绘制丢弃元素的 OCR 框
			
 
				+            for element in page.get('discarded_blocks', []):
			
 
				+                bbox = element.get('bbox', [0, 0, 0, 0])
			
 
				+                content = element.get('content', {})
			
 
				+                
			
 
				+                if len(bbox) >= 4:
			
 
				+                    x0, y0, x1, y1 = map(int, bbox[:4])
			
 
				+                    draw.rectangle(
			
 
				+                        [x0, y0, x1, y1], 
			
 
				+                        outline=VisualizationUtils.DISCARD_COLOR, 
			
 
				+                        width=1
			
 
				+                    )
			
 
				+                    
			
 
				+                    ocr_details = content.get('ocr_details', [])
			
 
				+                    for ocr_item in ocr_details:
			
 
				+                        ocr_bbox = ocr_item.get('bbox', [])
			
 
				+                        if ocr_bbox:
			
 
				+                            VisualizationUtils._draw_polygon(
			
 
				+                                draw, ocr_bbox, VisualizationUtils.DISCARD_COLOR, width=1
			
 
				+                            )
			
 
				+            
			
 
				+            # 根据输入类型决定命名
			
 
				+            if is_pdf or total_pages > 1:
			
 
				+                ocr_path = output_dir / f"{doc_name}_page_{page_idx + 1:03d}_ocr.png"
			
 
				+            else:
			
 
				+                ocr_path = output_dir / f"{doc_name}_ocr.png"
			
 
				+            
			
 
				+            image.save(ocr_path)
			
 
				+            ocr_paths.append(str(ocr_path))
			
 
				+            logger.info(f"🖼️ OCR image saved: {ocr_path}")
			
 
				+        
			
 
				+        return ocr_paths
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def _draw_polygon(
			
 
				+        draw: ImageDraw.Draw,
			
 
				+        bbox: List,
			
 
				+        color: Tuple[int, int, int],
			
 
				+        width: int = 1
			
 
				+    ):
			
 
				+        """
			
 
				+        绘制多边形或矩形
			
 
				+        
			
 
				+        Args:
			
 
				+            draw: ImageDraw 对象
			
 
				+            bbox: 坐标（4点多边形或矩形）
			
 
				+            color: 颜色
			
 
				+            width: 线宽
			
 
				+        """
			
 
				+        if isinstance(bbox[0], (list, tuple)):
			
 
				+            points = [(int(p[0]), int(p[1])) for p in bbox]
			
 
				+            points.append(points[0])
			
 
				+            draw.line(points, fill=color, width=width)
			
 
				+        elif len(bbox) >= 4:
			
 
				+            x0, y0, x1, y1 = map(int, bbox[:4])
			
 
				+            draw.rectangle([x0, y0, x1, y1], outline=color, width=width)
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def _get_font(size: int) -> ImageFont.FreeTypeFont:
			
 
				+        """
			
 
				+        获取字体
			
 
				+        
			
 
				+        Args:
			
 
				+            size: 字体大小
			
 
				+            
			
 
				+        Returns:
			
 
				+            字体对象
			
 
				+        """
			
 
				+        font_paths = [
			
 
				+            "/System/Library/Fonts/Helvetica.ttc",
			
 
				+            "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
			
 
				+            "/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf",
			
 
				+        ]
			
 
				+        
			
 
				+        for font_path in font_paths:
			
 
				+            try:
			
 
				+                return ImageFont.truetype(font_path, size)
			
 
				+            except:
			
 
				+                continue
			
 
				+        
			
 
				+        return ImageFont.load_default()
			
 
				+