Bläddra i källkod

feat: Add output formatter v2 and PDF extraction utilities

- Implemented OutputFormatterV2 for unified output formatting, supporting multiple formats including JSON, Markdown, and HTML.
- Introduced pdf_extractor.py for extracting specified pages from PDF files with command-line interface.
- Created pdf_utils.py for PDF processing utilities, including page range parsing and text extraction.
- Developed visualization_utils.py for visualizing document processing results, including layout and OCR visualizations.
zhch158_admin 2 veckor sedan
förälder
incheckning
d810bf44fc

+ 71 - 0
ocr_utils/__init__.py

@@ -0,0 +1,71 @@
+"""
+OCR 工具包
+
+整合了文档处理相关的工具函数,包括:
+- PDF 处理工具
+- JSON/Markdown/HTML 格式化工具
+- 文件处理工具
+- 数字标准化工具
+"""
+
+from .pdf_utils import PDFUtils
+from .json_formatters import JSONFormatters
+from .markdown_generator import MarkdownGenerator
+from .html_generator import HTMLGenerator
+from .visualization_utils import VisualizationUtils
+from .output_formatter_v2 import OutputFormatterV2, save_mineru_format
+from .pdf_extractor import extract_pdf_pages
+from .normalize_financial_numbers import (
+    normalize_financial_numbers,
+    normalize_json_table,
+    normalize_markdown_table,
+    normalize_json_file
+)
+from .file_utils import (
+    get_input_files,
+    collect_pid_files,
+    get_image_files_from_dir,
+    get_image_files_from_list,
+    get_image_files_from_csv,
+    convert_pdf_to_images,
+    split_files,
+    create_temp_file_list
+)
+from .log_utils import setup_logging
+
+__all__ = [
+    # PDF 工具
+    'PDFUtils',
+    'extract_pdf_pages',
+    # JSON 格式化
+    'JSONFormatters',
+    # Markdown 生成
+    'MarkdownGenerator',
+    # HTML 生成
+    'HTMLGenerator',
+    # 可视化
+    'VisualizationUtils',
+    # 输出格式化
+    'OutputFormatterV2',
+    'save_mineru_format',
+    # 数字标准化
+    'normalize_financial_numbers',
+    'normalize_json_table',
+    'normalize_markdown_table',
+    'normalize_json_file',
+    # 文件工具
+    'get_input_files',
+    'collect_pid_files',
+    'get_image_files_from_dir',
+    'get_image_files_from_list',
+    'get_image_files_from_csv',
+    'convert_pdf_to_images',
+    'split_files',
+    'create_temp_file_list',
+    # 日志工具
+    'setup_logging',
+]
+
+__version__ = "1.0.0"
+__author__ = "zhch158"
+

+ 397 - 0
ocr_utils/file_utils.py

@@ -0,0 +1,397 @@
+"""
+文件处理工具模块
+
+提供文件处理相关功能:
+- 输入文件获取(支持文件/目录/列表/CSV)
+- PDF转图片
+- 文件列表处理
+"""
+import tempfile
+from pathlib import Path
+from typing import List, Tuple
+import json
+import traceback
+from loguru import logger
+
+try:
+    from mineru.utils.pdf_image_tools import load_images_from_pdf
+    from mineru.utils.enum_class import ImageType
+    MINERU_AVAILABLE = True
+except ImportError:
+    MINERU_AVAILABLE = False
+    load_images_from_pdf = None
+    ImageType = None
+
+
+def split_files(file_list: List[str], num_splits: int) -> List[List[str]]:
+    """
+    将文件列表分割成指定数量的子列表
+    
+    Args:
+        file_list: 文件路径列表
+        num_splits: 分割数量
+        
+    Returns:
+        分割后的文件列表
+    """
+    if num_splits <= 0:
+        return [file_list]
+    
+    chunk_size = len(file_list) // num_splits
+    remainder = len(file_list) % num_splits
+    
+    chunks = []
+    start = 0
+    
+    for i in range(num_splits):
+        # 前remainder个chunk多分配一个文件
+        current_chunk_size = chunk_size + (1 if i < remainder else 0)
+        if current_chunk_size > 0:
+            chunks.append(file_list[start:start + current_chunk_size])
+            start += current_chunk_size
+    
+    return [chunk for chunk in chunks if chunk]  # 过滤空列表
+
+
+def create_temp_file_list(file_chunk: List[str]) -> str:
+    """
+    创建临时文件列表文件
+    
+    Args:
+        file_chunk: 文件路径列表
+        
+    Returns:
+        临时文件路径
+    """
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
+        for file_path in file_chunk:
+            f.write(f"{file_path}\n")
+        return f.name
+
+
+def get_image_files_from_dir(input_dir: Path, pattern: str = "*", max_files: int | None = None) -> List[str]:
+    """
+    从目录获取图像文件列表
+    
+    Args:
+        input_dir: 输入目录
+        pattern: 文件名模式
+        max_files: 最大文件数量限制
+        
+    Returns:
+        图像文件路径列表
+    """
+    image_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif']
+    image_files = []
+    
+    for ext in image_extensions:
+        image_files.extend(list(input_dir.glob(f"{pattern}{ext}")))
+        image_files.extend(list(input_dir.glob(f"{pattern}{ext.upper()}")))
+    
+    # 去重并排序
+    image_files = sorted(list(set(str(f) for f in image_files)))
+    
+    # 限制文件数量
+    if max_files:
+        image_files = image_files[:max_files]
+    
+    return image_files
+
+
+def get_image_files_from_list(file_list_path: str) -> List[str]:
+    """
+    从文件列表获取图像文件列表
+    
+    Args:
+        file_list_path: 文件列表路径
+        
+    Returns:
+        图像文件路径列表
+    """
+    logger.info(f"📄 Reading file list from: {file_list_path}")
+    
+    with open(file_list_path, 'r', encoding='utf-8') as f:
+        image_files = [line.strip() for line in f if line.strip()]
+    
+    # 验证文件存在性
+    valid_files = []
+    missing_files = []
+    
+    for file_path in image_files:
+        if Path(file_path).exists():
+            valid_files.append(file_path)
+        else:
+            missing_files.append(file_path)
+    
+    if missing_files:
+        logger.warning(f"⚠️ Warning: {len(missing_files)} files not found:")
+        for missing_file in missing_files[:5]:  # 只显示前5个
+            logger.warning(f"  - {missing_file}")
+        if len(missing_files) > 5:
+            logger.warning(f"  ... and {len(missing_files) - 5} more")
+    
+    logger.info(f"✅ Found {len(valid_files)} valid files out of {len(image_files)} in list")
+    return valid_files
+
+
+def get_image_files_from_csv(csv_file: str, status_filter: str = "fail") -> List[str]:
+    """
+    从CSV文件获取图像文件列表
+
+    Args:
+        csv_file: CSV文件路径
+        status_filter: 状态过滤器
+
+    Returns:
+        图像文件路径列表
+    """
+    logger.info(f"📄 Reading image files from CSV: {csv_file}")
+
+    # 读取CSV文件, 表头:image_path,status
+    image_files = []
+    with open(csv_file, 'r', encoding='utf-8') as f:
+        for line in f:
+            # 需要去掉表头, 按","分割,读取文件名,状态
+            parts = line.strip().split(",")
+            if len(parts) >= 2:
+                image_file, status = parts[0], parts[1]
+                if status.lower() == status_filter.lower():
+                    image_files.append(image_file)
+
+    return image_files
+
+
+def collect_pid_files(pid_output_file: str) -> List[Tuple[str, str]]:
+    """
+    从进程输出文件中收集文件
+
+    Args:
+        pid_output_file: 进程输出文件路径
+
+    Returns:
+        文件列表(文件路径,处理结果)
+    """
+    """
+    单进程结果统计文件格式
+    "results": [
+    {
+      "image_path": "docstructbench_dianzishu_zhongwenzaixian-o.O-61520612.pdf_140.jpg",
+      "processing_time": 2.0265579223632812e-06,
+      "success": true,
+      "device": "gpu:3",
+      "output_json": "/home/ubuntu/zhch/PaddleX/zhch/OmniDocBench_Results_Scheduler/process_3/docstructbench_dianzishu_zhongwenzaixian-o.O-61520612.pdf_140.json",
+      "output_md": "/home/ubuntu/zhch/PaddleX/zhch/OmniDocBench_Results_Scheduler/process_3/docstructbench_dianzishu_zhongwenzaixian-o.O-61520612.pdf_140.md"
+    },
+    ...
+    """
+    if not Path(pid_output_file).exists():
+        logger.warning(f"⚠️ Warning: PID output file not found: {pid_output_file}")
+        return []
+
+    with open(pid_output_file, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+
+    if not isinstance(data, dict) or "results" not in data:
+        logger.warning(f"⚠️ Warning: Invalid PID output file format: {pid_output_file}")
+        return []
+    # 返回文件路径和处理状态, 如果"success": True, 则状态为"success", 否则为"fail"
+    file_list = []
+    for file_result in data.get("results", []):
+        image_path = file_result.get("image_path", "")
+        status = "success" if file_result.get("success", False) else "fail"
+        file_list.append((image_path, status))
+    return file_list
+
+
+def convert_pdf_to_images(
+    pdf_file: str, 
+    output_dir: str | None = None, 
+    dpi: int = 200,
+    page_range: str | None = None
+) -> List[str]:
+    """
+    将PDF转换为图像文件,支持页面范围过滤
+    
+    Args:
+        pdf_file: PDF文件路径
+        output_dir: 输出目录
+        dpi: 图像分辨率
+        page_range: 页面范围字符串,如 "1-5,7,9-12"
+        
+    Returns:
+        生成的图像文件路径列表
+    """
+    pdf_path = Path(pdf_file)
+    if not pdf_path.exists() or pdf_path.suffix.lower() != '.pdf':
+        logger.error(f"❌ Invalid PDF file: {pdf_path}")
+        return []
+
+    # 如果没有指定输出目录,使用PDF同名目录
+    if output_dir is None:
+        output_path = pdf_path.parent / f"{pdf_path.stem}"
+    else:
+        output_path = Path(output_dir) / f"{pdf_path.stem}"
+    output_path = output_path.resolve()
+    output_path.mkdir(parents=True, exist_ok=True)
+
+    try:
+        # 使用MinerU的函数加载PDF图像
+        if not MINERU_AVAILABLE or load_images_from_pdf is None or ImageType is None:
+            logger.error("❌ MinerU components not available for PDF to image conversion")
+            return []
+        
+        images, _ = load_images_from_pdf(
+            pdf_path.read_bytes(),
+            dpi=dpi,
+            image_type=ImageType.PIL  # 返回包含 img_pil 的字典列表
+        )
+        
+        # 应用页面范围过滤
+        selected_pages = None
+        if page_range:
+            from .pdf_utils import PDFUtils
+            total_pages = len(images)
+            selected_pages = PDFUtils.parse_page_range(page_range, total_pages)
+            if selected_pages:
+                images = [images[i] for i in sorted(selected_pages)]
+                logger.info(f"📋 PDF 共 {total_pages} 页,选择处理 {len(images)} 页")
+            else:
+                logger.warning(f"⚠️ 页面范围 '{page_range}' 没有匹配到任何有效页面")
+                return []
+        
+        image_paths = []
+        # 需要跟踪原始页码索引,以便正确命名文件
+        original_indices = sorted(selected_pages) if selected_pages else list(range(len(images)))
+        
+        for idx, image in enumerate(images):
+            # 获取原始页码索引(用于文件命名)
+            original_idx = original_indices[idx] if selected_pages else idx
+            # 生成图像文件名(使用原始页码,从1开始)
+            image_filename = f"{pdf_path.stem}_page_{original_idx + 1:03d}.png"
+            image_path = output_path / image_filename
+
+            # 保存图像 - 从字典中提取 img_pil
+            if isinstance(image, dict):
+                pil_image = image.get('img_pil')
+                if pil_image is None:
+                    logger.error(f"❌ Image dict at index {idx} does not contain 'img_pil' key")
+                    continue
+                pil_image.save(str(image_path))
+            else:
+                # 如果不是字典,假设是直接的 PIL Image
+                image.save(str(image_path))
+            image_paths.append(str(image_path))
+            
+        logger.info(f"✅ Converted {len(images)} pages from {pdf_path.name} to images")
+        return image_paths
+        
+    except Exception as e:
+        logger.error(f"❌ Error converting PDF {pdf_path}: {e}")
+        traceback.print_exc()
+        return []
+
+
+def get_input_files(args, page_range: str | None = None) -> List[str]:
+    """
+    获取输入文件列表,统一处理PDF和图像文件,支持页面范围过滤
+    
+    支持自动判断输入类型:
+    - 如果是文件路径,判断是PDF还是图片
+    - 如果是目录,扫描所有PDF和图片文件
+    - 如果是CSV文件,读取文件列表
+    - 如果是文本文件,读取文件列表
+    
+    Args:
+        args: 命令行参数对象,需要包含 input, output_dir, pdf_dpi 属性
+        page_range: 页面范围字符串(可选),如 "1-5,7,9-12"
+        
+    Returns:
+        处理后的图像文件路径列表
+    """
+    input_files = []
+    input_path = Path(args.input)
+    
+    if not input_path.exists():
+        logger.error(f"❌ Input path does not exist: {input_path}")
+        return []
+    
+    # 判断输入类型
+    if input_path.is_file():
+        # 单个文件
+        if input_path.suffix.lower() == '.pdf':
+            # PDF文件:转换为图片
+            logger.info(f"📄 Processing PDF: {input_path.name}")
+            pdf_images = convert_pdf_to_images(
+                str(input_path), 
+                getattr(args, 'output_dir', None),
+                dpi=getattr(args, 'pdf_dpi', 200),
+                page_range=page_range  # 传递页面范围参数
+            )
+            input_files.extend(pdf_images)
+        elif input_path.suffix.lower() in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif']:
+            # 图片文件:直接添加
+            input_files.append(str(input_path))
+        elif input_path.suffix.lower() == '.csv':
+            # CSV文件:读取文件列表
+            input_files = get_image_files_from_csv(str(input_path), "fail")
+        elif input_path.suffix.lower() in ['.txt', '.list']:
+            # 文本文件:读取文件列表
+            input_files = get_image_files_from_list(str(input_path))
+        else:
+            logger.warning(f"⚠️ Unsupported file type: {input_path.suffix}")
+    
+    elif input_path.is_dir():
+        # 目录:扫描所有PDF和图片文件
+        image_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif']
+        pdf_extensions = ['.pdf']
+        
+        raw_files = []
+        for ext in image_extensions + pdf_extensions:
+            raw_files.extend(list(input_path.glob(f"*{ext}")))
+            raw_files.extend(list(input_path.glob(f"*{ext.upper()}")))
+        
+        # 分离PDF和图像文件
+        pdf_files = [f for f in sorted(set(raw_files)) if f.suffix.lower() == '.pdf']
+        image_files = [f for f in sorted(set(raw_files)) if f.suffix.lower() in image_extensions]
+        
+        # 对于图片目录,应用页面范围过滤
+        if page_range and image_files:
+            from .pdf_utils import PDFUtils
+            total_pages = len(image_files)
+            selected_pages = PDFUtils.parse_page_range(page_range, total_pages)
+            if selected_pages:
+                image_files = [image_files[i] for i in sorted(selected_pages)]
+                logger.info(f"📋 图片目录共 {total_pages} 张,选择处理 {len(image_files)} 张")
+            else:
+                logger.warning(f"⚠️ 页面范围 '{page_range}' 没有匹配到任何有效图片")
+                image_files = []
+        
+        # 分别处理PDF和图像文件
+        pdf_count = 0
+        image_count = 0
+        
+        for file_path in pdf_files:
+            # 转换PDF为图像
+            logger.info(f"📄 Processing PDF: {file_path.name}")
+            pdf_images = convert_pdf_to_images(
+                str(file_path), 
+                getattr(args, 'output_dir', None),
+                dpi=getattr(args, 'pdf_dpi', 200),
+                page_range=page_range  # 传递页面范围参数
+            )
+            input_files.extend(pdf_images)
+            pdf_count += 1
+        
+        for file_path in image_files:
+            # 直接添加图像文件
+            input_files.append(str(file_path))
+            image_count += 1
+        
+        logger.info(f"📊 Input summary:")
+        logger.info(f"  PDF files processed: {pdf_count}")
+        logger.info(f"  Image files found: {image_count}")
+    
+    logger.info(f"📊 Total image files to process: {len(input_files)}")
+    
+    return sorted(list(set(str(f) for f in input_files)))
+

+ 199 - 0
ocr_utils/html_generator.py

@@ -0,0 +1,199 @@
+"""
+HTML 生成器模块
+
+提供 HTML 输出功能:
+- 表格 HTML 生成(带样式)
+- 单元格坐标展示
+"""
+import json
+from pathlib import Path
+from typing import Dict, Any, List
+from loguru import logger
+
+
+class HTMLGenerator:
+    """HTML 生成器类"""
+    
+    @staticmethod
+    def save_table_htmls(
+        results: Dict[str, Any],
+        output_dir: Path,
+        doc_name: str,
+        is_pdf: bool = True
+    ) -> Path:
+        """
+        保存表格 HTML 文件
+        
+        命名规则:
+        - PDF输入: 文件名_table_1_page_001.html
+        - 图片输入(单页): 文件名_table_1.html
+        
+        Args:
+            results: 处理结果
+            output_dir: 输出目录
+            doc_name: 文档名称
+            is_pdf: 是否为 PDF 输入
+            
+        Returns:
+            表格目录路径
+        """
+        tables_dir = output_dir / 'tables'
+        tables_dir.mkdir(exist_ok=True)
+        
+        table_count = 0
+        total_pages = len(results.get('pages', []))
+        
+        for page in results.get('pages', []):
+            page_idx = page.get('page_idx', 0)
+            
+            for element in page.get('elements', []):
+                if element.get('type') in ['table', 'table_body']:
+                    table_count += 1
+                    content = element.get('content', {})
+                    html = content.get('html', '')
+                    cells = content.get('cells', [])
+                    
+                    if html:
+                        full_html = HTMLGenerator._generate_table_html_with_styles(
+                            html, cells, doc_name, page_idx, table_count
+                        )
+                        
+                        # 根据输入类型决定命名
+                        if is_pdf or total_pages > 1:
+                            html_path = tables_dir / f"{doc_name}_table_{table_count}_page_{page_idx + 1:03d}.html"
+                        else:
+                            html_path = tables_dir / f"{doc_name}_table_{table_count}.html"
+                        
+                        with open(html_path, 'w', encoding='utf-8') as f:
+                            f.write(full_html)
+        
+        if table_count > 0:
+            logger.info(f"📊 {table_count} tables saved to: {tables_dir}")
+        
+        return tables_dir
+    
+    @staticmethod
+    def _generate_table_html_with_styles(
+        table_html: str,
+        cells: List[Dict],
+        doc_name: str,
+        page_idx: int,
+        table_idx: int
+    ) -> str:
+        """
+        生成带样式的完整 HTML
+        
+        Args:
+            table_html: 表格 HTML 内容
+            cells: 单元格列表
+            doc_name: 文档名称
+            page_idx: 页码
+            table_idx: 表格序号
+            
+        Returns:
+            完整的 HTML 字符串
+        """
+        cells_json = json.dumps(cells, ensure_ascii=False, indent=2) if cells else "[]"
+        
+        return f"""<!DOCTYPE html>
+<html lang="zh-CN">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>{doc_name} - Table {table_idx}</title>
+    <style>
+        body {{
+            font-family: Arial, "Microsoft YaHei", sans-serif;
+            margin: 20px;
+            background-color: #f5f5f5;
+        }}
+        .container {{
+            max-width: 1400px;
+            margin: 0 auto;
+            background-color: white;
+            padding: 20px;
+            box-shadow: 0 0 10px rgba(0,0,0,0.1);
+            border-radius: 8px;
+        }}
+        .meta {{
+            color: #666;
+            font-size: 0.9em;
+            margin-bottom: 20px;
+            padding-bottom: 10px;
+            border-bottom: 1px solid #ddd;
+        }}
+        table {{
+            border-collapse: collapse;
+            width: 100%;
+            margin: 20px 0;
+        }}
+        th, td {{
+            border: 1px solid #ddd;
+            padding: 8px 12px;
+            text-align: left;
+        }}
+        th {{
+            background-color: #f2f2f2;
+            font-weight: bold;
+        }}
+        tr:hover {{
+            background-color: #f9f9f9;
+        }}
+        td[data-bbox], th[data-bbox] {{
+            position: relative;
+        }}
+        td[data-bbox]:hover::after, th[data-bbox]:hover::after {{
+            content: attr(data-bbox);
+            position: absolute;
+            bottom: 100%;
+            left: 0;
+            background: #333;
+            color: white;
+            padding: 2px 6px;
+            font-size: 10px;
+            border-radius: 3px;
+            white-space: nowrap;
+            z-index: 100;
+        }}
+        .cells-info {{
+            margin-top: 30px;
+            padding: 15px;
+            background-color: #f8f9fa;
+            border-radius: 5px;
+        }}
+        .cells-info summary {{
+            cursor: pointer;
+            font-weight: bold;
+            color: #333;
+        }}
+        .cells-info pre {{
+            background-color: #2d2d2d;
+            color: #f8f8f2;
+            padding: 15px;
+            border-radius: 5px;
+            overflow-x: auto;
+            font-size: 12px;
+        }}
+    </style>
+</head>
+<body>
+    <div class="container">
+        <div class="meta">
+            <p><strong>Document:</strong> {doc_name}</p>
+            <p><strong>Page:</strong> {page_idx + 1}</p>
+            <p><strong>Table:</strong> {table_idx}</p>
+            <p><strong>Cells with coordinates:</strong> {len(cells)}</p>
+        </div>
+        
+        {table_html}
+        
+        <div class="cells-info">
+            <details>
+                <summary>📍 单元格坐标数据 (JSON)</summary>
+                <pre>{cells_json}</pre>
+            </details>
+        </div>
+    </div>
+</body>
+</html>"""
+

+ 396 - 0
ocr_utils/json_formatters.py

@@ -0,0 +1,396 @@
+"""
+JSON 格式化工具模块
+
+提供 JSON 输出格式化功能:
+- MinerU middle.json 格式转换
+- mineru_vllm_results_cell_bbox 格式转换
+- 表格单元格格式化
+- 金额数字标准化(全角→半角)
+"""
+import json
+import sys
+from pathlib import Path
+from typing import Dict, Any, List, Optional
+from loguru import logger
+
+# 导入数字标准化工具
+from .normalize_financial_numbers import normalize_json_table
+
+class JSONFormatters:
+    """JSON 格式化工具类"""
+    
+    @staticmethod
+    def convert_to_middle_json(results: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        转换为 MinerU 标准 middle.json 格式
+        
+        用于 vlm_union_make 生成 Markdown
+        
+        Args:
+            results: 处理结果
+            
+        Returns:
+            MinerU middle.json 格式的字典
+        """
+        middle_json = {
+            "pdf_info": [],
+            "_backend": "vlm",
+            "_scene": results.get('scene', 'unknown'),
+            "_version_name": "2.5.0"
+        }
+        
+        for page in results.get('pages', []):
+            page_info = {
+                'page_idx': page['page_idx'],
+                'page_size': list(page.get('image_shape', [0, 0])[:2][::-1]),
+                'angle': page.get('angle', 0),
+                'para_blocks': [],
+                'discarded_blocks': []
+            }
+            
+            # 处理普通元素
+            for element in page.get('elements', []):
+                block = JSONFormatters._element_to_middle_block(element)
+                if block:
+                    elem_type = element.get('type', '')
+                    if elem_type in ['header', 'footer', 'page_number', 'aside_text', 'abandon', 'discarded']:
+                        page_info['discarded_blocks'].append(block)
+                    else:
+                        page_info['para_blocks'].append(block)
+            
+            # 处理丢弃元素(从 discarded_blocks 字段)
+            for element in page.get('discarded_blocks', []):
+                block = JSONFormatters._element_to_middle_block(element)
+                if block:
+                    page_info['discarded_blocks'].append(block)
+            
+            middle_json['pdf_info'].append(page_info)
+        
+        return middle_json
+    
+    @staticmethod
+    def _element_to_middle_block(element: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+        """
+        将元素转换为 MinerU middle.json block 格式
+        
+        MinerU 期望的嵌套结构:
+        - image 类型: { type: "image", blocks: [{ type: "image_body", lines: [...] }] }
+        - table 类型: { type: "table", blocks: [{ type: "table_body", lines: [...] }] }
+        """
+        elem_type = element.get('type', '')
+        bbox = element.get('bbox', [0, 0, 0, 0])
+        content = element.get('content', {})
+        
+        block = {
+            'type': elem_type,
+            'bbox': bbox,
+            'angle': element.get('angle', 0),
+            'reading_order': element.get('reading_order', 0),
+            'lines': []
+        }
+        
+        # 文本类型
+        if elem_type in ['text', 'title', 'ref_text', 'header', 'footer', 'ocr_text']:
+            text = content.get('text', '') if isinstance(content, dict) else str(content)
+            if text:
+                block['lines'] = [{
+                    'bbox': bbox,
+                    'spans': [{
+                        'bbox': bbox,
+                        'type': 'text',
+                        'content': text
+                    }]
+                }]
+        
+        # 表格类型 - 嵌套结构
+        elif elem_type in ['table', 'table_body']:
+            table_html = content.get('html', '')
+            cells = content.get('cells', [])
+            
+            block['type'] = 'table'
+            block['blocks'] = [{
+                'type': 'table_body',
+                'bbox': bbox,
+                'angle': 0,
+                'lines': [{
+                    'bbox': bbox,
+                    'spans': [{
+                        'bbox': bbox,
+                        'type': 'table',
+                        'html': table_html,
+                        'cells': cells
+                    }]
+                }]
+            }]
+        
+        # 图片类型 - 嵌套结构
+        elif elem_type in ['image', 'image_body', 'figure']:
+            block['type'] = 'image'
+            block['blocks'] = [{
+                'type': 'image_body',
+                'bbox': bbox,
+                'angle': element.get('angle', 0),
+                'lines': [{
+                    'bbox': bbox,
+                    'spans': [{
+                        'bbox': bbox,
+                        'type': 'image',
+                        'image_path': content.get('image_path', ''),
+                        'description': content.get('description', '')
+                    }]
+                }]
+            }]
+        
+        # 公式类型
+        elif elem_type in ['interline_equation', 'inline_equation', 'equation']:
+            latex = content.get('latex', '')
+            block['lines'] = [{
+                'bbox': bbox,
+                'spans': [{
+                    'bbox': bbox,
+                    'type': 'interline_equation' if 'interline' in elem_type else 'inline_equation',
+                    'content': latex
+                }]
+            }]
+        
+        # 表格/图片附属文本
+        elif elem_type in ['table_caption', 'table_footnote', 'image_caption', 'image_footnote']:
+            text = content.get('text', '') if isinstance(content, dict) else str(content)
+            if text:
+                block['lines'] = [{
+                    'bbox': bbox,
+                    'spans': [{
+                        'bbox': bbox,
+                        'type': 'text',
+                        'content': text
+                    }]
+                }]
+        
+        # 丢弃类型
+        elif elem_type in ['abandon', 'discarded']:
+            block['type'] = 'abandon'
+            text = content.get('text', '') if isinstance(content, dict) else str(content)
+            if text:
+                block['lines'] = [{
+                    'bbox': bbox,
+                    'spans': [{
+                        'bbox': bbox,
+                        'type': 'text',
+                        'content': text
+                    }]
+                }]
+        
+        return block
+    
+    @staticmethod
+    def save_page_jsons(
+        results: Dict[str, Any],
+        output_dir: Path,
+        doc_name: str,
+        is_pdf: bool = True,
+        normalize_numbers: bool = True
+    ) -> List[str]:
+        """
+        保存每页独立的 JSON(mineru_vllm_results_cell_bbox 格式)
+        
+        命名规则:
+        - PDF输入: 文件名_page_001.json
+        - 图片输入(单页): 文件名.json
+        
+        Args:
+            results: 处理结果
+            output_dir: 输出目录
+            doc_name: 文档名称
+            is_pdf: 是否为 PDF 输入
+            normalize_numbers: 是否标准化金额数字(全角→半角)
+            
+        Returns:
+            保存的文件路径列表
+        """
+        saved_paths = []
+        total_pages = len(results.get('pages', []))
+        
+        for page in results.get('pages', []):
+            page_idx = page.get('page_idx', 0)
+            
+            # 根据输入类型决定命名
+            if is_pdf or total_pages > 1:
+                page_name = f"{doc_name}_page_{page_idx + 1:03d}"
+            else:
+                page_name = doc_name
+            
+            # 转换为 mineru_vllm_results_cell_bbox 格式
+            page_elements = []
+            for element in page.get('elements', []):
+                converted = JSONFormatters._element_to_cell_bbox_format(element, page_idx)
+                if converted:
+                    page_elements.append(converted)
+            
+            # 添加丢弃元素
+            for element in page.get('discarded_blocks', []):
+                converted = JSONFormatters._element_to_cell_bbox_format(element, page_idx)
+                if converted:
+                    page_elements.append(converted)
+            
+            # 转换为 JSON 字符串
+            json_content = json.dumps(page_elements, ensure_ascii=False, indent=2)
+            
+            # 金额数字标准化
+            if normalize_numbers:
+                original_content = json_content
+                json_content = normalize_json_table(json_content)
+                
+                if json_content != original_content:
+                    original_path = output_dir / f"{page_name}_original.json"
+                    with open(original_path, 'w', encoding='utf-8') as f:
+                        f.write(original_content)
+                    logger.debug(f"📄 Original page JSON saved: {original_path}")
+            
+            # 保存 JSON
+            json_path = output_dir / f"{page_name}.json"
+            with open(json_path, 'w', encoding='utf-8') as f:
+                f.write(json_content)
+            
+            saved_paths.append(str(json_path))
+            logger.debug(f"📄 Page JSON saved: {json_path}")
+        
+        if saved_paths:
+            logger.info(f"📄 {len(saved_paths)} page JSONs saved")
+        
+        return saved_paths
+    
+    @staticmethod
+    def _element_to_cell_bbox_format(
+        element: Dict[str, Any],
+        page_idx: int
+    ) -> Optional[Dict[str, Any]]:
+        """
+        将元素转换为 mineru_vllm_results_cell_bbox 格式
+        """
+        elem_type = element.get('type', '')
+        bbox = element.get('bbox', [0, 0, 0, 0])
+        content = element.get('content', {})
+        
+        # 确保 bbox 是整数列表
+        bbox = [int(x) for x in bbox[:4]] if bbox else [0, 0, 0, 0]
+        
+        result = {
+            'bbox': bbox,
+            'page_idx': page_idx,
+            'reading_order': element.get('reading_order', 0)
+        }
+        
+        # 文本类型
+        if elem_type in ['text', 'title', 'ref_text', 'ocr_text']:
+            text = content.get('text', '') if isinstance(content, dict) else str(content)
+            result['type'] = 'text' if elem_type != 'title' else 'title'
+            result['text'] = text
+            if elem_type == 'title':
+                result['text_level'] = element.get('level', 1)
+        
+        # 表格类型
+        elif elem_type in ['table', 'table_body']:
+            result['type'] = 'table'
+            result['img_path'] = content.get('table_image_path', '')
+            result['table_caption'] = JSONFormatters._ensure_list(content.get('table_caption', []))
+            result['table_footnote'] = JSONFormatters._ensure_list(content.get('table_footnote', []))
+            result['table_body'] = content.get('html', '')
+            
+            # 关键:table_cells 数组
+            cells = content.get('cells', [])
+            if cells:
+                result['table_cells'] = JSONFormatters.format_table_cells(cells)
+            
+            # 旋转和倾斜信息
+            if 'table_angle' in content:
+                result['image_rotation_angle'] = float(content['table_angle'])
+            if 'skew_angle' in content:
+                result['skew_angle'] = float(content['skew_angle'])
+        
+        # 图片类型
+        elif elem_type in ['image', 'image_body', 'figure']:
+            result['type'] = 'image'
+            image_filename = content.get('image_path', '')
+            result['img_path'] = f"images/{image_filename}" if image_filename else ''
+            result['image_caption'] = JSONFormatters._ensure_list(content.get('caption', []))
+            result['image_footnote'] = JSONFormatters._ensure_list(content.get('footnote', []))
+        
+        # 公式类型
+        elif elem_type in ['interline_equation', 'inline_equation', 'equation']:
+            result['type'] = 'equation'
+            result['text'] = content.get('latex', '') if isinstance(content, dict) else ''
+            result['text_format'] = 'latex'
+        
+        # 列表类型
+        elif elem_type == 'list':
+            result['type'] = 'list'
+            result['sub_type'] = 'text'
+            result['list_items'] = content.get('list_items', []) if isinstance(content, dict) else []
+        
+        # 页眉页脚
+        elif elem_type in ['header', 'footer']:
+            result['type'] = elem_type
+            result['text'] = content.get('text', '') if isinstance(content, dict) else str(content)
+        
+        # 表格/图片附属文本
+        elif elem_type in ['table_caption', 'table_footnote', 'image_caption', 'image_footnote']:
+            result['type'] = elem_type
+            result['text'] = content.get('text', '') if isinstance(content, dict) else str(content)
+        
+        # 丢弃元素
+        elif elem_type in ['discarded', 'abandon']:
+            result['type'] = 'discarded'
+            result['original_category'] = element.get('original_category', 'unknown')
+            result['text'] = content.get('text', '') if isinstance(content, dict) else ''
+        
+        else:
+            return None
+        
+        return result
+    
+    @staticmethod
+    def format_table_cells(cells: List[Dict]) -> List[Dict[str, Any]]:
+        """
+        格式化表格单元格为 mineru_vllm_results_cell_bbox 格式
+        
+        输出格式:
+        {
+            "type": "table_cell",
+            "text": "单元格内容",
+            "matched_text": "OCR匹配文本",
+            "bbox": [x1, y1, x2, y2],
+            "row": 1,
+            "col": 1,
+            "score": 100.0,
+            "paddle_bbox_indices": [0, 1]
+        }
+        """
+        formatted_cells = []
+        
+        for cell in cells:
+            formatted_cell = {
+                'type': 'table_cell',
+                'text': cell.get('text', ''),
+                'matched_text': cell.get('matched_text', cell.get('text', '')),
+                'bbox': [float(x) for x in cell.get('bbox', [0, 0, 0, 0])[:4]],
+                'row': cell.get('row', 0),
+                'col': cell.get('col', 0),
+                'score': float(cell.get('score', 100.0)),
+                'paddle_bbox_indices': cell.get('paddle_bbox_indices', 
+                                                cell.get('paddle_indices', []))
+            }
+            formatted_cells.append(formatted_cell)
+        
+        return formatted_cells
+    
+    @staticmethod
+    def _ensure_list(value) -> List:
+        """确保值是列表"""
+        if value is None:
+            return []
+        if isinstance(value, str):
+            return [value] if value else []
+        if isinstance(value, list):
+            return value
+        return [str(value)]
+

+ 35 - 0
ocr_utils/log_utils.py

@@ -0,0 +1,35 @@
+"""
+日志工具模块
+
+提供统一的日志配置功能
+"""
+import sys
+from pathlib import Path
+from loguru import logger
+
+def setup_logging(log_level: str = "INFO", log_file: str | None = None):
+    """
+    设置日志配置
+    
+    Args:
+        log_level: 日志级别(DEBUG, INFO, WARNING, ERROR)
+        log_file: 日志文件路径(可选)
+    """
+    logger.remove()
+    
+    # 控制台输出
+    logger.add(
+        sys.stdout,
+        level=log_level,
+        format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>"
+    )
+    
+    # 文件输出
+    if log_file:
+        logger.add(
+            log_file,
+            level="DEBUG",
+            format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function}:{line} - {message}",
+            rotation="10 MB"
+        )
+

+ 395 - 0
ocr_utils/markdown_generator.py

@@ -0,0 +1,395 @@
+"""
+Markdown 生成器模块
+
+提供 Markdown 输出功能:
+- 完整文档 Markdown 生成
+- 按页 Markdown 生成
+- MinerU union_make 集成
+- 金额数字标准化(全角→半角)
+"""
+import sys
+from pathlib import Path
+from typing import Dict, Any, List, Tuple, Optional
+from loguru import logger
+
+# 导入 MinerU 组件
+mineru_path = Path(__file__).parents[3]
+if str(mineru_path) not in sys.path:
+    sys.path.insert(0, str(mineru_path))
+
+try:
+    from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
+    from mineru.utils.enum_class import MakeMode
+    MINERU_AVAILABLE = True
+except ImportError:
+    MINERU_AVAILABLE = False
+    vlm_union_make = None
+    
+    class MakeMode:
+        MM_MD = 'mm_md'
+        NLP_MD = 'nlp_md'
+
+# 导入数字标准化工具
+from .normalize_financial_numbers import normalize_markdown_table
+
+
+class MarkdownGenerator:
+    """Markdown 生成器类"""
+    
+    @staticmethod
+    def save_markdown(
+        results: Dict[str, Any],
+        middle_json: Dict[str, Any],
+        output_dir: Path,
+        doc_name: str,
+        use_mineru_union: bool = False,
+        normalize_numbers: bool = True
+    ) -> Tuple[Path, Optional[Path]]:
+        """
+        保存 Markdown 文件
+        
+        默认使用自定义实现,确保所有元素类型(包括 table_caption 等)都被正确处理
+        可选使用 MinerU union_make(但它不处理 table_caption 等独立元素)
+        
+        Args:
+            results: 处理结果
+            middle_json: middle.json 格式数据
+            output_dir: 输出目录
+            doc_name: 文档名称
+            use_mineru_union: 是否使用 MinerU union_make(默认 False)
+            normalize_numbers: 是否标准化金额数字(全角→半角)
+            
+        Returns:
+            (Markdown 文件路径, 原始文件路径 或 None)
+        """
+        md_path = output_dir / f"{doc_name}.md"
+        original_path = None
+        
+        if use_mineru_union and MINERU_AVAILABLE and vlm_union_make is not None:
+            try:
+                img_bucket_path = "images"
+                markdown_content = vlm_union_make(
+                    middle_json['pdf_info'],
+                    MakeMode.MM_MD,
+                    img_bucket_path
+                )
+                
+                if markdown_content:
+                    if isinstance(markdown_content, list):
+                        markdown_content = '\n\n'.join(markdown_content)
+                    
+                    header = MarkdownGenerator._generate_header(results)
+                    markdown_content = header + str(markdown_content)
+                    
+                    # 金额数字标准化
+                    if normalize_numbers:
+                        original_content = markdown_content
+                        markdown_content = normalize_markdown_table(markdown_content)
+                        
+                        if markdown_content != original_content:
+                            original_path = output_dir / f"{doc_name}_original.md"
+                            with open(original_path, 'w', encoding='utf-8') as f:
+                                f.write(original_content)
+                            logger.info(f"📝 Original Markdown saved: {original_path}")
+                    
+                    with open(md_path, 'w', encoding='utf-8') as f:
+                        f.write(markdown_content)
+                    
+                    logger.info(f"📝 Markdown saved (MinerU format): {md_path}")
+                    return md_path, original_path
+                    
+            except Exception as e:
+                logger.warning(f"MinerU union_make failed: {e}, falling back to custom implementation")
+        
+        # 使用自定义实现,确保所有元素类型都被处理
+        markdown_content = MarkdownGenerator._generate_full_markdown(results)
+        
+        # 金额数字标准化
+        if normalize_numbers:
+            original_content = markdown_content
+            markdown_content = normalize_markdown_table(markdown_content)
+            
+            if markdown_content != original_content:
+                original_path = output_dir / f"{doc_name}_original.md"
+                with open(original_path, 'w', encoding='utf-8') as f:
+                    f.write(original_content)
+                logger.info(f"📝 Original Markdown saved: {original_path}")
+        
+        with open(md_path, 'w', encoding='utf-8') as f:
+            f.write(markdown_content)
+        
+        logger.info(f"📝 Markdown saved (custom format): {md_path}")
+        return md_path, original_path
+    
+    @staticmethod
+    def save_page_markdowns(
+        results: Dict[str, Any],
+        output_dir: Path,
+        doc_name: str,
+        is_pdf: bool = True,
+        normalize_numbers: bool = True
+    ) -> List[str]:
+        """
+        按页保存 Markdown 文件
+        
+        命名规则:
+        - PDF输入: 文件名_page_001.md
+        - 图片输入(单页): 文件名.md(跳过,因为已有完整版)
+        
+        Args:
+            results: 处理结果
+            output_dir: 输出目录
+            doc_name: 文档名称
+            is_pdf: 是否为 PDF 输入
+            normalize_numbers: 是否标准化金额数字(全角→半角)
+            
+        Returns:
+            保存的 Markdown 文件路径列表
+        """
+        saved_paths = []
+        total_pages = len(results.get('pages', []))
+        
+        # 单个图片输入时,跳过按页保存(因为已有完整版 doc_name.md)
+        if not is_pdf and total_pages == 1:
+            logger.debug("📝 Single image input, skipping page markdown (full version exists)")
+            return saved_paths
+        
+        for page in results.get('pages', []):
+            page_idx = page.get('page_idx', 0)
+            
+            # 根据输入类型决定命名
+            if is_pdf or total_pages > 1:
+                page_name = f"{doc_name}_page_{page_idx + 1:03d}"
+            else:
+                page_name = doc_name
+            
+            # 生成单页 Markdown
+            md_content = MarkdownGenerator._generate_page_markdown(page, doc_name, page_idx)
+            
+            # 金额数字标准化
+            if normalize_numbers:
+                original_content = md_content
+                md_content = normalize_markdown_table(md_content)
+                
+                if md_content != original_content:
+                    original_path = output_dir / f"{page_name}_original.md"
+                    with open(original_path, 'w', encoding='utf-8') as f:
+                        f.write(original_content)
+                    logger.debug(f"📝 Original page Markdown saved: {original_path}")
+            
+            # 保存
+            md_path = output_dir / f"{page_name}.md"
+            with open(md_path, 'w', encoding='utf-8') as f:
+                f.write(md_content)
+            
+            saved_paths.append(str(md_path))
+            logger.debug(f"📝 Page Markdown saved: {md_path}")
+        
+        if saved_paths:
+            logger.info(f"📝 {len(saved_paths)} page Markdowns saved")
+        
+        return saved_paths
+    
+    @staticmethod
+    def _generate_header(results: Dict[str, Any]) -> str:
+        """生成 Markdown 文件头"""
+        return f"""<!--
+scene: {results.get('scene', 'unknown')}
+document: {results.get('document_path', '')}
+pages: {len(results.get('pages', []))}
+-->
+"""
+    
+    @staticmethod
+    def _generate_full_markdown(results: Dict[str, Any]) -> str:
+        """
+        生成完整文档的 Markdown(自定义实现)
+        
+        确保所有元素类型都被正确处理,包括 table_caption、table_footnote 等
+        
+        Args:
+            results: 处理结果
+            
+        Returns:
+            Markdown 内容字符串
+        """
+        md_lines = [
+            f"<!-- ",
+            f"scene: {results.get('scene', 'unknown')}",
+            f"document: {results.get('document_path', '')}",
+            f"pages: {len(results.get('pages', []))}",
+            f"-->",
+            "",
+        ]
+        
+        for page in results.get('pages', []):
+            # 按阅读顺序处理元素
+            for element in page.get('elements', []):
+                elem_type = element.get('type', '')
+                content = element.get('content', {})
+                
+                if elem_type == 'title':
+                    text = content.get('text', '') if isinstance(content, dict) else str(content)
+                    level = element.get('level', 1)
+                    if text:
+                        md_lines.append(f"{'#' * min(level, 6)} {text}")
+                        md_lines.append("")
+                
+                elif elem_type in ['text', 'ocr_text', 'ref_text']:
+                    text = content.get('text', '') if isinstance(content, dict) else str(content)
+                    if text:
+                        md_lines.append(text)
+                        md_lines.append("")
+                
+                elif elem_type in ['table', 'table_body']:
+                    html = content.get('html', '')
+                    if html:
+                        md_lines.append(f"\n{html}\n")
+                        md_lines.append("")
+                
+                elif elem_type in ['image', 'image_body', 'figure']:
+                    img_filename = content.get('image_path', '')
+                    if img_filename:
+                        md_lines.append(f"![](images/{img_filename})")
+                        md_lines.append("")
+                
+                elif elem_type in ['interline_equation', 'inline_equation', 'equation']:
+                    latex = content.get('latex', '')
+                    if latex:
+                        md_lines.append(f"$$\n{latex}\n$$")
+                        md_lines.append("")
+                
+                elif elem_type in ['table_caption', 'table_footnote']:
+                    text = content.get('text', '') if isinstance(content, dict) else str(content)
+                    if text:
+                        if elem_type == 'table_caption':
+                            md_lines.append(f"**{text}**")
+                        else:
+                            md_lines.append(f"*{text}*")
+                        md_lines.append("")
+                
+                elif elem_type in ['image_caption', 'image_footnote']:
+                    text = content.get('text', '') if isinstance(content, dict) else str(content)
+                    if text:
+                        if elem_type == 'image_caption':
+                            md_lines.append(f"**{text}**")
+                        else:
+                            md_lines.append(f"*{text}*")
+                        md_lines.append("")
+        
+        return '\n'.join(md_lines)
+    
+    @staticmethod
+    def _generate_page_markdown(
+        page: Dict[str, Any],
+        doc_name: str,
+        page_idx: int
+    ) -> str:
+        """
+        生成单页的 Markdown 内容
+        
+        Args:
+            page: 页面数据
+            doc_name: 文档名称
+            page_idx: 页码索引
+            
+        Returns:
+            Markdown 内容字符串
+        """
+        md_lines = [
+            f"<!--",
+            f"document: {doc_name}",
+            f"page: {page_idx + 1}",
+            f"angle: {page.get('angle', 0)}",
+            f"-->",
+            "",
+        ]
+        
+        for element in page.get('elements', []):
+            elem_type = element.get('type', '')
+            content = element.get('content', {})
+            bbox = element.get('bbox', [])
+            reading_order = element.get('reading_order', 0)
+            
+            # 添加元素注释
+            md_lines.append(f"<!-- reading_order: {reading_order}, bbox: {bbox} -->")
+            
+            if elem_type == 'title':
+                text = content.get('text', '') if isinstance(content, dict) else str(content)
+                level = element.get('level', 1)
+                md_lines.append(f"{'#' * min(level, 6)} {text}")
+                md_lines.append("")
+            
+            elif elem_type in ['text', 'ocr_text', 'ref_text']:
+                text = content.get('text', '') if isinstance(content, dict) else str(content)
+                if text:
+                    md_lines.append(text)
+                    md_lines.append("")
+            
+            elif elem_type in ['table', 'table_body']:
+                table_captions = content.get('table_caption', [])
+                if isinstance(table_captions, str):
+                    table_captions = [table_captions] if table_captions else []
+                for caption in table_captions:
+                    md_lines.append(f"**{caption}**")
+                
+                html = content.get('html', '')
+                if html:
+                    md_lines.append(f"\n{html}\n")
+                md_lines.append("")
+            
+            elif elem_type in ['image', 'image_body', 'figure']:
+                img_filename = content.get('image_path', '')
+                if img_filename:
+                    md_lines.append(f"![](images/{img_filename})")
+                    md_lines.append("")
+            
+            elif elem_type in ['interline_equation', 'inline_equation', 'equation']:
+                latex = content.get('latex', '')
+                if latex:
+                    md_lines.append(f"$$\n{latex}\n$$")
+                    md_lines.append("")
+            
+            elif elem_type in ['table_caption', 'table_footnote']:
+                text = content.get('text', '') if isinstance(content, dict) else str(content)
+                if text:
+                    # 表格标题加粗,表格脚注斜体
+                    if elem_type == 'table_caption':
+                        md_lines.append(f"**{text}**")
+                    else:
+                        md_lines.append(f"*{text}*")
+                    md_lines.append("")
+            
+            elif elem_type in ['image_caption', 'image_footnote']:
+                text = content.get('text', '') if isinstance(content, dict) else str(content)
+                if text:
+                    # 图片标题加粗,图片脚注斜体
+                    if elem_type == 'image_caption':
+                        md_lines.append(f"**{text}**")
+                    else:
+                        md_lines.append(f"*{text}*")
+                    md_lines.append("")
+            
+            elif elem_type == 'discarded':
+                text = content.get('text', '') if isinstance(content, dict) else ''
+                if text:
+                    md_lines.append(f"<!-- [discarded: {element.get('original_category', 'unknown')}] {text} -->")
+                    md_lines.append("")
+        
+        # 处理丢弃元素
+        for element in page.get('discarded_blocks', []):
+            content = element.get('content', {})
+            bbox = element.get('bbox', [])
+            reading_order = element.get('reading_order', 0)
+            original_category = element.get('original_category', 'unknown')
+            
+            md_lines.append(f"<!-- reading_order: {reading_order}, bbox: {bbox} -->")
+            text = content.get('text', '') if isinstance(content, dict) else ''
+            if text:
+                md_lines.append(f"<!-- [discarded: {original_category}] {text} -->")
+            else:
+                md_lines.append(f"<!-- [discarded: {original_category}] (no text) -->")
+            md_lines.append("")
+        
+        return '\n'.join(md_lines)
+

+ 269 - 0
ocr_utils/normalize_financial_numbers.py

@@ -0,0 +1,269 @@
+import re
+import os
+from pathlib import Path
+
+def normalize_financial_numbers(text: str) -> str:
+    """
+    标准化财务数字:将全角字符转换为半角字符
+    
+    Args:
+        text: 原始文本
+    
+    Returns:
+        标准化后的文本
+    """
+    if not text:
+        return text
+    
+    # 定义全角到半角的映射
+    fullwidth_to_halfwidth = {
+        '0': '0', '1': '1', '2': '2', '3': '3', '4': '4',
+        '5': '5', '6': '6', '7': '7', '8': '8', '9': '9',
+        ',': ',',  # 全角逗号转半角逗号
+        '。': '.',  # 全角句号转半角句号  
+        '.': '.',  # 全角句点转半角句点
+        ':': ':',  # 全角冒号转半角冒号
+        ';': ';',  # 全角分号转半角分号
+        '(': '(',  # 全角左括号转半角左括号
+        ')': ')',  # 全角右括号转半角右括号
+        '-': '-',  # 全角减号转半角减号
+        '+': '+',  # 全角加号转半角加号
+        '%': '%',  # 全角百分号转半角百分号
+    }
+    
+    # 第一步:执行基础字符替换
+    normalized_text = text
+    for fullwidth, halfwidth in fullwidth_to_halfwidth.items():
+        normalized_text = normalized_text.replace(fullwidth, halfwidth)
+    
+    # 第二步:处理数字序列中的空格和分隔符
+    # 修改正则表达式以匹配完整的数字序列,包括空格
+    # 匹配模式:数字 + (空格? + 逗号 + 空格? + 数字)* + (空格? + 小数点 + 数字+)?
+    number_sequence_pattern = r'(\d+(?:\s*[,,]\s*\d+)*(?:\s*[。..]\s*\d+)?)'
+    
+    def normalize_number_sequence(match):
+        sequence = match.group(1)
+        
+        # 处理千分位分隔符周围的空格
+        # 将 "数字 + 空格 + 逗号 + 空格 + 数字" 标准化为 "数字,数字"
+        sequence = re.sub(r'(\d)\s*[,,]\s*(\d)', r'\1,\2', sequence)
+        
+        # 处理小数点周围的空格
+        # 将 "数字 + 空格 + 小数点 + 空格 + 数字" 标准化为 "数字.数字"
+        sequence = re.sub(r'(\d)\s*[。..]\s*(\d)', r'\1.\2', sequence)
+        
+        return sequence
+    
+    normalized_text = re.sub(number_sequence_pattern, normalize_number_sequence, normalized_text)
+    return normalized_text
+    
+def normalize_markdown_table(markdown_content: str) -> str:
+    """
+    专门处理Markdown表格中的数字标准化
+    
+    注意:保留原始markdown中的换行符,只替换表格内的文本内容
+    
+    Args:
+        markdown_content: Markdown内容
+    
+    Returns:
+        标准化后的Markdown内容
+    """
+    # 使用BeautifulSoup处理HTML表格
+    from bs4 import BeautifulSoup, Tag
+    import re
+    
+    # 使用正则表达式找到所有表格的位置,并保留其前后的内容
+    # 匹配完整的HTML表格标签(包括嵌套)
+    table_pattern = r'(<table[^>]*>.*?</table>)'
+    
+    def normalize_table_match(match):
+        """处理单个表格匹配,保留原始格式"""
+        table_html = match.group(1)
+        original_table_html = table_html  # 保存原始HTML用于比较
+        
+        # 解析表格HTML
+        soup = BeautifulSoup(table_html, 'html.parser')
+        tables = soup.find_all('table')
+        
+        # 记录所有需要替换的文本(原始文本 -> 标准化文本)
+        replacements = []
+        
+        for table in tables:
+            if isinstance(table, Tag):
+                cells = table.find_all(['td', 'th'])
+                for cell in cells:
+                    if isinstance(cell, Tag):
+                        # 获取单元格的纯文本内容
+                        original_text = cell.get_text()
+                        normalized_text = normalize_financial_numbers(original_text)
+                        
+                        # 如果内容发生了变化,记录替换
+                        if original_text != normalized_text:
+                            # 找到单元格中所有文本节点并替换
+                            from bs4.element import NavigableString
+                            for text_node in cell.find_all(string=True, recursive=True):
+                                if isinstance(text_node, NavigableString):
+                                    text_str = str(text_node)
+                                    if text_str.strip():
+                                        normalized = normalize_financial_numbers(text_str.strip())
+                                        if normalized != text_str.strip():
+                                            # 保留原始文本节点的前后空白
+                                            if text_str.strip() == text_str:
+                                                # 纯文本节点,直接替换
+                                                text_node.replace_with(normalized)
+                                            else:
+                                                # 有前后空白,需要保留
+                                                leading_ws = text_str[:len(text_str) - len(text_str.lstrip())]
+                                                trailing_ws = text_str[len(text_str.rstrip()):]
+                                                text_node.replace_with(leading_ws + normalized + trailing_ws)
+        
+        # 获取修改后的HTML
+        modified_html = str(soup)
+        
+        # 如果内容没有变化,返回原始HTML(保持原始格式)
+        # 检查是否只是格式变化(换行、空格等)
+        original_text_only = re.sub(r'\s+', '', original_table_html)
+        modified_text_only = re.sub(r'\s+', '', modified_html)
+        
+        if original_text_only == modified_text_only:
+            # 只有格式变化,返回原始HTML以保留换行符
+            return original_table_html
+        
+        # 有实际内容变化,返回修改后的HTML
+        return modified_html
+    
+    # 使用正则替换,只替换表格内容,保留其他部分(包括换行符)不变
+    normalized_content = re.sub(table_pattern, normalize_table_match, markdown_content, flags=re.DOTALL)
+    
+    return normalized_content
+
+def normalize_json_table(json_content: str) -> str:
+    """
+    专门处理JSON格式OCR结果中表格的数字标准化
+    
+    Args:
+        json_content: JSON格式的OCR结果内容
+    
+    Returns:
+        标准化后的JSON内容
+    """
+    """
+    json_content 示例:
+    [
+        {
+            "category": "Table",
+            "text": "<table>...</table>"
+        },
+        {
+            "category": "Text",
+            "text": "Some other text"
+        }
+    ]
+    """
+    import json
+    
+    try:
+        # 解析JSON内容
+        data = json.loads(json_content) if isinstance(json_content, str) else json_content
+        
+        # 确保data是列表格式
+        if not isinstance(data, list):
+            return json_content
+        
+        # 遍历所有OCR结果项
+        for item in data:
+            if not isinstance(item, dict):
+                continue
+                
+            # 检查是否是表格类型
+            if item.get('category') == 'Table' and 'text' in item:
+                table_html = item['text']
+                
+                # 使用BeautifulSoup处理HTML表格
+                from bs4 import BeautifulSoup, Tag
+                
+                soup = BeautifulSoup(table_html, 'html.parser')
+                tables = soup.find_all('table')
+                
+                for table in tables:
+                    if isinstance(table, Tag):
+                        cells = table.find_all(['td', 'th'])
+                        for cell in cells:
+                            if isinstance(cell, Tag):
+                                original_text = cell.get_text()
+                                
+                                # 应用数字标准化
+                                normalized_text = normalize_financial_numbers(original_text)
+                                
+                                # 如果内容发生了变化,更新单元格内容
+                                if original_text != normalized_text:
+                                    cell.string = normalized_text
+                
+                # 更新item中的表格内容
+                item['text'] = str(soup)
+            
+            # 同时标准化普通文本中的数字(如果需要)
+            # elif 'text' in item:
+            #     original_text = item['text']
+            #     normalized_text = normalize_financial_numbers(original_text)
+            #     if original_text != normalized_text:
+            #         item['text'] = normalized_text
+        
+        # 返回标准化后的JSON字符串
+        return json.dumps(data, ensure_ascii=False, indent=2)
+        
+    except json.JSONDecodeError as e:
+        print(f"⚠️ JSON解析失败: {e}")
+        return json_content
+    except Exception as e:
+        print(f"⚠️ JSON表格标准化失败: {e}")
+        return json_content
+
+def normalize_json_file(file_path: str, output_path: str | None = None) -> str:
+    """
+    标准化JSON文件中的表格数字
+    
+    Args:
+        file_path: 输入JSON文件路径
+        output_path: 输出文件路径,如果为None则覆盖原文件
+    
+    Returns:
+        标准化后的JSON内容
+    """
+    input_file = Path(file_path)
+    output_file = Path(output_path) if output_path else input_file
+    
+    if not input_file.exists():
+        raise FileNotFoundError(f"找不到文件: {file_path}")
+    
+    # 读取原始JSON文件
+    with open(input_file, 'r', encoding='utf-8') as f:
+        original_content = f.read()
+    
+    print(f"🔧 正在标准化JSON文件: {input_file.name}")
+    
+    # 标准化内容
+    normalized_content = normalize_json_table(original_content)
+    
+    # 保存标准化后的文件
+    with open(output_file, 'w', encoding='utf-8') as f:
+        f.write(normalized_content)
+    
+    # 统计变化
+    changes = sum(1 for o, n in zip(original_content, normalized_content) if o != n)
+    if changes > 0:
+        print(f"✅ 标准化了 {changes} 个字符")
+        
+        # 如果输出路径不同,也保存原始版本
+        if output_path and output_path != file_path:
+            original_backup = Path(output_path).parent / f"{Path(output_path).stem}_original.json"
+            with open(original_backup, 'w', encoding='utf-8') as f:
+                f.write(original_content)
+            print(f"📄 原始版本已保存到: {original_backup}")
+    else:
+        print("ℹ️ 无需标准化(已是标准格式)")
+    
+    print(f"📄 标准化结果已保存到: {output_file}")
+    return normalized_content
+

+ 284 - 0
ocr_utils/output_formatter_v2.py

@@ -0,0 +1,284 @@
+"""
+统一输出格式化器 v2
+
+严格遵循 MinerU mineru_vllm_results_cell_bbox 格式
+
+支持:
+1. MinerU 标准 middle.json 格式(用于 union_make 生成 Markdown)
+2. mineru_vllm_results_cell_bbox 格式(每页独立 JSON)
+3. Markdown 输出(复用 MinerU union_make)
+4. Debug 模式:layout 图片、OCR 图片
+5. 表格 HTML 输出(带坐标信息)
+6. 金额数字标准化(全角→半角转换)
+
+模块结构:
+- json_formatters.py: JSON 格式化工具
+- markdown_generator.py: Markdown 生成器
+- html_generator.py: HTML 生成器
+- visualization_utils.py: 可视化工具
+"""
+import json
+import sys
+from pathlib import Path
+from typing import Dict, Any, List, Optional
+from loguru import logger
+
+# 导入子模块
+from .json_formatters import JSONFormatters
+from .markdown_generator import MarkdownGenerator
+from .html_generator import HTMLGenerator
+from .visualization_utils import VisualizationUtils
+
+# 导入数字标准化工具
+from .normalize_financial_numbers import normalize_markdown_table, normalize_json_table
+
+
+class OutputFormatterV2:
+    """
+    统一输出格式化器
+    
+    严格遵循 MinerU mineru_vllm_results_cell_bbox 格式:
+    - middle.json: MinerU 标准格式,用于生成 Markdown
+    - page_xxx.json: 每页独立的 JSON,包含 table_cells
+    - Markdown: 带 bbox 注释
+    - 表格: HTML 格式,带 data-bbox 属性
+    
+    命名规则:
+    - PDF输入: 文件名_page_001.*(按页编号)
+    - 图片输入: 文件名.*(不加页码后缀)
+    """
+    
+    # 颜色映射(导出供其他模块使用)
+    COLOR_MAP = VisualizationUtils.COLOR_MAP
+    OCR_BOX_COLOR = VisualizationUtils.OCR_BOX_COLOR
+    CELL_BOX_COLOR = VisualizationUtils.CELL_BOX_COLOR
+    
+    def __init__(self, output_dir: str):
+        """
+        初始化格式化器
+        
+        Args:
+            output_dir: 输出目录
+        """
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+    
+    @staticmethod
+    def is_pdf_input(results: Dict[str, Any]) -> bool:
+        """
+        判断输入是否为 PDF
+        
+        Args:
+            results: 处理结果
+            
+        Returns:
+            True 如果输入是 PDF,否则 False
+        """
+        doc_path = results.get('document_path', '')
+        if doc_path:
+            return Path(doc_path).suffix.lower() == '.pdf'
+        
+        # 如果没有 document_path,检查 metadata
+        input_type = results.get('metadata', {}).get('input_type', '')
+        return input_type == 'pdf'
+    
+    @staticmethod
+    def get_page_name(doc_name: str, page_idx: int, is_pdf: bool, total_pages: int = 1) -> str:
+        """
+        获取页面名称
+        
+        Args:
+            doc_name: 文档名称
+            page_idx: 页码索引(从0开始)
+            is_pdf: 是否为 PDF 输入
+            total_pages: 总页数
+            
+        Returns:
+            页面名称(不含扩展名)
+        """
+        if is_pdf or total_pages > 1:
+            # PDF 或多页输入:添加页码后缀
+            return f"{doc_name}_page_{page_idx + 1:03d}"
+        else:
+            # 单个图片:不添加页码后缀
+            return doc_name
+    
+    def save_results(
+        self,
+        results: Dict[str, Any],
+        output_config: Dict[str, Any]
+    ) -> Dict[str, Any]:
+        """
+        保存处理结果
+        
+        命名规则:
+        - PDF输入: 文件名_page_001.*(按页编号)
+        - 图片输入: 文件名.*(不加页码后缀)
+        
+        Args:
+            results: 处理结果
+            output_config: 输出配置,支持以下选项:
+                - create_subdir: 是否在输出目录下创建文档名子目录(默认 False)
+                - ... 其他选项见 save_mineru_format 函数
+            
+        Returns:
+            输出文件路径字典
+        """
+        output_paths: Dict[str, Any] = {
+            'images': [],
+            'json_pages': [],
+        }
+        
+        # 创建文档输出目录
+        doc_name = Path(results['document_path']).stem
+        
+        # 是否创建子目录(默认不创建,直接使用指定的输出目录)
+        create_subdir = output_config.get('create_subdir', False)
+        if create_subdir:
+            doc_output_dir = self.output_dir / doc_name
+        else:
+            doc_output_dir = self.output_dir
+        doc_output_dir.mkdir(parents=True, exist_ok=True)
+        
+        # 判断输入类型
+        is_pdf = self.is_pdf_input(results)
+        total_pages = len(results.get('pages', []))
+        
+        # 创建 images 子目录
+        images_dir = doc_output_dir / 'images'
+        images_dir.mkdir(exist_ok=True)
+        
+        # 1. 首先保存图片元素(设置 image_path)
+        image_paths = VisualizationUtils.save_image_elements(
+            results, images_dir, doc_name, is_pdf=is_pdf
+        )
+        if image_paths:
+            output_paths['images'] = image_paths
+        
+        # 2. 转换为 MinerU middle.json 格式
+        middle_json = JSONFormatters.convert_to_middle_json(results)
+        
+        # 3. 保存 middle.json
+        if output_config.get('save_json', True):
+            json_path = doc_output_dir / f"{doc_name}_middle.json"
+            json_content = json.dumps(middle_json, ensure_ascii=False, indent=2)
+            
+            # 金额数字标准化
+            normalize_numbers = output_config.get('normalize_numbers', True)
+            if normalize_numbers:
+                original_content = json_content
+                json_content = normalize_json_table(json_content)
+                
+                # 检查是否有变化
+                if json_content != original_content:
+                    # 保存原始文件
+                    original_path = doc_output_dir / f"{doc_name}_middle_original.json"
+                    with open(original_path, 'w', encoding='utf-8') as f:
+                        f.write(original_content)
+                    logger.info(f"📄 Original middle JSON saved: {original_path}")
+                    output_paths['middle_json_original'] = str(original_path)
+            
+            with open(json_path, 'w', encoding='utf-8') as f:
+                f.write(json_content)
+            output_paths['middle_json'] = str(json_path)
+            logger.info(f"📄 Middle JSON saved: {json_path}")
+        
+        # 4. 保存每页独立的 mineru_vllm_results_cell_bbox 格式 JSON
+        if output_config.get('save_page_json', True):
+            normalize_numbers = output_config.get('normalize_numbers', True)
+            page_json_paths = JSONFormatters.save_page_jsons(
+                results, doc_output_dir, doc_name, is_pdf=is_pdf,
+                normalize_numbers=normalize_numbers
+            )
+            output_paths['json_pages'] = page_json_paths
+        
+        # 5. 保存 Markdown(完整版)
+        if output_config.get('save_markdown', True):
+            normalize_numbers = output_config.get('normalize_numbers', True)
+            md_path, original_md_path = MarkdownGenerator.save_markdown(
+                results, middle_json, doc_output_dir, doc_name,
+                normalize_numbers=normalize_numbers
+            )
+            output_paths['markdown'] = str(md_path)
+            if original_md_path:
+                output_paths['markdown_original'] = str(original_md_path)
+        
+        # 5.5 保存每页独立的 Markdown
+        if output_config.get('save_page_markdown', True):
+            normalize_numbers = output_config.get('normalize_numbers', True)
+            page_md_paths = MarkdownGenerator.save_page_markdowns(
+                results, doc_output_dir, doc_name, is_pdf=is_pdf,
+                normalize_numbers=normalize_numbers
+            )
+            output_paths['markdown_pages'] = page_md_paths
+        
+        # 6. 保存表格 HTML
+        if output_config.get('save_html', True):
+            html_dir = HTMLGenerator.save_table_htmls(
+                results, doc_output_dir, doc_name, is_pdf=is_pdf
+            )
+            output_paths['table_htmls'] = str(html_dir)
+        
+        # 7. Debug 模式:保存可视化图片
+        if output_config.get('save_layout_image', False):
+            layout_paths = VisualizationUtils.save_layout_images(
+                results, doc_output_dir, doc_name,
+                draw_type_label=output_config.get('draw_type_label', True),
+                draw_bbox_number=output_config.get('draw_bbox_number', True),
+                is_pdf=is_pdf
+            )
+            output_paths['layout_images'] = layout_paths
+        
+        if output_config.get('save_ocr_image', False):
+            ocr_paths = VisualizationUtils.save_ocr_images(
+                results, doc_output_dir, doc_name, is_pdf=is_pdf
+            )
+            output_paths['ocr_images'] = ocr_paths
+        
+        logger.info(f"✅ All results saved to: {doc_output_dir}")
+        return output_paths
+
+
+# ==================== 便捷函数 ====================
+
+def save_mineru_format(
+    results: Dict[str, Any],
+    output_dir: str,
+    output_config: Optional[Dict[str, Any]] = None
+) -> Dict[str, Any]:
+    """
+    便捷函数:保存为 MinerU 格式
+    
+    Args:
+        results: pipeline 处理结果
+        output_dir: 输出目录
+        output_config: 输出配置,支持以下选项:
+            - create_subdir: 在输出目录下创建文档名子目录(默认 False)
+            - save_json: 保存 middle.json
+            - save_page_json: 保存每页 JSON
+            - save_markdown: 保存完整 Markdown
+            - save_page_markdown: 保存每页 Markdown
+            - save_html: 保存表格 HTML
+            - save_layout_image: 保存布局可视化图
+            - save_ocr_image: 保存 OCR 可视化图
+            - normalize_numbers: 标准化金额数字(全角→半角)
+        
+    Returns:
+        输出文件路径字典
+    """
+    if output_config is None:
+        output_config = {
+            'create_subdir': False,  # 默认不创建子目录,直接使用指定目录
+            'save_json': True,
+            'save_page_json': True,
+            'save_markdown': True,
+            'save_page_markdown': True,
+            'save_html': True,
+            'save_layout_image': False,
+            'save_ocr_image': False,
+            'normalize_numbers': True,  # 默认启用数字标准化
+        }
+    
+    formatter = OutputFormatterV2(output_dir)
+    return formatter.save_results(results, output_config)
+

+ 223 - 0
ocr_utils/pdf_extractor.py

@@ -0,0 +1,223 @@
+#!/usr/bin/env python3
+"""
+PDF页面提取工具
+
+从PDF文件中提取指定页面并保存为新PDF文件。
+
+使用方法:
+    python pdf_extractor.py input.pdf --pages "1-5,7,9-12" --output output.pdf
+    python pdf_extractor.py input.pdf --pages "1-" --output output.pdf  # 提取第1页到最后
+    python pdf_extractor.py input.pdf --pages "-10" --output output.pdf  # 提取前10页
+"""
+import argparse
+import sys
+from pathlib import Path
+from typing import List
+import io
+
+try:
+    import pypdfium2 as pdfium
+    PDFIUM_AVAILABLE = True
+except ImportError:
+    PDFIUM_AVAILABLE = False
+    pdfium = None
+
+from loguru import logger
+from .pdf_utils import PDFUtils
+
+
+def extract_pdf_pages(
+    input_pdf_path: Path,
+    page_indices: List[int],
+    output_pdf_path: Path
+) -> bool:
+    """
+    从PDF中提取指定页面并保存为新PDF
+    
+    Args:
+        input_pdf_path: 输入PDF文件路径
+        page_indices: 要提取的页面索引列表(0-based)
+        output_pdf_path: 输出PDF文件路径
+        
+    Returns:
+        是否成功
+    """
+    if not PDFIUM_AVAILABLE:
+        logger.error("❌ pypdfium2 未安装,请先安装: pip install pypdfium2")
+        return False
+    
+    if not input_pdf_path.exists():
+        logger.error(f"❌ 输入文件不存在: {input_pdf_path}")
+        return False
+    
+    if not input_pdf_path.suffix.lower() == '.pdf':
+        logger.error(f"❌ 输入文件不是PDF格式: {input_pdf_path}")
+        return False
+    
+    try:
+        # 读取PDF文件
+        with open(input_pdf_path, 'rb') as f:
+            pdf_bytes = f.read()
+        
+        # 加载PDF文档
+        pdf = pdfium.PdfDocument(pdf_bytes)
+        total_pages = len(pdf)
+        
+        if total_pages == 0:
+            logger.error("❌ PDF文件为空")
+            pdf.close()
+            return False
+        
+        # 验证页面索引
+        valid_indices = []
+        for idx in sorted(set(page_indices)):  # 去重并排序
+            if 0 <= idx < total_pages:
+                valid_indices.append(idx)
+            else:
+                logger.warning(f"⚠️  页面索引 {idx + 1} 超出范围(总页数: {total_pages}),已跳过")
+        
+        if not valid_indices:
+            logger.error("❌ 没有有效的页面可提取")
+            pdf.close()
+            return False
+        
+        # 创建新PDF文档
+        output_pdf = pdfium.PdfDocument.new()
+        
+        # 导入指定页面
+        success_count = 0
+        for page_idx in valid_indices:
+            try:
+                output_pdf.import_pages(pdf, pages=[page_idx])
+                success_count += 1
+            except Exception as e:
+                logger.warning(f"⚠️  导入第 {page_idx + 1} 页失败: {e},已跳过")
+                continue
+        
+        if success_count == 0:
+            logger.error("❌ 没有成功导入任何页面")
+            pdf.close()
+            output_pdf.close()
+            return False
+        
+        # 保存到文件
+        output_pdf_path.parent.mkdir(parents=True, exist_ok=True)
+        
+        # 保存到内存缓冲区
+        output_buffer = io.BytesIO()
+        output_pdf.save(output_buffer)
+        output_bytes = output_buffer.getvalue()
+        
+        # 写入文件
+        with open(output_pdf_path, 'wb') as f:
+            f.write(output_bytes)
+        
+        # 清理资源
+        pdf.close()
+        output_pdf.close()
+        
+        logger.info(f"✅ 成功提取 {success_count} 页到: {output_pdf_path}")
+        logger.info(f"   提取的页面: {', '.join([str(idx + 1) for idx in valid_indices])}")
+        return True
+        
+    except Exception as e:
+        logger.error(f"❌ 提取PDF页面时出错: {e}")
+        import traceback
+        logger.debug(traceback.format_exc())
+        return False
+
+
+def main():
+    """命令行入口"""
+    parser = argparse.ArgumentParser(
+        description='从PDF文件中提取指定页面并保存为新PDF文件',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+示例:
+  # 提取第1-5页和第7页
+  python pdf_extractor.py --input input.pdf --pages "1-5,7" --output output.pdf
+  
+  # 提取第1页到最后
+  python pdf_extractor.py --input input.pdf --pages "1-" --output output.pdf
+  
+  # 提取前10页
+  python pdf_extractor.py --input input.pdf --pages "-10" --output output.pdf
+  
+  # 提取单页
+  python pdf_extractor.py --input input.pdf --pages "3" --output output.pdf
+        """
+    )
+    
+    parser.add_argument(
+        '--input', '-i',
+        type=str,
+        help='输入PDF文件路径'
+    )
+    
+    parser.add_argument(
+        '--pages', '-p',
+        type=str,
+        required=True,
+        help='要提取的页面范围,支持格式: "1-5,7,9-12", "1-", "-10", "3"'
+    )
+    
+    parser.add_argument(
+        '--output', '-o',
+        type=str,
+        required=True,
+        help='输出PDF文件路径'
+    )
+    
+    args = parser.parse_args()
+    
+    # 解析输入路径
+    input_path = Path(args.input).resolve()
+    
+    # 解析输出路径
+    output_path = Path(args.output).resolve()
+    
+    # 如果输出路径是目录,自动生成文件名
+    if output_path.is_dir() or not output_path.suffix.lower() == '.pdf':
+        output_path = output_path / f"{input_path.stem}_extracted.pdf"
+    
+    # 检查 pypdfium2 是否可用
+    if not PDFIUM_AVAILABLE:
+        logger.error("❌ pypdfium2 未安装,请先安装: pip install pypdfium2")
+        sys.exit(1)
+    
+    # 先加载PDF获取总页数(用于验证页面范围)
+    try:
+        with open(input_path, 'rb') as f:
+            pdf_bytes = f.read()
+        pdf = pdfium.PdfDocument(pdf_bytes)
+        total_pages = len(pdf)
+        pdf.close()
+    except Exception as e:
+        logger.error(f"❌ 无法读取PDF文件: {e}")
+        sys.exit(1)
+    
+    # 解析页面范围
+    page_set = PDFUtils.parse_page_range(args.pages, total_pages)
+    page_indices = sorted(list(page_set))
+    
+    if not page_indices:
+        logger.error(f"❌ 页面范围 '{args.pages}' 没有匹配到任何有效页面(总页数: {total_pages})")
+        sys.exit(1)
+    
+    logger.info(f"📋 PDF总页数: {total_pages}")
+    logger.info(f"📋 要提取的页面: {args.pages} → {len(page_indices)} 页")
+    
+    # 执行提取
+    success = extract_pdf_pages(input_path, page_indices, output_path)
+    
+    if success:
+        logger.info(f"✅ 提取完成!输出文件: {output_path}")
+        sys.exit(0)
+    else:
+        logger.error("❌ 提取失败")
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    sys.exit(main())
+

+ 294 - 0
ocr_utils/pdf_utils.py

@@ -0,0 +1,294 @@
+"""
+PDF处理工具模块
+
+提供PDF相关处理功能:
+- PDF加载与分类
+- PDF文本提取
+- 跨页表格合并
+- 页面范围解析与过滤
+"""
+from typing import Dict, List, Any, Optional, Tuple, Set
+from pathlib import Path
+from PIL import Image
+from loguru import logger
+import re
+
+# 导入 MinerU 组件
+try:
+    from mineru.utils.pdf_classify import classify as pdf_classify
+    from mineru.utils.pdf_image_tools import load_images_from_pdf
+    from mineru.utils.enum_class import ImageType
+    from mineru.utils.pdf_text_tool import get_page as pdf_get_page_text
+    MINERU_AVAILABLE = True
+except ImportError:
+    raise ImportError("MinerU components not available for PDF processing")
+
+class PDFUtils:
+    """PDF处理工具类"""
+    
+    @staticmethod
+    def parse_page_range(page_range: Optional[str], total_pages: int) -> Set[int]:
+        """
+        解析页面范围字符串
+        
+        支持格式:
+        - "1-5" → {0, 1, 2, 3, 4}(页码从1开始,内部转为0-based索引)
+        - "3" → {2}
+        - "1-5,7,9-12" → {0, 1, 2, 3, 4, 6, 8, 9, 10, 11}
+        - "1-" → 从第1页到最后
+        - "-5" → 从第1页到第5页
+        
+        Args:
+            page_range: 页面范围字符串(页码从1开始)
+            total_pages: 总页数
+            
+        Returns:
+            页面索引集合(0-based)
+        """
+        if not page_range or not page_range.strip():
+            return set(range(total_pages))
+        
+        pages = set()
+        parts = page_range.replace(' ', '').split(',')
+        
+        for part in parts:
+            part = part.strip()
+            if not part:
+                continue
+            
+            if '-' in part:
+                # 范围格式
+                match = re.match(r'^(\d*)-(\d*)$', part)
+                if match:
+                    start_str, end_str = match.groups()
+                    start = int(start_str) if start_str else 1
+                    end = int(end_str) if end_str else total_pages
+                    
+                    # 转换为 0-based 索引
+                    start = max(0, start - 1)
+                    end = min(total_pages, end)
+                    
+                    pages.update(range(start, end))
+            else:
+                # 单页
+                try:
+                    page_num = int(part)
+                    if 1 <= page_num <= total_pages:
+                        pages.add(page_num - 1)  # 转换为 0-based 索引
+                except ValueError:
+                    logger.warning(f"Invalid page number: {part}")
+        
+        return pages
+    
+    @staticmethod
+    def load_and_classify_document(
+        document_path: Path,
+        dpi: int = 200,
+        page_range: Optional[str] = None
+    ) -> Tuple[List[Dict], str, Optional[Any]]:
+        """
+        加载文档并分类,支持页面范围过滤
+        
+        Args:
+            document_path: 文档路径
+            dpi: PDF渲染DPI
+            page_range: 页面范围字符串,如 "1-5,7,9-12"
+                       - PDF:按页码(从1开始)
+                       - 图片目录:按文件名排序后的位置(从1开始)
+            
+        Returns:
+            (images_list, pdf_type, pdf_doc)
+            - images_list: 图像列表,每个元素包含 {'img_pil': PIL.Image, 'scale': float, 'page_idx': int}
+            - pdf_type: 'ocr' 或 'txt'
+            - pdf_doc: PDF文档对象(如果是PDF)
+        """
+        pdf_doc = None
+        pdf_type = 'ocr'  # 默认使用OCR模式
+        all_images = []
+        
+        if document_path.is_dir():
+            # 处理目录:遍历所有图片
+            image_extensions = {'.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.tif'}
+            image_files = sorted([
+                f for f in document_path.iterdir() 
+                if f.suffix.lower() in image_extensions
+            ])
+            
+            # 解析页面范围
+            total_pages = len(image_files)
+            selected_pages = PDFUtils.parse_page_range(page_range, total_pages)
+            
+            if page_range:
+                logger.info(f"📋 图片目录共 {total_pages} 张,选择处理 {len(selected_pages)} 张")
+            
+            for idx, img_file in enumerate(image_files):
+                if idx not in selected_pages:
+                    continue
+                
+                img = Image.open(img_file)
+                if img.mode != 'RGB':
+                    img = img.convert('RGB')
+                all_images.append({
+                    'img_pil': img,
+                    'scale': 1.0,
+                    'source_path': str(img_file),
+                    'page_idx': idx,  # 原始索引
+                    'page_name': img_file.stem  # 文件名(不含扩展名)
+                })
+            
+            pdf_type = 'ocr'  # 图片目录始终使用OCR模式
+            
+        elif document_path.suffix.lower() == '.pdf':
+            # 处理PDF文件
+            if not MINERU_AVAILABLE:
+                raise RuntimeError("MinerU components not available for PDF processing")
+            
+            with open(document_path, 'rb') as f:
+                pdf_bytes = f.read()
+            
+            # PDF分类
+            pdf_type = pdf_classify(pdf_bytes)
+            logger.info(f"📋 PDF classified as: {pdf_type}")
+            
+            # 加载图像
+            images_list, pdf_doc = load_images_from_pdf(
+                pdf_bytes, 
+                dpi=dpi,
+                image_type=ImageType.PIL
+            )
+            
+            # 解析页面范围
+            total_pages = len(images_list)
+            selected_pages = PDFUtils.parse_page_range(page_range, total_pages)
+            
+            if page_range:
+                logger.info(f"📋 PDF 共 {total_pages} 页,选择处理 {len(selected_pages)} 页")
+            
+            for idx, img_dict in enumerate(images_list):
+                if idx not in selected_pages:
+                    continue
+                
+                all_images.append({
+                    'img_pil': img_dict['img_pil'],
+                    'scale': img_dict.get('scale', dpi / 72),
+                    'source_path': str(document_path),
+                    'page_idx': idx  # 原始页码索引
+                })
+                
+        elif document_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.tif']:
+            # 处理单个图片
+            img = Image.open(document_path)
+            if img.mode != 'RGB':
+                img = img.convert('RGB')
+            all_images.append({
+                'img_pil': img,
+                'scale': 1.0,
+                'source_path': str(document_path),
+                'page_idx': 0,
+                'page_name': document_path.stem
+            })
+            pdf_type = 'ocr'
+            
+        else:
+            raise ValueError(f"Unsupported file format: {document_path.suffix}")
+        
+        return all_images, pdf_type, pdf_doc
+    
+    @staticmethod
+    def extract_text_from_pdf(
+        pdf_doc: Any,
+        page_idx: int,
+        bbox: List[float],
+        scale: float
+    ) -> Tuple[str, bool]:
+        """
+        从PDF直接提取文本(使用 MinerU 的 pypdfium2 方式)
+        
+        Args:
+            pdf_doc: pypdfium2 的 PdfDocument 对象
+            page_idx: 页码索引
+            bbox: 目标区域的bbox(图像坐标)
+            scale: 图像与PDF的缩放比例
+            
+        Returns:
+            (text, success)
+        """
+        if not MINERU_AVAILABLE or pdf_get_page_text is None:
+            logger.debug("MinerU pdf_text_tool not available")
+            return "", False
+            
+        try:
+            page = pdf_doc[page_idx]
+            
+            # 将图像坐标转换为PDF坐标
+            pdf_bbox = [
+                bbox[0] / scale,
+                bbox[1] / scale,
+                bbox[2] / scale,
+                bbox[3] / scale
+            ]
+            
+            # 使用 MinerU 的方式获取页面文本信息
+            page_dict = pdf_get_page_text(page)
+            
+            # 从 blocks 中提取与 bbox 重叠的文本
+            text_parts = []
+            for block in page_dict.get('blocks', []):
+                for line in block.get('lines', []):
+                    line_bbox = line.get('bbox')
+                    if line_bbox and hasattr(line_bbox, 'bbox'):
+                        line_bbox = line_bbox.bbox  # pdftext 的 BBox 对象
+                    elif isinstance(line_bbox, (list, tuple)) and len(line_bbox) >= 4:
+                        line_bbox = list(line_bbox)
+                    else:
+                        continue
+                    
+                    # 检查 line 是否与目标 bbox 重叠
+                    if PDFUtils._bbox_overlap(pdf_bbox, line_bbox):
+                        for span in line.get('spans', []):
+                            span_text = span.get('text', '')
+                            if span_text:
+                                text_parts.append(span_text)
+            
+            text = ' '.join(text_parts)
+            return text.strip(), bool(text.strip())
+            
+        except Exception as e:
+            import traceback
+            logger.debug(f"PDF text extraction error: {e}")
+            logger.debug(traceback.format_exc())
+            return "", False
+    
+    @staticmethod
+    def _bbox_overlap(bbox1: List[float], bbox2: List[float]) -> bool:
+        """检查两个 bbox 是否重叠"""
+        if len(bbox1) < 4 or len(bbox2) < 4:
+            return False
+        
+        x1_1, y1_1, x2_1, y2_1 = bbox1[:4]
+        x1_2, y1_2, x2_2, y2_2 = bbox2[:4]
+        
+        if x2_1 < x1_2 or x2_2 < x1_1:
+            return False
+        if y2_1 < y1_2 or y2_2 < y1_1:
+            return False
+        
+        return True
+    
+    @staticmethod
+    def merge_cross_page_tables(results: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        合并跨页表格
+        
+        TODO: 实现跨页表格合并逻辑
+        可以参考 MinerU 的 cross_page_table_merge 实现
+        
+        Args:
+            results: 处理结果字典
+            
+        Returns:
+            合并后的结果
+        """
+        # TODO: 实现跨页表格合并逻辑
+        return results
+

+ 436 - 0
ocr_utils/visualization_utils.py

@@ -0,0 +1,436 @@
+"""
+可视化工具模块
+
+提供文档处理结果的可视化功能:
+- Layout 布局可视化
+- OCR 结果可视化
+- 图片元素保存
+"""
+from pathlib import Path
+from typing import Dict, Any, List, Tuple
+import numpy as np
+from PIL import Image, ImageDraw, ImageFont
+import cv2
+from loguru import logger
+
+
+class VisualizationUtils:
+    """可视化工具类"""
+    
+    # 颜色映射(与 MinerU BlockType / EnhancedDocPipeline 类别保持一致)
+    COLOR_MAP = {
+        # 文本类元素 (TEXT_CATEGORIES)
+        'title': (102, 102, 255),           # 蓝色
+        'text': (153, 0, 76),               # 深红
+        'ocr_text': (153, 0, 76),           # 深红(同 text)
+        'low_score_text': (200, 100, 100),  # 浅红
+        'header': (128, 128, 128),          # 灰色
+        'footer': (128, 128, 128),          # 灰色
+        'page_number': (160, 160, 160),     # 浅灰
+        'ref_text': (180, 180, 180),        # 浅灰
+        'aside_text': (180, 180, 180),      # 浅灰
+        'page_footnote': (200, 200, 200),   # 浅灰
+        
+        # 表格相关元素
+        'table': (204, 204, 0),             # 黄色
+        'table_body': (204, 204, 0),        # 黄色
+        'table_caption': (255, 255, 102),   # 浅黄
+        'table_footnote': (229, 255, 204),  # 浅黄绿
+        
+        # 图片相关元素
+        'image': (153, 255, 51),            # 绿色
+        'image_body': (153, 255, 51),       # 绿色
+        'figure': (153, 255, 51),           # 绿色
+        'image_caption': (102, 178, 255),   # 浅蓝
+        'image_footnote': (255, 178, 102),  # 橙色
+        
+        # 公式类元素
+        'interline_equation': (0, 255, 0),  # 亮绿
+        'inline_equation': (0, 200, 0),     # 绿色
+        'equation': (0, 220, 0),            # 绿色
+        'interline_equation_yolo': (0, 180, 0),
+        'interline_equation_number': (0, 160, 0),
+        
+        # 代码类元素
+        'code': (102, 0, 204),              # 紫色
+        'code_body': (102, 0, 204),         # 紫色
+        'code_caption': (153, 51, 255),     # 浅紫
+        'algorithm': (128, 0, 255),         # 紫色
+        
+        # 列表类元素
+        'list': (40, 169, 92),              # 青绿
+        'index': (60, 180, 100),            # 青绿
+        
+        # 丢弃类元素
+        'abandon': (100, 100, 100),         # 深灰
+        'discarded': (100, 100, 100),       # 深灰
+        
+        # 错误
+        'error': (255, 0, 0),               # 红色
+    }
+    
+    # OCR 框颜色
+    OCR_BOX_COLOR = (0, 255, 0)  # 绿色
+    CELL_BOX_COLOR = (255, 165, 0)  # 橙色
+    DISCARD_COLOR = (128, 128, 128)  # 灰色
+    
+    @staticmethod
+    def save_image_elements(
+        results: Dict[str, Any],
+        images_dir: Path,
+        doc_name: str,
+        is_pdf: bool = True
+    ) -> List[str]:
+        """
+        保存图片元素
+        
+        命名规则:
+        - PDF输入: 文件名_page_001_image_1.png
+        - 图片输入(单页): 文件名_image_1.png
+        
+        Args:
+            results: 处理结果
+            images_dir: 图片输出目录
+            doc_name: 文档名称
+            is_pdf: 是否为 PDF 输入
+            
+        Returns:
+            保存的图片路径列表
+        """
+        saved_paths = []
+        image_count = 0
+        total_pages = len(results.get('pages', []))
+        
+        for page in results.get('pages', []):
+            page_idx = page.get('page_idx', 0)
+            
+            for element in page.get('elements', []):
+                if element.get('type') in ['image', 'image_body', 'figure']:
+                    content = element.get('content', {})
+                    image_data = content.get('image_data')
+                    
+                    if image_data is not None:
+                        image_count += 1
+                        
+                        # 根据输入类型决定命名
+                        if is_pdf or total_pages > 1:
+                            image_filename = f"{doc_name}_page_{page_idx + 1}_image_{image_count}.png"
+                        else:
+                            image_filename = f"{doc_name}_image_{image_count}.png"
+                        
+                        image_path = images_dir / image_filename
+                        
+                        try:
+                            if isinstance(image_data, np.ndarray):
+                                cv2.imwrite(str(image_path), image_data)
+                            else:
+                                Image.fromarray(image_data).save(image_path)
+                            
+                            # 更新路径(只保存文件名)
+                            content['image_path'] = image_filename
+                            content.pop('image_data', None)
+                            
+                            saved_paths.append(str(image_path))
+                            logger.debug(f"🖼️ Image saved: {image_path}")
+                        except Exception as e:
+                            logger.warning(f"Failed to save image: {e}")
+        
+        if image_count > 0:
+            logger.info(f"🖼️ {image_count} images saved to: {images_dir}")
+        
+        return saved_paths
+    
+    @staticmethod
+    def save_layout_images(
+        results: Dict[str, Any],
+        output_dir: Path,
+        doc_name: str,
+        draw_type_label: bool = True,
+        draw_bbox_number: bool = True,
+        is_pdf: bool = True
+    ) -> List[str]:
+        """
+        保存 Layout 可视化图片
+        
+        命名规则:
+        - PDF输入: 文件名_page_001_layout.png
+        - 图片输入(单页): 文件名_layout.png
+        
+        Args:
+            results: 处理结果
+            output_dir: 输出目录
+            doc_name: 文档名称
+            draw_type_label: 是否绘制类型标签
+            draw_bbox_number: 是否绘制序号
+            is_pdf: 是否为 PDF 输入
+            
+        Returns:
+            保存的图片路径列表
+        """
+        layout_paths = []
+        total_pages = len(results.get('pages', []))
+        
+        for page in results.get('pages', []):
+            page_idx = page.get('page_idx', 0)
+            processed_image = page.get('original_image')
+            if processed_image is None:
+                processed_image = page.get('processed_image')
+            
+            if processed_image is None:
+                logger.warning(f"Page {page_idx}: No image data found for layout visualization")
+                continue
+            
+            if isinstance(processed_image, np.ndarray):
+                image = Image.fromarray(processed_image).convert('RGB')
+            elif isinstance(processed_image, Image.Image):
+                image = processed_image.convert('RGB')
+            else:
+                continue
+            
+            draw = ImageDraw.Draw(image, 'RGBA')
+            font = VisualizationUtils._get_font(14)
+            
+            # 绘制普通元素
+            for idx, element in enumerate(page.get('elements', []), 1):
+                elem_type = element.get('type', '')
+                bbox = element.get('bbox', [0, 0, 0, 0])
+                
+                if len(bbox) < 4:
+                    continue
+                
+                x0, y0, x1, y1 = map(int, bbox[:4])
+                color = VisualizationUtils.COLOR_MAP.get(elem_type, (255, 0, 0))
+                
+                # 半透明填充
+                overlay = Image.new('RGBA', image.size, (255, 255, 255, 0))
+                overlay_draw = ImageDraw.Draw(overlay)
+                overlay_draw.rectangle([x0, y0, x1, y1], fill=(*color, 50))
+                image = Image.alpha_composite(image.convert('RGBA'), overlay).convert('RGB')
+                draw = ImageDraw.Draw(image)
+                
+                # 边框
+                draw.rectangle([x0, y0, x1, y1], outline=color, width=2)
+                
+                # 类型标签
+                if draw_type_label:
+                    label = elem_type.replace('_', ' ').title()
+                    bbox_label = draw.textbbox((x0 + 2, y0 + 2), label, font=font)
+                    draw.rectangle(bbox_label, fill=color)
+                    draw.text((x0 + 2, y0 + 2), label, fill='white', font=font)
+                
+                # 序号
+                if draw_bbox_number:
+                    number_text = str(idx)
+                    bbox_number = draw.textbbox((x1 - 25, y0 + 2), number_text, font=font)
+                    draw.rectangle(bbox_number, fill=(255, 0, 0))
+                    draw.text((x1 - 25, y0 + 2), number_text, fill='white', font=font)
+            
+            # 绘制丢弃元素(灰色样式)
+            for idx, element in enumerate(page.get('discarded_blocks', []), 1):
+                original_category = element.get('original_category', 'unknown')
+                bbox = element.get('bbox', [0, 0, 0, 0])
+                
+                if len(bbox) < 4:
+                    continue
+                
+                x0, y0, x1, y1 = map(int, bbox[:4])
+                
+                # 半透明填充
+                overlay = Image.new('RGBA', image.size, (255, 255, 255, 0))
+                overlay_draw = ImageDraw.Draw(overlay)
+                overlay_draw.rectangle([x0, y0, x1, y1], fill=(*VisualizationUtils.DISCARD_COLOR, 30))
+                image = Image.alpha_composite(image.convert('RGBA'), overlay).convert('RGB')
+                draw = ImageDraw.Draw(image)
+                
+                # 灰色边框
+                draw.rectangle([x0, y0, x1, y1], outline=VisualizationUtils.DISCARD_COLOR, width=1)
+                
+                # 类型标签
+                if draw_type_label:
+                    label = f"D:{original_category}"
+                    bbox_label = draw.textbbox((x0 + 2, y0 + 2), label, font=font)
+                    draw.rectangle(bbox_label, fill=VisualizationUtils.DISCARD_COLOR)
+                    draw.text((x0 + 2, y0 + 2), label, fill='white', font=font)
+            
+            # 根据输入类型决定命名
+            if is_pdf or total_pages > 1:
+                layout_path = output_dir / f"{doc_name}_page_{page_idx + 1:03d}_layout.png"
+            else:
+                layout_path = output_dir / f"{doc_name}_layout.png"
+            
+            image.save(layout_path)
+            layout_paths.append(str(layout_path))
+            logger.info(f"🖼️ Layout image saved: {layout_path}")
+        
+        return layout_paths
+    
+    @staticmethod
+    def save_ocr_images(
+        results: Dict[str, Any],
+        output_dir: Path,
+        doc_name: str,
+        is_pdf: bool = True
+    ) -> List[str]:
+        """
+        保存 OCR 可视化图片
+        
+        命名规则:
+        - PDF输入: 文件名_page_001_ocr.png
+        - 图片输入(单页): 文件名_ocr.png
+        
+        Args:
+            results: 处理结果
+            output_dir: 输出目录
+            doc_name: 文档名称
+            is_pdf: 是否为 PDF 输入
+            
+        Returns:
+            保存的图片路径列表
+        """
+        ocr_paths = []
+        total_pages = len(results.get('pages', []))
+        
+        for page in results.get('pages', []):
+            page_idx = page.get('page_idx', 0)
+            processed_image = page.get('original_image')
+            if processed_image is None:
+                processed_image = page.get('processed_image')
+            
+            if processed_image is None:
+                logger.warning(f"Page {page_idx}: No image data found for OCR visualization")
+                continue
+            
+            if isinstance(processed_image, np.ndarray):
+                image = Image.fromarray(processed_image).convert('RGB')
+            elif isinstance(processed_image, Image.Image):
+                image = processed_image.convert('RGB')
+            else:
+                continue
+            
+            draw = ImageDraw.Draw(image)
+            font = VisualizationUtils._get_font(10)
+            
+            for element in page.get('elements', []):
+                content = element.get('content', {})
+                
+                # OCR 文本框
+                ocr_details = content.get('ocr_details', [])
+                for ocr_item in ocr_details:
+                    ocr_bbox = ocr_item.get('bbox', [])
+                    if ocr_bbox:
+                        VisualizationUtils._draw_polygon(
+                            draw, ocr_bbox, VisualizationUtils.OCR_BOX_COLOR, width=1
+                        )
+                
+                # 表格单元格
+                cells = content.get('cells', [])
+                for cell in cells:
+                    cell_bbox = cell.get('bbox', [])
+                    if cell_bbox and len(cell_bbox) >= 4:
+                        x0, y0, x1, y1 = map(int, cell_bbox[:4])
+                        draw.rectangle(
+                            [x0, y0, x1, y1], 
+                            outline=VisualizationUtils.CELL_BOX_COLOR, 
+                            width=2
+                        )
+                        
+                        cell_text = cell.get('text', '')[:10]
+                        if cell_text:
+                            draw.text(
+                                (x0 + 2, y0 + 2), 
+                                cell_text, 
+                                fill=VisualizationUtils.CELL_BOX_COLOR, 
+                                font=font
+                            )
+                
+                # OCR 框
+                ocr_boxes = content.get('ocr_boxes', [])
+                for ocr_box in ocr_boxes:
+                    bbox = ocr_box.get('bbox', [])
+                    if bbox:
+                        VisualizationUtils._draw_polygon(
+                            draw, bbox, VisualizationUtils.OCR_BOX_COLOR, width=1
+                        )
+            
+            # 绘制丢弃元素的 OCR 框
+            for element in page.get('discarded_blocks', []):
+                bbox = element.get('bbox', [0, 0, 0, 0])
+                content = element.get('content', {})
+                
+                if len(bbox) >= 4:
+                    x0, y0, x1, y1 = map(int, bbox[:4])
+                    draw.rectangle(
+                        [x0, y0, x1, y1], 
+                        outline=VisualizationUtils.DISCARD_COLOR, 
+                        width=1
+                    )
+                    
+                    ocr_details = content.get('ocr_details', [])
+                    for ocr_item in ocr_details:
+                        ocr_bbox = ocr_item.get('bbox', [])
+                        if ocr_bbox:
+                            VisualizationUtils._draw_polygon(
+                                draw, ocr_bbox, VisualizationUtils.DISCARD_COLOR, width=1
+                            )
+            
+            # 根据输入类型决定命名
+            if is_pdf or total_pages > 1:
+                ocr_path = output_dir / f"{doc_name}_page_{page_idx + 1:03d}_ocr.png"
+            else:
+                ocr_path = output_dir / f"{doc_name}_ocr.png"
+            
+            image.save(ocr_path)
+            ocr_paths.append(str(ocr_path))
+            logger.info(f"🖼️ OCR image saved: {ocr_path}")
+        
+        return ocr_paths
+    
+    @staticmethod
+    def _draw_polygon(
+        draw: ImageDraw.Draw,
+        bbox: List,
+        color: Tuple[int, int, int],
+        width: int = 1
+    ):
+        """
+        绘制多边形或矩形
+        
+        Args:
+            draw: ImageDraw 对象
+            bbox: 坐标(4点多边形或矩形)
+            color: 颜色
+            width: 线宽
+        """
+        if isinstance(bbox[0], (list, tuple)):
+            points = [(int(p[0]), int(p[1])) for p in bbox]
+            points.append(points[0])
+            draw.line(points, fill=color, width=width)
+        elif len(bbox) >= 4:
+            x0, y0, x1, y1 = map(int, bbox[:4])
+            draw.rectangle([x0, y0, x1, y1], outline=color, width=width)
+    
+    @staticmethod
+    def _get_font(size: int) -> ImageFont.FreeTypeFont:
+        """
+        获取字体
+        
+        Args:
+            size: 字体大小
+            
+        Returns:
+            字体对象
+        """
+        font_paths = [
+            "/System/Library/Fonts/Helvetica.ttc",
+            "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
+            "/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf",
+        ]
+        
+        for font_path in font_paths:
+            try:
+                return ImageFont.truetype(font_path, size)
+            except:
+                continue
+        
+        return ImageFont.load_default()
+