SHA1
--- a/ocr_tools/universal_doc_parser/config/bank_statement_wired_unet.yaml
+++ b/ocr_tools/universal_doc_parser/config/bank_statement_wired_unet.yaml
@@ -57,13 +57,14 @@ table_recognition_wired:
 
				 
			
 
				 output:
			
 
				   create_subdir: false
			
 
				+  save_pdf_images: true
			
 
				   save_json: true
			
 
				   save_page_json: true
			
 
				   save_markdown: true
			
 
				   save_page_markdown: true
			
 
				   save_html: true
			
 
				-  save_layout_image: false
			
 
				-  save_ocr_image: false
			
 
				+  save_layout_image: true
			
 
				+  save_ocr_image: true
			
 
				   draw_type_label: true
			
 
				   draw_bbox_number: true
			
 
				   save_enhanced_json: true
			
--- a/ocr_tools/universal_doc_parser/core/pipeline_manager_v2.py
+++ b/ocr_tools/universal_doc_parser/core/pipeline_manager_v2.py
@@ -201,7 +201,11 @@ class EnhancedDocPipeline:
 
				             处理结果字典
			
 
				         """
			
 
				         doc_path = Path(document_path)
			
 
				+        doc_name = doc_path.stem
			
 
				         
			
 
				+        # 判断输入类型
			
 
				+        is_pdf = doc_path.suffix.lower() == '.pdf'        
			
 
				+
			
 
				         results = {
			
 
				             'scene': self.scene_name,
			
 
				             'document_path': str(doc_path),
			
@@ -219,6 +223,11 @@ class EnhancedDocPipeline:
 
				             results['metadata']['pdf_type'] = pdf_type
			
 
				             results['metadata']['page_count'] = len(images)
			
 
				             results['metadata']['page_range'] = page_range
			
 
				+
			
 
				+            # 保存文档元数据
			
 
				+            self.doc_name = doc_name
			
 
				+            self.is_pdf = is_pdf
			
 
				+            self.total_pages = len(images)            
			
 
				             
			
 
				             logger.info(f"📄 Loaded {len(images)} pages, type: {pdf_type}")
			
 
				             
			
@@ -227,6 +236,12 @@ class EnhancedDocPipeline:
 
				                 # 使用原始页码索引（支持页面范围过滤）
			
 
				                 page_idx = image_dict.get('page_idx', idx)
			
 
				                 page_name = image_dict.get('page_name', f'page_{page_idx + 1:03d}')
			
 
				+                # 如果定义将pdf输出图片，则保存
			
 
				+                if is_pdf and self.config.get('output', {}).get('save_pdf_images', False):
			
 
				+                    image_path = Path(output_dir).resolve() / f"{doc_name}" / f"{doc_name}_page_{page_idx + 1:03d}.png"
			
 
				+                    image_path.parent.mkdir(parents=True, exist_ok=True)
			
 
				+                    image_dict.get('img_pil').save(image_path)
			
 
				+                
			
 
				                 logger.info(f"🔍 Processing page {idx + 1}/{len(images)} (original index: {page_idx})")
			
 
				                 
			
 
				                 page_result = self._process_single_page(
			
--- a/ocr_tools/universal_doc_parser/core/pipeline_manager_v2_streaming.py
+++ b/ocr_tools/universal_doc_parser/core/pipeline_manager_v2_streaming.py
@@ -148,6 +148,11 @@ class StreamingDocPipeline(EnhancedDocPipeline):
 
				             for idx, image_dict in enumerate(images):
			
 
				                 page_idx = image_dict.get('page_idx', idx)
			
 
				                 page_name = image_dict.get('page_name', f'page_{page_idx + 1:03d}')
			
 
				+                # 如果定义将pdf输出图片，则保存
			
 
				+                if is_pdf and output_config.get('save_pdf_images', False):
			
 
				+                    image_path = self.output_dir / f"{doc_name}" / f"{doc_name}_page_{page_idx + 1:03d}.png"
			
 
				+                    image_path.parent.mkdir(parents=True, exist_ok=True)
			
 
				+                    image_dict.get('img_pil').save(image_path)
			
 
				                 
			
 
				                 logger.info(f"🔍 Processing page {idx + 1}/{len(images)} (original index: {page_idx})")
			
 
				                 
			
--- a/ocr_utils/compare_pdf_renderers.py
+++ b/ocr_utils/compare_pdf_renderers.py
@@ -0,0 +1,261 @@
 
				+#!/usr/bin/env python3
			
 
				+"""
			
 
				+对比分析 fitz (PyMuPDF) 和 pypdfium2 渲染 PDF 的差异
			
 
				+用于诊断 UNet 表格识别结果不一致的问题
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+from PIL import Image
			
 
				+import numpy as np
			
 
				+import cv2
			
 
				+
			
 
				+# 图片路径
			
 
				+FITZ_IMAGE = Path("/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/tests/2023年度报告母公司_page_003.png")
			
 
				+PYPDFIUM2_IMAGE = Path("/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/output/2023年度报告母公司/bank_statement_wired_unet/2023年度报告母公司/2023年度报告母公司_page_003.png")
			
 
				+
			
 
				+def analyze_image(image_path: Path, label: str):
			
 
				+    """分析图像的详细属性"""
			
 
				+    print(f"\n{'='*70}")
			
 
				+    print(f"分析: {label}")
			
 
				+    print(f"{'='*70}")
			
 
				+    
			
 
				+    if not image_path.exists():
			
 
				+        print(f"❌ 文件不存在: {image_path}")
			
 
				+        return None, None, None
			
 
				+    
			
 
				+    # 文件大小
			
 
				+    file_size = os.path.getsize(image_path) / 1024  # KB
			
 
				+    print(f"📦 文件大小: {file_size:.2f} KB")
			
 
				+    
			
 
				+    # PIL 加载
			
 
				+    img_pil = Image.open(image_path)
			
 
				+    print(f"📐 图像尺寸: {img_pil.size[0]} × {img_pil.size[1]} (宽×高)")
			
 
				+    print(f"🎨 颜色模式: {img_pil.mode}")
			
 
				+    print(f"📊 格式: {img_pil.format}")
			
 
				+    
			
 
				+    # 获取 DPI 信息
			
 
				+    dpi = img_pil.info.get('dpi', 'N/A')
			
 
				+    print(f"🔍 DPI 信息: {dpi}")
			
 
				+    
			
 
				+    # OpenCV 加载
			
 
				+    img_cv = cv2.imread(str(image_path))
			
 
				+    if img_cv is not None:
			
 
				+        print(f"📏 OpenCV 尺寸: {img_cv.shape[0]} × {img_cv.shape[1]} × {img_cv.shape[2]} (高×宽×通道)")
			
 
				+        print(f"📈 数据类型: {img_cv.dtype}")
			
 
				+        
			
 
				+        # 统计信息
			
 
				+        print(f"🔢 像素值范围: [{img_cv.min()}, {img_cv.max()}]")
			
 
				+        print(f"📊 平均值: {img_cv.mean():.2f}")
			
 
				+        print(f"📊 标准差: {img_cv.std():.2f}")
			
 
				+        
			
 
				+        # 检查是否有纯黑/纯白区域
			
 
				+        black_pixels = np.all(img_cv == 0, axis=-1).sum()
			
 
				+        white_pixels = np.all(img_cv == 255, axis=-1).sum()
			
 
				+        total_pixels = img_cv.shape[0] * img_cv.shape[1]
			
 
				+        print(f"⚫ 纯黑像素: {black_pixels} ({black_pixels/total_pixels*100:.2f}%)")
			
 
				+        print(f"⚪ 纯白像素: {white_pixels} ({white_pixels/total_pixels*100:.2f}%)")
			
 
				+    
			
 
				+    # NumPy 数组
			
 
				+    img_np = np.array(img_pil)
			
 
				+    print(f"🧮 NumPy shape: {img_np.shape}")
			
 
				+    print(f"🧮 NumPy dtype: {img_np.dtype}")
			
 
				+    
			
 
				+    return img_pil, img_cv, img_np
			
 
				+
			
 
				+def compare_images(img1_pil: Image.Image, img2_pil: Image.Image, label1: str, label2: str):
			
 
				+    """对比两张图像的差异"""
			
 
				+    print(f"\n{'='*70}")
			
 
				+    print(f"对比: {label1} vs {label2}")
			
 
				+    print(f"{'='*70}")
			
 
				+    
			
 
				+    # 尺寸对比
			
 
				+    if img1_pil.size == img2_pil.size:
			
 
				+        print(f"✅ 尺寸一致: {img1_pil.size[0]} × {img1_pil.size[1]}")
			
 
				+    else:
			
 
				+        print(f"❌ 尺寸不一致:")
			
 
				+        print(f"   {label1}: {img1_pil.size[0]} × {img1_pil.size[1]}")
			
 
				+        print(f"   {label2}: {img2_pil.size[0]} × {img2_pil.size[1]}")
			
 
				+        print(f"\n⚠️ 尺寸不同，无法进行像素级对比")
			
 
				+        return
			
 
				+    
			
 
				+    # 转换为 numpy 数组
			
 
				+    arr1 = np.array(img1_pil)
			
 
				+    arr2 = np.array(img2_pil)
			
 
				+    
			
 
				+    # 像素差异
			
 
				+    diff = np.abs(arr1.astype(np.float32) - arr2.astype(np.float32))
			
 
				+    
			
 
				+    print(f"\n📊 像素差异统计:")
			
 
				+    print(f"  最大差异: {diff.max():.2f} (0-255 范围)")
			
 
				+    print(f"  平均差异: {diff.mean():.2f}")
			
 
				+    print(f"  中位数差异: {np.median(diff):.2f}")
			
 
				+    print(f"  差异标准差: {diff.std():.2f}")
			
 
				+    
			
 
				+    # 相同像素百分比
			
 
				+    identical_pixels = np.all(arr1 == arr2, axis=-1).sum()
			
 
				+    total_pixels = arr1.shape[0] * arr1.shape[1]
			
 
				+    identical_ratio = identical_pixels / total_pixels * 100
			
 
				+    print(f"\n✓ 完全相同的像素: {identical_pixels:,} / {total_pixels:,} ({identical_ratio:.2f}%)")
			
 
				+    
			
 
				+    # 差异分布
			
 
				+    diff_1px = np.sum(np.any(diff <= 1, axis=-1))
			
 
				+    diff_5px = np.sum(np.any(diff <= 5, axis=-1))
			
 
				+    diff_10px = np.sum(np.any(diff <= 10, axis=-1))
			
 
				+    print(f"\n📈 差异分布 (有差异的像素):")
			
 
				+    print(f"  ≤ 1 灰度级: {diff_1px:,} ({diff_1px/total_pixels*100:.2f}%)")
			
 
				+    print(f"  ≤ 5 灰度级: {diff_5px:,} ({diff_5px/total_pixels*100:.2f}%)")
			
 
				+    print(f"  ≤ 10 灰度级: {diff_10px:,} ({diff_10px/total_pixels*100:.2f}%)")
			
 
				+    print(f"  > 10 灰度级: {(total_pixels - diff_10px):,} ({(total_pixels - diff_10px)/total_pixels*100:.2f}%)")
			
 
				+    
			
 
				+    # 颜色通道差异
			
 
				+    if len(arr1.shape) == 3:
			
 
				+        print(f"\n🎨 各颜色通道差异:")
			
 
				+        for i, channel in enumerate(['红色 (R)', '绿色 (G)', '蓝色 (B)']):
			
 
				+            channel_diff = np.abs(arr1[:,:,i].astype(np.float32) - arr2[:,:,i].astype(np.float32))
			
 
				+            print(f"  {channel}: 平均 {channel_diff.mean():.2f}, 最大 {channel_diff.max():.0f}")
			
 
				+    
			
 
				+    # 生成差异热图
			
 
				+    diff_map = diff.mean(axis=-1) if len(diff.shape) == 3 else diff
			
 
				+    max_diff_loc = np.unravel_index(diff_map.argmax(), diff_map.shape)
			
 
				+    print(f"\n🔥 最大差异位置:")
			
 
				+    print(f"  坐标: (y={max_diff_loc[0]}, x={max_diff_loc[1]})")
			
 
				+    print(f"  差异值: {diff_map[max_diff_loc]:.2f}")
			
 
				+    print(f"  {label1} 像素值: {arr1[max_diff_loc]}")
			
 
				+    print(f"  {label2} 像素值: {arr2[max_diff_loc]}")
			
 
				+    
			
 
				+    # SSIM 结构相似性
			
 
				+    try:
			
 
				+        from skimage.metrics import structural_similarity as ssim
			
 
				+        # 转换为灰度
			
 
				+        gray1 = cv2.cvtColor(arr1, cv2.COLOR_RGB2GRAY) if len(arr1.shape) == 3 else arr1
			
 
				+        gray2 = cv2.cvtColor(arr2, cv2.COLOR_RGB2GRAY) if len(arr2.shape) == 3 else arr2
			
 
				+        ssim_value = ssim(gray1, gray2)
			
 
				+        print(f"\n📏 SSIM 结构相似性: {ssim_value:.6f}")
			
 
				+        print(f"   (1.0 = 完全相同, >0.95 = 几乎相同, <0.9 = 有明显差异)")
			
 
				+    except ImportError:
			
 
				+        print(f"\n⚠️ 未安装 scikit-image，跳过 SSIM 计算")
			
 
				+        print(f"   安装: pip install scikit-image")
			
 
				+    
			
 
				+    # 保存差异图
			
 
				+    output_dir = Path(__file__).parent / "analysis_output"
			
 
				+    output_dir.mkdir(exist_ok=True)
			
 
				+    
			
 
				+    # 差异热图 (归一化到 0-255)
			
 
				+    diff_visual = (diff_map / diff_map.max() * 255).astype(np.uint8) if diff_map.max() > 0 else diff_map.astype(np.uint8)
			
 
				+    diff_colored = cv2.applyColorMap(diff_visual, cv2.COLORMAP_JET)
			
 
				+    cv2.imwrite(str(output_dir / "diff_heatmap.png"), diff_colored)
			
 
				+    
			
 
				+    # 保存原始差异图（未归一化）
			
 
				+    diff_raw = diff_map.astype(np.uint8)
			
 
				+    cv2.imwrite(str(output_dir / "diff_raw.png"), diff_raw)
			
 
				+    
			
 
				+    # 保存二值化差异（差异 > 5 的区域）
			
 
				+    diff_binary = (diff_map > 5).astype(np.uint8) * 255
			
 
				+    cv2.imwrite(str(output_dir / "diff_binary_5px.png"), diff_binary)
			
 
				+    
			
 
				+    print(f"\n💾 差异图已保存到: {output_dir}")
			
 
				+    print(f"   - diff_heatmap.png (彩色热图)")
			
 
				+    print(f"   - diff_raw.png (原始差异)")
			
 
				+    print(f"   - diff_binary_5px.png (差异>5的区域)")
			
 
				+
			
 
				+def analyze_rendering_differences():
			
 
				+    """分析渲染差异的根本原因"""
			
 
				+    print(f"\n{'='*70}")
			
 
				+    print("🔬 渲染差异根本原因分析")
			
 
				+    print(f"{'='*70}")
			
 
				+    
			
 
				+    print("""
			
 
				+## 主要差异来源：
			
 
				+
			
 
				+### 1. 抗锯齿算法 (Anti-aliasing)
			
 
				+   • PyMuPDF (fitz): 使用 MuPDF 渲染引擎，默认启用抗锯齿
			
 
				+   • pypdfium2: 使用 PDFium 渲染引擎（Chrome PDF 引擎）
			
 
				+   
			
 
				+   影响: 边缘平滑度不同，细线条的像素值会有 1-3 灰度级差异
			
 
				+
			
 
				+### 2. 颜色空间处理
			
 
				+   • PyMuPDF: MuPDF 内部颜色管理
			
 
				+   • pypdfium2: Chromium 颜色管理系统
			
 
				+   
			
 
				+   影响: RGB 值可能有 1-2 个灰度级的系统性偏差
			
 
				+
			
 
				+### 3. 字体渲染引擎
			
 
				+   • PyMuPDF: FreeType 字体渲染
			
 
				+   • pypdfium2: PDFium/Skia 字体渲染
			
 
				+   
			
 
				+   影响: 文字边缘、字形细节略有不同，影响 OCR 识别
			
 
				+
			
 
				+### 4. DPI 缩放算法
			
 
				+   • PyMuPDF: fitz.Matrix() 矩阵变换
			
 
				+   • pypdfium2: bitmap.render(scale=) 缩放
			
 
				+   
			
 
				+   影响: 插值算法不同，导致边缘像素值差异
			
 
				+
			
 
				+### 5. 尺寸限制策略
			
 
				+   • PyMuPDF: >4500px → 降为 72 DPI
			
 
				+   • pypdfium2: >3500px → 动态调整 scale
			
 
				+   
			
 
				+   影响: 大尺寸 PDF 可能产生不同分辨率的图像
			
 
				+
			
 
				+## 对 UNet 表格识别的影响：
			
 
				+
			
 
				+### 直接影响：
			
 
				+✗ 线条边缘抗锯齿差异 → UNet 检测线条位置有 1-2 像素偏移
			
 
				+✗ 文字清晰度差异 → 影响单元格文本区域识别
			
 
				+✗ 整体对比度差异 → 影响表格线检测阈值
			
 
				+
			
 
				+### 建议解决方案：
			
 
				+1. 统一渲染引擎: 全部使用 pypdfium2 (更稳定、更快)
			
 
				+2. 保存调试图像: 保存 UNet 输入图像以便排查
			
 
				+3. 调整检测阈值: 考虑渲染差异，适当放宽容差
			
 
				+4. 使用相同测试数据: 确保 test 和 production 使用同一渲染方法
			
 
				+""")
			
 
				+
			
 
				+def main():
			
 
				+    print("="*70)
			
 
				+    print("PDF 渲染引擎对比分析工具")
			
 
				+    print("fitz (PyMuPDF) vs pypdfium2")
			
 
				+    print("="*70)
			
 
				+    
			
 
				+    # 检查文件是否存在
			
 
				+    if not FITZ_IMAGE.exists():
			
 
				+        print(f"\n❌ fitz 图像不存在: {FITZ_IMAGE}")
			
 
				+        print(f"   请确保已使用 fitz 渲染 PDF 并保存图像")
			
 
				+        return 1
			
 
				+    
			
 
				+    if not PYPDFIUM2_IMAGE.exists():
			
 
				+        print(f"\n❌ pypdfium2 图像不存在: {PYPDFIUM2_IMAGE}")
			
 
				+        print(f"   请运行 pipeline 生成输出图像")
			
 
				+        return 1
			
 
				+    
			
 
				+    # 分析两张图片
			
 
				+    print("\n" + "🔍 第一步: 分析各自的图像属性")
			
 
				+    fitz_pil, fitz_cv, fitz_np = analyze_image(FITZ_IMAGE, "PyMuPDF (fitz)")
			
 
				+    
			
 
				+    if fitz_pil is None:
			
 
				+        return 1
			
 
				+    
			
 
				+    pypdfium2_pil, pypdfium2_cv, pypdfium2_np = analyze_image(PYPDFIUM2_IMAGE, "pypdfium2")
			
 
				+    
			
 
				+    if pypdfium2_pil is None:
			
 
				+        return 1
			
 
				+    
			
 
				+    # 对比差异
			
 
				+    print("\n" + "📊 第二步: 对比两张图像的差异")
			
 
				+    compare_images(fitz_pil, pypdfium2_pil, "PyMuPDF", "pypdfium2")
			
 
				+    
			
 
				+    # 分析根本原因
			
 
				+    print("\n" + "💡 第三步: 分析差异的根本原因")
			
 
				+    analyze_rendering_differences()
			
 
				+    
			
 
				+    print(f"\n{'='*70}")
			
 
				+    print("✅ 分析完成")
			
 
				+    print(f"{'='*70}")
			
 
				+    print(f"\n查看输出目录: {Path(__file__).parent / 'analysis_output'}")
			
 
				+    
			
 
				+    return 0
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    sys.exit(main())
			
--- a/ocr_utils/pdf_utils.py
+++ b/ocr_utils/pdf_utils.py
@@ -124,10 +124,11 @@ class PDFUtils:
 
				             logger.info(f"📋 PDF classified as: {pdf_type}")
			
 
				             
			
 
				             # 加载图像
			
 
				-            images_list, pdf_doc = load_images_from_pdf(
			
 
				+            images_list, pdf_doc = load_images_from_pdf_unified(
			
 
				                 pdf_bytes, 
			
 
				                 dpi=dpi,
			
 
				-                image_type=ImageType.PIL
			
 
				+                image_type=ImageType.PIL,
			
 
				+                renderer='fitz'
			
 
				             )
			
 
				             
			
 
				             # 解析页面范围
			
@@ -266,3 +267,226 @@ class PDFUtils:
 
				         # TODO: 实现跨页表格合并逻辑
			
 
				         return results
			
 
				 
			
 
				+
			
 
				+# ============================================================================
			
 
				+# 统一的 PDF 图像加载函数 - 支持多种渲染引擎
			
 
				+# ============================================================================
			
 
				+
			
 
				+def load_images_from_pdf_unified(
			
 
				+    pdf_bytes: bytes,
			
 
				+    dpi: int = 200,
			
 
				+    start_page_id: int = 0,
			
 
				+    end_page_id: Optional[int] = None,
			
 
				+    image_type: str = "PIL",
			
 
				+    renderer: str = "pypdfium2",
			
 
				+    timeout: Optional[int] = None,
			
 
				+    threads: int = 4,
			
 
				+) -> Tuple[List[Dict[str, Any]], Any]:
			
 
				+    """
			
 
				+    从 PDF 加载图像，支持两种渲染引擎
			
 
				+    
			
 
				+    Args:
			
 
				+        pdf_bytes: PDF 文件的字节数据
			
 
				+        dpi: 渲染 DPI，默认 200
			
 
				+        start_page_id: 起始页码（0-based），默认 0
			
 
				+        end_page_id: 结束页码（0-based，包含），默认 None（处理到最后）
			
 
				+        image_type: 返回图像类型，"PIL" 或 "BASE64"
			
 
				+        renderer: 渲染引擎选择
			
 
				+            - "pypdfium2": 使用 MinerU 标准的 pypdfium2（推荐）
			
 
				+              * 优势: Chrome PDFium 引擎，多进程加速，更好的细节保留
			
 
				+              * 尺寸限制: 3500px，超过则动态调整 scale
			
 
				+            - "fitz" / "pymupdf": 使用 PyMuPDF (fitz)
			
 
				+              * 优势: MuPDF 引擎，简单直接，无需额外依赖
			
 
				+              * 尺寸限制: 4500px，超过则降到 72 DPI
			
 
				+        timeout: 超时时间（秒），仅 pypdfium2 支持
			
 
				+        threads: 进程数，仅 pypdfium2 支持多进程加速（Windows 下自动禁用）
			
 
				+        
			
 
				+    Returns:
			
 
				+        (images_list, pdf_doc)
			
 
				+        - images_list: 图像列表，每个元素为 {'img_pil': PIL.Image, 'scale': float}
			
 
				+                      或 {'img_base64': str, 'scale': float}（取决于 image_type）
			
 
				+        - pdf_doc: PDF 文档对象（pypdfium2.PdfDocument 或 fitz.Document）
			
 
				+        
			
 
				+    Raises:
			
 
				+        ImportError: 如果选择的渲染引擎不可用
			
 
				+        ValueError: 如果参数无效
			
 
				+        TimeoutError: 如果转换超时（仅 pypdfium2）
			
 
				+    
			
 
				+    渲染引擎对比:
			
 
				+        ┌─────────────┬──────────────┬──────────────┐
			
 
				+        │   特性      │  pypdfium2   │    fitz      │
			
 
				+        ├─────────────┼──────────────┼──────────────┤
			
 
				+        │ 渲染引擎    │ Chrome PDFium│ MuPDF        │
			
 
				+        │ 多进程加速  │ ✅ (非Windows)│ ❌           │
			
 
				+        │ 超时控制    │ ✅           │ ❌           │
			
 
				+        │ 尺寸限制    │ 3500px       │ 4500px       │
			
 
				+        │ 超限处理    │ 动态调整scale│ 降到72 DPI   │
			
 
				+        │ 细节保留    │ 更好         │ 良好         │
			
 
				+        │ MinerU标准  │ ✅           │ ❌           │
			
 
				+        └─────────────┴──────────────┴──────────────┘
			
 
				+    
			
 
				+    示例:
			
 
				+        # 使用 pypdfium2（推荐，MinerU 标准）
			
 
				+        images, doc = load_images_from_pdf_unified(
			
 
				+            pdf_bytes, 
			
 
				+            dpi=200, 
			
 
				+            renderer="pypdfium2",
			
 
				+            threads=4
			
 
				+        )
			
 
				+        
			
 
				+        # 使用 PyMuPDF (fitz)
			
 
				+        images, doc = load_images_from_pdf_unified(
			
 
				+            pdf_bytes, 
			
 
				+            dpi=200, 
			
 
				+            renderer="fitz"
			
 
				+        )
			
 
				+        
			
 
				+        # 访问图像
			
 
				+        for img_dict in images:
			
 
				+            pil_image = img_dict['img_pil']
			
 
				+            scale = img_dict['scale']
			
 
				+            # 处理图像...
			
 
				+    
			
 
				+    注意事项:
			
 
				+        1. pypdfium2 在生产环境中更推荐，因为它是 MinerU 的标准实现
			
 
				+        2. 两种渲染引擎可能产生略有不同的图像（SSIM ≈ 0.945）
			
 
				+        3. 建议在同一项目中保持使用同一渲染引擎，避免不一致
			
 
				+        4. 如果需要与现有测试图像对比，使用相同的渲染引擎
			
 
				+    """
			
 
				+    renderer = renderer.lower()
			
 
				+    
			
 
				+    if renderer in ["pypdfium2", "pdfium"]:
			
 
				+        return _load_images_pypdfium2(
			
 
				+            pdf_bytes, dpi, start_page_id, end_page_id, 
			
 
				+            image_type, timeout, threads
			
 
				+        )
			
 
				+    elif renderer in ["fitz", "pymupdf", "mupdf"]:
			
 
				+        return _load_images_fitz(
			
 
				+            pdf_bytes, dpi, start_page_id, end_page_id, image_type
			
 
				+        )
			
 
				+    else:
			
 
				+        raise ValueError(
			
 
				+            f"不支持的渲染引擎: {renderer}. "
			
 
				+            f"请使用 'pypdfium2' 或 'fitz'"
			
 
				+        )
			
 
				+
			
 
				+
			
 
				+def _load_images_pypdfium2(
			
 
				+    pdf_bytes: bytes,
			
 
				+    dpi: int,
			
 
				+    start_page_id: int,
			
 
				+    end_page_id: Optional[int],
			
 
				+    image_type: str,
			
 
				+    timeout: Optional[int],
			
 
				+    threads: int
			
 
				+) -> Tuple[List[Dict[str, Any]], Any]:
			
 
				+    """使用 pypdfium2 渲染引擎（MinerU 标准）"""
			
 
				+    try:
			
 
				+        import pypdfium2 as pdfium
			
 
				+        from mineru.utils.pdf_image_tools import load_images_from_pdf as mineru_load_images
			
 
				+        from mineru.utils.enum_class import ImageType
			
 
				+    except ImportError as e:
			
 
				+        raise ImportError(
			
 
				+            f"pypdfium2 渲染引擎需要安装 MinerU: pip install mineru\n"
			
 
				+            f"原始错误: {e}"
			
 
				+        )
			
 
				+    
			
 
				+    # 转换 image_type
			
 
				+    img_type = ImageType.PIL if image_type.upper() == "PIL" else ImageType.BASE64
			
 
				+    
			
 
				+    # 使用 MinerU 的实现
			
 
				+    images_list, pdf_doc = mineru_load_images(
			
 
				+        pdf_bytes=pdf_bytes,
			
 
				+        dpi=dpi,
			
 
				+        start_page_id=start_page_id,
			
 
				+        end_page_id=end_page_id,
			
 
				+        image_type=img_type,
			
 
				+        timeout=timeout,
			
 
				+        threads=threads
			
 
				+    )
			
 
				+    
			
 
				+    logger.info(
			
 
				+        f"✅ pypdfium2 渲染完成: {len(images_list)} 页 "
			
 
				+        f"(DPI={dpi}, 多进程={threads})"
			
 
				+    )
			
 
				+    
			
 
				+    return images_list, pdf_doc
			
 
				+
			
 
				+
			
 
				+def _load_images_fitz(
			
 
				+    pdf_bytes: bytes,
			
 
				+    dpi: int,
			
 
				+    start_page_id: int,
			
 
				+    end_page_id: Optional[int],
			
 
				+    image_type: str
			
 
				+) -> Tuple[List[Dict[str, Any]], Any]:
			
 
				+    """使用 PyMuPDF (fitz) 渲染引擎"""
			
 
				+    try:
			
 
				+        import fitz
			
 
				+    except ImportError as e:
			
 
				+        raise ImportError(
			
 
				+            f"PyMuPDF 渲染引擎需要安装: pip install PyMuPDF\n"
			
 
				+            f"原始错误: {e}"
			
 
				+        )
			
 
				+    
			
 
				+    from io import BytesIO
			
 
				+    import base64
			
 
				+    
			
 
				+    # 打开 PDF
			
 
				+    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
			
 
				+    pdf_page_num = doc.page_count
			
 
				+    
			
 
				+    # 处理 end_page_id
			
 
				+    if end_page_id is None or end_page_id < 0:
			
 
				+        end_page_id = pdf_page_num - 1
			
 
				+    end_page_id = min(end_page_id, pdf_page_num - 1)
			
 
				+    
			
 
				+    # 渲染图像
			
 
				+    images_list = []
			
 
				+    mat = fitz.Matrix(dpi / 72, dpi / 72)
			
 
				+    
			
 
				+    for index in range(start_page_id, end_page_id + 1):
			
 
				+        page = doc[index]
			
 
				+        
			
 
				+        # 渲染为 pixmap
			
 
				+        pm = page.get_pixmap(matrix=mat, alpha=False)
			
 
				+        
			
 
				+        # 如果超过尺寸限制，降低到 72 DPI
			
 
				+        if pm.width > 4500 or pm.height > 4500:
			
 
				+            logger.warning(
			
 
				+                f"⚠️  页面 {index} 尺寸过大 ({pm.width}x{pm.height}), "
			
 
				+                f"降低到 72 DPI"
			
 
				+            )
			
 
				+            mat_fallback = fitz.Matrix(1, 1)  # 72 DPI
			
 
				+            pm = page.get_pixmap(matrix=mat_fallback, alpha=False)
			
 
				+        
			
 
				+        # 转换为 PIL Image
			
 
				+        pil_img = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
			
 
				+        
			
 
				+        # 计算实际 scale
			
 
				+        page_rect = page.rect
			
 
				+        actual_scale = pm.width / page_rect.width
			
 
				+        
			
 
				+        # 构建返回字典
			
 
				+        image_dict = {
			
 
				+            'img_pil': pil_img,
			
 
				+            'scale': actual_scale
			
 
				+        }
			
 
				+        
			
 
				+        # 如果需要 BASE64
			
 
				+        if image_type.upper() == "BASE64":
			
 
				+            buffer = BytesIO()
			
 
				+            pil_img.save(buffer, format="JPEG")
			
 
				+            img_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
			
 
				+            image_dict['img_base64'] = img_base64
			
 
				+            # 移除 img_pil 以节省内存
			
 
				+            del image_dict['img_pil']
			
 
				+        
			
 
				+        images_list.append(image_dict)
			
 
				+    
			
 
				+    logger.info(
			
 
				+        f"✅ PyMuPDF (fitz) 渲染完成: {len(images_list)} 页 "
			
 
				+        f"(DPI={dpi}, 单进程)"
			
 
				+    )
			
 
				+    
			
 
				+    return images_list, doc
Аутор	SHA1 Порука	Датум
zhch158_admin	3263321e84 feat: 添加统一的PDF图像加载函数，支持多种渲染引擎	пре 4 часа
zhch158_admin	4e6c855b17 feat: 添加PDF渲染引擎对比分析工具，支持分析图像属性和差异	пре 4 часа
zhch158_admin	95e2272ed9 fix: 添加PDF文档处理时保存页面图像的功能	пре 4 часа
zhch158_admin	e21b57e051 fix: 更新输出配置，启用保存布局和OCR图像	пре 4 часа