6 maanden geleden · 3263321e84
--- a/ocr_utils/pdf_utils.py
+++ b/ocr_utils/pdf_utils.py
@@ -124,10 +124,11 @@ class PDFUtils:
 
				             logger.info(f"📋 PDF classified as: {pdf_type}")
			
 
				             
			
 
				             # 加载图像
			
 
				-            images_list, pdf_doc = load_images_from_pdf(
			
 
				+            images_list, pdf_doc = load_images_from_pdf_unified(
			
 
				                 pdf_bytes, 
			
 
				                 dpi=dpi,
			
 
				-                image_type=ImageType.PIL
			
 
				+                image_type=ImageType.PIL,
			
 
				+                renderer='fitz'
			
 
				             )
			
 
				             
			
 
				             # 解析页面范围
			
@@ -266,3 +267,226 @@ class PDFUtils:
 
				         # TODO: 实现跨页表格合并逻辑
			
 
				         return results
			
 
				 
			
 
				+
			
 
				+# ============================================================================
			
 
				+# 统一的 PDF 图像加载函数 - 支持多种渲染引擎
			
 
				+# ============================================================================
			
 
				+
			
 
				+def load_images_from_pdf_unified(
			
 
				+    pdf_bytes: bytes,
			
 
				+    dpi: int = 200,
			
 
				+    start_page_id: int = 0,
			
 
				+    end_page_id: Optional[int] = None,
			
 
				+    image_type: str = "PIL",
			
 
				+    renderer: str = "pypdfium2",
			
 
				+    timeout: Optional[int] = None,
			
 
				+    threads: int = 4,
			
 
				+) -> Tuple[List[Dict[str, Any]], Any]:
			
 
				+    """
			
 
				+    从 PDF 加载图像，支持两种渲染引擎
			
 
				+    
			
 
				+    Args:
			
 
				+        pdf_bytes: PDF 文件的字节数据
			
 
				+        dpi: 渲染 DPI，默认 200
			
 
				+        start_page_id: 起始页码（0-based），默认 0
			
 
				+        end_page_id: 结束页码（0-based，包含），默认 None（处理到最后）
			
 
				+        image_type: 返回图像类型，"PIL" 或 "BASE64"
			
 
				+        renderer: 渲染引擎选择
			
 
				+            - "pypdfium2": 使用 MinerU 标准的 pypdfium2（推荐）
			
 
				+              * 优势: Chrome PDFium 引擎，多进程加速，更好的细节保留
			
 
				+              * 尺寸限制: 3500px，超过则动态调整 scale
			
 
				+            - "fitz" / "pymupdf": 使用 PyMuPDF (fitz)
			
 
				+              * 优势: MuPDF 引擎，简单直接，无需额外依赖
			
 
				+              * 尺寸限制: 4500px，超过则降到 72 DPI
			
 
				+        timeout: 超时时间（秒），仅 pypdfium2 支持
			
 
				+        threads: 进程数，仅 pypdfium2 支持多进程加速（Windows 下自动禁用）
			
 
				+        
			
 
				+    Returns:
			
 
				+        (images_list, pdf_doc)
			
 
				+        - images_list: 图像列表，每个元素为 {'img_pil': PIL.Image, 'scale': float}
			
 
				+                      或 {'img_base64': str, 'scale': float}（取决于 image_type）
			
 
				+        - pdf_doc: PDF 文档对象（pypdfium2.PdfDocument 或 fitz.Document）
			
 
				+        
			
 
				+    Raises:
			
 
				+        ImportError: 如果选择的渲染引擎不可用
			
 
				+        ValueError: 如果参数无效
			
 
				+        TimeoutError: 如果转换超时（仅 pypdfium2）
			
 
				+    
			
 
				+    渲染引擎对比:
			
 
				+        ┌─────────────┬──────────────┬──────────────┐
			
 
				+        │   特性      │  pypdfium2   │    fitz      │
			
 
				+        ├─────────────┼──────────────┼──────────────┤
			
 
				+        │ 渲染引擎    │ Chrome PDFium│ MuPDF        │
			
 
				+        │ 多进程加速  │ ✅ (非Windows)│ ❌           │
			
 
				+        │ 超时控制    │ ✅           │ ❌           │
			
 
				+        │ 尺寸限制    │ 3500px       │ 4500px       │
			
 
				+        │ 超限处理    │ 动态调整scale│ 降到72 DPI   │
			
 
				+        │ 细节保留    │ 更好         │ 良好         │
			
 
				+        │ MinerU标准  │ ✅           │ ❌           │
			
 
				+        └─────────────┴──────────────┴──────────────┘
			
 
				+    
			
 
				+    示例:
			
 
				+        # 使用 pypdfium2（推荐，MinerU 标准）
			
 
				+        images, doc = load_images_from_pdf_unified(
			
 
				+            pdf_bytes, 
			
 
				+            dpi=200, 
			
 
				+            renderer="pypdfium2",
			
 
				+            threads=4
			
 
				+        )
			
 
				+        
			
 
				+        # 使用 PyMuPDF (fitz)
			
 
				+        images, doc = load_images_from_pdf_unified(
			
 
				+            pdf_bytes, 
			
 
				+            dpi=200, 
			
 
				+            renderer="fitz"
			
 
				+        )
			
 
				+        
			
 
				+        # 访问图像
			
 
				+        for img_dict in images:
			
 
				+            pil_image = img_dict['img_pil']
			
 
				+            scale = img_dict['scale']
			
 
				+            # 处理图像...
			
 
				+    
			
 
				+    注意事项:
			
 
				+        1. pypdfium2 在生产环境中更推荐，因为它是 MinerU 的标准实现
			
 
				+        2. 两种渲染引擎可能产生略有不同的图像（SSIM ≈ 0.945）
			
 
				+        3. 建议在同一项目中保持使用同一渲染引擎，避免不一致
			
 
				+        4. 如果需要与现有测试图像对比，使用相同的渲染引擎
			
 
				+    """
			
 
				+    renderer = renderer.lower()
			
 
				+    
			
 
				+    if renderer in ["pypdfium2", "pdfium"]:
			
 
				+        return _load_images_pypdfium2(
			
 
				+            pdf_bytes, dpi, start_page_id, end_page_id, 
			
 
				+            image_type, timeout, threads
			
 
				+        )
			
 
				+    elif renderer in ["fitz", "pymupdf", "mupdf"]:
			
 
				+        return _load_images_fitz(
			
 
				+            pdf_bytes, dpi, start_page_id, end_page_id, image_type
			
 
				+        )
			
 
				+    else:
			
 
				+        raise ValueError(
			
 
				+            f"不支持的渲染引擎: {renderer}. "
			
 
				+            f"请使用 'pypdfium2' 或 'fitz'"
			
 
				+        )
			
 
				+
			
 
				+
			
 
				+def _load_images_pypdfium2(
			
 
				+    pdf_bytes: bytes,
			
 
				+    dpi: int,
			
 
				+    start_page_id: int,
			
 
				+    end_page_id: Optional[int],
			
 
				+    image_type: str,
			
 
				+    timeout: Optional[int],
			
 
				+    threads: int
			
 
				+) -> Tuple[List[Dict[str, Any]], Any]:
			
 
				+    """使用 pypdfium2 渲染引擎（MinerU 标准）"""
			
 
				+    try:
			
 
				+        import pypdfium2 as pdfium
			
 
				+        from mineru.utils.pdf_image_tools import load_images_from_pdf as mineru_load_images
			
 
				+        from mineru.utils.enum_class import ImageType
			
 
				+    except ImportError as e:
			
 
				+        raise ImportError(
			
 
				+            f"pypdfium2 渲染引擎需要安装 MinerU: pip install mineru\n"
			
 
				+            f"原始错误: {e}"
			
 
				+        )
			
 
				+    
			
 
				+    # 转换 image_type
			
 
				+    img_type = ImageType.PIL if image_type.upper() == "PIL" else ImageType.BASE64
			
 
				+    
			
 
				+    # 使用 MinerU 的实现
			
 
				+    images_list, pdf_doc = mineru_load_images(
			
 
				+        pdf_bytes=pdf_bytes,
			
 
				+        dpi=dpi,
			
 
				+        start_page_id=start_page_id,
			
 
				+        end_page_id=end_page_id,
			
 
				+        image_type=img_type,
			
 
				+        timeout=timeout,
			
 
				+        threads=threads
			
 
				+    )
			
 
				+    
			
 
				+    logger.info(
			
 
				+        f"✅ pypdfium2 渲染完成: {len(images_list)} 页 "
			
 
				+        f"(DPI={dpi}, 多进程={threads})"
			
 
				+    )
			
 
				+    
			
 
				+    return images_list, pdf_doc
			
 
				+
			
 
				+
			
 
				+def _load_images_fitz(
			
 
				+    pdf_bytes: bytes,
			
 
				+    dpi: int,
			
 
				+    start_page_id: int,
			
 
				+    end_page_id: Optional[int],
			
 
				+    image_type: str
			
 
				+) -> Tuple[List[Dict[str, Any]], Any]:
			
 
				+    """使用 PyMuPDF (fitz) 渲染引擎"""
			
 
				+    try:
			
 
				+        import fitz
			
 
				+    except ImportError as e:
			
 
				+        raise ImportError(
			
 
				+            f"PyMuPDF 渲染引擎需要安装: pip install PyMuPDF\n"
			
 
				+            f"原始错误: {e}"
			
 
				+        )
			
 
				+    
			
 
				+    from io import BytesIO
			
 
				+    import base64
			
 
				+    
			
 
				+    # 打开 PDF
			
 
				+    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
			
 
				+    pdf_page_num = doc.page_count
			
 
				+    
			
 
				+    # 处理 end_page_id
			
 
				+    if end_page_id is None or end_page_id < 0:
			
 
				+        end_page_id = pdf_page_num - 1
			
 
				+    end_page_id = min(end_page_id, pdf_page_num - 1)
			
 
				+    
			
 
				+    # 渲染图像
			
 
				+    images_list = []
			
 
				+    mat = fitz.Matrix(dpi / 72, dpi / 72)
			
 
				+    
			
 
				+    for index in range(start_page_id, end_page_id + 1):
			
 
				+        page = doc[index]
			
 
				+        
			
 
				+        # 渲染为 pixmap
			
 
				+        pm = page.get_pixmap(matrix=mat, alpha=False)
			
 
				+        
			
 
				+        # 如果超过尺寸限制，降低到 72 DPI
			
 
				+        if pm.width > 4500 or pm.height > 4500:
			
 
				+            logger.warning(
			
 
				+                f"⚠️  页面 {index} 尺寸过大 ({pm.width}x{pm.height}), "
			
 
				+                f"降低到 72 DPI"
			
 
				+            )
			
 
				+            mat_fallback = fitz.Matrix(1, 1)  # 72 DPI
			
 
				+            pm = page.get_pixmap(matrix=mat_fallback, alpha=False)
			
 
				+        
			
 
				+        # 转换为 PIL Image
			
 
				+        pil_img = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
			
 
				+        
			
 
				+        # 计算实际 scale
			
 
				+        page_rect = page.rect
			
 
				+        actual_scale = pm.width / page_rect.width
			
 
				+        
			
 
				+        # 构建返回字典
			
 
				+        image_dict = {
			
 
				+            'img_pil': pil_img,
			
 
				+            'scale': actual_scale
			
 
				+        }
			
 
				+        
			
 
				+        # 如果需要 BASE64
			
 
				+        if image_type.upper() == "BASE64":
			
 
				+            buffer = BytesIO()
			
 
				+            pil_img.save(buffer, format="JPEG")
			
 
				+            img_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
			
 
				+            image_dict['img_base64'] = img_base64
			
 
				+            # 移除 img_pil 以节省内存
			
 
				+            del image_dict['img_pil']
			
 
				+        
			
 
				+        images_list.append(image_dict)
			
 
				+    
			
 
				+    logger.info(
			
 
				+        f"✅ PyMuPDF (fitz) 渲染完成: {len(images_list)} 页 "
			
 
				+        f"(DPI={dpi}, 单进程)"
			
 
				+    )
			
 
				+    
			
 
				+    return images_list, doc