|
|
@@ -124,10 +124,11 @@ class PDFUtils:
|
|
|
logger.info(f"📋 PDF classified as: {pdf_type}")
|
|
|
|
|
|
# 加载图像
|
|
|
- images_list, pdf_doc = load_images_from_pdf(
|
|
|
+ images_list, pdf_doc = load_images_from_pdf_unified(
|
|
|
pdf_bytes,
|
|
|
dpi=dpi,
|
|
|
- image_type=ImageType.PIL
|
|
|
+ image_type=ImageType.PIL,
|
|
|
+ renderer='fitz'
|
|
|
)
|
|
|
|
|
|
# 解析页面范围
|
|
|
@@ -266,3 +267,226 @@ class PDFUtils:
|
|
|
# TODO: 实现跨页表格合并逻辑
|
|
|
return results
|
|
|
|
|
|
+
|
|
|
+# ============================================================================
|
|
|
+# 统一的 PDF 图像加载函数 - 支持多种渲染引擎
|
|
|
+# ============================================================================
|
|
|
+
|
|
|
+def load_images_from_pdf_unified(
|
|
|
+ pdf_bytes: bytes,
|
|
|
+ dpi: int = 200,
|
|
|
+ start_page_id: int = 0,
|
|
|
+ end_page_id: Optional[int] = None,
|
|
|
+ image_type: str = "PIL",
|
|
|
+ renderer: str = "pypdfium2",
|
|
|
+ timeout: Optional[int] = None,
|
|
|
+ threads: int = 4,
|
|
|
+) -> Tuple[List[Dict[str, Any]], Any]:
|
|
|
+ """
|
|
|
+ 从 PDF 加载图像,支持两种渲染引擎
|
|
|
+
|
|
|
+ Args:
|
|
|
+ pdf_bytes: PDF 文件的字节数据
|
|
|
+ dpi: 渲染 DPI,默认 200
|
|
|
+ start_page_id: 起始页码(0-based),默认 0
|
|
|
+ end_page_id: 结束页码(0-based,包含),默认 None(处理到最后)
|
|
|
+ image_type: 返回图像类型,"PIL" 或 "BASE64"
|
|
|
+ renderer: 渲染引擎选择
|
|
|
+ - "pypdfium2": 使用 MinerU 标准的 pypdfium2(推荐)
|
|
|
+ * 优势: Chrome PDFium 引擎,多进程加速,更好的细节保留
|
|
|
+ * 尺寸限制: 3500px,超过则动态调整 scale
|
|
|
+ - "fitz" / "pymupdf": 使用 PyMuPDF (fitz)
|
|
|
+ * 优势: MuPDF 引擎,简单直接,无需额外依赖
|
|
|
+ * 尺寸限制: 4500px,超过则降到 72 DPI
|
|
|
+ timeout: 超时时间(秒),仅 pypdfium2 支持
|
|
|
+ threads: 进程数,仅 pypdfium2 支持多进程加速(Windows 下自动禁用)
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ (images_list, pdf_doc)
|
|
|
+ - images_list: 图像列表,每个元素为 {'img_pil': PIL.Image, 'scale': float}
|
|
|
+ 或 {'img_base64': str, 'scale': float}(取决于 image_type)
|
|
|
+ - pdf_doc: PDF 文档对象(pypdfium2.PdfDocument 或 fitz.Document)
|
|
|
+
|
|
|
+ Raises:
|
|
|
+ ImportError: 如果选择的渲染引擎不可用
|
|
|
+ ValueError: 如果参数无效
|
|
|
+ TimeoutError: 如果转换超时(仅 pypdfium2)
|
|
|
+
|
|
|
+ 渲染引擎对比:
|
|
|
+ ┌─────────────┬──────────────┬──────────────┐
|
|
|
+ │ 特性 │ pypdfium2 │ fitz │
|
|
|
+ ├─────────────┼──────────────┼──────────────┤
|
|
|
+ │ 渲染引擎 │ Chrome PDFium│ MuPDF │
|
|
|
+ │ 多进程加速 │ ✅ (非Windows)│ ❌ │
|
|
|
+ │ 超时控制 │ ✅ │ ❌ │
|
|
|
+ │ 尺寸限制 │ 3500px │ 4500px │
|
|
|
+ │ 超限处理 │ 动态调整scale│ 降到72 DPI │
|
|
|
+ │ 细节保留 │ 更好 │ 良好 │
|
|
|
+ │ MinerU标准 │ ✅ │ ❌ │
|
|
|
+ └─────────────┴──────────────┴──────────────┘
|
|
|
+
|
|
|
+ 示例:
|
|
|
+ # 使用 pypdfium2(推荐,MinerU 标准)
|
|
|
+ images, doc = load_images_from_pdf_unified(
|
|
|
+ pdf_bytes,
|
|
|
+ dpi=200,
|
|
|
+ renderer="pypdfium2",
|
|
|
+ threads=4
|
|
|
+ )
|
|
|
+
|
|
|
+ # 使用 PyMuPDF (fitz)
|
|
|
+ images, doc = load_images_from_pdf_unified(
|
|
|
+ pdf_bytes,
|
|
|
+ dpi=200,
|
|
|
+ renderer="fitz"
|
|
|
+ )
|
|
|
+
|
|
|
+ # 访问图像
|
|
|
+ for img_dict in images:
|
|
|
+ pil_image = img_dict['img_pil']
|
|
|
+ scale = img_dict['scale']
|
|
|
+ # 处理图像...
|
|
|
+
|
|
|
+ 注意事项:
|
|
|
+ 1. pypdfium2 在生产环境中更推荐,因为它是 MinerU 的标准实现
|
|
|
+ 2. 两种渲染引擎可能产生略有不同的图像(SSIM ≈ 0.945)
|
|
|
+ 3. 建议在同一项目中保持使用同一渲染引擎,避免不一致
|
|
|
+ 4. 如果需要与现有测试图像对比,使用相同的渲染引擎
|
|
|
+ """
|
|
|
+ renderer = renderer.lower()
|
|
|
+
|
|
|
+ if renderer in ["pypdfium2", "pdfium"]:
|
|
|
+ return _load_images_pypdfium2(
|
|
|
+ pdf_bytes, dpi, start_page_id, end_page_id,
|
|
|
+ image_type, timeout, threads
|
|
|
+ )
|
|
|
+ elif renderer in ["fitz", "pymupdf", "mupdf"]:
|
|
|
+ return _load_images_fitz(
|
|
|
+ pdf_bytes, dpi, start_page_id, end_page_id, image_type
|
|
|
+ )
|
|
|
+ else:
|
|
|
+ raise ValueError(
|
|
|
+ f"不支持的渲染引擎: {renderer}. "
|
|
|
+ f"请使用 'pypdfium2' 或 'fitz'"
|
|
|
+ )
|
|
|
+
|
|
|
+
|
|
|
+def _load_images_pypdfium2(
|
|
|
+ pdf_bytes: bytes,
|
|
|
+ dpi: int,
|
|
|
+ start_page_id: int,
|
|
|
+ end_page_id: Optional[int],
|
|
|
+ image_type: str,
|
|
|
+ timeout: Optional[int],
|
|
|
+ threads: int
|
|
|
+) -> Tuple[List[Dict[str, Any]], Any]:
|
|
|
+ """使用 pypdfium2 渲染引擎(MinerU 标准)"""
|
|
|
+ try:
|
|
|
+ import pypdfium2 as pdfium
|
|
|
+ from mineru.utils.pdf_image_tools import load_images_from_pdf as mineru_load_images
|
|
|
+ from mineru.utils.enum_class import ImageType
|
|
|
+ except ImportError as e:
|
|
|
+ raise ImportError(
|
|
|
+ f"pypdfium2 渲染引擎需要安装 MinerU: pip install mineru\n"
|
|
|
+ f"原始错误: {e}"
|
|
|
+ )
|
|
|
+
|
|
|
+ # 转换 image_type
|
|
|
+ img_type = ImageType.PIL if image_type.upper() == "PIL" else ImageType.BASE64
|
|
|
+
|
|
|
+ # 使用 MinerU 的实现
|
|
|
+ images_list, pdf_doc = mineru_load_images(
|
|
|
+ pdf_bytes=pdf_bytes,
|
|
|
+ dpi=dpi,
|
|
|
+ start_page_id=start_page_id,
|
|
|
+ end_page_id=end_page_id,
|
|
|
+ image_type=img_type,
|
|
|
+ timeout=timeout,
|
|
|
+ threads=threads
|
|
|
+ )
|
|
|
+
|
|
|
+ logger.info(
|
|
|
+ f"✅ pypdfium2 渲染完成: {len(images_list)} 页 "
|
|
|
+ f"(DPI={dpi}, 多进程={threads})"
|
|
|
+ )
|
|
|
+
|
|
|
+ return images_list, pdf_doc
|
|
|
+
|
|
|
+
|
|
|
+def _load_images_fitz(
|
|
|
+ pdf_bytes: bytes,
|
|
|
+ dpi: int,
|
|
|
+ start_page_id: int,
|
|
|
+ end_page_id: Optional[int],
|
|
|
+ image_type: str
|
|
|
+) -> Tuple[List[Dict[str, Any]], Any]:
|
|
|
+ """使用 PyMuPDF (fitz) 渲染引擎"""
|
|
|
+ try:
|
|
|
+ import fitz
|
|
|
+ except ImportError as e:
|
|
|
+ raise ImportError(
|
|
|
+ f"PyMuPDF 渲染引擎需要安装: pip install PyMuPDF\n"
|
|
|
+ f"原始错误: {e}"
|
|
|
+ )
|
|
|
+
|
|
|
+ from io import BytesIO
|
|
|
+ import base64
|
|
|
+
|
|
|
+ # 打开 PDF
|
|
|
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
|
|
+ pdf_page_num = doc.page_count
|
|
|
+
|
|
|
+ # 处理 end_page_id
|
|
|
+ if end_page_id is None or end_page_id < 0:
|
|
|
+ end_page_id = pdf_page_num - 1
|
|
|
+ end_page_id = min(end_page_id, pdf_page_num - 1)
|
|
|
+
|
|
|
+ # 渲染图像
|
|
|
+ images_list = []
|
|
|
+ mat = fitz.Matrix(dpi / 72, dpi / 72)
|
|
|
+
|
|
|
+ for index in range(start_page_id, end_page_id + 1):
|
|
|
+ page = doc[index]
|
|
|
+
|
|
|
+ # 渲染为 pixmap
|
|
|
+ pm = page.get_pixmap(matrix=mat, alpha=False)
|
|
|
+
|
|
|
+ # 如果超过尺寸限制,降低到 72 DPI
|
|
|
+ if pm.width > 4500 or pm.height > 4500:
|
|
|
+ logger.warning(
|
|
|
+ f"⚠️ 页面 {index} 尺寸过大 ({pm.width}x{pm.height}), "
|
|
|
+ f"降低到 72 DPI"
|
|
|
+ )
|
|
|
+ mat_fallback = fitz.Matrix(1, 1) # 72 DPI
|
|
|
+ pm = page.get_pixmap(matrix=mat_fallback, alpha=False)
|
|
|
+
|
|
|
+ # 转换为 PIL Image
|
|
|
+ pil_img = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
|
|
|
+
|
|
|
+ # 计算实际 scale
|
|
|
+ page_rect = page.rect
|
|
|
+ actual_scale = pm.width / page_rect.width
|
|
|
+
|
|
|
+ # 构建返回字典
|
|
|
+ image_dict = {
|
|
|
+ 'img_pil': pil_img,
|
|
|
+ 'scale': actual_scale
|
|
|
+ }
|
|
|
+
|
|
|
+ # 如果需要 BASE64
|
|
|
+ if image_type.upper() == "BASE64":
|
|
|
+ buffer = BytesIO()
|
|
|
+ pil_img.save(buffer, format="JPEG")
|
|
|
+ img_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
|
|
|
+ image_dict['img_base64'] = img_base64
|
|
|
+ # 移除 img_pil 以节省内存
|
|
|
+ del image_dict['img_pil']
|
|
|
+
|
|
|
+ images_list.append(image_dict)
|
|
|
+
|
|
|
+ logger.info(
|
|
|
+ f"✅ PyMuPDF (fitz) 渲染完成: {len(images_list)} 页 "
|
|
|
+ f"(DPI={dpi}, 单进程)"
|
|
|
+ )
|
|
|
+
|
|
|
+ return images_list, doc
|