|
|
@@ -1,33 +1,76 @@
|
|
|
"""
|
|
|
-PDF处理工具模块
|
|
|
+PDF处理工具模块(重构版)
|
|
|
|
|
|
-提供PDF相关处理功能:
|
|
|
+提供PDF相关处理功能的统一入口:
|
|
|
- PDF加载与分类
|
|
|
-- PDF文本提取
|
|
|
+- PDF文本提取(支持 pypdfium2 和 fitz)
|
|
|
+- PDF图像渲染(支持多种引擎)
|
|
|
+- 坐标转换(PDF坐标 ↔ 图像坐标)
|
|
|
- 跨页表格合并
|
|
|
- 页面范围解析与过滤
|
|
|
+
|
|
|
+本模块已重构为多个子模块:
|
|
|
+- pdf_coordinate_transform: 坐标转换功能
|
|
|
+- pdf_text_extraction: 文本提取功能
|
|
|
+- pdf_image_rendering: 图像渲染功能
|
|
|
+- pdf_utils: 高级API和统一入口(本文件)
|
|
|
+
|
|
|
+为保持向后兼容性,所有原有函数都从新模块重新导出。
|
|
|
"""
|
|
|
from typing import Dict, List, Any, Optional, Tuple, Set
|
|
|
from pathlib import Path
|
|
|
from PIL import Image
|
|
|
from loguru import logger
|
|
|
-import re
|
|
|
|
|
|
# 导入页面范围解析函数(不依赖 MinerU)
|
|
|
from .file_utils import parse_page_range
|
|
|
|
|
|
+# 从子模块导入功能
|
|
|
+from .pdf_coordinate_transform import (
|
|
|
+ transform_bbox_for_rotation_fitz,
|
|
|
+ transform_bbox_for_rotation_pypdfium2,
|
|
|
+ pdf_rotation_to_image_rotation,
|
|
|
+)
|
|
|
+
|
|
|
+from .pdf_text_extraction import (
|
|
|
+ detect_pdf_doc_type,
|
|
|
+ bbox_overlap,
|
|
|
+ extract_text_from_pdf,
|
|
|
+ extract_text_from_pdf_pypdfium2,
|
|
|
+ extract_text_from_pdf_fitz,
|
|
|
+ extract_all_text_blocks,
|
|
|
+ extract_all_text_blocks_pypdfium2,
|
|
|
+ extract_all_text_blocks_fitz,
|
|
|
+)
|
|
|
+
|
|
|
+from .pdf_image_rendering import (
|
|
|
+ load_images_from_pdf_unified,
|
|
|
+ load_images_pypdfium2,
|
|
|
+ load_images_fitz,
|
|
|
+)
|
|
|
+
|
|
|
# 导入 MinerU 组件
|
|
|
try:
|
|
|
from mineru.utils.pdf_classify import classify as pdf_classify
|
|
|
- from mineru.utils.pdf_image_tools import load_images_from_pdf
|
|
|
from mineru.utils.enum_class import ImageType
|
|
|
- from mineru.utils.pdf_text_tool import get_page as pdf_get_page_text
|
|
|
MINERU_AVAILABLE = True
|
|
|
except ImportError:
|
|
|
raise ImportError("MinerU components not available for PDF processing")
|
|
|
|
|
|
+
|
|
|
+
|
|
|
class PDFUtils:
|
|
|
- """PDF处理工具类"""
|
|
|
+ """
|
|
|
+ PDF处理工具类(重构版)
|
|
|
+
|
|
|
+ 本类提供PDF处理的高级API,内部调用已重构的子模块功能。
|
|
|
+ 保持原有接口不变,确保向后兼容性。
|
|
|
+
|
|
|
+ 子模块:
|
|
|
+ - pdf_coordinate_transform: 坐标转换
|
|
|
+ - pdf_text_extraction: 文本提取
|
|
|
+ - pdf_image_rendering: 图像渲染
|
|
|
+ """
|
|
|
|
|
|
@staticmethod
|
|
|
def parse_page_range(page_range: Optional[str], total_pages: int) -> Set[int]:
|
|
|
@@ -56,7 +99,7 @@ class PDFUtils:
|
|
|
@staticmethod
|
|
|
def _detect_pdf_doc_type(pdf_doc: Any) -> str:
|
|
|
"""
|
|
|
- 检测 PDF 文档对象类型
|
|
|
+ 检测 PDF 文档对象类型(向后兼容包装)
|
|
|
|
|
|
Args:
|
|
|
pdf_doc: PDF 文档对象
|
|
|
@@ -64,28 +107,14 @@ class PDFUtils:
|
|
|
Returns:
|
|
|
'pypdfium2' 或 'fitz'
|
|
|
"""
|
|
|
- doc_type_name = type(pdf_doc).__name__
|
|
|
- doc_module = type(pdf_doc).__module__
|
|
|
-
|
|
|
- if 'pdfium' in doc_module.lower() or 'PdfDocument' in doc_type_name:
|
|
|
- return 'pypdfium2'
|
|
|
- elif 'fitz' in doc_module.lower() or 'Document' in doc_type_name:
|
|
|
- return 'fitz'
|
|
|
- else:
|
|
|
- # 尝试通过属性判断
|
|
|
- if hasattr(pdf_doc, 'get_page') or hasattr(pdf_doc, 'page_count'):
|
|
|
- # fitz.Document 有 page_count 属性
|
|
|
- return 'fitz'
|
|
|
- else:
|
|
|
- # pypdfium2 通过索引访问
|
|
|
- return 'pypdfium2'
|
|
|
+ return detect_pdf_doc_type(pdf_doc)
|
|
|
|
|
|
@staticmethod
|
|
|
def load_and_classify_document(
|
|
|
document_path: Path,
|
|
|
dpi: int = 200,
|
|
|
page_range: Optional[str] = None,
|
|
|
- renderer: str = "fitz" # 新增参数,默认 fitz
|
|
|
+ renderer: str = "fitz"
|
|
|
) -> Tuple[List[Dict], str, Optional[Any], str]:
|
|
|
"""
|
|
|
加载文档并分类,支持页面范围过滤
|
|
|
@@ -99,10 +128,10 @@ class PDFUtils:
|
|
|
renderer: PDF渲染引擎,"fitz" 或 "pypdfium2"
|
|
|
|
|
|
Returns:
|
|
|
- (images_list, pdf_type, pdf_doc)
|
|
|
+ (images_list, pdf_type, pdf_doc, renderer_used)
|
|
|
- images_list: 图像列表,每个元素包含 {'img_pil': PIL.Image, 'scale': float, 'page_idx': int}
|
|
|
- pdf_type: 'ocr' 或 'txt'
|
|
|
- - pdf_doc: PDF文档对象(如果PDF)
|
|
|
+ - pdf_doc: PDF文档对象(如果是PDF)
|
|
|
- renderer_used: 实际使用的渲染器类型
|
|
|
"""
|
|
|
pdf_doc = None
|
|
|
@@ -135,11 +164,11 @@ class PDFUtils:
|
|
|
'img_pil': img,
|
|
|
'scale': 1.0,
|
|
|
'source_path': str(img_file),
|
|
|
- 'page_idx': idx, # 原始索引
|
|
|
- 'page_name': img_file.stem # 文件名(不含扩展名)
|
|
|
+ 'page_idx': idx,
|
|
|
+ 'page_name': img_file.stem
|
|
|
})
|
|
|
|
|
|
- pdf_type = 'ocr' # 图片目录始终使用OCR模式
|
|
|
+ pdf_type = 'ocr'
|
|
|
|
|
|
elif document_path.suffix.lower() == '.pdf':
|
|
|
# 处理PDF文件
|
|
|
@@ -153,12 +182,12 @@ class PDFUtils:
|
|
|
pdf_type = pdf_classify(pdf_bytes)
|
|
|
logger.info(f"📋 PDF classified as: {pdf_type}")
|
|
|
|
|
|
- # 加载图像
|
|
|
+ # 加载图像(使用重构后的函数)
|
|
|
images_list, pdf_doc = load_images_from_pdf_unified(
|
|
|
pdf_bytes,
|
|
|
dpi=dpi,
|
|
|
image_type=ImageType.PIL,
|
|
|
- renderer=renderer # 使用指定的渲染引擎
|
|
|
+ renderer=renderer
|
|
|
)
|
|
|
|
|
|
# 解析页面范围
|
|
|
@@ -176,7 +205,7 @@ class PDFUtils:
|
|
|
'img_pil': img_dict['img_pil'],
|
|
|
'scale': img_dict.get('scale', dpi / 72),
|
|
|
'source_path': str(document_path),
|
|
|
- 'page_idx': idx, # 原始页码索引
|
|
|
+ 'page_idx': idx,
|
|
|
'page_name': f"{document_path.stem}_page_{idx + 1:03d}"
|
|
|
})
|
|
|
|
|
|
@@ -199,6 +228,37 @@ class PDFUtils:
|
|
|
|
|
|
return all_images, pdf_type, pdf_doc, renderer
|
|
|
|
|
|
+
|
|
|
+ @staticmethod
|
|
|
+ def _transform_bbox_for_rotation_fitz(
|
|
|
+ bbox: List[float],
|
|
|
+ rotation: int,
|
|
|
+ pdf_width: float,
|
|
|
+ pdf_height: float,
|
|
|
+ scale: float
|
|
|
+ ) -> List[float]:
|
|
|
+ """向后兼容包装:fitz引擎坐标转换"""
|
|
|
+ return transform_bbox_for_rotation_fitz(bbox, rotation, pdf_width, pdf_height, scale)
|
|
|
+
|
|
|
+ @staticmethod
|
|
|
+ def _transform_bbox_for_rotation_pypdfium2(
|
|
|
+ bbox: List[float],
|
|
|
+ rotation: int,
|
|
|
+ pdf_width: float,
|
|
|
+ pdf_height: float,
|
|
|
+ scale: float
|
|
|
+ ) -> List[float]:
|
|
|
+ """向后兼容包装:pypdfium2引擎坐标转换"""
|
|
|
+ return transform_bbox_for_rotation_pypdfium2(bbox, rotation, pdf_width, pdf_height, scale)
|
|
|
+
|
|
|
+ # ========================================================================
|
|
|
+ # 文本提取函数(向后兼容包装)
|
|
|
+ # ========================================================================
|
|
|
+
|
|
|
+ # ========================================================================
|
|
|
+ # 文本提取函数(向后兼容包装)
|
|
|
+ # ========================================================================
|
|
|
+
|
|
|
@staticmethod
|
|
|
def extract_text_from_pdf(
|
|
|
pdf_doc: Any,
|
|
|
@@ -206,25 +266,8 @@ class PDFUtils:
|
|
|
bbox: List[float],
|
|
|
scale: float
|
|
|
) -> Tuple[str, bool]:
|
|
|
- """
|
|
|
- 从PDF直接提取文本(支持 pypdfium2 和 fitz)
|
|
|
-
|
|
|
- Args:
|
|
|
- pdf_doc: PDF文档对象 (pypdfium2.PdfDocument 或 fitz.Document)
|
|
|
- page_idx: 页码索引
|
|
|
- bbox: 目标区域的bbox(图像坐标)
|
|
|
- scale: 图像与PDF的缩放比例
|
|
|
-
|
|
|
- Returns:
|
|
|
- (text, success)
|
|
|
- """
|
|
|
- # 检测 PDF 文档类型
|
|
|
- doc_type = PDFUtils._detect_pdf_doc_type(pdf_doc)
|
|
|
-
|
|
|
- if doc_type == 'fitz':
|
|
|
- return PDFUtils._extract_text_from_pdf_fitz(pdf_doc, page_idx, bbox, scale)
|
|
|
- else: # pypdfium2
|
|
|
- return PDFUtils._extract_text_from_pdf_pypdfium2(pdf_doc, page_idx, bbox, scale)
|
|
|
+ """向后兼容包装:从PDF指定区域提取文本"""
|
|
|
+ return extract_text_from_pdf(pdf_doc, page_idx, bbox, scale)
|
|
|
|
|
|
@staticmethod
|
|
|
def _extract_text_from_pdf_pypdfium2(
|
|
|
@@ -233,51 +276,8 @@ class PDFUtils:
|
|
|
bbox: List[float],
|
|
|
scale: float
|
|
|
) -> Tuple[str, bool]:
|
|
|
- """使用 pypdfium2 提取文本(原有实现)"""
|
|
|
- if not MINERU_AVAILABLE or pdf_get_page_text is None:
|
|
|
- logger.error("MinerU pdf_text_tool not available")
|
|
|
- return "", False
|
|
|
-
|
|
|
- try:
|
|
|
- page = pdf_doc[page_idx]
|
|
|
-
|
|
|
- # 将图像坐标转换为PDF坐标
|
|
|
- pdf_bbox = [
|
|
|
- bbox[0] / scale,
|
|
|
- bbox[1] / scale,
|
|
|
- bbox[2] / scale,
|
|
|
- bbox[3] / scale
|
|
|
- ]
|
|
|
-
|
|
|
- # 使用 MinerU 的方式获取页面文本信息
|
|
|
- page_dict = pdf_get_page_text(page)
|
|
|
-
|
|
|
- # 从 blocks 中提取与 bbox 重叠的文本
|
|
|
- text_parts = []
|
|
|
- for block in page_dict.get('blocks', []):
|
|
|
- for line in block.get('lines', []):
|
|
|
- line_bbox = line.get('bbox')
|
|
|
- if line_bbox and hasattr(line_bbox, 'bbox'):
|
|
|
- line_bbox = line_bbox.bbox
|
|
|
- elif isinstance(line_bbox, (list, tuple)) and len(line_bbox) >= 4:
|
|
|
- line_bbox = list(line_bbox)
|
|
|
- else:
|
|
|
- continue
|
|
|
-
|
|
|
- if PDFUtils._bbox_overlap(pdf_bbox, line_bbox):
|
|
|
- for span in line.get('spans', []):
|
|
|
- span_text = span.get('text', '')
|
|
|
- if span_text:
|
|
|
- text_parts.append(span_text)
|
|
|
-
|
|
|
- text = ' '.join(text_parts)
|
|
|
- return text.strip(), bool(text.strip())
|
|
|
-
|
|
|
- except Exception as e:
|
|
|
- import traceback
|
|
|
- logger.debug(f"pypdfium2 text extraction error: {e}")
|
|
|
- logger.debug(traceback.format_exc())
|
|
|
- return "", False
|
|
|
+ """向后兼容包装:使用pypdfium2提取文本"""
|
|
|
+ return extract_text_from_pdf_pypdfium2(pdf_doc, page_idx, bbox, scale)
|
|
|
|
|
|
@staticmethod
|
|
|
def _extract_text_from_pdf_fitz(
|
|
|
@@ -286,190 +286,96 @@ class PDFUtils:
|
|
|
bbox: List[float],
|
|
|
scale: float
|
|
|
) -> Tuple[str, bool]:
|
|
|
- """使用 fitz 提取文本"""
|
|
|
- try:
|
|
|
- import fitz
|
|
|
- except ImportError:
|
|
|
- logger.error("PyMuPDF (fitz) not available")
|
|
|
- return "", False
|
|
|
-
|
|
|
- try:
|
|
|
- page = pdf_doc[page_idx]
|
|
|
-
|
|
|
- # 将图像坐标转换为PDF坐标
|
|
|
- pdf_bbox = fitz.Rect(
|
|
|
- bbox[0] / scale,
|
|
|
- bbox[1] / scale,
|
|
|
- bbox[2] / scale,
|
|
|
- bbox[3] / scale
|
|
|
- )
|
|
|
-
|
|
|
- # 提取区域内的文本
|
|
|
- text = page.get_text("text", clip=pdf_bbox)
|
|
|
-
|
|
|
- return text.strip(), bool(text.strip())
|
|
|
-
|
|
|
- except Exception as e:
|
|
|
- import traceback
|
|
|
- logger.debug(f"fitz text extraction error: {e}")
|
|
|
- logger.debug(traceback.format_exc())
|
|
|
- return "", False
|
|
|
-
|
|
|
+ """向后兼容包装:使用fitz提取文本"""
|
|
|
+ return extract_text_from_pdf_fitz(pdf_doc, page_idx, bbox, scale)
|
|
|
+
|
|
|
@staticmethod
|
|
|
def extract_all_text_blocks(
|
|
|
pdf_doc: Any,
|
|
|
page_idx: int,
|
|
|
scale: float
|
|
|
- ) -> List[Dict[str, Any]]:
|
|
|
- """
|
|
|
- 提取页面所有文本块(支持 pypdfium2 和 fitz)
|
|
|
-
|
|
|
- Args:
|
|
|
- pdf_doc: PDF文档对象
|
|
|
- page_idx: 页码
|
|
|
- scale: 缩放比例
|
|
|
-
|
|
|
- Returns:
|
|
|
- 文本块列表 [{'text': str, 'bbox': [x1, y1, x2, y2]}, ...]
|
|
|
- """
|
|
|
- # 检测 PDF 文档类型
|
|
|
- doc_type = PDFUtils._detect_pdf_doc_type(pdf_doc)
|
|
|
-
|
|
|
- if doc_type == 'fitz':
|
|
|
- return PDFUtils._extract_all_text_blocks_fitz(pdf_doc, page_idx, scale)
|
|
|
- else: # pypdfium2
|
|
|
- return PDFUtils._extract_all_text_blocks_pypdfium2(pdf_doc, page_idx, scale)
|
|
|
-
|
|
|
+ ) -> Tuple[List[Dict[str, Any]], int]:
|
|
|
+ """向后兼容包装:提取页面所有文本块"""
|
|
|
+ return extract_all_text_blocks(pdf_doc, page_idx, scale)
|
|
|
+
|
|
|
@staticmethod
|
|
|
def _extract_all_text_blocks_pypdfium2(
|
|
|
pdf_doc: Any,
|
|
|
page_idx: int,
|
|
|
scale: float
|
|
|
- ) -> List[Dict[str, Any]]:
|
|
|
- """使用 pypdfium2 提取所有文本块(原有实现)"""
|
|
|
- if not MINERU_AVAILABLE or pdf_get_page_text is None:
|
|
|
- return []
|
|
|
-
|
|
|
- try:
|
|
|
- page = pdf_doc[page_idx]
|
|
|
- page_dict = pdf_get_page_text(page)
|
|
|
-
|
|
|
- extracted_blocks = []
|
|
|
-
|
|
|
- for block in page_dict.get('blocks', []):
|
|
|
- for line in block.get('lines', []):
|
|
|
- line_text = ""
|
|
|
- for span in line.get('spans', []):
|
|
|
- line_text += span.get('text', "")
|
|
|
-
|
|
|
- if not line_text.strip():
|
|
|
- continue
|
|
|
-
|
|
|
- line_bbox = line.get('bbox')
|
|
|
- if line_bbox and hasattr(line_bbox, 'bbox'):
|
|
|
- line_bbox = line_bbox.bbox
|
|
|
- elif isinstance(line_bbox, (list, tuple)) and len(line_bbox) >= 4:
|
|
|
- line_bbox = list(line_bbox)
|
|
|
- else:
|
|
|
- continue
|
|
|
-
|
|
|
- img_bbox = [
|
|
|
- line_bbox[0] * scale,
|
|
|
- line_bbox[1] * scale,
|
|
|
- line_bbox[2] * scale,
|
|
|
- line_bbox[3] * scale
|
|
|
- ]
|
|
|
-
|
|
|
- extracted_blocks.append({
|
|
|
- 'text': line_text,
|
|
|
- 'bbox': img_bbox,
|
|
|
- 'origin_bbox': line_bbox
|
|
|
- })
|
|
|
-
|
|
|
- return extracted_blocks
|
|
|
-
|
|
|
- except Exception as e:
|
|
|
- logger.warning(f"pypdfium2 extract_all_text_blocks failed: {e}")
|
|
|
- import traceback
|
|
|
- logger.debug(traceback.format_exc())
|
|
|
- return []
|
|
|
-
|
|
|
+ ) -> Tuple[List[Dict[str, Any]], int]:
|
|
|
+ """向后兼容包装:使用pypdfium2提取所有文本块"""
|
|
|
+ return extract_all_text_blocks_pypdfium2(pdf_doc, page_idx, scale)
|
|
|
+
|
|
|
@staticmethod
|
|
|
def _extract_all_text_blocks_fitz(
|
|
|
pdf_doc: Any,
|
|
|
page_idx: int,
|
|
|
scale: float
|
|
|
- ) -> List[Dict[str, Any]]:
|
|
|
- """使用 fitz 提取所有文本块"""
|
|
|
- try:
|
|
|
- import fitz
|
|
|
- except ImportError:
|
|
|
- logger.warning("PyMuPDF (fitz) not available")
|
|
|
- return []
|
|
|
-
|
|
|
- try:
|
|
|
- page = pdf_doc[page_idx]
|
|
|
-
|
|
|
- # 使用 get_text("dict") 获取详细的文本信息
|
|
|
- text_dict = page.get_text("dict")
|
|
|
-
|
|
|
- extracted_blocks = []
|
|
|
-
|
|
|
- # 遍历所有 blocks
|
|
|
- for block in text_dict.get("blocks", []):
|
|
|
- # 只处理文本块(type=0)
|
|
|
- if block.get("type") != 0:
|
|
|
- continue
|
|
|
-
|
|
|
- # 遍历所有 lines
|
|
|
- for line in block.get("lines", []):
|
|
|
- line_text = ""
|
|
|
- line_bbox = line.get("bbox")
|
|
|
-
|
|
|
- # 提取 line 中的所有 span 文本
|
|
|
- for span in line.get("spans", []):
|
|
|
- line_text += span.get("text", "")
|
|
|
-
|
|
|
- if not line_text.strip() or not line_bbox:
|
|
|
- continue
|
|
|
-
|
|
|
- # PDF 坐标转换为图像坐标
|
|
|
- img_bbox = [
|
|
|
- line_bbox[0] * scale,
|
|
|
- line_bbox[1] * scale,
|
|
|
- line_bbox[2] * scale,
|
|
|
- line_bbox[3] * scale
|
|
|
- ]
|
|
|
-
|
|
|
- extracted_blocks.append({
|
|
|
- 'text': line_text,
|
|
|
- 'bbox': img_bbox,
|
|
|
- 'origin_bbox': list(line_bbox)
|
|
|
- })
|
|
|
-
|
|
|
- return extracted_blocks
|
|
|
-
|
|
|
- except Exception as e:
|
|
|
- logger.warning(f"fitz extract_all_text_blocks failed: {e}")
|
|
|
- import traceback
|
|
|
- logger.debug(traceback.format_exc())
|
|
|
- return []
|
|
|
+ ) -> Tuple[List[Dict[str, Any]], int]:
|
|
|
+ """向后兼容包装:使用fitz提取所有文本块"""
|
|
|
+ return extract_all_text_blocks_fitz(pdf_doc, page_idx, scale)
|
|
|
|
|
|
@staticmethod
|
|
|
def _bbox_overlap(bbox1: List[float], bbox2: List[float]) -> bool:
|
|
|
- """检查两个 bbox 是否重叠"""
|
|
|
- if len(bbox1) < 4 or len(bbox2) < 4:
|
|
|
- return False
|
|
|
-
|
|
|
- x1_1, y1_1, x2_1, y2_1 = bbox1[:4]
|
|
|
- x1_2, y1_2, x2_2, y2_2 = bbox2[:4]
|
|
|
-
|
|
|
- if x2_1 < x1_2 or x2_2 < x1_1:
|
|
|
- return False
|
|
|
- if y2_1 < y1_2 or y2_2 < y1_1:
|
|
|
- return False
|
|
|
-
|
|
|
- return True
|
|
|
+ """向后兼容包装:检查两个bbox是否重叠"""
|
|
|
+ return bbox_overlap(bbox1, bbox2)
|
|
|
+
|
|
|
+ # ========================================================================
|
|
|
+ # 图像渲染函数(向后兼容包装)
|
|
|
+ # ========================================================================
|
|
|
+
|
|
|
+ @staticmethod
|
|
|
+ def load_images_from_pdf_unified(
|
|
|
+ pdf_bytes: bytes,
|
|
|
+ dpi: int = 200,
|
|
|
+ start_page_id: int = 0,
|
|
|
+ end_page_id: Optional[int] = None,
|
|
|
+ image_type: str = "PIL",
|
|
|
+ renderer: str = "pypdfium2",
|
|
|
+ timeout: Optional[int] = None,
|
|
|
+ threads: int = 4,
|
|
|
+ ) -> Tuple[List[Dict[str, Any]], Any]:
|
|
|
+ """向后兼容包装:统一的PDF图像加载接口"""
|
|
|
+ return load_images_from_pdf_unified(
|
|
|
+ pdf_bytes, dpi, start_page_id, end_page_id,
|
|
|
+ image_type, renderer, timeout, threads
|
|
|
+ )
|
|
|
+
|
|
|
+ @staticmethod
|
|
|
+ def _load_images_pypdfium2(
|
|
|
+ pdf_bytes: bytes,
|
|
|
+ dpi: int,
|
|
|
+ start_page_id: int,
|
|
|
+ end_page_id: Optional[int],
|
|
|
+ image_type: str,
|
|
|
+ timeout: Optional[int],
|
|
|
+ threads: int
|
|
|
+ ) -> Tuple[List[Dict[str, Any]], Any]:
|
|
|
+ """向后兼容包装:使用pypdfium2渲染"""
|
|
|
+ return load_images_pypdfium2(
|
|
|
+ pdf_bytes, dpi, start_page_id, end_page_id,
|
|
|
+ image_type, timeout, threads
|
|
|
+ )
|
|
|
+
|
|
|
+ @staticmethod
|
|
|
+ def _load_images_fitz(
|
|
|
+ pdf_bytes: bytes,
|
|
|
+ dpi: int,
|
|
|
+ start_page_id: int,
|
|
|
+ end_page_id: Optional[int],
|
|
|
+ image_type: str
|
|
|
+ ) -> Tuple[List[Dict[str, Any]], Any]:
|
|
|
+ """向后兼容包装:使用fitz渲染"""
|
|
|
+ return load_images_fitz(
|
|
|
+ pdf_bytes, dpi, start_page_id, end_page_id, image_type
|
|
|
+ )
|
|
|
+
|
|
|
+ # ========================================================================
|
|
|
+ # 其他功能
|
|
|
+ # ========================================================================
|
|
|
+ # 其他功能
|
|
|
+ # ========================================================================
|
|
|
|
|
|
@staticmethod
|
|
|
def merge_cross_page_tables(results: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
@@ -488,226 +394,3 @@ class PDFUtils:
|
|
|
# TODO: 实现跨页表格合并逻辑
|
|
|
return results
|
|
|
|
|
|
-
|
|
|
-# ============================================================================
|
|
|
-# 统一的 PDF 图像加载函数 - 支持多种渲染引擎
|
|
|
-# ============================================================================
|
|
|
-
|
|
|
-def load_images_from_pdf_unified(
|
|
|
- pdf_bytes: bytes,
|
|
|
- dpi: int = 200,
|
|
|
- start_page_id: int = 0,
|
|
|
- end_page_id: Optional[int] = None,
|
|
|
- image_type: str = "PIL",
|
|
|
- renderer: str = "pypdfium2",
|
|
|
- timeout: Optional[int] = None,
|
|
|
- threads: int = 4,
|
|
|
-) -> Tuple[List[Dict[str, Any]], Any]:
|
|
|
- """
|
|
|
- 从 PDF 加载图像,支持两种渲染引擎
|
|
|
-
|
|
|
- Args:
|
|
|
- pdf_bytes: PDF 文件的字节数据
|
|
|
- dpi: 渲染 DPI,默认 200
|
|
|
- start_page_id: 起始页码(0-based),默认 0
|
|
|
- end_page_id: 结束页码(0-based,包含),默认 None(处理到最后)
|
|
|
- image_type: 返回图像类型,"PIL" 或 "BASE64"
|
|
|
- renderer: 渲染引擎选择
|
|
|
- - "pypdfium2": 使用 MinerU 标准的 pypdfium2(推荐)
|
|
|
- * 优势: Chrome PDFium 引擎,多进程加速,更好的细节保留
|
|
|
- * 尺寸限制: 3500px,超过则动态调整 scale
|
|
|
- - "fitz" / "pymupdf": 使用 PyMuPDF (fitz)
|
|
|
- * 优势: MuPDF 引擎,简单直接,无需额外依赖
|
|
|
- * 尺寸限制: 4500px,超过则降到 72 DPI
|
|
|
- timeout: 超时时间(秒),仅 pypdfium2 支持
|
|
|
- threads: 进程数,仅 pypdfium2 支持多进程加速(Windows 下自动禁用)
|
|
|
-
|
|
|
- Returns:
|
|
|
- (images_list, pdf_doc)
|
|
|
- - images_list: 图像列表,每个元素为 {'img_pil': PIL.Image, 'scale': float}
|
|
|
- 或 {'img_base64': str, 'scale': float}(取决于 image_type)
|
|
|
- - pdf_doc: PDF 文档对象(pypdfium2.PdfDocument 或 fitz.Document)
|
|
|
-
|
|
|
- Raises:
|
|
|
- ImportError: 如果选择的渲染引擎不可用
|
|
|
- ValueError: 如果参数无效
|
|
|
- TimeoutError: 如果转换超时(仅 pypdfium2)
|
|
|
-
|
|
|
- 渲染引擎对比:
|
|
|
- ┌─────────────┬──────────────┬──────────────┐
|
|
|
- │ 特性 │ pypdfium2 │ fitz │
|
|
|
- ├─────────────┼──────────────┼──────────────┤
|
|
|
- │ 渲染引擎 │ Chrome PDFium│ MuPDF │
|
|
|
- │ 多进程加速 │ ✅ (非Windows)│ ❌ │
|
|
|
- │ 超时控制 │ ✅ │ ❌ │
|
|
|
- │ 尺寸限制 │ 3500px │ 4500px │
|
|
|
- │ 超限处理 │ 动态调整scale│ 降到72 DPI │
|
|
|
- │ 细节保留 │ 更好 │ 良好 │
|
|
|
- │ MinerU标准 │ ✅ │ ❌ │
|
|
|
- └─────────────┴──────────────┴──────────────┘
|
|
|
-
|
|
|
- 示例:
|
|
|
- # 使用 pypdfium2(推荐,MinerU 标准)
|
|
|
- images, doc = load_images_from_pdf_unified(
|
|
|
- pdf_bytes,
|
|
|
- dpi=200,
|
|
|
- renderer="pypdfium2",
|
|
|
- threads=4
|
|
|
- )
|
|
|
-
|
|
|
- # 使用 PyMuPDF (fitz)
|
|
|
- images, doc = load_images_from_pdf_unified(
|
|
|
- pdf_bytes,
|
|
|
- dpi=200,
|
|
|
- renderer="fitz"
|
|
|
- )
|
|
|
-
|
|
|
- # 访问图像
|
|
|
- for img_dict in images:
|
|
|
- pil_image = img_dict['img_pil']
|
|
|
- scale = img_dict['scale']
|
|
|
- # 处理图像...
|
|
|
-
|
|
|
- 注意事项:
|
|
|
- 1. pypdfium2 在生产环境中更推荐,因为它是 MinerU 的标准实现
|
|
|
- 2. 两种渲染引擎可能产生略有不同的图像(SSIM ≈ 0.945)
|
|
|
- 3. 建议在同一项目中保持使用同一渲染引擎,避免不一致
|
|
|
- 4. 如果需要与现有测试图像对比,使用相同的渲染引擎
|
|
|
- """
|
|
|
- renderer = renderer.lower()
|
|
|
-
|
|
|
- if renderer in ["pypdfium2", "pdfium"]:
|
|
|
- return _load_images_pypdfium2(
|
|
|
- pdf_bytes, dpi, start_page_id, end_page_id,
|
|
|
- image_type, timeout, threads
|
|
|
- )
|
|
|
- elif renderer in ["fitz", "pymupdf", "mupdf"]:
|
|
|
- return _load_images_fitz(
|
|
|
- pdf_bytes, dpi, start_page_id, end_page_id, image_type
|
|
|
- )
|
|
|
- else:
|
|
|
- raise ValueError(
|
|
|
- f"不支持的渲染引擎: {renderer}. "
|
|
|
- f"请使用 'pypdfium2' 或 'fitz'"
|
|
|
- )
|
|
|
-
|
|
|
-
|
|
|
-def _load_images_pypdfium2(
|
|
|
- pdf_bytes: bytes,
|
|
|
- dpi: int,
|
|
|
- start_page_id: int,
|
|
|
- end_page_id: Optional[int],
|
|
|
- image_type: str,
|
|
|
- timeout: Optional[int],
|
|
|
- threads: int
|
|
|
-) -> Tuple[List[Dict[str, Any]], Any]:
|
|
|
- """使用 pypdfium2 渲染引擎(MinerU 标准)"""
|
|
|
- try:
|
|
|
- import pypdfium2 as pdfium
|
|
|
- from mineru.utils.pdf_image_tools import load_images_from_pdf as mineru_load_images
|
|
|
- from mineru.utils.enum_class import ImageType
|
|
|
- except ImportError as e:
|
|
|
- raise ImportError(
|
|
|
- f"pypdfium2 渲染引擎需要安装 MinerU: pip install mineru\n"
|
|
|
- f"原始错误: {e}"
|
|
|
- )
|
|
|
-
|
|
|
- # 转换 image_type
|
|
|
- img_type = ImageType.PIL if image_type.upper() == "PIL" else ImageType.BASE64
|
|
|
-
|
|
|
- # 使用 MinerU 的实现
|
|
|
- images_list, pdf_doc = mineru_load_images(
|
|
|
- pdf_bytes=pdf_bytes,
|
|
|
- dpi=dpi,
|
|
|
- start_page_id=start_page_id,
|
|
|
- end_page_id=end_page_id,
|
|
|
- image_type=img_type,
|
|
|
- timeout=timeout,
|
|
|
- threads=threads
|
|
|
- )
|
|
|
-
|
|
|
- logger.info(
|
|
|
- f"✅ pypdfium2 渲染完成: {len(images_list)} 页 "
|
|
|
- f"(DPI={dpi}, 多进程={threads})"
|
|
|
- )
|
|
|
-
|
|
|
- return images_list, pdf_doc
|
|
|
-
|
|
|
-
|
|
|
-def _load_images_fitz(
|
|
|
- pdf_bytes: bytes,
|
|
|
- dpi: int,
|
|
|
- start_page_id: int,
|
|
|
- end_page_id: Optional[int],
|
|
|
- image_type: str
|
|
|
-) -> Tuple[List[Dict[str, Any]], Any]:
|
|
|
- """使用 PyMuPDF (fitz) 渲染引擎"""
|
|
|
- try:
|
|
|
- import fitz
|
|
|
- except ImportError as e:
|
|
|
- raise ImportError(
|
|
|
- f"PyMuPDF 渲染引擎需要安装: pip install PyMuPDF\n"
|
|
|
- f"原始错误: {e}"
|
|
|
- )
|
|
|
-
|
|
|
- from io import BytesIO
|
|
|
- import base64
|
|
|
-
|
|
|
- # 打开 PDF
|
|
|
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
|
|
- pdf_page_num = doc.page_count
|
|
|
-
|
|
|
- # 处理 end_page_id
|
|
|
- if end_page_id is None or end_page_id < 0:
|
|
|
- end_page_id = pdf_page_num - 1
|
|
|
- end_page_id = min(end_page_id, pdf_page_num - 1)
|
|
|
-
|
|
|
- # 渲染图像
|
|
|
- images_list = []
|
|
|
- mat = fitz.Matrix(dpi / 72, dpi / 72)
|
|
|
-
|
|
|
- for index in range(start_page_id, end_page_id + 1):
|
|
|
- page = doc[index]
|
|
|
-
|
|
|
- # 渲染为 pixmap
|
|
|
- pm = page.get_pixmap(matrix=mat, alpha=False)
|
|
|
-
|
|
|
- # 如果超过尺寸限制,降低到 72 DPI
|
|
|
- if pm.width > 4500 or pm.height > 4500:
|
|
|
- logger.warning(
|
|
|
- f"⚠️ 页面 {index} 尺寸过大 ({pm.width}x{pm.height}), "
|
|
|
- f"降低到 72 DPI"
|
|
|
- )
|
|
|
- mat_fallback = fitz.Matrix(1, 1) # 72 DPI
|
|
|
- pm = page.get_pixmap(matrix=mat_fallback, alpha=False)
|
|
|
-
|
|
|
- # 转换为 PIL Image
|
|
|
- pil_img = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
|
|
|
-
|
|
|
- # 计算实际 scale
|
|
|
- page_rect = page.rect
|
|
|
- actual_scale = pm.width / page_rect.width
|
|
|
-
|
|
|
- # 构建返回字典
|
|
|
- image_dict = {
|
|
|
- 'img_pil': pil_img,
|
|
|
- 'scale': actual_scale
|
|
|
- }
|
|
|
-
|
|
|
- # 如果需要 BASE64
|
|
|
- if image_type.upper() == "BASE64":
|
|
|
- buffer = BytesIO()
|
|
|
- pil_img.save(buffer, format="JPEG")
|
|
|
- img_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
|
|
|
- image_dict['img_base64'] = img_base64
|
|
|
- # 移除 img_pil 以节省内存
|
|
|
- del image_dict['img_pil']
|
|
|
-
|
|
|
- images_list.append(image_dict)
|
|
|
-
|
|
|
- logger.info(
|
|
|
- f"✅ PyMuPDF (fitz) 渲染完成: {len(images_list)} 页 "
|
|
|
- f"(DPI={dpi}, 单进程)"
|
|
|
- )
|
|
|
-
|
|
|
- return images_list, doc
|