| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492 |
- """
- PDF处理工具模块
- 提供PDF相关处理功能:
- - PDF加载与分类
- - PDF文本提取
- - 跨页表格合并
- - 页面范围解析与过滤
- """
- from typing import Dict, List, Any, Optional, Tuple, Set
- from pathlib import Path
- from PIL import Image
- from loguru import logger
- import re
- # 导入页面范围解析函数(不依赖 MinerU)
- from .file_utils import parse_page_range
- # 导入 MinerU 组件
- try:
- from mineru.utils.pdf_classify import classify as pdf_classify
- from mineru.utils.pdf_image_tools import load_images_from_pdf
- from mineru.utils.enum_class import ImageType
- from mineru.utils.pdf_text_tool import get_page as pdf_get_page_text
- MINERU_AVAILABLE = True
- except ImportError:
- raise ImportError("MinerU components not available for PDF processing")
- class PDFUtils:
- """PDF处理工具类"""
-
- @staticmethod
- def parse_page_range(page_range: Optional[str], total_pages: int) -> Set[int]:
- """
- 解析页面范围字符串(向后兼容包装函数)
-
- 此方法是对 file_utils.parse_page_range 的包装,保持向后兼容性。
- 新代码应直接使用 file_utils.parse_page_range。
-
- 支持格式:
- - "1-5" → {0, 1, 2, 3, 4}(页码从1开始,内部转为0-based索引)
- - "3" → {2}
- - "1-5,7,9-12" → {0, 1, 2, 3, 4, 6, 8, 9, 10, 11}
- - "1-" → 从第1页到最后
- - "-5" → 从第1页到第5页
-
- Args:
- page_range: 页面范围字符串(页码从1开始)
- total_pages: 总页数
-
- Returns:
- 页面索引集合(0-based)
- """
- return parse_page_range(page_range, total_pages)
-
- @staticmethod
- def load_and_classify_document(
- document_path: Path,
- dpi: int = 200,
- page_range: Optional[str] = None
- ) -> Tuple[List[Dict], str, Optional[Any]]:
- """
- 加载文档并分类,支持页面范围过滤
-
- Args:
- document_path: 文档路径
- dpi: PDF渲染DPI
- page_range: 页面范围字符串,如 "1-5,7,9-12"
- - PDF:按页码(从1开始)
- - 图片目录:按文件名排序后的位置(从1开始)
-
- Returns:
- (images_list, pdf_type, pdf_doc)
- - images_list: 图像列表,每个元素包含 {'img_pil': PIL.Image, 'scale': float, 'page_idx': int}
- - pdf_type: 'ocr' 或 'txt'
- - pdf_doc: PDF文档对象(如果是PDF)
- """
- pdf_doc = None
- pdf_type = 'ocr' # 默认使用OCR模式
- all_images = []
-
- if document_path.is_dir():
- # 处理目录:遍历所有图片
- image_extensions = {'.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.tif'}
- image_files = sorted([
- f for f in document_path.iterdir()
- if f.suffix.lower() in image_extensions
- ])
-
- # 解析页面范围
- total_pages = len(image_files)
- selected_pages = parse_page_range(page_range, total_pages)
-
- if page_range:
- logger.info(f"📋 图片目录共 {total_pages} 张,选择处理 {len(selected_pages)} 张")
-
- for idx, img_file in enumerate(image_files):
- if idx not in selected_pages:
- continue
-
- img = Image.open(img_file)
- if img.mode != 'RGB':
- img = img.convert('RGB')
- all_images.append({
- 'img_pil': img,
- 'scale': 1.0,
- 'source_path': str(img_file),
- 'page_idx': idx, # 原始索引
- 'page_name': img_file.stem # 文件名(不含扩展名)
- })
-
- pdf_type = 'ocr' # 图片目录始终使用OCR模式
-
- elif document_path.suffix.lower() == '.pdf':
- # 处理PDF文件
- if not MINERU_AVAILABLE:
- raise RuntimeError("MinerU components not available for PDF processing")
-
- with open(document_path, 'rb') as f:
- pdf_bytes = f.read()
-
- # PDF分类
- pdf_type = pdf_classify(pdf_bytes)
- logger.info(f"📋 PDF classified as: {pdf_type}")
-
- # 加载图像
- images_list, pdf_doc = load_images_from_pdf_unified(
- pdf_bytes,
- dpi=dpi,
- image_type=ImageType.PIL,
- renderer='fitz'
- )
-
- # 解析页面范围
- total_pages = len(images_list)
- selected_pages = parse_page_range(page_range, total_pages)
-
- if page_range:
- logger.info(f"📋 PDF 共 {total_pages} 页,选择处理 {len(selected_pages)} 页")
-
- for idx, img_dict in enumerate(images_list):
- if idx not in selected_pages:
- continue
-
- all_images.append({
- 'img_pil': img_dict['img_pil'],
- 'scale': img_dict.get('scale', dpi / 72),
- 'source_path': str(document_path),
- 'page_idx': idx, # 原始页码索引
- 'page_name': f"{document_path.stem}_page_{idx + 1:03d}"
- })
-
- elif document_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.tif']:
- # 处理单个图片
- img = Image.open(document_path)
- if img.mode != 'RGB':
- img = img.convert('RGB')
- all_images.append({
- 'img_pil': img,
- 'scale': 1.0,
- 'source_path': str(document_path),
- 'page_idx': 0,
- 'page_name': document_path.stem
- })
- pdf_type = 'ocr'
-
- else:
- raise ValueError(f"Unsupported file format: {document_path.suffix}")
-
- return all_images, pdf_type, pdf_doc
-
- @staticmethod
- def extract_text_from_pdf(
- pdf_doc: Any,
- page_idx: int,
- bbox: List[float],
- scale: float
- ) -> Tuple[str, bool]:
- """
- 从PDF直接提取文本(使用 MinerU 的 pypdfium2 方式)
-
- Args:
- pdf_doc: pypdfium2 的 PdfDocument 对象
- page_idx: 页码索引
- bbox: 目标区域的bbox(图像坐标)
- scale: 图像与PDF的缩放比例
-
- Returns:
- (text, success)
- """
- if not MINERU_AVAILABLE or pdf_get_page_text is None:
- logger.debug("MinerU pdf_text_tool not available")
- return "", False
-
- try:
- page = pdf_doc[page_idx]
-
- # 将图像坐标转换为PDF坐标
- pdf_bbox = [
- bbox[0] / scale,
- bbox[1] / scale,
- bbox[2] / scale,
- bbox[3] / scale
- ]
-
- # 使用 MinerU 的方式获取页面文本信息
- page_dict = pdf_get_page_text(page)
-
- # 从 blocks 中提取与 bbox 重叠的文本
- text_parts = []
- for block in page_dict.get('blocks', []):
- for line in block.get('lines', []):
- line_bbox = line.get('bbox')
- if line_bbox and hasattr(line_bbox, 'bbox'):
- line_bbox = line_bbox.bbox # pdftext 的 BBox 对象
- elif isinstance(line_bbox, (list, tuple)) and len(line_bbox) >= 4:
- line_bbox = list(line_bbox)
- else:
- continue
-
- # 检查 line 是否与目标 bbox 重叠
- if PDFUtils._bbox_overlap(pdf_bbox, line_bbox):
- for span in line.get('spans', []):
- span_text = span.get('text', '')
- if span_text:
- text_parts.append(span_text)
-
- text = ' '.join(text_parts)
- return text.strip(), bool(text.strip())
-
- except Exception as e:
- import traceback
- logger.debug(f"PDF text extraction error: {e}")
- logger.debug(traceback.format_exc())
- return "", False
-
- @staticmethod
- def _bbox_overlap(bbox1: List[float], bbox2: List[float]) -> bool:
- """检查两个 bbox 是否重叠"""
- if len(bbox1) < 4 or len(bbox2) < 4:
- return False
-
- x1_1, y1_1, x2_1, y2_1 = bbox1[:4]
- x1_2, y1_2, x2_2, y2_2 = bbox2[:4]
-
- if x2_1 < x1_2 or x2_2 < x1_1:
- return False
- if y2_1 < y1_2 or y2_2 < y1_1:
- return False
-
- return True
-
- @staticmethod
- def merge_cross_page_tables(results: Dict[str, Any]) -> Dict[str, Any]:
- """
- 合并跨页表格
-
- TODO: 实现跨页表格合并逻辑
- 可以参考 MinerU 的 cross_page_table_merge 实现
-
- Args:
- results: 处理结果字典
-
- Returns:
- 合并后的结果
- """
- # TODO: 实现跨页表格合并逻辑
- return results
- # ============================================================================
- # 统一的 PDF 图像加载函数 - 支持多种渲染引擎
- # ============================================================================
- def load_images_from_pdf_unified(
- pdf_bytes: bytes,
- dpi: int = 200,
- start_page_id: int = 0,
- end_page_id: Optional[int] = None,
- image_type: str = "PIL",
- renderer: str = "pypdfium2",
- timeout: Optional[int] = None,
- threads: int = 4,
- ) -> Tuple[List[Dict[str, Any]], Any]:
- """
- 从 PDF 加载图像,支持两种渲染引擎
-
- Args:
- pdf_bytes: PDF 文件的字节数据
- dpi: 渲染 DPI,默认 200
- start_page_id: 起始页码(0-based),默认 0
- end_page_id: 结束页码(0-based,包含),默认 None(处理到最后)
- image_type: 返回图像类型,"PIL" 或 "BASE64"
- renderer: 渲染引擎选择
- - "pypdfium2": 使用 MinerU 标准的 pypdfium2(推荐)
- * 优势: Chrome PDFium 引擎,多进程加速,更好的细节保留
- * 尺寸限制: 3500px,超过则动态调整 scale
- - "fitz" / "pymupdf": 使用 PyMuPDF (fitz)
- * 优势: MuPDF 引擎,简单直接,无需额外依赖
- * 尺寸限制: 4500px,超过则降到 72 DPI
- timeout: 超时时间(秒),仅 pypdfium2 支持
- threads: 进程数,仅 pypdfium2 支持多进程加速(Windows 下自动禁用)
-
- Returns:
- (images_list, pdf_doc)
- - images_list: 图像列表,每个元素为 {'img_pil': PIL.Image, 'scale': float}
- 或 {'img_base64': str, 'scale': float}(取决于 image_type)
- - pdf_doc: PDF 文档对象(pypdfium2.PdfDocument 或 fitz.Document)
-
- Raises:
- ImportError: 如果选择的渲染引擎不可用
- ValueError: 如果参数无效
- TimeoutError: 如果转换超时(仅 pypdfium2)
-
- 渲染引擎对比:
- ┌─────────────┬──────────────┬──────────────┐
- │ 特性 │ pypdfium2 │ fitz │
- ├─────────────┼──────────────┼──────────────┤
- │ 渲染引擎 │ Chrome PDFium│ MuPDF │
- │ 多进程加速 │ ✅ (非Windows)│ ❌ │
- │ 超时控制 │ ✅ │ ❌ │
- │ 尺寸限制 │ 3500px │ 4500px │
- │ 超限处理 │ 动态调整scale│ 降到72 DPI │
- │ 细节保留 │ 更好 │ 良好 │
- │ MinerU标准 │ ✅ │ ❌ │
- └─────────────┴──────────────┴──────────────┘
-
- 示例:
- # 使用 pypdfium2(推荐,MinerU 标准)
- images, doc = load_images_from_pdf_unified(
- pdf_bytes,
- dpi=200,
- renderer="pypdfium2",
- threads=4
- )
-
- # 使用 PyMuPDF (fitz)
- images, doc = load_images_from_pdf_unified(
- pdf_bytes,
- dpi=200,
- renderer="fitz"
- )
-
- # 访问图像
- for img_dict in images:
- pil_image = img_dict['img_pil']
- scale = img_dict['scale']
- # 处理图像...
-
- 注意事项:
- 1. pypdfium2 在生产环境中更推荐,因为它是 MinerU 的标准实现
- 2. 两种渲染引擎可能产生略有不同的图像(SSIM ≈ 0.945)
- 3. 建议在同一项目中保持使用同一渲染引擎,避免不一致
- 4. 如果需要与现有测试图像对比,使用相同的渲染引擎
- """
- renderer = renderer.lower()
-
- if renderer in ["pypdfium2", "pdfium"]:
- return _load_images_pypdfium2(
- pdf_bytes, dpi, start_page_id, end_page_id,
- image_type, timeout, threads
- )
- elif renderer in ["fitz", "pymupdf", "mupdf"]:
- return _load_images_fitz(
- pdf_bytes, dpi, start_page_id, end_page_id, image_type
- )
- else:
- raise ValueError(
- f"不支持的渲染引擎: {renderer}. "
- f"请使用 'pypdfium2' 或 'fitz'"
- )
- def _load_images_pypdfium2(
- pdf_bytes: bytes,
- dpi: int,
- start_page_id: int,
- end_page_id: Optional[int],
- image_type: str,
- timeout: Optional[int],
- threads: int
- ) -> Tuple[List[Dict[str, Any]], Any]:
- """使用 pypdfium2 渲染引擎(MinerU 标准)"""
- try:
- import pypdfium2 as pdfium
- from mineru.utils.pdf_image_tools import load_images_from_pdf as mineru_load_images
- from mineru.utils.enum_class import ImageType
- except ImportError as e:
- raise ImportError(
- f"pypdfium2 渲染引擎需要安装 MinerU: pip install mineru\n"
- f"原始错误: {e}"
- )
-
- # 转换 image_type
- img_type = ImageType.PIL if image_type.upper() == "PIL" else ImageType.BASE64
-
- # 使用 MinerU 的实现
- images_list, pdf_doc = mineru_load_images(
- pdf_bytes=pdf_bytes,
- dpi=dpi,
- start_page_id=start_page_id,
- end_page_id=end_page_id,
- image_type=img_type,
- timeout=timeout,
- threads=threads
- )
-
- logger.info(
- f"✅ pypdfium2 渲染完成: {len(images_list)} 页 "
- f"(DPI={dpi}, 多进程={threads})"
- )
-
- return images_list, pdf_doc
- def _load_images_fitz(
- pdf_bytes: bytes,
- dpi: int,
- start_page_id: int,
- end_page_id: Optional[int],
- image_type: str
- ) -> Tuple[List[Dict[str, Any]], Any]:
- """使用 PyMuPDF (fitz) 渲染引擎"""
- try:
- import fitz
- except ImportError as e:
- raise ImportError(
- f"PyMuPDF 渲染引擎需要安装: pip install PyMuPDF\n"
- f"原始错误: {e}"
- )
-
- from io import BytesIO
- import base64
-
- # 打开 PDF
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
- pdf_page_num = doc.page_count
-
- # 处理 end_page_id
- if end_page_id is None or end_page_id < 0:
- end_page_id = pdf_page_num - 1
- end_page_id = min(end_page_id, pdf_page_num - 1)
-
- # 渲染图像
- images_list = []
- mat = fitz.Matrix(dpi / 72, dpi / 72)
-
- for index in range(start_page_id, end_page_id + 1):
- page = doc[index]
-
- # 渲染为 pixmap
- pm = page.get_pixmap(matrix=mat, alpha=False)
-
- # 如果超过尺寸限制,降低到 72 DPI
- if pm.width > 4500 or pm.height > 4500:
- logger.warning(
- f"⚠️ 页面 {index} 尺寸过大 ({pm.width}x{pm.height}), "
- f"降低到 72 DPI"
- )
- mat_fallback = fitz.Matrix(1, 1) # 72 DPI
- pm = page.get_pixmap(matrix=mat_fallback, alpha=False)
-
- # 转换为 PIL Image
- pil_img = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
-
- # 计算实际 scale
- page_rect = page.rect
- actual_scale = pm.width / page_rect.width
-
- # 构建返回字典
- image_dict = {
- 'img_pil': pil_img,
- 'scale': actual_scale
- }
-
- # 如果需要 BASE64
- if image_type.upper() == "BASE64":
- buffer = BytesIO()
- pil_img.save(buffer, format="JPEG")
- img_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
- image_dict['img_base64'] = img_base64
- # 移除 img_pil 以节省内存
- del image_dict['img_pil']
-
- images_list.append(image_dict)
-
- logger.info(
- f"✅ PyMuPDF (fitz) 渲染完成: {len(images_list)} 页 "
- f"(DPI={dpi}, 单进程)"
- )
-
- return images_list, doc
|