""" PDF处理工具模块 提供PDF相关处理功能: - PDF加载与分类 - PDF文本提取 - 跨页表格合并 - 页面范围解析与过滤 """ from typing import Dict, List, Any, Optional, Tuple, Set from pathlib import Path from PIL import Image from loguru import logger import re # 导入页面范围解析函数(不依赖 MinerU) from .file_utils import parse_page_range # 导入 MinerU 组件 try: from mineru.utils.pdf_classify import classify as pdf_classify from mineru.utils.pdf_image_tools import load_images_from_pdf from mineru.utils.enum_class import ImageType from mineru.utils.pdf_text_tool import get_page as pdf_get_page_text MINERU_AVAILABLE = True except ImportError: raise ImportError("MinerU components not available for PDF processing") class PDFUtils: """PDF处理工具类""" @staticmethod def parse_page_range(page_range: Optional[str], total_pages: int) -> Set[int]: """ 解析页面范围字符串(向后兼容包装函数) 此方法是对 file_utils.parse_page_range 的包装,保持向后兼容性。 新代码应直接使用 file_utils.parse_page_range。 支持格式: - "1-5" → {0, 1, 2, 3, 4}(页码从1开始,内部转为0-based索引) - "3" → {2} - "1-5,7,9-12" → {0, 1, 2, 3, 4, 6, 8, 9, 10, 11} - "1-" → 从第1页到最后 - "-5" → 从第1页到第5页 Args: page_range: 页面范围字符串(页码从1开始) total_pages: 总页数 Returns: 页面索引集合(0-based) """ return parse_page_range(page_range, total_pages) @staticmethod def _detect_pdf_doc_type(pdf_doc: Any) -> str: """ 检测 PDF 文档对象类型 Args: pdf_doc: PDF 文档对象 Returns: 'pypdfium2' 或 'fitz' """ doc_type_name = type(pdf_doc).__name__ doc_module = type(pdf_doc).__module__ if 'pdfium' in doc_module.lower() or 'PdfDocument' in doc_type_name: return 'pypdfium2' elif 'fitz' in doc_module.lower() or 'Document' in doc_type_name: return 'fitz' else: # 尝试通过属性判断 if hasattr(pdf_doc, 'get_page') or hasattr(pdf_doc, 'page_count'): # fitz.Document 有 page_count 属性 return 'fitz' else: # pypdfium2 通过索引访问 return 'pypdfium2' @staticmethod def load_and_classify_document( document_path: Path, dpi: int = 200, page_range: Optional[str] = None, renderer: str = "fitz" # 新增参数,默认 fitz ) -> Tuple[List[Dict], str, Optional[Any], str]: """ 加载文档并分类,支持页面范围过滤 Args: document_path: 文档路径 dpi: PDF渲染DPI page_range: 页面范围字符串,如 "1-5,7,9-12" - PDF:按页码(从1开始) - 图片目录:按文件名排序后的位置(从1开始) renderer: PDF渲染引擎,"fitz" 或 "pypdfium2" Returns: (images_list, pdf_type, pdf_doc) - images_list: 图像列表,每个元素包含 {'img_pil': PIL.Image, 'scale': float, 'page_idx': int} - pdf_type: 'ocr' 或 'txt' - pdf_doc: PDF文档对象(如果PDF) - renderer_used: 实际使用的渲染器类型 """ pdf_doc = None pdf_type = 'ocr' # 默认使用OCR模式 all_images = [] if document_path.is_dir(): # 处理目录:遍历所有图片 image_extensions = {'.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.tif'} image_files = sorted([ f for f in document_path.iterdir() if f.suffix.lower() in image_extensions ]) # 解析页面范围 total_pages = len(image_files) selected_pages = parse_page_range(page_range, total_pages) if page_range: logger.info(f"📋 图片目录共 {total_pages} 张,选择处理 {len(selected_pages)} 张") for idx, img_file in enumerate(image_files): if idx not in selected_pages: continue img = Image.open(img_file) if img.mode != 'RGB': img = img.convert('RGB') all_images.append({ 'img_pil': img, 'scale': 1.0, 'source_path': str(img_file), 'page_idx': idx, # 原始索引 'page_name': img_file.stem # 文件名(不含扩展名) }) pdf_type = 'ocr' # 图片目录始终使用OCR模式 elif document_path.suffix.lower() == '.pdf': # 处理PDF文件 if not MINERU_AVAILABLE: raise RuntimeError("MinerU components not available for PDF processing") with open(document_path, 'rb') as f: pdf_bytes = f.read() # PDF分类 pdf_type = pdf_classify(pdf_bytes) logger.info(f"📋 PDF classified as: {pdf_type}") # 加载图像 images_list, pdf_doc = load_images_from_pdf_unified( pdf_bytes, dpi=dpi, image_type=ImageType.PIL, renderer=renderer # 使用指定的渲染引擎 ) # 解析页面范围 total_pages = len(images_list) selected_pages = parse_page_range(page_range, total_pages) if page_range: logger.info(f"📋 PDF 共 {total_pages} 页,选择处理 {len(selected_pages)} 页") for idx, img_dict in enumerate(images_list): if idx not in selected_pages: continue all_images.append({ 'img_pil': img_dict['img_pil'], 'scale': img_dict.get('scale', dpi / 72), 'source_path': str(document_path), 'page_idx': idx, # 原始页码索引 'page_name': f"{document_path.stem}_page_{idx + 1:03d}" }) elif document_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.tif']: # 处理单个图片 img = Image.open(document_path) if img.mode != 'RGB': img = img.convert('RGB') all_images.append({ 'img_pil': img, 'scale': 1.0, 'source_path': str(document_path), 'page_idx': 0, 'page_name': document_path.stem }) pdf_type = 'ocr' else: raise ValueError(f"Unsupported file format: {document_path.suffix}") return all_images, pdf_type, pdf_doc, renderer @staticmethod def extract_text_from_pdf( pdf_doc: Any, page_idx: int, bbox: List[float], scale: float ) -> Tuple[str, bool]: """ 从PDF直接提取文本(支持 pypdfium2 和 fitz) Args: pdf_doc: PDF文档对象 (pypdfium2.PdfDocument 或 fitz.Document) page_idx: 页码索引 bbox: 目标区域的bbox(图像坐标) scale: 图像与PDF的缩放比例 Returns: (text, success) """ # 检测 PDF 文档类型 doc_type = PDFUtils._detect_pdf_doc_type(pdf_doc) if doc_type == 'fitz': return PDFUtils._extract_text_from_pdf_fitz(pdf_doc, page_idx, bbox, scale) else: # pypdfium2 return PDFUtils._extract_text_from_pdf_pypdfium2(pdf_doc, page_idx, bbox, scale) @staticmethod def _extract_text_from_pdf_pypdfium2( pdf_doc: Any, page_idx: int, bbox: List[float], scale: float ) -> Tuple[str, bool]: """使用 pypdfium2 提取文本(原有实现)""" if not MINERU_AVAILABLE or pdf_get_page_text is None: logger.error("MinerU pdf_text_tool not available") return "", False try: page = pdf_doc[page_idx] # 将图像坐标转换为PDF坐标 pdf_bbox = [ bbox[0] / scale, bbox[1] / scale, bbox[2] / scale, bbox[3] / scale ] # 使用 MinerU 的方式获取页面文本信息 page_dict = pdf_get_page_text(page) # 从 blocks 中提取与 bbox 重叠的文本 text_parts = [] for block in page_dict.get('blocks', []): for line in block.get('lines', []): line_bbox = line.get('bbox') if line_bbox and hasattr(line_bbox, 'bbox'): line_bbox = line_bbox.bbox elif isinstance(line_bbox, (list, tuple)) and len(line_bbox) >= 4: line_bbox = list(line_bbox) else: continue if PDFUtils._bbox_overlap(pdf_bbox, line_bbox): for span in line.get('spans', []): span_text = span.get('text', '') if span_text: text_parts.append(span_text) text = ' '.join(text_parts) return text.strip(), bool(text.strip()) except Exception as e: import traceback logger.debug(f"pypdfium2 text extraction error: {e}") logger.debug(traceback.format_exc()) return "", False @staticmethod def _extract_text_from_pdf_fitz( pdf_doc: Any, page_idx: int, bbox: List[float], scale: float ) -> Tuple[str, bool]: """使用 fitz 提取文本""" try: import fitz except ImportError: logger.error("PyMuPDF (fitz) not available") return "", False try: page = pdf_doc[page_idx] # 将图像坐标转换为PDF坐标 pdf_bbox = fitz.Rect( bbox[0] / scale, bbox[1] / scale, bbox[2] / scale, bbox[3] / scale ) # 提取区域内的文本 text = page.get_text("text", clip=pdf_bbox) return text.strip(), bool(text.strip()) except Exception as e: import traceback logger.debug(f"fitz text extraction error: {e}") logger.debug(traceback.format_exc()) return "", False @staticmethod def extract_all_text_blocks( pdf_doc: Any, page_idx: int, scale: float ) -> List[Dict[str, Any]]: """ 提取页面所有文本块(支持 pypdfium2 和 fitz) Args: pdf_doc: PDF文档对象 page_idx: 页码 scale: 缩放比例 Returns: 文本块列表 [{'text': str, 'bbox': [x1, y1, x2, y2]}, ...] """ # 检测 PDF 文档类型 doc_type = PDFUtils._detect_pdf_doc_type(pdf_doc) if doc_type == 'fitz': return PDFUtils._extract_all_text_blocks_fitz(pdf_doc, page_idx, scale) else: # pypdfium2 return PDFUtils._extract_all_text_blocks_pypdfium2(pdf_doc, page_idx, scale) @staticmethod def _extract_all_text_blocks_pypdfium2( pdf_doc: Any, page_idx: int, scale: float ) -> List[Dict[str, Any]]: """使用 pypdfium2 提取所有文本块(原有实现)""" if not MINERU_AVAILABLE or pdf_get_page_text is None: return [] try: page = pdf_doc[page_idx] page_dict = pdf_get_page_text(page) extracted_blocks = [] for block in page_dict.get('blocks', []): for line in block.get('lines', []): line_text = "" for span in line.get('spans', []): line_text += span.get('text', "") if not line_text.strip(): continue line_bbox = line.get('bbox') if line_bbox and hasattr(line_bbox, 'bbox'): line_bbox = line_bbox.bbox elif isinstance(line_bbox, (list, tuple)) and len(line_bbox) >= 4: line_bbox = list(line_bbox) else: continue img_bbox = [ line_bbox[0] * scale, line_bbox[1] * scale, line_bbox[2] * scale, line_bbox[3] * scale ] extracted_blocks.append({ 'text': line_text, 'bbox': img_bbox, 'origin_bbox': line_bbox }) return extracted_blocks except Exception as e: logger.warning(f"pypdfium2 extract_all_text_blocks failed: {e}") import traceback logger.debug(traceback.format_exc()) return [] @staticmethod def _extract_all_text_blocks_fitz( pdf_doc: Any, page_idx: int, scale: float ) -> List[Dict[str, Any]]: """使用 fitz 提取所有文本块""" try: import fitz except ImportError: logger.warning("PyMuPDF (fitz) not available") return [] try: page = pdf_doc[page_idx] # 使用 get_text("dict") 获取详细的文本信息 text_dict = page.get_text("dict") extracted_blocks = [] # 遍历所有 blocks for block in text_dict.get("blocks", []): # 只处理文本块(type=0) if block.get("type") != 0: continue # 遍历所有 lines for line in block.get("lines", []): line_text = "" line_bbox = line.get("bbox") # 提取 line 中的所有 span 文本 for span in line.get("spans", []): line_text += span.get("text", "") if not line_text.strip() or not line_bbox: continue # PDF 坐标转换为图像坐标 img_bbox = [ line_bbox[0] * scale, line_bbox[1] * scale, line_bbox[2] * scale, line_bbox[3] * scale ] extracted_blocks.append({ 'text': line_text, 'bbox': img_bbox, 'origin_bbox': list(line_bbox) }) return extracted_blocks except Exception as e: logger.warning(f"fitz extract_all_text_blocks failed: {e}") import traceback logger.debug(traceback.format_exc()) return [] @staticmethod def _bbox_overlap(bbox1: List[float], bbox2: List[float]) -> bool: """检查两个 bbox 是否重叠""" if len(bbox1) < 4 or len(bbox2) < 4: return False x1_1, y1_1, x2_1, y2_1 = bbox1[:4] x1_2, y1_2, x2_2, y2_2 = bbox2[:4] if x2_1 < x1_2 or x2_2 < x1_1: return False if y2_1 < y1_2 or y2_2 < y1_1: return False return True @staticmethod def merge_cross_page_tables(results: Dict[str, Any]) -> Dict[str, Any]: """ 合并跨页表格 TODO: 实现跨页表格合并逻辑 可以参考 MinerU 的 cross_page_table_merge 实现 Args: results: 处理结果字典 Returns: 合并后的结果 """ # TODO: 实现跨页表格合并逻辑 return results # ============================================================================ # 统一的 PDF 图像加载函数 - 支持多种渲染引擎 # ============================================================================ def load_images_from_pdf_unified( pdf_bytes: bytes, dpi: int = 200, start_page_id: int = 0, end_page_id: Optional[int] = None, image_type: str = "PIL", renderer: str = "pypdfium2", timeout: Optional[int] = None, threads: int = 4, ) -> Tuple[List[Dict[str, Any]], Any]: """ 从 PDF 加载图像,支持两种渲染引擎 Args: pdf_bytes: PDF 文件的字节数据 dpi: 渲染 DPI,默认 200 start_page_id: 起始页码(0-based),默认 0 end_page_id: 结束页码(0-based,包含),默认 None(处理到最后) image_type: 返回图像类型,"PIL" 或 "BASE64" renderer: 渲染引擎选择 - "pypdfium2": 使用 MinerU 标准的 pypdfium2(推荐) * 优势: Chrome PDFium 引擎,多进程加速,更好的细节保留 * 尺寸限制: 3500px,超过则动态调整 scale - "fitz" / "pymupdf": 使用 PyMuPDF (fitz) * 优势: MuPDF 引擎,简单直接,无需额外依赖 * 尺寸限制: 4500px,超过则降到 72 DPI timeout: 超时时间(秒),仅 pypdfium2 支持 threads: 进程数,仅 pypdfium2 支持多进程加速(Windows 下自动禁用) Returns: (images_list, pdf_doc) - images_list: 图像列表,每个元素为 {'img_pil': PIL.Image, 'scale': float} 或 {'img_base64': str, 'scale': float}(取决于 image_type) - pdf_doc: PDF 文档对象(pypdfium2.PdfDocument 或 fitz.Document) Raises: ImportError: 如果选择的渲染引擎不可用 ValueError: 如果参数无效 TimeoutError: 如果转换超时(仅 pypdfium2) 渲染引擎对比: ┌─────────────┬──────────────┬──────────────┐ │ 特性 │ pypdfium2 │ fitz │ ├─────────────┼──────────────┼──────────────┤ │ 渲染引擎 │ Chrome PDFium│ MuPDF │ │ 多进程加速 │ ✅ (非Windows)│ ❌ │ │ 超时控制 │ ✅ │ ❌ │ │ 尺寸限制 │ 3500px │ 4500px │ │ 超限处理 │ 动态调整scale│ 降到72 DPI │ │ 细节保留 │ 更好 │ 良好 │ │ MinerU标准 │ ✅ │ ❌ │ └─────────────┴──────────────┴──────────────┘ 示例: # 使用 pypdfium2(推荐,MinerU 标准) images, doc = load_images_from_pdf_unified( pdf_bytes, dpi=200, renderer="pypdfium2", threads=4 ) # 使用 PyMuPDF (fitz) images, doc = load_images_from_pdf_unified( pdf_bytes, dpi=200, renderer="fitz" ) # 访问图像 for img_dict in images: pil_image = img_dict['img_pil'] scale = img_dict['scale'] # 处理图像... 注意事项: 1. pypdfium2 在生产环境中更推荐,因为它是 MinerU 的标准实现 2. 两种渲染引擎可能产生略有不同的图像(SSIM ≈ 0.945) 3. 建议在同一项目中保持使用同一渲染引擎,避免不一致 4. 如果需要与现有测试图像对比,使用相同的渲染引擎 """ renderer = renderer.lower() if renderer in ["pypdfium2", "pdfium"]: return _load_images_pypdfium2( pdf_bytes, dpi, start_page_id, end_page_id, image_type, timeout, threads ) elif renderer in ["fitz", "pymupdf", "mupdf"]: return _load_images_fitz( pdf_bytes, dpi, start_page_id, end_page_id, image_type ) else: raise ValueError( f"不支持的渲染引擎: {renderer}. " f"请使用 'pypdfium2' 或 'fitz'" ) def _load_images_pypdfium2( pdf_bytes: bytes, dpi: int, start_page_id: int, end_page_id: Optional[int], image_type: str, timeout: Optional[int], threads: int ) -> Tuple[List[Dict[str, Any]], Any]: """使用 pypdfium2 渲染引擎(MinerU 标准)""" try: import pypdfium2 as pdfium from mineru.utils.pdf_image_tools import load_images_from_pdf as mineru_load_images from mineru.utils.enum_class import ImageType except ImportError as e: raise ImportError( f"pypdfium2 渲染引擎需要安装 MinerU: pip install mineru\n" f"原始错误: {e}" ) # 转换 image_type img_type = ImageType.PIL if image_type.upper() == "PIL" else ImageType.BASE64 # 使用 MinerU 的实现 images_list, pdf_doc = mineru_load_images( pdf_bytes=pdf_bytes, dpi=dpi, start_page_id=start_page_id, end_page_id=end_page_id, image_type=img_type, timeout=timeout, threads=threads ) logger.info( f"✅ pypdfium2 渲染完成: {len(images_list)} 页 " f"(DPI={dpi}, 多进程={threads})" ) return images_list, pdf_doc def _load_images_fitz( pdf_bytes: bytes, dpi: int, start_page_id: int, end_page_id: Optional[int], image_type: str ) -> Tuple[List[Dict[str, Any]], Any]: """使用 PyMuPDF (fitz) 渲染引擎""" try: import fitz except ImportError as e: raise ImportError( f"PyMuPDF 渲染引擎需要安装: pip install PyMuPDF\n" f"原始错误: {e}" ) from io import BytesIO import base64 # 打开 PDF doc = fitz.open(stream=pdf_bytes, filetype="pdf") pdf_page_num = doc.page_count # 处理 end_page_id if end_page_id is None or end_page_id < 0: end_page_id = pdf_page_num - 1 end_page_id = min(end_page_id, pdf_page_num - 1) # 渲染图像 images_list = [] mat = fitz.Matrix(dpi / 72, dpi / 72) for index in range(start_page_id, end_page_id + 1): page = doc[index] # 渲染为 pixmap pm = page.get_pixmap(matrix=mat, alpha=False) # 如果超过尺寸限制,降低到 72 DPI if pm.width > 4500 or pm.height > 4500: logger.warning( f"⚠️ 页面 {index} 尺寸过大 ({pm.width}x{pm.height}), " f"降低到 72 DPI" ) mat_fallback = fitz.Matrix(1, 1) # 72 DPI pm = page.get_pixmap(matrix=mat_fallback, alpha=False) # 转换为 PIL Image pil_img = Image.frombytes('RGB', (pm.width, pm.height), pm.samples) # 计算实际 scale page_rect = page.rect actual_scale = pm.width / page_rect.width # 构建返回字典 image_dict = { 'img_pil': pil_img, 'scale': actual_scale } # 如果需要 BASE64 if image_type.upper() == "BASE64": buffer = BytesIO() pil_img.save(buffer, format="JPEG") img_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8') image_dict['img_base64'] = img_base64 # 移除 img_pil 以节省内存 del image_dict['img_pil'] images_list.append(image_dict) logger.info( f"✅ PyMuPDF (fitz) 渲染完成: {len(images_list)} 页 " f"(DPI={dpi}, 单进程)" ) return images_list, doc