""" PDF处理工具模块(重构版) 提供PDF相关处理功能的统一入口: - PDF加载与分类 - PDF文本提取(支持 pypdfium2 和 fitz) - PDF图像渲染(支持多种引擎) - 坐标转换(PDF坐标 ↔ 图像坐标) - 跨页表格合并 - 页面范围解析与过滤 本模块已重构为多个子模块: - pdf_coordinate_transform: 坐标转换功能 - pdf_text_extraction: 文本提取功能 - pdf_image_rendering: 图像渲染功能 - pdf_utils: 高级API和统一入口(本文件) 为保持向后兼容性,所有原有函数都从新模块重新导出。 """ from typing import Dict, List, Any, Optional, Tuple, Set from pathlib import Path from PIL import Image from loguru import logger # 导入页面范围解析函数(不依赖 MinerU) from .file_utils import parse_page_range # 从子模块导入功能 from .pdf_coordinate_transform import ( transform_bbox_for_rotation_fitz, transform_bbox_for_rotation_pypdfium2, pdf_rotation_to_image_rotation, ) from .pdf_text_extraction import ( detect_pdf_doc_type, bbox_overlap, extract_text_from_pdf, extract_text_from_pdf_pypdfium2, extract_text_from_pdf_fitz, extract_all_text_blocks, extract_all_text_blocks_pypdfium2, extract_all_text_blocks_fitz, ) from .pdf_image_rendering import ( load_images_from_pdf_unified, load_images_pypdfium2, load_images_fitz, ) # 导入 MinerU 组件 try: from mineru.utils.pdf_classify import classify as pdf_classify from mineru.utils.enum_class import ImageType MINERU_AVAILABLE = True except ImportError: raise ImportError("MinerU components not available for PDF processing") class PDFUtils: """ PDF处理工具类(重构版) 本类提供PDF处理的高级API,内部调用已重构的子模块功能。 保持原有接口不变,确保向后兼容性。 子模块: - pdf_coordinate_transform: 坐标转换 - pdf_text_extraction: 文本提取 - pdf_image_rendering: 图像渲染 """ @staticmethod def parse_page_range(page_range: Optional[str], total_pages: int) -> Set[int]: """ 解析页面范围字符串(向后兼容包装函数) 此方法是对 file_utils.parse_page_range 的包装,保持向后兼容性。 新代码应直接使用 file_utils.parse_page_range。 支持格式: - "1-5" → {0, 1, 2, 3, 4}(页码从1开始,内部转为0-based索引) - "3" → {2} - "1-5,7,9-12" → {0, 1, 2, 3, 4, 6, 8, 9, 10, 11} - "1-" → 从第1页到最后 - "-5" → 从第1页到第5页 Args: page_range: 页面范围字符串(页码从1开始) total_pages: 总页数 Returns: 页面索引集合(0-based) """ return parse_page_range(page_range, total_pages) @staticmethod def _detect_pdf_doc_type(pdf_doc: Any) -> str: """ 检测 PDF 文档对象类型(向后兼容包装) Args: pdf_doc: PDF 文档对象 Returns: 'pypdfium2' 或 'fitz' """ return detect_pdf_doc_type(pdf_doc) @staticmethod def load_and_classify_document( document_path: Path, dpi: int = 200, page_range: Optional[str] = None, renderer: str = "fitz" ) -> Tuple[List[Dict], str, Optional[Any], str]: """ 加载文档并分类,支持页面范围过滤 Args: document_path: 文档路径 dpi: PDF渲染DPI page_range: 页面范围字符串,如 "1-5,7,9-12" - PDF:按页码(从1开始) - 图片目录:按文件名排序后的位置(从1开始) renderer: PDF渲染引擎,"fitz" 或 "pypdfium2" Returns: (images_list, pdf_type, pdf_doc, renderer_used) - images_list: 图像列表,每个元素包含 {'img_pil': PIL.Image, 'scale': float, 'page_idx': int} - pdf_type: 'ocr' 或 'txt' - pdf_doc: PDF文档对象(如果是PDF) - renderer_used: 实际使用的渲染器类型 """ pdf_doc = None pdf_type = 'ocr' # 默认使用OCR模式 all_images = [] if document_path.is_dir(): # 处理目录:遍历所有图片 image_extensions = {'.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.tif'} image_files = sorted([ f for f in document_path.iterdir() if f.suffix.lower() in image_extensions ]) # 解析页面范围 total_pages = len(image_files) selected_pages = parse_page_range(page_range, total_pages) if page_range: logger.info(f"📋 图片目录共 {total_pages} 张,选择处理 {len(selected_pages)} 张") for idx, img_file in enumerate(image_files): if idx not in selected_pages: continue img = Image.open(img_file) if img.mode != 'RGB': img = img.convert('RGB') all_images.append({ 'img_pil': img, 'scale': 1.0, 'source_path': str(img_file), 'page_idx': idx, 'page_name': img_file.stem }) pdf_type = 'ocr' elif document_path.suffix.lower() == '.pdf': # 处理PDF文件 if not MINERU_AVAILABLE: raise RuntimeError("MinerU components not available for PDF processing") with open(document_path, 'rb') as f: pdf_bytes = f.read() # PDF分类 pdf_type = pdf_classify(pdf_bytes) logger.info(f"📋 PDF classified as: {pdf_type}") # 加载图像(使用重构后的函数) images_list, pdf_doc = load_images_from_pdf_unified( pdf_bytes, dpi=dpi, image_type=ImageType.PIL, renderer=renderer ) # 解析页面范围 total_pages = len(images_list) selected_pages = parse_page_range(page_range, total_pages) if page_range: logger.info(f"📋 PDF 共 {total_pages} 页,选择处理 {len(selected_pages)} 页") for idx, img_dict in enumerate(images_list): if idx not in selected_pages: continue all_images.append({ 'img_pil': img_dict['img_pil'], 'scale': img_dict.get('scale', dpi / 72), 'source_path': str(document_path), 'page_idx': idx, 'page_name': f"{document_path.stem}_page_{idx + 1:03d}" }) elif document_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.tif']: # 处理单个图片 img = Image.open(document_path) if img.mode != 'RGB': img = img.convert('RGB') all_images.append({ 'img_pil': img, 'scale': 1.0, 'source_path': str(document_path), 'page_idx': 0, 'page_name': document_path.stem }) pdf_type = 'ocr' else: raise ValueError(f"Unsupported file format: {document_path.suffix}") return all_images, pdf_type, pdf_doc, renderer @staticmethod def _transform_bbox_for_rotation_fitz( bbox: List[float], rotation: int, pdf_width: float, pdf_height: float, scale: float ) -> List[float]: """向后兼容包装:fitz引擎坐标转换""" return transform_bbox_for_rotation_fitz(bbox, rotation, pdf_width, pdf_height, scale) @staticmethod def _transform_bbox_for_rotation_pypdfium2( bbox: List[float], rotation: int, pdf_width: float, pdf_height: float, scale: float ) -> List[float]: """向后兼容包装:pypdfium2引擎坐标转换""" return transform_bbox_for_rotation_pypdfium2(bbox, rotation, pdf_width, pdf_height, scale) # ======================================================================== # 文本提取函数(向后兼容包装) # ======================================================================== # ======================================================================== # 文本提取函数(向后兼容包装) # ======================================================================== @staticmethod def extract_text_from_pdf( pdf_doc: Any, page_idx: int, bbox: List[float], scale: float ) -> Tuple[str, bool]: """向后兼容包装:从PDF指定区域提取文本""" return extract_text_from_pdf(pdf_doc, page_idx, bbox, scale) @staticmethod def _extract_text_from_pdf_pypdfium2( pdf_doc: Any, page_idx: int, bbox: List[float], scale: float ) -> Tuple[str, bool]: """向后兼容包装:使用pypdfium2提取文本""" return extract_text_from_pdf_pypdfium2(pdf_doc, page_idx, bbox, scale) @staticmethod def _extract_text_from_pdf_fitz( pdf_doc: Any, page_idx: int, bbox: List[float], scale: float ) -> Tuple[str, bool]: """向后兼容包装:使用fitz提取文本""" return extract_text_from_pdf_fitz(pdf_doc, page_idx, bbox, scale) @staticmethod def extract_all_text_blocks( pdf_doc: Any, page_idx: int, scale: float ) -> Tuple[List[Dict[str, Any]], int]: """向后兼容包装:提取页面所有文本块""" return extract_all_text_blocks(pdf_doc, page_idx, scale) @staticmethod def _extract_all_text_blocks_pypdfium2( pdf_doc: Any, page_idx: int, scale: float ) -> Tuple[List[Dict[str, Any]], int]: """向后兼容包装:使用pypdfium2提取所有文本块""" return extract_all_text_blocks_pypdfium2(pdf_doc, page_idx, scale) @staticmethod def _extract_all_text_blocks_fitz( pdf_doc: Any, page_idx: int, scale: float ) -> Tuple[List[Dict[str, Any]], int]: """向后兼容包装:使用fitz提取所有文本块""" return extract_all_text_blocks_fitz(pdf_doc, page_idx, scale) @staticmethod def _bbox_overlap(bbox1: List[float], bbox2: List[float]) -> bool: """向后兼容包装:检查两个bbox是否重叠""" return bbox_overlap(bbox1, bbox2) # ======================================================================== # 图像渲染函数(向后兼容包装) # ======================================================================== @staticmethod def load_images_from_pdf_unified( pdf_bytes: bytes, dpi: int = 200, start_page_id: int = 0, end_page_id: Optional[int] = None, image_type: str = "PIL", renderer: str = "pypdfium2", timeout: Optional[int] = None, threads: int = 4, ) -> Tuple[List[Dict[str, Any]], Any]: """向后兼容包装:统一的PDF图像加载接口""" return load_images_from_pdf_unified( pdf_bytes, dpi, start_page_id, end_page_id, image_type, renderer, timeout, threads ) @staticmethod def _load_images_pypdfium2( pdf_bytes: bytes, dpi: int, start_page_id: int, end_page_id: Optional[int], image_type: str, timeout: Optional[int], threads: int ) -> Tuple[List[Dict[str, Any]], Any]: """向后兼容包装:使用pypdfium2渲染""" return load_images_pypdfium2( pdf_bytes, dpi, start_page_id, end_page_id, image_type, timeout, threads ) @staticmethod def _load_images_fitz( pdf_bytes: bytes, dpi: int, start_page_id: int, end_page_id: Optional[int], image_type: str ) -> Tuple[List[Dict[str, Any]], Any]: """向后兼容包装:使用fitz渲染""" return load_images_fitz( pdf_bytes, dpi, start_page_id, end_page_id, image_type ) # ======================================================================== # 其他功能 # ======================================================================== # 其他功能 # ======================================================================== @staticmethod def merge_cross_page_tables(results: Dict[str, Any]) -> Dict[str, Any]: """ 合并跨页表格 TODO: 实现跨页表格合并逻辑 可以参考 MinerU 的 cross_page_table_merge 实现 Args: results: 处理结果字典 Returns: 合并后的结果 """ # TODO: 实现跨页表格合并逻辑 return results