| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409 |
- """
- PDF处理工具模块(重构版)
- 提供PDF相关处理功能的统一入口:
- - PDF加载与分类
- - PDF文本提取(支持 pypdfium2 和 fitz)
- - PDF图像渲染(支持多种引擎)
- - 坐标转换(PDF坐标 ↔ 图像坐标)
- - 跨页表格合并
- - 页面范围解析与过滤
- 本模块已重构为多个子模块:
- - pdf_coordinate_transform: 坐标转换功能
- - pdf_text_extraction: 文本提取功能
- - pdf_image_rendering: 图像渲染功能
- - pdf_utils: 高级API和统一入口(本文件)
- 为保持向后兼容性,所有原有函数都从新模块重新导出。
- """
- from typing import Dict, List, Any, Optional, Tuple, Set
- from pathlib import Path
- from PIL import Image
- from loguru import logger
- # 导入页面范围解析函数(不依赖 MinerU)
- from .file_utils import parse_page_range
- # 从子模块导入功能
- from .pdf_coordinate_transform import (
- transform_bbox_for_rotation_fitz,
- transform_bbox_for_rotation_pypdfium2,
- pdf_rotation_to_image_rotation,
- )
- from .pdf_text_extraction import (
- detect_pdf_doc_type,
- bbox_overlap,
- extract_text_from_pdf,
- extract_text_from_pdf_pypdfium2,
- extract_text_from_pdf_fitz,
- extract_all_text_blocks,
- extract_all_text_blocks_pypdfium2,
- extract_all_text_blocks_fitz,
- detect_page_type,
- )
- from .pdf_image_rendering import (
- load_images_from_pdf_unified,
- load_images_pypdfium2,
- load_images_fitz,
- )
- # 导入 MinerU 组件
- try:
- from mineru.utils.pdf_classify import classify as pdf_classify
- from mineru.utils.enum_class import ImageType
- MINERU_AVAILABLE = True
- except ImportError:
- raise ImportError("MinerU components not available for PDF processing")
- class PDFUtils:
- """
- PDF处理工具类(重构版)
-
- 本类提供PDF处理的高级API,内部调用已重构的子模块功能。
- 保持原有接口不变,确保向后兼容性。
-
- 子模块:
- - pdf_coordinate_transform: 坐标转换
- - pdf_text_extraction: 文本提取
- - pdf_image_rendering: 图像渲染
- """
-
- @staticmethod
- def parse_page_range(page_range: Optional[str], total_pages: int) -> Set[int]:
- """
- 解析页面范围字符串(向后兼容包装函数)
-
- 此方法是对 file_utils.parse_page_range 的包装,保持向后兼容性。
- 新代码应直接使用 file_utils.parse_page_range。
-
- 支持格式:
- - "1-5" → {0, 1, 2, 3, 4}(页码从1开始,内部转为0-based索引)
- - "3" → {2}
- - "1-5,7,9-12" → {0, 1, 2, 3, 4, 6, 8, 9, 10, 11}
- - "1-" → 从第1页到最后
- - "-5" → 从第1页到第5页
-
- Args:
- page_range: 页面范围字符串(页码从1开始)
- total_pages: 总页数
-
- Returns:
- 页面索引集合(0-based)
- """
- return parse_page_range(page_range, total_pages)
- @staticmethod
- def _detect_pdf_doc_type(pdf_doc: Any) -> str:
- """
- 检测 PDF 文档对象类型(向后兼容包装)
-
- Args:
- pdf_doc: PDF 文档对象
-
- Returns:
- 'pypdfium2' 或 'fitz'
- """
- return detect_pdf_doc_type(pdf_doc)
-
- @staticmethod
- def load_and_classify_document(
- document_path: Path,
- dpi: int = 200,
- page_range: Optional[str] = None,
- renderer: str = "fitz"
- ) -> Tuple[List[Dict], str, Optional[Any], str]:
- """
- 加载文档并分类,支持页面范围过滤
-
- Args:
- document_path: 文档路径
- dpi: PDF渲染DPI
- page_range: 页面范围字符串,如 "1-5,7,9-12"
- - PDF:按页码(从1开始)
- - 图片目录:按文件名排序后的位置(从1开始)
- renderer: PDF渲染引擎,"fitz" 或 "pypdfium2"
-
- Returns:
- (images_list, pdf_type, pdf_doc, renderer_used)
- - images_list: 图像列表,每个元素包含 {'img_pil': PIL.Image, 'scale': float, 'page_idx': int}
- - pdf_type: 'ocr' 或 'txt'
- - pdf_doc: PDF文档对象(如果是PDF)
- - renderer_used: 实际使用的渲染器类型
- """
- pdf_doc = None
- pdf_type = 'ocr' # 默认使用OCR模式
- all_images = []
-
- if document_path.is_dir():
- # 处理目录:遍历所有图片
- image_extensions = {'.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.tif'}
- image_files = sorted([
- f for f in document_path.iterdir()
- if f.suffix.lower() in image_extensions
- ])
-
- # 解析页面范围
- total_pages = len(image_files)
- selected_pages = parse_page_range(page_range, total_pages)
-
- if page_range:
- logger.info(f"📋 图片目录共 {total_pages} 张,选择处理 {len(selected_pages)} 张")
-
- for idx, img_file in enumerate(image_files):
- if idx not in selected_pages:
- continue
-
- img = Image.open(img_file)
- if img.mode != 'RGB':
- img = img.convert('RGB')
- all_images.append({
- 'img_pil': img,
- 'scale': 1.0,
- 'source_path': str(img_file),
- 'page_idx': idx,
- 'page_name': img_file.stem
- })
-
- pdf_type = 'ocr'
-
- elif document_path.suffix.lower() == '.pdf':
- # 处理PDF文件
- if not MINERU_AVAILABLE:
- raise RuntimeError("MinerU components not available for PDF processing")
-
- with open(document_path, 'rb') as f:
- pdf_bytes = f.read()
-
- # PDF分类
- pdf_type = pdf_classify(pdf_bytes)
- logger.info(f"📋 PDF classified as: {pdf_type}")
-
- # 加载图像(使用重构后的函数)
- images_list, pdf_doc = load_images_from_pdf_unified(
- pdf_bytes,
- dpi=dpi,
- image_type=ImageType.PIL,
- renderer=renderer
- )
-
- # 解析页面范围
- total_pages = len(images_list)
- selected_pages = parse_page_range(page_range, total_pages)
-
- if page_range:
- logger.info(f"📋 PDF 共 {total_pages} 页,选择处理 {len(selected_pages)} 页")
-
- for idx, img_dict in enumerate(images_list):
- if idx not in selected_pages:
- continue
-
- all_images.append({
- 'img_pil': img_dict['img_pil'],
- 'scale': img_dict.get('scale', dpi / 72),
- 'source_path': str(document_path),
- 'page_idx': idx,
- 'page_name': f"{document_path.stem}_page_{idx + 1:03d}"
- })
-
- elif document_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.tif']:
- # 处理单个图片
- img = Image.open(document_path)
- if img.mode != 'RGB':
- img = img.convert('RGB')
- all_images.append({
- 'img_pil': img,
- 'scale': 1.0,
- 'source_path': str(document_path),
- 'page_idx': 0,
- 'page_name': document_path.stem
- })
- pdf_type = 'ocr'
-
- else:
- raise ValueError(f"Unsupported file format: {document_path.suffix}")
-
- return all_images, pdf_type, pdf_doc, renderer
-
- @staticmethod
- def _transform_bbox_for_rotation_fitz(
- bbox: List[float],
- rotation: int,
- pdf_width: float,
- pdf_height: float,
- scale: float
- ) -> List[float]:
- """向后兼容包装:fitz引擎坐标转换"""
- return transform_bbox_for_rotation_fitz(bbox, rotation, pdf_width, pdf_height, scale)
- @staticmethod
- def _transform_bbox_for_rotation_pypdfium2(
- bbox: List[float],
- rotation: int,
- pdf_width: float,
- pdf_height: float,
- scale: float
- ) -> List[float]:
- """向后兼容包装:pypdfium2引擎坐标转换"""
- return transform_bbox_for_rotation_pypdfium2(bbox, rotation, pdf_width, pdf_height, scale)
- # ========================================================================
- # 文本提取函数(向后兼容包装)
- # ========================================================================
- # ========================================================================
- # 文本提取函数(向后兼容包装)
- # ========================================================================
-
- @staticmethod
- def extract_text_from_pdf(
- pdf_doc: Any,
- page_idx: int,
- bbox: List[float],
- scale: float
- ) -> Tuple[str, bool]:
- """向后兼容包装:从PDF指定区域提取文本"""
- return extract_text_from_pdf(pdf_doc, page_idx, bbox, scale)
-
- @staticmethod
- def _extract_text_from_pdf_pypdfium2(
- pdf_doc: Any,
- page_idx: int,
- bbox: List[float],
- scale: float
- ) -> Tuple[str, bool]:
- """向后兼容包装:使用pypdfium2提取文本"""
- return extract_text_from_pdf_pypdfium2(pdf_doc, page_idx, bbox, scale)
-
- @staticmethod
- def _extract_text_from_pdf_fitz(
- pdf_doc: Any,
- page_idx: int,
- bbox: List[float],
- scale: float
- ) -> Tuple[str, bool]:
- """向后兼容包装:使用fitz提取文本"""
- return extract_text_from_pdf_fitz(pdf_doc, page_idx, bbox, scale)
- @staticmethod
- def extract_all_text_blocks(
- pdf_doc: Any,
- page_idx: int,
- scale: float
- ) -> Tuple[List[Dict[str, Any]], int]:
- """向后兼容包装:提取页面所有文本块"""
- return extract_all_text_blocks(pdf_doc, page_idx, scale)
- @staticmethod
- def _extract_all_text_blocks_pypdfium2(
- pdf_doc: Any,
- page_idx: int,
- scale: float
- ) -> Tuple[List[Dict[str, Any]], int]:
- """向后兼容包装:使用pypdfium2提取所有文本块"""
- return extract_all_text_blocks_pypdfium2(pdf_doc, page_idx, scale)
- @staticmethod
- def _extract_all_text_blocks_fitz(
- pdf_doc: Any,
- page_idx: int,
- scale: float
- ) -> Tuple[List[Dict[str, Any]], int]:
- """向后兼容包装:使用fitz提取所有文本块"""
- return extract_all_text_blocks_fitz(pdf_doc, page_idx, scale)
- @staticmethod
- def _bbox_overlap(bbox1: List[float], bbox2: List[float]) -> bool:
- """向后兼容包装:检查两个bbox是否重叠"""
- return bbox_overlap(bbox1, bbox2)
-
- # ========================================================================
- # 图像渲染函数(向后兼容包装)
- # ========================================================================
-
- @staticmethod
- def load_images_from_pdf_unified(
- pdf_bytes: bytes,
- dpi: int = 200,
- start_page_id: int = 0,
- end_page_id: Optional[int] = None,
- image_type: str = "PIL",
- renderer: str = "pypdfium2",
- timeout: Optional[int] = None,
- threads: int = 4,
- ) -> Tuple[List[Dict[str, Any]], Any]:
- """向后兼容包装:统一的PDF图像加载接口"""
- return load_images_from_pdf_unified(
- pdf_bytes, dpi, start_page_id, end_page_id,
- image_type, renderer, timeout, threads
- )
- @staticmethod
- def _load_images_pypdfium2(
- pdf_bytes: bytes,
- dpi: int,
- start_page_id: int,
- end_page_id: Optional[int],
- image_type: str,
- timeout: Optional[int],
- threads: int
- ) -> Tuple[List[Dict[str, Any]], Any]:
- """向后兼容包装:使用pypdfium2渲染"""
- return load_images_pypdfium2(
- pdf_bytes, dpi, start_page_id, end_page_id,
- image_type, timeout, threads
- )
- @staticmethod
- def _load_images_fitz(
- pdf_bytes: bytes,
- dpi: int,
- start_page_id: int,
- end_page_id: Optional[int],
- image_type: str
- ) -> Tuple[List[Dict[str, Any]], Any]:
- """向后兼容包装:使用fitz渲染"""
- return load_images_fitz(
- pdf_bytes, dpi, start_page_id, end_page_id, image_type
- )
-
- @staticmethod
- def detect_page_type(
- pdf_doc: Any,
- page_idx: int,
- char_threshold: int = 50
- ) -> str:
- """
- 检测页面类型(文本PDF或扫描OCR)
-
- Returns:
- 页面类型:'txt' 或 'ocr'
- """
- return detect_page_type(pdf_doc, page_idx, char_threshold)
- # ========================================================================
- # 其他功能
- # ========================================================================
-
- @staticmethod
- def merge_cross_page_tables(results: Dict[str, Any]) -> Dict[str, Any]:
- """
- 合并跨页表格
-
- TODO: 实现跨页表格合并逻辑
- 可以参考 MinerU 的 cross_page_table_merge 实现
-
- Args:
- results: 处理结果字典
-
- Returns:
- 合并后的结果
- """
- # TODO: 实现跨页表格合并逻辑
- return results
|