zhengchun
/
ocr_platform


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436
							"""
PDF处理工具模块（重构版）

提供PDF相关处理功能的统一入口：
- PDF加载与分类
- PDF文本提取（支持 pypdfium2 和 fitz）
- PDF图像渲染（支持多种引擎）
- 坐标转换（PDF坐标 ↔ 图像坐标）
- 跨页表格合并
- 页面范围解析与过滤

本模块已重构为多个子模块：
- pdf_coordinate_transform: 坐标转换功能
- pdf_text_extraction: 文本提取功能
- pdf_image_rendering: 图像渲染功能
- pdf_utils: 高级API和统一入口（本文件）

为保持向后兼容性，所有原有函数都从新模块重新导出。
"""
from typing import Dict, List, Any, Optional, Tuple, Set
from pathlib import Path
from PIL import Image
from loguru import logger

# 导入页面范围解析函数（不依赖 MinerU）
from .file_utils import parse_page_range

# 从子模块导入功能
from .pdf_coordinate_transform import (
    transform_bbox_for_rotation_fitz,
    transform_bbox_for_rotation_pypdfium2,
    pdf_rotation_to_image_rotation,
)

from .pdf_text_extraction import (
    detect_pdf_doc_type,
    bbox_overlap,
    extract_text_from_pdf,
    extract_text_from_pdf_pypdfium2,
    extract_text_from_pdf_fitz,
    extract_all_text_blocks,
    extract_all_text_blocks_pypdfium2,
    extract_all_text_blocks_fitz,
    detect_page_type,
    get_page_rotation,
)

from .pdf_image_rendering import (
    load_images_from_pdf_unified,
    load_images_pypdfium2,
    load_images_fitz,
)

# 导入 MinerU 组件
try:
    from mineru.utils.pdf_classify import classify as pdf_classify
    from mineru.utils.enum_class import ImageType
    MINERU_AVAILABLE = True
except ImportError:
    raise ImportError("MinerU components not available for PDF processing")


class PDFUtils:
    """
    PDF处理工具类（重构版）
    
    本类提供PDF处理的高级API，内部调用已重构的子模块功能。
    保持原有接口不变，确保向后兼容性。
    
    子模块：
    - pdf_coordinate_transform: 坐标转换
    - pdf_text_extraction: 文本提取
    - pdf_image_rendering: 图像渲染
    """
    
    @staticmethod
    def parse_page_range(page_range: Optional[str], total_pages: int) -> Set[int]:
        """
        解析页面范围字符串（向后兼容包装函数）
        
        此方法是对 file_utils.parse_page_range 的包装，保持向后兼容性。
        新代码应直接使用 file_utils.parse_page_range。
        
        支持格式：
        - "1-5" → {0, 1, 2, 3, 4}（页码从1开始，内部转为0-based索引）
        - "3" → {2}
        - "1-5,7,9-12" → {0, 1, 2, 3, 4, 6, 8, 9, 10, 11}
        - "1-" → 从第1页到最后
        - "-5" → 从第1页到第5页
        
        Args:
            page_range: 页面范围字符串（页码从1开始）
            total_pages: 总页数
            
        Returns:
            页面索引集合（0-based）
        """
        return parse_page_range(page_range, total_pages)

    @staticmethod
    def _detect_pdf_doc_type(pdf_doc: Any) -> str:
        """
        检测 PDF 文档对象类型（向后兼容包装）
        
        Args:
            pdf_doc: PDF 文档对象
            
        Returns:
            'pypdfium2' 或 'fitz'
        """
        return detect_pdf_doc_type(pdf_doc)
    
    @staticmethod
    def load_and_classify_document(
        document_path: Path,
        dpi: int = 200,
        page_range: Optional[str] = None,
        renderer: str = "fitz"
    ) -> Tuple[List[Dict], str, Optional[Any], str]:
        """
        加载文档并分类，支持页面范围过滤
        
        Args:
            document_path: 文档路径
            dpi: PDF渲染DPI
            page_range: 页面范围字符串，如 "1-5,7,9-12"
                       - PDF：按页码（从1开始）
                       - 图片目录：按文件名排序后的位置（从1开始）
            renderer: PDF渲染引擎，"fitz" 或 "pypdfium2"
            
        Returns:
            (images_list, pdf_type, pdf_doc, renderer_used)
            - images_list: 图像列表，每个元素包含 {'img_pil': PIL.Image, 'scale': float, 'page_idx': int}
            - pdf_type: 'ocr' 或 'txt'
            - pdf_doc: PDF文档对象（如果是PDF）
            - renderer_used: 实际使用的渲染器类型
        """
        pdf_doc = None
        pdf_type = 'ocr'  # 默认使用OCR模式
        all_images = []
        
        if document_path.is_dir():
            # 处理目录：遍历所有图片
            image_extensions = {'.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.tif'}
            image_files = sorted([
                f for f in document_path.iterdir() 
                if f.suffix.lower() in image_extensions
            ])
            
            # 解析页面范围
            total_pages = len(image_files)
            selected_pages = parse_page_range(page_range, total_pages)
            
            if page_range:
                logger.info(f"📋 图片目录共 {total_pages} 张，选择处理 {len(selected_pages)} 张")
            
            for idx, img_file in enumerate(image_files):
                if idx not in selected_pages:
                    continue
                
                img = Image.open(img_file)
                if img.mode != 'RGB':
                    img = img.convert('RGB')
                all_images.append({
                    'img_pil': img,
                    'scale': 1.0,
                    'source_path': str(img_file),
                    'page_idx': idx,
                    'page_name': img_file.stem
                })
            
            pdf_type = 'ocr'
            
        elif document_path.suffix.lower() == '.pdf':
            # 处理PDF文件
            if not MINERU_AVAILABLE:
                raise RuntimeError("MinerU components not available for PDF processing")
            
            with open(document_path, 'rb') as f:
                pdf_bytes = f.read()
            
            # PDF分类
            pdf_type = pdf_classify(pdf_bytes)
            logger.info(f"📋 PDF classified as: {pdf_type}")
            
            # 加载图像（使用重构后的函数）
            images_list, pdf_doc = load_images_from_pdf_unified(
                pdf_bytes, 
                dpi=dpi,
                image_type=ImageType.PIL,
                renderer=renderer
            )
            
            # 解析页面范围
            total_pages = len(images_list)
            selected_pages = parse_page_range(page_range, total_pages)
            
            if page_range:
                logger.info(f"📋 PDF 共 {total_pages} 页，选择处理 {len(selected_pages)} 页")
            
            for idx, img_dict in enumerate(images_list):
                if idx not in selected_pages:
                    continue
                
                all_images.append({
                    'img_pil': img_dict['img_pil'],
                    'scale': img_dict.get('scale', dpi / 72),
                    'source_path': str(document_path),
                    'page_idx': idx,
                    'page_name': f"{document_path.stem}_page_{idx + 1:03d}"
                })
                
        elif document_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.tif']:
            # 处理单个图片
            img = Image.open(document_path)
            if img.mode != 'RGB':
                img = img.convert('RGB')
            all_images.append({
                'img_pil': img,
                'scale': 1.0,
                'source_path': str(document_path),
                'page_idx': 0,
                'page_name': document_path.stem
            })
            pdf_type = 'ocr'
            
        else:
            raise ValueError(f"Unsupported file format: {document_path.suffix}")
        
        return all_images, pdf_type, pdf_doc, renderer
    

    @staticmethod
    def _transform_bbox_for_rotation_fitz(
        bbox: List[float],
        rotation: int,
        pdf_width: float,
        pdf_height: float,
        scale: float
    ) -> List[float]:
        """向后兼容包装：fitz引擎坐标转换"""
        return transform_bbox_for_rotation_fitz(bbox, rotation, pdf_width, pdf_height, scale)

    @staticmethod
    def _transform_bbox_for_rotation_pypdfium2(
        bbox: List[float],
        rotation: int,
        pdf_width: float,
        pdf_height: float,
        scale: float
    ) -> List[float]:
        """向后兼容包装：pypdfium2引擎坐标转换"""
        return transform_bbox_for_rotation_pypdfium2(bbox, rotation, pdf_width, pdf_height, scale)

    # ========================================================================
    # 文本提取函数（向后兼容包装）
    # ========================================================================

    # ========================================================================
    # 文本提取函数（向后兼容包装）
    # ========================================================================
    
    @staticmethod
    def extract_text_from_pdf(
        pdf_doc: Any,
        page_idx: int,
        bbox: List[float],
        scale: float
    ) -> Tuple[str, bool]:
        """向后兼容包装：从PDF指定区域提取文本"""
        return extract_text_from_pdf(pdf_doc, page_idx, bbox, scale)
    
    @staticmethod
    def _extract_text_from_pdf_pypdfium2(
        pdf_doc: Any,
        page_idx: int,
        bbox: List[float],
        scale: float
    ) -> Tuple[str, bool]:
        """向后兼容包装：使用pypdfium2提取文本"""
        return extract_text_from_pdf_pypdfium2(pdf_doc, page_idx, bbox, scale)
    
    @staticmethod
    def _extract_text_from_pdf_fitz(
        pdf_doc: Any,
        page_idx: int,
        bbox: List[float],
        scale: float
    ) -> Tuple[str, bool]:
        """向后兼容包装：使用fitz提取文本"""
        return extract_text_from_pdf_fitz(pdf_doc, page_idx, bbox, scale)

    @staticmethod
    def extract_all_text_blocks(
        pdf_doc: Any,
        page_idx: int,
        scale: float,
        return_upright_coords: bool = True,
    ) -> Tuple[List[Dict[str, Any]], int]:
        """向后兼容包装：提取页面所有文本块
        
        Args:
            return_upright_coords: 是否返回正视坐标（True=正视，False=旋转后）
        """
        return extract_all_text_blocks(pdf_doc, page_idx, scale, return_upright_coords)

    @staticmethod
    def _extract_all_text_blocks_pypdfium2(
        pdf_doc: Any,
        page_idx: int,
        scale: float
    ) -> Tuple[List[Dict[str, Any]], int]:
        """向后兼容包装：使用pypdfium2提取所有文本块"""
        return extract_all_text_blocks_pypdfium2(pdf_doc, page_idx, scale)

    @staticmethod
    def _extract_all_text_blocks_fitz(
        pdf_doc: Any,
        page_idx: int,
        scale: float
    ) -> Tuple[List[Dict[str, Any]], int]:
        """向后兼容包装：使用fitz提取所有文本块"""
        return extract_all_text_blocks_fitz(pdf_doc, page_idx, scale)

    @staticmethod
    def _bbox_overlap(bbox1: List[float], bbox2: List[float]) -> bool:
        """向后兼容包装：检查两个bbox是否重叠"""
        return bbox_overlap(bbox1, bbox2)
    
    # ========================================================================
    # 图像渲染函数（向后兼容包装）
    # ========================================================================
    
    @staticmethod
    def load_images_from_pdf_unified(
        pdf_bytes: bytes,
        dpi: int = 200,
        start_page_id: int = 0,
        end_page_id: Optional[int] = None,
        image_type: str = "PIL",
        renderer: str = "pypdfium2",
        timeout: Optional[int] = None,
        threads: int = 4,
    ) -> Tuple[List[Dict[str, Any]], Any]:
        """向后兼容包装：统一的PDF图像加载接口"""
        return load_images_from_pdf_unified(
            pdf_bytes, dpi, start_page_id, end_page_id,
            image_type, renderer, timeout, threads
        )

    @staticmethod
    def _load_images_pypdfium2(
        pdf_bytes: bytes,
        dpi: int,
        start_page_id: int,
        end_page_id: Optional[int],
        image_type: str,
        timeout: Optional[int],
        threads: int
    ) -> Tuple[List[Dict[str, Any]], Any]:
        """向后兼容包装：使用pypdfium2渲染"""
        return load_images_pypdfium2(
            pdf_bytes, dpi, start_page_id, end_page_id,
            image_type, timeout, threads
        )

    @staticmethod
    def _load_images_fitz(
        pdf_bytes: bytes,
        dpi: int,
        start_page_id: int,
        end_page_id: Optional[int],
        image_type: str
    ) -> Tuple[List[Dict[str, Any]], Any]:
        """向后兼容包装：使用fitz渲染"""
        return load_images_fitz(
            pdf_bytes, dpi, start_page_id, end_page_id, image_type
        )
    
    @staticmethod
    def detect_page_type(
        pdf_doc: Any, 
        page_idx: int,
        char_threshold: int = 50
    ) -> str:
        """
        检测页面类型（文本PDF或扫描OCR）
        
        Returns:
            页面类型：'txt' 或 'ocr'
        """
        return detect_page_type(pdf_doc, page_idx, char_threshold)

    @staticmethod
    def get_page_rotation(pdf_doc: Any, page_idx: int) -> int:
        """
        获取PDF页面的旋转角度（逆时针定义，用于图像旋转）
        
        返回的角度可直接用于PIL.rotate()等图像旋转函数。
        
        Args:
            pdf_doc: PDF文档对象 (pypdfium2.PdfDocument 或 fitz.Document)
            page_idx: 页码索引（0-based）
            
        Returns:
            旋转角度：0/90/180/270（逆时针旋转角度）
            
        Examples:
            >>> rotate_angle = PDFUtils.get_page_rotation(pdf_doc, 0)
            >>> if rotate_angle != 0:
            >>>     image = image.rotate(-rotate_angle, expand=True)  # 旋转为正视
        """
        return get_page_rotation(pdf_doc, page_idx)

    # ========================================================================
    # 其他功能
    # ========================================================================
    
    @staticmethod
    def merge_cross_page_tables(results: Dict[str, Any]) -> Dict[str, Any]:
        """
        合并跨页表格
        
        TODO: 实现跨页表格合并逻辑
        可以参考 MinerU 的 cross_page_table_merge 实现
        
        Args:
            results: 处理结果字典
            
        Returns:
            合并后的结果
        """
        # TODO: 实现跨页表格合并逻辑
        return results