zhengchun
/
ocr_platform


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713
							"""
PDF处理工具模块

提供PDF相关处理功能：
- PDF加载与分类
- PDF文本提取
- 跨页表格合并
- 页面范围解析与过滤
"""
from typing import Dict, List, Any, Optional, Tuple, Set
from pathlib import Path
from PIL import Image
from loguru import logger
import re

# 导入页面范围解析函数（不依赖 MinerU）
from .file_utils import parse_page_range

# 导入 MinerU 组件
try:
    from mineru.utils.pdf_classify import classify as pdf_classify
    from mineru.utils.pdf_image_tools import load_images_from_pdf
    from mineru.utils.enum_class import ImageType
    from mineru.utils.pdf_text_tool import get_page as pdf_get_page_text
    MINERU_AVAILABLE = True
except ImportError:
    raise ImportError("MinerU components not available for PDF processing")

class PDFUtils:
    """PDF处理工具类"""
    
    @staticmethod
    def parse_page_range(page_range: Optional[str], total_pages: int) -> Set[int]:
        """
        解析页面范围字符串（向后兼容包装函数）
        
        此方法是对 file_utils.parse_page_range 的包装，保持向后兼容性。
        新代码应直接使用 file_utils.parse_page_range。
        
        支持格式：
        - "1-5" → {0, 1, 2, 3, 4}（页码从1开始，内部转为0-based索引）
        - "3" → {2}
        - "1-5,7,9-12" → {0, 1, 2, 3, 4, 6, 8, 9, 10, 11}
        - "1-" → 从第1页到最后
        - "-5" → 从第1页到第5页
        
        Args:
            page_range: 页面范围字符串（页码从1开始）
            total_pages: 总页数
            
        Returns:
            页面索引集合（0-based）
        """
        return parse_page_range(page_range, total_pages)

    @staticmethod
    def _detect_pdf_doc_type(pdf_doc: Any) -> str:
        """
        检测 PDF 文档对象类型
        
        Args:
            pdf_doc: PDF 文档对象
            
        Returns:
            'pypdfium2' 或 'fitz'
        """
        doc_type_name = type(pdf_doc).__name__
        doc_module = type(pdf_doc).__module__
        
        if 'pdfium' in doc_module.lower() or 'PdfDocument' in doc_type_name:
            return 'pypdfium2'
        elif 'fitz' in doc_module.lower() or 'Document' in doc_type_name:
            return 'fitz'
        else:
            # 尝试通过属性判断
            if hasattr(pdf_doc, 'get_page') or hasattr(pdf_doc, 'page_count'):
                # fitz.Document 有 page_count 属性
                return 'fitz'
            else:
                # pypdfium2 通过索引访问
                return 'pypdfium2'
    
    @staticmethod
    def load_and_classify_document(
        document_path: Path,
        dpi: int = 200,
        page_range: Optional[str] = None,
        renderer: str = "fitz"  # 新增参数，默认 fitz
    ) -> Tuple[List[Dict], str, Optional[Any], str]:
        """
        加载文档并分类，支持页面范围过滤
        
        Args:
            document_path: 文档路径
            dpi: PDF渲染DPI
            page_range: 页面范围字符串，如 "1-5,7,9-12"
                       - PDF：按页码（从1开始）
                       - 图片目录：按文件名排序后的位置（从1开始）
            renderer: PDF渲染引擎，"fitz" 或 "pypdfium2"
            
        Returns:
            (images_list, pdf_type, pdf_doc)
            - images_list: 图像列表，每个元素包含 {'img_pil': PIL.Image, 'scale': float, 'page_idx': int}
            - pdf_type: 'ocr' 或 'txt'
            - pdf_doc: PDF文档对象（如果PDF）
            - renderer_used: 实际使用的渲染器类型
        """
        pdf_doc = None
        pdf_type = 'ocr'  # 默认使用OCR模式
        all_images = []
        
        if document_path.is_dir():
            # 处理目录：遍历所有图片
            image_extensions = {'.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.tif'}
            image_files = sorted([
                f for f in document_path.iterdir() 
                if f.suffix.lower() in image_extensions
            ])
            
            # 解析页面范围
            total_pages = len(image_files)
            selected_pages = parse_page_range(page_range, total_pages)
            
            if page_range:
                logger.info(f"📋 图片目录共 {total_pages} 张，选择处理 {len(selected_pages)} 张")
            
            for idx, img_file in enumerate(image_files):
                if idx not in selected_pages:
                    continue
                
                img = Image.open(img_file)
                if img.mode != 'RGB':
                    img = img.convert('RGB')
                all_images.append({
                    'img_pil': img,
                    'scale': 1.0,
                    'source_path': str(img_file),
                    'page_idx': idx,  # 原始索引
                    'page_name': img_file.stem  # 文件名（不含扩展名）
                })
            
            pdf_type = 'ocr'  # 图片目录始终使用OCR模式
            
        elif document_path.suffix.lower() == '.pdf':
            # 处理PDF文件
            if not MINERU_AVAILABLE:
                raise RuntimeError("MinerU components not available for PDF processing")
            
            with open(document_path, 'rb') as f:
                pdf_bytes = f.read()
            
            # PDF分类
            pdf_type = pdf_classify(pdf_bytes)
            logger.info(f"📋 PDF classified as: {pdf_type}")
            
            # 加载图像
            images_list, pdf_doc = load_images_from_pdf_unified(
                pdf_bytes, 
                dpi=dpi,
                image_type=ImageType.PIL,
                renderer=renderer   # 使用指定的渲染引擎
            )
            
            # 解析页面范围
            total_pages = len(images_list)
            selected_pages = parse_page_range(page_range, total_pages)
            
            if page_range:
                logger.info(f"📋 PDF 共 {total_pages} 页，选择处理 {len(selected_pages)} 页")
            
            for idx, img_dict in enumerate(images_list):
                if idx not in selected_pages:
                    continue
                
                all_images.append({
                    'img_pil': img_dict['img_pil'],
                    'scale': img_dict.get('scale', dpi / 72),
                    'source_path': str(document_path),
                    'page_idx': idx,  # 原始页码索引
                    'page_name': f"{document_path.stem}_page_{idx + 1:03d}"
                })
                
        elif document_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.tif']:
            # 处理单个图片
            img = Image.open(document_path)
            if img.mode != 'RGB':
                img = img.convert('RGB')
            all_images.append({
                'img_pil': img,
                'scale': 1.0,
                'source_path': str(document_path),
                'page_idx': 0,
                'page_name': document_path.stem
            })
            pdf_type = 'ocr'
            
        else:
            raise ValueError(f"Unsupported file format: {document_path.suffix}")
        
        return all_images, pdf_type, pdf_doc, renderer
    
    @staticmethod
    def extract_text_from_pdf(
        pdf_doc: Any,
        page_idx: int,
        bbox: List[float],
        scale: float
    ) -> Tuple[str, bool]:
        """
        从PDF直接提取文本（支持 pypdfium2 和 fitz）
        
        Args:
            pdf_doc: PDF文档对象 (pypdfium2.PdfDocument 或 fitz.Document)
            page_idx: 页码索引
            bbox: 目标区域的bbox（图像坐标）
            scale: 图像与PDF的缩放比例
            
        Returns:
            (text, success)
        """
        # 检测 PDF 文档类型
        doc_type = PDFUtils._detect_pdf_doc_type(pdf_doc)
        
        if doc_type == 'fitz':
            return PDFUtils._extract_text_from_pdf_fitz(pdf_doc, page_idx, bbox, scale)
        else:  # pypdfium2
            return PDFUtils._extract_text_from_pdf_pypdfium2(pdf_doc, page_idx, bbox, scale)
    
    @staticmethod
    def _extract_text_from_pdf_pypdfium2(
        pdf_doc: Any,
        page_idx: int,
        bbox: List[float],
        scale: float
    ) -> Tuple[str, bool]:
        """使用 pypdfium2 提取文本（原有实现）"""
        if not MINERU_AVAILABLE or pdf_get_page_text is None:
            logger.error("MinerU pdf_text_tool not available")
            return "", False
            
        try:
            page = pdf_doc[page_idx]
            
            # 将图像坐标转换为PDF坐标
            pdf_bbox = [
                bbox[0] / scale,
                bbox[1] / scale,
                bbox[2] / scale,
                bbox[3] / scale
            ]
            
            # 使用 MinerU 的方式获取页面文本信息
            page_dict = pdf_get_page_text(page)
            
            # 从 blocks 中提取与 bbox 重叠的文本
            text_parts = []
            for block in page_dict.get('blocks', []):
                for line in block.get('lines', []):
                    line_bbox = line.get('bbox')
                    if line_bbox and hasattr(line_bbox, 'bbox'):
                        line_bbox = line_bbox.bbox
                    elif isinstance(line_bbox, (list, tuple)) and len(line_bbox) >= 4:
                        line_bbox = list(line_bbox)
                    else:
                        continue
                    
                    if PDFUtils._bbox_overlap(pdf_bbox, line_bbox):
                        for span in line.get('spans', []):
                            span_text = span.get('text', '')
                            if span_text:
                                text_parts.append(span_text)
            
            text = ' '.join(text_parts)
            return text.strip(), bool(text.strip())
            
        except Exception as e:
            import traceback
            logger.debug(f"pypdfium2 text extraction error: {e}")
            logger.debug(traceback.format_exc())
            return "", False
    
    @staticmethod
    def _extract_text_from_pdf_fitz(
        pdf_doc: Any,
        page_idx: int,
        bbox: List[float],
        scale: float
    ) -> Tuple[str, bool]:
        """使用 fitz 提取文本"""
        try:
            import fitz
        except ImportError:
            logger.error("PyMuPDF (fitz) not available")
            return "", False
        
        try:
            page = pdf_doc[page_idx]
            
            # 将图像坐标转换为PDF坐标
            pdf_bbox = fitz.Rect(
                bbox[0] / scale,
                bbox[1] / scale,
                bbox[2] / scale,
                bbox[3] / scale
            )
            
            # 提取区域内的文本
            text = page.get_text("text", clip=pdf_bbox)
            
            return text.strip(), bool(text.strip())
            
        except Exception as e:
            import traceback
            logger.debug(f"fitz text extraction error: {e}")
            logger.debug(traceback.format_exc())
            return "", False
    
    @staticmethod
    def extract_all_text_blocks(
        pdf_doc: Any,
        page_idx: int,
        scale: float
    ) -> List[Dict[str, Any]]:
        """
        提取页面所有文本块（支持 pypdfium2 和 fitz）
        
        Args:
            pdf_doc: PDF文档对象
            page_idx: 页码
            scale: 缩放比例
            
        Returns:
            文本块列表 [{'text': str, 'bbox': [x1, y1, x2, y2]}, ...]
        """
        # 检测 PDF 文档类型
        doc_type = PDFUtils._detect_pdf_doc_type(pdf_doc)
        
        if doc_type == 'fitz':
            return PDFUtils._extract_all_text_blocks_fitz(pdf_doc, page_idx, scale)
        else:  # pypdfium2
            return PDFUtils._extract_all_text_blocks_pypdfium2(pdf_doc, page_idx, scale)
    
    @staticmethod
    def _extract_all_text_blocks_pypdfium2(
        pdf_doc: Any,
        page_idx: int,
        scale: float
    ) -> List[Dict[str, Any]]:
        """使用 pypdfium2 提取所有文本块（原有实现）"""
        if not MINERU_AVAILABLE or pdf_get_page_text is None:
            return []
            
        try:
            page = pdf_doc[page_idx]
            page_dict = pdf_get_page_text(page)
            
            extracted_blocks = []
            
            for block in page_dict.get('blocks', []):
                for line in block.get('lines', []):
                    line_text = ""
                    for span in line.get('spans', []):
                        line_text += span.get('text', "")
                    
                    if not line_text.strip():
                        continue
                        
                    line_bbox = line.get('bbox')
                    if line_bbox and hasattr(line_bbox, 'bbox'):
                        line_bbox = line_bbox.bbox
                    elif isinstance(line_bbox, (list, tuple)) and len(line_bbox) >= 4:
                        line_bbox = list(line_bbox)
                    else:
                        continue
                        
                    img_bbox = [
                        line_bbox[0] * scale,
                        line_bbox[1] * scale,
                        line_bbox[2] * scale,
                        line_bbox[3] * scale
                    ]
                    
                    extracted_blocks.append({
                        'text': line_text,
                        'bbox': img_bbox,
                        'origin_bbox': line_bbox
                    })
            
            return extracted_blocks
            
        except Exception as e:
            logger.warning(f"pypdfium2 extract_all_text_blocks failed: {e}")
            import traceback
            logger.debug(traceback.format_exc())
            return []
    
    @staticmethod
    def _extract_all_text_blocks_fitz(
        pdf_doc: Any,
        page_idx: int,
        scale: float
    ) -> List[Dict[str, Any]]:
        """使用 fitz 提取所有文本块"""
        try:
            import fitz
        except ImportError:
            logger.warning("PyMuPDF (fitz) not available")
            return []
        
        try:
            page = pdf_doc[page_idx]
            
            # 使用 get_text("dict") 获取详细的文本信息
            text_dict = page.get_text("dict")
            
            extracted_blocks = []
            
            # 遍历所有 blocks
            for block in text_dict.get("blocks", []):
                # 只处理文本块（type=0）
                if block.get("type") != 0:
                    continue
                
                # 遍历所有 lines
                for line in block.get("lines", []):
                    line_text = ""
                    line_bbox = line.get("bbox")
                    
                    # 提取 line 中的所有 span 文本
                    for span in line.get("spans", []):
                        line_text += span.get("text", "")
                    
                    if not line_text.strip() or not line_bbox:
                        continue
                    
                    # PDF 坐标转换为图像坐标
                    img_bbox = [
                        line_bbox[0] * scale,
                        line_bbox[1] * scale,
                        line_bbox[2] * scale,
                        line_bbox[3] * scale
                    ]
                    
                    extracted_blocks.append({
                        'text': line_text,
                        'bbox': img_bbox,
                        'origin_bbox': list(line_bbox)
                    })
            
            return extracted_blocks
            
        except Exception as e:
            logger.warning(f"fitz extract_all_text_blocks failed: {e}")
            import traceback
            logger.debug(traceback.format_exc())
            return []    

    @staticmethod
    def _bbox_overlap(bbox1: List[float], bbox2: List[float]) -> bool:
        """检查两个 bbox 是否重叠"""
        if len(bbox1) < 4 or len(bbox2) < 4:
            return False
        
        x1_1, y1_1, x2_1, y2_1 = bbox1[:4]
        x1_2, y1_2, x2_2, y2_2 = bbox2[:4]
        
        if x2_1 < x1_2 or x2_2 < x1_1:
            return False
        if y2_1 < y1_2 or y2_2 < y1_1:
            return False
        
        return True
    
    @staticmethod
    def merge_cross_page_tables(results: Dict[str, Any]) -> Dict[str, Any]:
        """
        合并跨页表格
        
        TODO: 实现跨页表格合并逻辑
        可以参考 MinerU 的 cross_page_table_merge 实现
        
        Args:
            results: 处理结果字典
            
        Returns:
            合并后的结果
        """
        # TODO: 实现跨页表格合并逻辑
        return results


# ============================================================================
# 统一的 PDF 图像加载函数 - 支持多种渲染引擎
# ============================================================================

def load_images_from_pdf_unified(
    pdf_bytes: bytes,
    dpi: int = 200,
    start_page_id: int = 0,
    end_page_id: Optional[int] = None,
    image_type: str = "PIL",
    renderer: str = "pypdfium2",
    timeout: Optional[int] = None,
    threads: int = 4,
) -> Tuple[List[Dict[str, Any]], Any]:
    """
    从 PDF 加载图像，支持两种渲染引擎
    
    Args:
        pdf_bytes: PDF 文件的字节数据
        dpi: 渲染 DPI，默认 200
        start_page_id: 起始页码（0-based），默认 0
        end_page_id: 结束页码（0-based，包含），默认 None（处理到最后）
        image_type: 返回图像类型，"PIL" 或 "BASE64"
        renderer: 渲染引擎选择
            - "pypdfium2": 使用 MinerU 标准的 pypdfium2（推荐）
              * 优势: Chrome PDFium 引擎，多进程加速，更好的细节保留
              * 尺寸限制: 3500px，超过则动态调整 scale
            - "fitz" / "pymupdf": 使用 PyMuPDF (fitz)
              * 优势: MuPDF 引擎，简单直接，无需额外依赖
              * 尺寸限制: 4500px，超过则降到 72 DPI
        timeout: 超时时间（秒），仅 pypdfium2 支持
        threads: 进程数，仅 pypdfium2 支持多进程加速（Windows 下自动禁用）
        
    Returns:
        (images_list, pdf_doc)
        - images_list: 图像列表，每个元素为 {'img_pil': PIL.Image, 'scale': float}
                      或 {'img_base64': str, 'scale': float}（取决于 image_type）
        - pdf_doc: PDF 文档对象（pypdfium2.PdfDocument 或 fitz.Document）
        
    Raises:
        ImportError: 如果选择的渲染引擎不可用
        ValueError: 如果参数无效
        TimeoutError: 如果转换超时（仅 pypdfium2）
    
    渲染引擎对比:
        ┌─────────────┬──────────────┬──────────────┐
        │   特性      │  pypdfium2   │    fitz      │
        ├─────────────┼──────────────┼──────────────┤
        │ 渲染引擎    │ Chrome PDFium│ MuPDF        │
        │ 多进程加速  │ ✅ (非Windows)│ ❌           │
        │ 超时控制    │ ✅           │ ❌           │
        │ 尺寸限制    │ 3500px       │ 4500px       │
        │ 超限处理    │ 动态调整scale│ 降到72 DPI   │
        │ 细节保留    │ 更好         │ 良好         │
        │ MinerU标准  │ ✅           │ ❌           │
        └─────────────┴──────────────┴──────────────┘
    
    示例:
        # 使用 pypdfium2（推荐，MinerU 标准）
        images, doc = load_images_from_pdf_unified(
            pdf_bytes, 
            dpi=200, 
            renderer="pypdfium2",
            threads=4
        )
        
        # 使用 PyMuPDF (fitz)
        images, doc = load_images_from_pdf_unified(
            pdf_bytes, 
            dpi=200, 
            renderer="fitz"
        )
        
        # 访问图像
        for img_dict in images:
            pil_image = img_dict['img_pil']
            scale = img_dict['scale']
            # 处理图像...
    
    注意事项:
        1. pypdfium2 在生产环境中更推荐，因为它是 MinerU 的标准实现
        2. 两种渲染引擎可能产生略有不同的图像（SSIM ≈ 0.945）
        3. 建议在同一项目中保持使用同一渲染引擎，避免不一致
        4. 如果需要与现有测试图像对比，使用相同的渲染引擎
    """
    renderer = renderer.lower()
    
    if renderer in ["pypdfium2", "pdfium"]:
        return _load_images_pypdfium2(
            pdf_bytes, dpi, start_page_id, end_page_id, 
            image_type, timeout, threads
        )
    elif renderer in ["fitz", "pymupdf", "mupdf"]:
        return _load_images_fitz(
            pdf_bytes, dpi, start_page_id, end_page_id, image_type
        )
    else:
        raise ValueError(
            f"不支持的渲染引擎: {renderer}. "
            f"请使用 'pypdfium2' 或 'fitz'"
        )


def _load_images_pypdfium2(
    pdf_bytes: bytes,
    dpi: int,
    start_page_id: int,
    end_page_id: Optional[int],
    image_type: str,
    timeout: Optional[int],
    threads: int
) -> Tuple[List[Dict[str, Any]], Any]:
    """使用 pypdfium2 渲染引擎（MinerU 标准）"""
    try:
        import pypdfium2 as pdfium
        from mineru.utils.pdf_image_tools import load_images_from_pdf as mineru_load_images
        from mineru.utils.enum_class import ImageType
    except ImportError as e:
        raise ImportError(
            f"pypdfium2 渲染引擎需要安装 MinerU: pip install mineru\n"
            f"原始错误: {e}"
        )
    
    # 转换 image_type
    img_type = ImageType.PIL if image_type.upper() == "PIL" else ImageType.BASE64
    
    # 使用 MinerU 的实现
    images_list, pdf_doc = mineru_load_images(
        pdf_bytes=pdf_bytes,
        dpi=dpi,
        start_page_id=start_page_id,
        end_page_id=end_page_id,
        image_type=img_type,
        timeout=timeout,
        threads=threads
    )
    
    logger.info(
        f"✅ pypdfium2 渲染完成: {len(images_list)} 页 "
        f"(DPI={dpi}, 多进程={threads})"
    )
    
    return images_list, pdf_doc


def _load_images_fitz(
    pdf_bytes: bytes,
    dpi: int,
    start_page_id: int,
    end_page_id: Optional[int],
    image_type: str
) -> Tuple[List[Dict[str, Any]], Any]:
    """使用 PyMuPDF (fitz) 渲染引擎"""
    try:
        import fitz
    except ImportError as e:
        raise ImportError(
            f"PyMuPDF 渲染引擎需要安装: pip install PyMuPDF\n"
            f"原始错误: {e}"
        )
    
    from io import BytesIO
    import base64
    
    # 打开 PDF
    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    pdf_page_num = doc.page_count
    
    # 处理 end_page_id
    if end_page_id is None or end_page_id < 0:
        end_page_id = pdf_page_num - 1
    end_page_id = min(end_page_id, pdf_page_num - 1)
    
    # 渲染图像
    images_list = []
    mat = fitz.Matrix(dpi / 72, dpi / 72)
    
    for index in range(start_page_id, end_page_id + 1):
        page = doc[index]
        
        # 渲染为 pixmap
        pm = page.get_pixmap(matrix=mat, alpha=False)
        
        # 如果超过尺寸限制，降低到 72 DPI
        if pm.width > 4500 or pm.height > 4500:
            logger.warning(
                f"⚠️  页面 {index} 尺寸过大 ({pm.width}x{pm.height}), "
                f"降低到 72 DPI"
            )
            mat_fallback = fitz.Matrix(1, 1)  # 72 DPI
            pm = page.get_pixmap(matrix=mat_fallback, alpha=False)
        
        # 转换为 PIL Image
        pil_img = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
        
        # 计算实际 scale
        page_rect = page.rect
        actual_scale = pm.width / page_rect.width
        
        # 构建返回字典
        image_dict = {
            'img_pil': pil_img,
            'scale': actual_scale
        }
        
        # 如果需要 BASE64
        if image_type.upper() == "BASE64":
            buffer = BytesIO()
            pil_img.save(buffer, format="JPEG")
            img_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
            image_dict['img_base64'] = img_base64
            # 移除 img_pil 以节省内存
            del image_dict['img_pil']
        
        images_list.append(image_dict)
    
    logger.info(
        f"✅ PyMuPDF (fitz) 渲染完成: {len(images_list)} 页 "
        f"(DPI={dpi}, 单进程)"
    )
    
    return images_list, doc