zhengchun
/
ocr_platform


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294
							"""
PDF处理工具模块

提供PDF相关处理功能：
- PDF加载与分类
- PDF文本提取
- 跨页表格合并
- 页面范围解析与过滤
"""
from typing import Dict, List, Any, Optional, Tuple, Set
from pathlib import Path
from PIL import Image
from loguru import logger
import re

# 导入 MinerU 组件
try:
    from mineru.utils.pdf_classify import classify as pdf_classify
    from mineru.utils.pdf_image_tools import load_images_from_pdf
    from mineru.utils.enum_class import ImageType
    from mineru.utils.pdf_text_tool import get_page as pdf_get_page_text
    MINERU_AVAILABLE = True
except ImportError:
    raise ImportError("MinerU components not available for PDF processing")

class PDFUtils:
    """PDF处理工具类"""
    
    @staticmethod
    def parse_page_range(page_range: Optional[str], total_pages: int) -> Set[int]:
        """
        解析页面范围字符串
        
        支持格式：
        - "1-5" → {0, 1, 2, 3, 4}（页码从1开始，内部转为0-based索引）
        - "3" → {2}
        - "1-5,7,9-12" → {0, 1, 2, 3, 4, 6, 8, 9, 10, 11}
        - "1-" → 从第1页到最后
        - "-5" → 从第1页到第5页
        
        Args:
            page_range: 页面范围字符串（页码从1开始）
            total_pages: 总页数
            
        Returns:
            页面索引集合（0-based）
        """
        if not page_range or not page_range.strip():
            return set(range(total_pages))
        
        pages = set()
        parts = page_range.replace(' ', '').split(',')
        
        for part in parts:
            part = part.strip()
            if not part:
                continue
            
            if '-' in part:
                # 范围格式
                match = re.match(r'^(\d*)-(\d*)$', part)
                if match:
                    start_str, end_str = match.groups()
                    start = int(start_str) if start_str else 1
                    end = int(end_str) if end_str else total_pages
                    
                    # 转换为 0-based 索引
                    start = max(0, start - 1)
                    end = min(total_pages, end)
                    
                    pages.update(range(start, end))
            else:
                # 单页
                try:
                    page_num = int(part)
                    if 1 <= page_num <= total_pages:
                        pages.add(page_num - 1)  # 转换为 0-based 索引
                except ValueError:
                    logger.warning(f"Invalid page number: {part}")
        
        return pages
    
    @staticmethod
    def load_and_classify_document(
        document_path: Path,
        dpi: int = 200,
        page_range: Optional[str] = None
    ) -> Tuple[List[Dict], str, Optional[Any]]:
        """
        加载文档并分类，支持页面范围过滤
        
        Args:
            document_path: 文档路径
            dpi: PDF渲染DPI
            page_range: 页面范围字符串，如 "1-5,7,9-12"
                       - PDF：按页码（从1开始）
                       - 图片目录：按文件名排序后的位置（从1开始）
            
        Returns:
            (images_list, pdf_type, pdf_doc)
            - images_list: 图像列表，每个元素包含 {'img_pil': PIL.Image, 'scale': float, 'page_idx': int}
            - pdf_type: 'ocr' 或 'txt'
            - pdf_doc: PDF文档对象（如果是PDF）
        """
        pdf_doc = None
        pdf_type = 'ocr'  # 默认使用OCR模式
        all_images = []
        
        if document_path.is_dir():
            # 处理目录：遍历所有图片
            image_extensions = {'.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.tif'}
            image_files = sorted([
                f for f in document_path.iterdir() 
                if f.suffix.lower() in image_extensions
            ])
            
            # 解析页面范围
            total_pages = len(image_files)
            selected_pages = PDFUtils.parse_page_range(page_range, total_pages)
            
            if page_range:
                logger.info(f"📋 图片目录共 {total_pages} 张，选择处理 {len(selected_pages)} 张")
            
            for idx, img_file in enumerate(image_files):
                if idx not in selected_pages:
                    continue
                
                img = Image.open(img_file)
                if img.mode != 'RGB':
                    img = img.convert('RGB')
                all_images.append({
                    'img_pil': img,
                    'scale': 1.0,
                    'source_path': str(img_file),
                    'page_idx': idx,  # 原始索引
                    'page_name': img_file.stem  # 文件名（不含扩展名）
                })
            
            pdf_type = 'ocr'  # 图片目录始终使用OCR模式
            
        elif document_path.suffix.lower() == '.pdf':
            # 处理PDF文件
            if not MINERU_AVAILABLE:
                raise RuntimeError("MinerU components not available for PDF processing")
            
            with open(document_path, 'rb') as f:
                pdf_bytes = f.read()
            
            # PDF分类
            pdf_type = pdf_classify(pdf_bytes)
            logger.info(f"📋 PDF classified as: {pdf_type}")
            
            # 加载图像
            images_list, pdf_doc = load_images_from_pdf(
                pdf_bytes, 
                dpi=dpi,
                image_type=ImageType.PIL
            )
            
            # 解析页面范围
            total_pages = len(images_list)
            selected_pages = PDFUtils.parse_page_range(page_range, total_pages)
            
            if page_range:
                logger.info(f"📋 PDF 共 {total_pages} 页，选择处理 {len(selected_pages)} 页")
            
            for idx, img_dict in enumerate(images_list):
                if idx not in selected_pages:
                    continue
                
                all_images.append({
                    'img_pil': img_dict['img_pil'],
                    'scale': img_dict.get('scale', dpi / 72),
                    'source_path': str(document_path),
                    'page_idx': idx  # 原始页码索引
                })
                
        elif document_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.tif']:
            # 处理单个图片
            img = Image.open(document_path)
            if img.mode != 'RGB':
                img = img.convert('RGB')
            all_images.append({
                'img_pil': img,
                'scale': 1.0,
                'source_path': str(document_path),
                'page_idx': 0,
                'page_name': document_path.stem
            })
            pdf_type = 'ocr'
            
        else:
            raise ValueError(f"Unsupported file format: {document_path.suffix}")
        
        return all_images, pdf_type, pdf_doc
    
    @staticmethod
    def extract_text_from_pdf(
        pdf_doc: Any,
        page_idx: int,
        bbox: List[float],
        scale: float
    ) -> Tuple[str, bool]:
        """
        从PDF直接提取文本（使用 MinerU 的 pypdfium2 方式）
        
        Args:
            pdf_doc: pypdfium2 的 PdfDocument 对象
            page_idx: 页码索引
            bbox: 目标区域的bbox（图像坐标）
            scale: 图像与PDF的缩放比例
            
        Returns:
            (text, success)
        """
        if not MINERU_AVAILABLE or pdf_get_page_text is None:
            logger.debug("MinerU pdf_text_tool not available")
            return "", False
            
        try:
            page = pdf_doc[page_idx]
            
            # 将图像坐标转换为PDF坐标
            pdf_bbox = [
                bbox[0] / scale,
                bbox[1] / scale,
                bbox[2] / scale,
                bbox[3] / scale
            ]
            
            # 使用 MinerU 的方式获取页面文本信息
            page_dict = pdf_get_page_text(page)
            
            # 从 blocks 中提取与 bbox 重叠的文本
            text_parts = []
            for block in page_dict.get('blocks', []):
                for line in block.get('lines', []):
                    line_bbox = line.get('bbox')
                    if line_bbox and hasattr(line_bbox, 'bbox'):
                        line_bbox = line_bbox.bbox  # pdftext 的 BBox 对象
                    elif isinstance(line_bbox, (list, tuple)) and len(line_bbox) >= 4:
                        line_bbox = list(line_bbox)
                    else:
                        continue
                    
                    # 检查 line 是否与目标 bbox 重叠
                    if PDFUtils._bbox_overlap(pdf_bbox, line_bbox):
                        for span in line.get('spans', []):
                            span_text = span.get('text', '')
                            if span_text:
                                text_parts.append(span_text)
            
            text = ' '.join(text_parts)
            return text.strip(), bool(text.strip())
            
        except Exception as e:
            import traceback
            logger.debug(f"PDF text extraction error: {e}")
            logger.debug(traceback.format_exc())
            return "", False
    
    @staticmethod
    def _bbox_overlap(bbox1: List[float], bbox2: List[float]) -> bool:
        """检查两个 bbox 是否重叠"""
        if len(bbox1) < 4 or len(bbox2) < 4:
            return False
        
        x1_1, y1_1, x2_1, y2_1 = bbox1[:4]
        x1_2, y1_2, x2_2, y2_2 = bbox2[:4]
        
        if x2_1 < x1_2 or x2_2 < x1_1:
            return False
        if y2_1 < y1_2 or y2_2 < y1_1:
            return False
        
        return True
    
    @staticmethod
    def merge_cross_page_tables(results: Dict[str, Any]) -> Dict[str, Any]:
        """
        合并跨页表格
        
        TODO: 实现跨页表格合并逻辑
        可以参考 MinerU 的 cross_page_table_merge 实现
        
        Args:
            results: 处理结果字典
            
        Returns:
            合并后的结果
        """
        # TODO: 实现跨页表格合并逻辑
        return results