zhengchun
/
ocr_platform


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432
							"""
PDF文本提取模块

提供从PDF文档中提取文本的功能，支持多种PDF引擎：
- pypdfium2: MinerU标准引擎
- fitz (PyMuPDF): 轻量级替代引擎

主要功能：
- 区域文本提取：从指定bbox区域提取文本
- 全页文本提取：提取页面所有文本块及其坐标
- 自动rotation处理：自动应用PDF页面旋转变换
- 返回图片rotation（逆时针定义）：对外统一使用图片处理标准
"""
from typing import Dict, List, Any, Tuple
from loguru import logger

# 导入坐标转换函数
from .pdf_coordinate_transform import (
    transform_bbox_for_rotation_fitz,
    transform_bbox_for_rotation_pypdfium2,
    pdf_rotation_to_image_rotation
)

# 导入 MinerU 组件
try:
    from mineru.utils.pdf_text_tool import get_page as pdf_get_page_text
    MINERU_AVAILABLE = True
except ImportError:
    pdf_get_page_text = None
    MINERU_AVAILABLE = False


def detect_pdf_doc_type(pdf_doc: Any) -> str:
    """
    检测 PDF 文档对象类型
    
    Args:
        pdf_doc: PDF 文档对象
        
    Returns:
        'pypdfium2' 或 'fitz'
    """
    doc_type_name = type(pdf_doc).__name__
    doc_module = type(pdf_doc).__module__
    
    if 'pdfium' in doc_module.lower() or 'PdfDocument' in doc_type_name:
        return 'pypdfium2'
    elif 'fitz' in doc_module.lower() or 'Document' in doc_type_name:
        return 'fitz'
    else:
        # 尝试通过属性判断
        if hasattr(pdf_doc, 'get_page') or hasattr(pdf_doc, 'page_count'):
            return 'fitz'
        else:
            return 'pypdfium2'


def bbox_overlap(bbox1: List[float], bbox2: List[float]) -> bool:
    """
    检查两个 bbox 是否重叠
    
    Args:
        bbox1: 第一个bbox [x1, y1, x2, y2]
        bbox2: 第二个bbox [x1, y1, x2, y2]
        
    Returns:
        True 如果重叠，否则 False
    """
    if len(bbox1) < 4 or len(bbox2) < 4:
        return False
    
    x1_1, y1_1, x2_1, y2_1 = bbox1[:4]
    x1_2, y1_2, x2_2, y2_2 = bbox2[:4]
    
    # 检查是否不重叠（取反）
    if x2_1 < x1_2 or x2_2 < x1_1:
        return False
    if y2_1 < y1_2 or y2_2 < y1_1:
        return False
    
    return True


# ============================================================================
# 区域文本提取
# ============================================================================

def extract_text_from_pdf(
    pdf_doc: Any,
    page_idx: int,
    bbox: List[float],
    scale: float
) -> Tuple[str, bool]:
    """
    从PDF指定区域提取文本（支持 pypdfium2 和 fitz）
    
    Args:
        pdf_doc: PDF文档对象 (pypdfium2.PdfDocument 或 fitz.Document)
        page_idx: 页码索引（0-based）
        bbox: 目标区域的bbox（图像坐标）[x1, y1, x2, y2]
        scale: 图像与PDF的缩放比例
        
    Returns:
        (text, success)
        - text: 提取的文本
        - success: 是否成功提取到文本
    """
    doc_type = detect_pdf_doc_type(pdf_doc)
    
    if doc_type == 'fitz':
        return extract_text_from_pdf_fitz(pdf_doc, page_idx, bbox, scale)
    else:  # pypdfium2
        return extract_text_from_pdf_pypdfium2(pdf_doc, page_idx, bbox, scale)


def extract_text_from_pdf_pypdfium2(
    pdf_doc: Any,
    page_idx: int,
    bbox: List[float],
    scale: float
) -> Tuple[str, bool]:
    """
    使用 pypdfium2 从指定区域提取文本
    
    Args:
        pdf_doc: pypdfium2.PdfDocument 对象
        page_idx: 页码索引
        bbox: 目标区域的bbox（图像坐标）
        scale: 缩放比例
        
    Returns:
        (text, success)
    """
    if not MINERU_AVAILABLE or pdf_get_page_text is None:
        logger.error("MinerU pdf_text_tool not available")
        return "", False
        
    try:
        page = pdf_doc[page_idx]
        
        # 将图像坐标转换为PDF坐标
        pdf_bbox = [
            bbox[0] / scale,
            bbox[1] / scale,
            bbox[2] / scale,
            bbox[3] / scale
        ]
        
        # 使用 MinerU 的方式获取页面文本信息
        page_dict = pdf_get_page_text(page)
        
        # 从 blocks 中提取与 bbox 重叠的文本
        text_parts = []
        for block in page_dict.get('blocks', []):
            for line in block.get('lines', []):
                line_bbox = line.get('bbox')
                if line_bbox and hasattr(line_bbox, 'bbox'):
                    line_bbox = line_bbox.bbox
                elif isinstance(line_bbox, (list, tuple)) and len(line_bbox) >= 4:
                    line_bbox = list(line_bbox)
                else:
                    continue
                
                if bbox_overlap(pdf_bbox, line_bbox):
                    for span in line.get('spans', []):
                        span_text = span.get('text', '')
                        if span_text:
                            text_parts.append(span_text)
        
        text = ' '.join(text_parts)
        return text.strip(), bool(text.strip())
        
    except Exception as e:
        import traceback
        logger.debug(f"pypdfium2 text extraction error: {e}")
        logger.debug(traceback.format_exc())
        return "", False


def extract_text_from_pdf_fitz(
    pdf_doc: Any,
    page_idx: int,
    bbox: List[float],
    scale: float
) -> Tuple[str, bool]:
    """
    使用 fitz 从指定区域提取文本
    
    Args:
        pdf_doc: fitz.Document 对象
        page_idx: 页码索引
        bbox: 目标区域的bbox（图像坐标）
        scale: 缩放比例
        
    Returns:
        (text, success)
    """
    try:
        import fitz
    except ImportError:
        logger.error("PyMuPDF (fitz) not available")
        return "", False
    
    try:
        page = pdf_doc[page_idx]
        
        # 将图像坐标转换为PDF坐标
        pdf_bbox = fitz.Rect(
            bbox[0] / scale,
            bbox[1] / scale,
            bbox[2] / scale,
            bbox[3] / scale
        )
        
        # 提取区域内的文本
        text = page.get_text("text", clip=pdf_bbox)
        
        return text.strip(), bool(text.strip())
        
    except Exception as e:
        import traceback
        logger.debug(f"fitz text extraction error: {e}")
        logger.debug(traceback.format_exc())
        return "", False


# ============================================================================
# 全页文本提取
# ============================================================================

def extract_all_text_blocks(
    pdf_doc: Any,
    page_idx: int,
    scale: float
) -> Tuple[List[Dict[str, Any]], int]:
    """
    提取页面所有文本块（支持 pypdfium2 和 fitz）+ PDF rotation处理
    
    Args:
        pdf_doc: PDF文档对象
        page_idx: 页码索引（0-based）
        scale: 缩放比例
        
    Returns:
        (text_blocks, rotation_angle)
        - text_blocks: 文本块列表 [{'text': str, 'bbox': [x1, y1, x2, y2], 'origin_bbox': [...]}, ...]
                      bbox坐标已转换为渲染图像坐标系（与OCR坐标系一致）
        - rotation_angle: 图片旋转角度(0/90/180/270)，逆时针定义
    """
    doc_type = detect_pdf_doc_type(pdf_doc)
    
    if doc_type == 'fitz':
        return extract_all_text_blocks_fitz(pdf_doc, page_idx, scale)
    else:
        return extract_all_text_blocks_pypdfium2(pdf_doc, page_idx, scale)


def extract_all_text_blocks_pypdfium2(
    pdf_doc: Any,
    page_idx: int,
    scale: float
) -> Tuple[List[Dict[str, Any]], int]:
    """
    使用 pypdfium2 提取所有文本块并处理rotation
    
    Args:
        pdf_doc: pypdfium2.PdfDocument 对象
        page_idx: 页码索引
        scale: 缩放比例
        
    Returns:
        (text_blocks, rotation_angle)
    """
    if not MINERU_AVAILABLE or pdf_get_page_text is None:
        return [], 0
        
    try:
        page = pdf_doc[page_idx]
        page_dict = pdf_get_page_text(page)
        
        # 获取页面尺寸和rotation
        rotation = page_dict.get('rotation', 0)
        pdf_width = page_dict.get('width', 0)
        pdf_height = page_dict.get('height', 0)
        
        if rotation != 0:
            logger.info(f"📐 Page {page_idx}: PDF rotation={rotation}°, size=({pdf_width}x{pdf_height})")
        
        extracted_blocks = []
        
        for block in page_dict.get('blocks', []):
            for line in block.get('lines', []):
                line_text = ""
                for span in line.get('spans', []):
                    line_text += span.get('text', "")
                
                if not line_text.strip():
                    continue
                    
                line_bbox = line.get('bbox')
                if line_bbox and hasattr(line_bbox, 'bbox'):
                    line_bbox = line_bbox.bbox
                elif isinstance(line_bbox, (list, tuple)) and len(line_bbox) >= 4:
                    line_bbox = list(line_bbox)
                else:
                    continue
                
                # 应用rotation坐标转换
                img_bbox = transform_bbox_for_rotation_pypdfium2(
                    line_bbox, rotation, pdf_width, pdf_height, scale
                )
                
                extracted_blocks.append({
                    'text': line_text,
                    'bbox': img_bbox,
                    'origin_bbox': line_bbox
                })
        
        # 转换为图片rotation（逆时针定义）
        image_rotation = pdf_rotation_to_image_rotation(rotation)
        return extracted_blocks, image_rotation
        
    except Exception as e:
        logger.warning(f"pypdfium2 extract_all_text_blocks failed: {e}")
        import traceback
        logger.debug(traceback.format_exc())
        return [], 0


def extract_all_text_blocks_fitz(
    pdf_doc: Any,
    page_idx: int,
    scale: float
) -> Tuple[List[Dict[str, Any]], int]:
    """
    使用 fitz 提取所有文本块并处理rotation
    
    Args:
        pdf_doc: fitz.Document 对象
        page_idx: 页码索引
        scale: 缩放比例
        
    Returns:
        (text_blocks, rotation_angle)
    """
    try:
        import fitz
    except ImportError:
        logger.warning("PyMuPDF (fitz) not available")
        return [], 0
    
    try:
        page = pdf_doc[page_idx]
        
        # 获取页面rotation
        rotation = page.rotation  # 0, 90, 180, 270
        
        # 获取页面尺寸（原始方向，未旋转）
        # page.rect 是旋转后的尺寸，我们需要原始尺寸
        if rotation in [90, 270]:
            # 宽高互换回来
            pdf_width = page.rect.height
            pdf_height = page.rect.width
        else:
            pdf_width = page.rect.width
            pdf_height = page.rect.height
        
        if rotation != 0:
            logger.info(f"📐 Page {page_idx}: PDF rotation={rotation}°, original_size=({pdf_width}x{pdf_height})")
        
        # 使用 get_text("dict") 获取详细的文本信息
        text_dict = page.get_text("dict")
        
        extracted_blocks = []
        
        # 遍历所有 blocks
        for block in text_dict.get("blocks", []):
            # 只处理文本块（type=0）
            if block.get("type") != 0:
                continue
            
            # 遍历所有 lines
            for line in block.get("lines", []):
                line_text = ""
                line_bbox = line.get("bbox")
                
                # 提取 line 中的所有 span 文本
                for span in line.get("spans", []):
                    line_text += span.get("text", "")
                
                if not line_text.strip() or not line_bbox:
                    continue
                
                # 应用rotation坐标转换
                img_bbox = transform_bbox_for_rotation_fitz(
                    list(line_bbox), rotation, pdf_width, pdf_height, scale
                )
                
                extracted_blocks.append({
                    'text': line_text,
                    'bbox': img_bbox,
                    'origin_bbox': list(line_bbox)
                })
        
        # 转换为图片rotation（逆时针定义）
        image_rotation = pdf_rotation_to_image_rotation(rotation)
        return extracted_blocks, image_rotation
        
    except Exception as e:
        logger.warning(f"fitz extract_all_text_blocks failed: {e}")
        import traceback
        logger.debug(traceback.format_exc())
        return [], 0


def detect_page_type(
    pdf_doc: Any, 
    page_idx: int,
    char_threshold: int = 50
) -> str:
    """
    检测PDF指定页是文字页还是图片页
    
    基于字符密度的简单可靠方法
    """
    try:
        text_blocks, _ = extract_all_text_blocks(pdf_doc, page_idx, scale=1.0)
        total_chars = sum(len(block.get('text', '')) for block in text_blocks)
        
        return 'txt' if total_chars >= char_threshold else 'ocr'
    except:
        return 'ocr'