""" PDF文本提取模块 提供从PDF文档中提取文本的功能,支持多种PDF引擎: - pypdfium2: MinerU标准引擎 - fitz (PyMuPDF): 轻量级替代引擎 主要功能: - 区域文本提取:从指定bbox区域提取文本 - 全页文本提取:提取页面所有文本块及其坐标 - 自动rotation处理:自动应用PDF页面旋转变换 - 返回图片rotation(逆时针定义):对外统一使用图片处理标准 """ from typing import Dict, List, Any, Tuple from loguru import logger # 导入坐标转换函数 from .pdf_coordinate_transform import ( transform_bbox_for_rotation_fitz, transform_bbox_for_rotation_pypdfium2, pdf_rotation_to_image_rotation ) # 导入 MinerU 组件 try: from mineru.utils.pdf_text_tool import get_page as pdf_get_page_text MINERU_AVAILABLE = True except ImportError: pdf_get_page_text = None MINERU_AVAILABLE = False def detect_pdf_doc_type(pdf_doc: Any) -> str: """ 检测 PDF 文档对象类型 Args: pdf_doc: PDF 文档对象 Returns: 'pypdfium2' 或 'fitz' """ doc_type_name = type(pdf_doc).__name__ doc_module = type(pdf_doc).__module__ if 'pdfium' in doc_module.lower() or 'PdfDocument' in doc_type_name: return 'pypdfium2' elif 'fitz' in doc_module.lower() or 'Document' in doc_type_name: return 'fitz' else: # 尝试通过属性判断 if hasattr(pdf_doc, 'get_page') or hasattr(pdf_doc, 'page_count'): return 'fitz' else: return 'pypdfium2' def bbox_overlap(bbox1: List[float], bbox2: List[float]) -> bool: """ 检查两个 bbox 是否重叠 Args: bbox1: 第一个bbox [x1, y1, x2, y2] bbox2: 第二个bbox [x1, y1, x2, y2] Returns: True 如果重叠,否则 False """ if len(bbox1) < 4 or len(bbox2) < 4: return False x1_1, y1_1, x2_1, y2_1 = bbox1[:4] x1_2, y1_2, x2_2, y2_2 = bbox2[:4] # 检查是否不重叠(取反) if x2_1 < x1_2 or x2_2 < x1_1: return False if y2_1 < y1_2 or y2_2 < y1_1: return False return True # ============================================================================ # 区域文本提取 # ============================================================================ def extract_text_from_pdf( pdf_doc: Any, page_idx: int, bbox: List[float], scale: float ) -> Tuple[str, bool]: """ 从PDF指定区域提取文本(支持 pypdfium2 和 fitz) Args: pdf_doc: PDF文档对象 (pypdfium2.PdfDocument 或 fitz.Document) page_idx: 页码索引(0-based) bbox: 目标区域的bbox(图像坐标)[x1, y1, x2, y2] scale: 图像与PDF的缩放比例 Returns: (text, success) - text: 提取的文本 - success: 是否成功提取到文本 """ doc_type = detect_pdf_doc_type(pdf_doc) if doc_type == 'fitz': return extract_text_from_pdf_fitz(pdf_doc, page_idx, bbox, scale) else: # pypdfium2 return extract_text_from_pdf_pypdfium2(pdf_doc, page_idx, bbox, scale) def extract_text_from_pdf_pypdfium2( pdf_doc: Any, page_idx: int, bbox: List[float], scale: float ) -> Tuple[str, bool]: """ 使用 pypdfium2 从指定区域提取文本 Args: pdf_doc: pypdfium2.PdfDocument 对象 page_idx: 页码索引 bbox: 目标区域的bbox(图像坐标) scale: 缩放比例 Returns: (text, success) """ if not MINERU_AVAILABLE or pdf_get_page_text is None: logger.error("MinerU pdf_text_tool not available") return "", False try: page = pdf_doc[page_idx] # 将图像坐标转换为PDF坐标 pdf_bbox = [ bbox[0] / scale, bbox[1] / scale, bbox[2] / scale, bbox[3] / scale ] # 使用 MinerU 的方式获取页面文本信息 page_dict = pdf_get_page_text(page) # 从 blocks 中提取与 bbox 重叠的文本 text_parts = [] for block in page_dict.get('blocks', []): for line in block.get('lines', []): line_bbox = line.get('bbox') if line_bbox and hasattr(line_bbox, 'bbox'): line_bbox = line_bbox.bbox elif isinstance(line_bbox, (list, tuple)) and len(line_bbox) >= 4: line_bbox = list(line_bbox) else: continue if bbox_overlap(pdf_bbox, line_bbox): for span in line.get('spans', []): span_text = span.get('text', '') if span_text: text_parts.append(span_text) text = ' '.join(text_parts) return text.strip(), bool(text.strip()) except Exception as e: import traceback logger.debug(f"pypdfium2 text extraction error: {e}") logger.debug(traceback.format_exc()) return "", False def extract_text_from_pdf_fitz( pdf_doc: Any, page_idx: int, bbox: List[float], scale: float ) -> Tuple[str, bool]: """ 使用 fitz 从指定区域提取文本 Args: pdf_doc: fitz.Document 对象 page_idx: 页码索引 bbox: 目标区域的bbox(图像坐标) scale: 缩放比例 Returns: (text, success) """ try: import fitz except ImportError: logger.error("PyMuPDF (fitz) not available") return "", False try: page = pdf_doc[page_idx] # 将图像坐标转换为PDF坐标 pdf_bbox = fitz.Rect( bbox[0] / scale, bbox[1] / scale, bbox[2] / scale, bbox[3] / scale ) # 提取区域内的文本 text = page.get_text("text", clip=pdf_bbox) return text.strip(), bool(text.strip()) except Exception as e: import traceback logger.debug(f"fitz text extraction error: {e}") logger.debug(traceback.format_exc()) return "", False # ============================================================================ # 全页文本提取 # ============================================================================ def extract_all_text_blocks( pdf_doc: Any, page_idx: int, scale: float ) -> Tuple[List[Dict[str, Any]], int]: """ 提取页面所有文本块(支持 pypdfium2 和 fitz)+ PDF rotation处理 Args: pdf_doc: PDF文档对象 page_idx: 页码索引(0-based) scale: 缩放比例 Returns: (text_blocks, rotation_angle) - text_blocks: 文本块列表 [{'text': str, 'bbox': [x1, y1, x2, y2], 'origin_bbox': [...]}, ...] bbox坐标已转换为渲染图像坐标系(与OCR坐标系一致) - rotation_angle: 图片旋转角度(0/90/180/270),逆时针定义 """ doc_type = detect_pdf_doc_type(pdf_doc) if doc_type == 'fitz': return extract_all_text_blocks_fitz(pdf_doc, page_idx, scale) else: return extract_all_text_blocks_pypdfium2(pdf_doc, page_idx, scale) def extract_all_text_blocks_pypdfium2( pdf_doc: Any, page_idx: int, scale: float ) -> Tuple[List[Dict[str, Any]], int]: """ 使用 pypdfium2 提取所有文本块并处理rotation Args: pdf_doc: pypdfium2.PdfDocument 对象 page_idx: 页码索引 scale: 缩放比例 Returns: (text_blocks, rotation_angle) """ if not MINERU_AVAILABLE or pdf_get_page_text is None: return [], 0 try: page = pdf_doc[page_idx] page_dict = pdf_get_page_text(page) # 获取页面尺寸和rotation rotation = page_dict.get('rotation', 0) pdf_width = page_dict.get('width', 0) pdf_height = page_dict.get('height', 0) if rotation != 0: logger.info(f"📐 Page {page_idx}: PDF rotation={rotation}°, size=({pdf_width}x{pdf_height})") extracted_blocks = [] for block in page_dict.get('blocks', []): for line in block.get('lines', []): line_text = "" for span in line.get('spans', []): line_text += span.get('text', "") if not line_text.strip(): continue line_bbox = line.get('bbox') if line_bbox and hasattr(line_bbox, 'bbox'): line_bbox = line_bbox.bbox elif isinstance(line_bbox, (list, tuple)) and len(line_bbox) >= 4: line_bbox = list(line_bbox) else: continue # 应用rotation坐标转换 img_bbox = transform_bbox_for_rotation_pypdfium2( line_bbox, rotation, pdf_width, pdf_height, scale ) extracted_blocks.append({ 'text': line_text, 'bbox': img_bbox, 'origin_bbox': line_bbox }) # 转换为图片rotation(逆时针定义) image_rotation = pdf_rotation_to_image_rotation(rotation) return extracted_blocks, image_rotation except Exception as e: logger.warning(f"pypdfium2 extract_all_text_blocks failed: {e}") import traceback logger.debug(traceback.format_exc()) return [], 0 def extract_all_text_blocks_fitz( pdf_doc: Any, page_idx: int, scale: float ) -> Tuple[List[Dict[str, Any]], int]: """ 使用 fitz 提取所有文本块并处理rotation Args: pdf_doc: fitz.Document 对象 page_idx: 页码索引 scale: 缩放比例 Returns: (text_blocks, rotation_angle) """ try: import fitz except ImportError: logger.warning("PyMuPDF (fitz) not available") return [], 0 try: page = pdf_doc[page_idx] # 获取页面rotation rotation = page.rotation # 0, 90, 180, 270 # 获取页面尺寸(原始方向,未旋转) # page.rect 是旋转后的尺寸,我们需要原始尺寸 if rotation in [90, 270]: # 宽高互换回来 pdf_width = page.rect.height pdf_height = page.rect.width else: pdf_width = page.rect.width pdf_height = page.rect.height if rotation != 0: logger.info(f"📐 Page {page_idx}: PDF rotation={rotation}°, original_size=({pdf_width}x{pdf_height})") # 使用 get_text("dict") 获取详细的文本信息 text_dict = page.get_text("dict") extracted_blocks = [] # 遍历所有 blocks for block in text_dict.get("blocks", []): # 只处理文本块(type=0) if block.get("type") != 0: continue # 遍历所有 lines for line in block.get("lines", []): line_text = "" line_bbox = line.get("bbox") # 提取 line 中的所有 span 文本 for span in line.get("spans", []): line_text += span.get("text", "") if not line_text.strip() or not line_bbox: continue # 应用rotation坐标转换 img_bbox = transform_bbox_for_rotation_fitz( list(line_bbox), rotation, pdf_width, pdf_height, scale ) extracted_blocks.append({ 'text': line_text, 'bbox': img_bbox, 'origin_bbox': list(line_bbox) }) # 转换为图片rotation(逆时针定义) image_rotation = pdf_rotation_to_image_rotation(rotation) return extracted_blocks, image_rotation except Exception as e: logger.warning(f"fitz extract_all_text_blocks failed: {e}") import traceback logger.debug(traceback.format_exc()) return [], 0 def detect_page_type( pdf_doc: Any, page_idx: int, char_threshold: int = 50 ) -> str: """ 检测PDF指定页是文字页还是图片页 基于字符密度的简单可靠方法 """ try: text_blocks, _ = extract_all_text_blocks(pdf_doc, page_idx, scale=1.0) total_chars = sum(len(block.get('text', '')) for block in text_blocks) return 'txt' if total_chars >= char_threshold else 'ocr' except: return 'ocr'