| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432 |
- """
- PDF文本提取模块
- 提供从PDF文档中提取文本的功能,支持多种PDF引擎:
- - pypdfium2: MinerU标准引擎
- - fitz (PyMuPDF): 轻量级替代引擎
- 主要功能:
- - 区域文本提取:从指定bbox区域提取文本
- - 全页文本提取:提取页面所有文本块及其坐标
- - 自动rotation处理:自动应用PDF页面旋转变换
- - 返回图片rotation(逆时针定义):对外统一使用图片处理标准
- """
- from typing import Dict, List, Any, Tuple
- from loguru import logger
- # 导入坐标转换函数
- from .pdf_coordinate_transform import (
- transform_bbox_for_rotation_fitz,
- transform_bbox_for_rotation_pypdfium2,
- pdf_rotation_to_image_rotation
- )
- # 导入 MinerU 组件
- try:
- from mineru.utils.pdf_text_tool import get_page as pdf_get_page_text
- MINERU_AVAILABLE = True
- except ImportError:
- pdf_get_page_text = None
- MINERU_AVAILABLE = False
- def detect_pdf_doc_type(pdf_doc: Any) -> str:
- """
- 检测 PDF 文档对象类型
-
- Args:
- pdf_doc: PDF 文档对象
-
- Returns:
- 'pypdfium2' 或 'fitz'
- """
- doc_type_name = type(pdf_doc).__name__
- doc_module = type(pdf_doc).__module__
-
- if 'pdfium' in doc_module.lower() or 'PdfDocument' in doc_type_name:
- return 'pypdfium2'
- elif 'fitz' in doc_module.lower() or 'Document' in doc_type_name:
- return 'fitz'
- else:
- # 尝试通过属性判断
- if hasattr(pdf_doc, 'get_page') or hasattr(pdf_doc, 'page_count'):
- return 'fitz'
- else:
- return 'pypdfium2'
- def bbox_overlap(bbox1: List[float], bbox2: List[float]) -> bool:
- """
- 检查两个 bbox 是否重叠
-
- Args:
- bbox1: 第一个bbox [x1, y1, x2, y2]
- bbox2: 第二个bbox [x1, y1, x2, y2]
-
- Returns:
- True 如果重叠,否则 False
- """
- if len(bbox1) < 4 or len(bbox2) < 4:
- return False
-
- x1_1, y1_1, x2_1, y2_1 = bbox1[:4]
- x1_2, y1_2, x2_2, y2_2 = bbox2[:4]
-
- # 检查是否不重叠(取反)
- if x2_1 < x1_2 or x2_2 < x1_1:
- return False
- if y2_1 < y1_2 or y2_2 < y1_1:
- return False
-
- return True
- # ============================================================================
- # 区域文本提取
- # ============================================================================
- def extract_text_from_pdf(
- pdf_doc: Any,
- page_idx: int,
- bbox: List[float],
- scale: float
- ) -> Tuple[str, bool]:
- """
- 从PDF指定区域提取文本(支持 pypdfium2 和 fitz)
-
- Args:
- pdf_doc: PDF文档对象 (pypdfium2.PdfDocument 或 fitz.Document)
- page_idx: 页码索引(0-based)
- bbox: 目标区域的bbox(图像坐标)[x1, y1, x2, y2]
- scale: 图像与PDF的缩放比例
-
- Returns:
- (text, success)
- - text: 提取的文本
- - success: 是否成功提取到文本
- """
- doc_type = detect_pdf_doc_type(pdf_doc)
-
- if doc_type == 'fitz':
- return extract_text_from_pdf_fitz(pdf_doc, page_idx, bbox, scale)
- else: # pypdfium2
- return extract_text_from_pdf_pypdfium2(pdf_doc, page_idx, bbox, scale)
- def extract_text_from_pdf_pypdfium2(
- pdf_doc: Any,
- page_idx: int,
- bbox: List[float],
- scale: float
- ) -> Tuple[str, bool]:
- """
- 使用 pypdfium2 从指定区域提取文本
-
- Args:
- pdf_doc: pypdfium2.PdfDocument 对象
- page_idx: 页码索引
- bbox: 目标区域的bbox(图像坐标)
- scale: 缩放比例
-
- Returns:
- (text, success)
- """
- if not MINERU_AVAILABLE or pdf_get_page_text is None:
- logger.error("MinerU pdf_text_tool not available")
- return "", False
-
- try:
- page = pdf_doc[page_idx]
-
- # 将图像坐标转换为PDF坐标
- pdf_bbox = [
- bbox[0] / scale,
- bbox[1] / scale,
- bbox[2] / scale,
- bbox[3] / scale
- ]
-
- # 使用 MinerU 的方式获取页面文本信息
- page_dict = pdf_get_page_text(page)
-
- # 从 blocks 中提取与 bbox 重叠的文本
- text_parts = []
- for block in page_dict.get('blocks', []):
- for line in block.get('lines', []):
- line_bbox = line.get('bbox')
- if line_bbox and hasattr(line_bbox, 'bbox'):
- line_bbox = line_bbox.bbox
- elif isinstance(line_bbox, (list, tuple)) and len(line_bbox) >= 4:
- line_bbox = list(line_bbox)
- else:
- continue
-
- if bbox_overlap(pdf_bbox, line_bbox):
- for span in line.get('spans', []):
- span_text = span.get('text', '')
- if span_text:
- text_parts.append(span_text)
-
- text = ' '.join(text_parts)
- return text.strip(), bool(text.strip())
-
- except Exception as e:
- import traceback
- logger.debug(f"pypdfium2 text extraction error: {e}")
- logger.debug(traceback.format_exc())
- return "", False
- def extract_text_from_pdf_fitz(
- pdf_doc: Any,
- page_idx: int,
- bbox: List[float],
- scale: float
- ) -> Tuple[str, bool]:
- """
- 使用 fitz 从指定区域提取文本
-
- Args:
- pdf_doc: fitz.Document 对象
- page_idx: 页码索引
- bbox: 目标区域的bbox(图像坐标)
- scale: 缩放比例
-
- Returns:
- (text, success)
- """
- try:
- import fitz
- except ImportError:
- logger.error("PyMuPDF (fitz) not available")
- return "", False
-
- try:
- page = pdf_doc[page_idx]
-
- # 将图像坐标转换为PDF坐标
- pdf_bbox = fitz.Rect(
- bbox[0] / scale,
- bbox[1] / scale,
- bbox[2] / scale,
- bbox[3] / scale
- )
-
- # 提取区域内的文本
- text = page.get_text("text", clip=pdf_bbox)
-
- return text.strip(), bool(text.strip())
-
- except Exception as e:
- import traceback
- logger.debug(f"fitz text extraction error: {e}")
- logger.debug(traceback.format_exc())
- return "", False
- # ============================================================================
- # 全页文本提取
- # ============================================================================
- def extract_all_text_blocks(
- pdf_doc: Any,
- page_idx: int,
- scale: float
- ) -> Tuple[List[Dict[str, Any]], int]:
- """
- 提取页面所有文本块(支持 pypdfium2 和 fitz)+ PDF rotation处理
-
- Args:
- pdf_doc: PDF文档对象
- page_idx: 页码索引(0-based)
- scale: 缩放比例
-
- Returns:
- (text_blocks, rotation_angle)
- - text_blocks: 文本块列表 [{'text': str, 'bbox': [x1, y1, x2, y2], 'origin_bbox': [...]}, ...]
- bbox坐标已转换为渲染图像坐标系(与OCR坐标系一致)
- - rotation_angle: 图片旋转角度(0/90/180/270),逆时针定义
- """
- doc_type = detect_pdf_doc_type(pdf_doc)
-
- if doc_type == 'fitz':
- return extract_all_text_blocks_fitz(pdf_doc, page_idx, scale)
- else:
- return extract_all_text_blocks_pypdfium2(pdf_doc, page_idx, scale)
- def extract_all_text_blocks_pypdfium2(
- pdf_doc: Any,
- page_idx: int,
- scale: float
- ) -> Tuple[List[Dict[str, Any]], int]:
- """
- 使用 pypdfium2 提取所有文本块并处理rotation
-
- Args:
- pdf_doc: pypdfium2.PdfDocument 对象
- page_idx: 页码索引
- scale: 缩放比例
-
- Returns:
- (text_blocks, rotation_angle)
- """
- if not MINERU_AVAILABLE or pdf_get_page_text is None:
- return [], 0
-
- try:
- page = pdf_doc[page_idx]
- page_dict = pdf_get_page_text(page)
-
- # 获取页面尺寸和rotation
- rotation = page_dict.get('rotation', 0)
- pdf_width = page_dict.get('width', 0)
- pdf_height = page_dict.get('height', 0)
-
- if rotation != 0:
- logger.info(f"📐 Page {page_idx}: PDF rotation={rotation}°, size=({pdf_width}x{pdf_height})")
-
- extracted_blocks = []
-
- for block in page_dict.get('blocks', []):
- for line in block.get('lines', []):
- line_text = ""
- for span in line.get('spans', []):
- line_text += span.get('text', "")
-
- if not line_text.strip():
- continue
-
- line_bbox = line.get('bbox')
- if line_bbox and hasattr(line_bbox, 'bbox'):
- line_bbox = line_bbox.bbox
- elif isinstance(line_bbox, (list, tuple)) and len(line_bbox) >= 4:
- line_bbox = list(line_bbox)
- else:
- continue
-
- # 应用rotation坐标转换
- img_bbox = transform_bbox_for_rotation_pypdfium2(
- line_bbox, rotation, pdf_width, pdf_height, scale
- )
-
- extracted_blocks.append({
- 'text': line_text,
- 'bbox': img_bbox,
- 'origin_bbox': line_bbox
- })
-
- # 转换为图片rotation(逆时针定义)
- image_rotation = pdf_rotation_to_image_rotation(rotation)
- return extracted_blocks, image_rotation
-
- except Exception as e:
- logger.warning(f"pypdfium2 extract_all_text_blocks failed: {e}")
- import traceback
- logger.debug(traceback.format_exc())
- return [], 0
- def extract_all_text_blocks_fitz(
- pdf_doc: Any,
- page_idx: int,
- scale: float
- ) -> Tuple[List[Dict[str, Any]], int]:
- """
- 使用 fitz 提取所有文本块并处理rotation
-
- Args:
- pdf_doc: fitz.Document 对象
- page_idx: 页码索引
- scale: 缩放比例
-
- Returns:
- (text_blocks, rotation_angle)
- """
- try:
- import fitz
- except ImportError:
- logger.warning("PyMuPDF (fitz) not available")
- return [], 0
-
- try:
- page = pdf_doc[page_idx]
-
- # 获取页面rotation
- rotation = page.rotation # 0, 90, 180, 270
-
- # 获取页面尺寸(原始方向,未旋转)
- # page.rect 是旋转后的尺寸,我们需要原始尺寸
- if rotation in [90, 270]:
- # 宽高互换回来
- pdf_width = page.rect.height
- pdf_height = page.rect.width
- else:
- pdf_width = page.rect.width
- pdf_height = page.rect.height
-
- if rotation != 0:
- logger.info(f"📐 Page {page_idx}: PDF rotation={rotation}°, original_size=({pdf_width}x{pdf_height})")
-
- # 使用 get_text("dict") 获取详细的文本信息
- text_dict = page.get_text("dict")
-
- extracted_blocks = []
-
- # 遍历所有 blocks
- for block in text_dict.get("blocks", []):
- # 只处理文本块(type=0)
- if block.get("type") != 0:
- continue
-
- # 遍历所有 lines
- for line in block.get("lines", []):
- line_text = ""
- line_bbox = line.get("bbox")
-
- # 提取 line 中的所有 span 文本
- for span in line.get("spans", []):
- line_text += span.get("text", "")
-
- if not line_text.strip() or not line_bbox:
- continue
-
- # 应用rotation坐标转换
- img_bbox = transform_bbox_for_rotation_fitz(
- list(line_bbox), rotation, pdf_width, pdf_height, scale
- )
-
- extracted_blocks.append({
- 'text': line_text,
- 'bbox': img_bbox,
- 'origin_bbox': list(line_bbox)
- })
-
- # 转换为图片rotation(逆时针定义)
- image_rotation = pdf_rotation_to_image_rotation(rotation)
- return extracted_blocks, image_rotation
-
- except Exception as e:
- logger.warning(f"fitz extract_all_text_blocks failed: {e}")
- import traceback
- logger.debug(traceback.format_exc())
- return [], 0
- def detect_page_type(
- pdf_doc: Any,
- page_idx: int,
- char_threshold: int = 50
- ) -> str:
- """
- 检测PDF指定页是文字页还是图片页
-
- 基于字符密度的简单可靠方法
- """
- try:
- text_blocks, _ = extract_all_text_blocks(pdf_doc, page_idx, scale=1.0)
- total_chars = sum(len(block.get('text', '')) for block in text_blocks)
-
- return 'txt' if total_chars >= char_threshold else 'ocr'
- except:
- return 'ocr'
|