|
|
@@ -11,7 +11,7 @@ PDF文本提取模块
|
|
|
- 自动rotation处理:自动应用PDF页面旋转变换
|
|
|
- 返回图片rotation(逆时针定义):对外统一使用图片处理标准
|
|
|
"""
|
|
|
-from typing import Dict, List, Any, Tuple
|
|
|
+from typing import Dict, List, Any, Tuple, Optional
|
|
|
from loguru import logger
|
|
|
|
|
|
# 导入坐标转换函数
|
|
|
@@ -231,34 +231,43 @@ def extract_text_from_pdf_fitz(
|
|
|
def extract_all_text_blocks(
|
|
|
pdf_doc: Any,
|
|
|
page_idx: int,
|
|
|
- scale: float
|
|
|
+ scale: float,
|
|
|
+ return_upright_coords: bool = True,
|
|
|
) -> Tuple[List[Dict[str, Any]], int]:
|
|
|
"""
|
|
|
- 提取页面所有文本块(支持 pypdfium2 和 fitz)+ PDF rotation处理
|
|
|
+ 提取页面所有文本块(支持 pypdfium2 和 fitz)
|
|
|
+
|
|
|
+ **统一对外输出逻辑**:
|
|
|
+ - return_upright_coords=True: 返回正视坐标(推荐,与OCR坐标系一致)
|
|
|
+ - return_upright_coords=False: 返回旋转后坐标(与PDF rotation匹配)
|
|
|
|
|
|
Args:
|
|
|
pdf_doc: PDF文档对象
|
|
|
page_idx: 页码索引(0-based)
|
|
|
scale: 缩放比例
|
|
|
+ return_upright_coords: 是否返回正视坐标
|
|
|
+ True=正视坐标(默认,推荐)
|
|
|
+ False=旋转后坐标
|
|
|
|
|
|
Returns:
|
|
|
(text_blocks, rotation_angle)
|
|
|
- text_blocks: 文本块列表 [{'text': str, 'bbox': [x1, y1, x2, y2], 'origin_bbox': [...]}, ...]
|
|
|
- bbox坐标已转换为渲染图像坐标系(与OCR坐标系一致)
|
|
|
- rotation_angle: 图片旋转角度(0/90/180/270),逆时针定义
|
|
|
"""
|
|
|
+
|
|
|
doc_type = detect_pdf_doc_type(pdf_doc)
|
|
|
|
|
|
if doc_type == 'fitz':
|
|
|
- return extract_all_text_blocks_fitz(pdf_doc, page_idx, scale)
|
|
|
+ return extract_all_text_blocks_fitz(pdf_doc, page_idx, scale, return_upright_coords)
|
|
|
else:
|
|
|
- return extract_all_text_blocks_pypdfium2(pdf_doc, page_idx, scale)
|
|
|
+ return extract_all_text_blocks_pypdfium2(pdf_doc, page_idx, scale, return_upright_coords)
|
|
|
|
|
|
|
|
|
def extract_all_text_blocks_pypdfium2(
|
|
|
pdf_doc: Any,
|
|
|
page_idx: int,
|
|
|
- scale: float
|
|
|
+ scale: float,
|
|
|
+ return_upright_coords: bool = True
|
|
|
) -> Tuple[List[Dict[str, Any]], int]:
|
|
|
"""
|
|
|
使用 pypdfium2 提取所有文本块并处理rotation
|
|
|
@@ -267,6 +276,7 @@ def extract_all_text_blocks_pypdfium2(
|
|
|
pdf_doc: pypdfium2.PdfDocument 对象
|
|
|
page_idx: 页码索引
|
|
|
scale: 缩放比例
|
|
|
+ return_upright_coords: 是否返回正视坐标(True=正视,False=旋转后)
|
|
|
|
|
|
Returns:
|
|
|
(text_blocks, rotation_angle)
|
|
|
@@ -284,7 +294,7 @@ def extract_all_text_blocks_pypdfium2(
|
|
|
pdf_height = page_dict.get('height', 0)
|
|
|
|
|
|
if rotation != 0:
|
|
|
- logger.info(f"📐 Page {page_idx}: PDF rotation={rotation}°, size=({pdf_width}x{pdf_height})")
|
|
|
+ logger.info(f"📐 Page {page_idx}: PDF rotation={rotation}°, size=({pdf_width}x{pdf_height}), return_upright={return_upright_coords}")
|
|
|
|
|
|
extracted_blocks = []
|
|
|
|
|
|
@@ -305,9 +315,9 @@ def extract_all_text_blocks_pypdfium2(
|
|
|
else:
|
|
|
continue
|
|
|
|
|
|
- # 应用rotation坐标转换
|
|
|
+ # pypdfium2返回旋转后坐标,根据 return_upright_coords 决定是否转换为正视坐标
|
|
|
img_bbox = transform_bbox_for_rotation_pypdfium2(
|
|
|
- line_bbox, rotation, pdf_width, pdf_height, scale
|
|
|
+ line_bbox, rotation, pdf_width, pdf_height, scale, to_upright=return_upright_coords
|
|
|
)
|
|
|
|
|
|
extracted_blocks.append({
|
|
|
@@ -330,7 +340,8 @@ def extract_all_text_blocks_pypdfium2(
|
|
|
def extract_all_text_blocks_fitz(
|
|
|
pdf_doc: Any,
|
|
|
page_idx: int,
|
|
|
- scale: float
|
|
|
+ scale: float,
|
|
|
+ return_upright_coords: bool = True
|
|
|
) -> Tuple[List[Dict[str, Any]], int]:
|
|
|
"""
|
|
|
使用 fitz 提取所有文本块并处理rotation
|
|
|
@@ -339,6 +350,7 @@ def extract_all_text_blocks_fitz(
|
|
|
pdf_doc: fitz.Document 对象
|
|
|
page_idx: 页码索引
|
|
|
scale: 缩放比例
|
|
|
+ return_upright_coords: 是否返回正视坐标(True=正视,False=旋转后)
|
|
|
|
|
|
Returns:
|
|
|
(text_blocks, rotation_angle)
|
|
|
@@ -366,7 +378,7 @@ def extract_all_text_blocks_fitz(
|
|
|
pdf_height = page.rect.height
|
|
|
|
|
|
if rotation != 0:
|
|
|
- logger.info(f"📐 Page {page_idx}: PDF rotation={rotation}°, original_size=({pdf_width}x{pdf_height})")
|
|
|
+ logger.info(f"📐 Page {page_idx}: PDF rotation={rotation}°, original_size=({pdf_width}x{pdf_height}), return_upright={return_upright_coords}")
|
|
|
|
|
|
# 使用 get_text("dict") 获取详细的文本信息
|
|
|
text_dict = page.get_text("dict")
|
|
|
@@ -391,9 +403,10 @@ def extract_all_text_blocks_fitz(
|
|
|
if not line_text.strip() or not line_bbox:
|
|
|
continue
|
|
|
|
|
|
- # 应用rotation坐标转换
|
|
|
+ # fitz返回正视坐标,根据 return_upright_coords 决定是否转换为旋转后坐标
|
|
|
+ to_rotated = not return_upright_coords # 反转逻辑
|
|
|
img_bbox = transform_bbox_for_rotation_fitz(
|
|
|
- list(line_bbox), rotation, pdf_width, pdf_height, scale
|
|
|
+ list(line_bbox), rotation, pdf_width, pdf_height, scale, to_rotated=to_rotated
|
|
|
)
|
|
|
|
|
|
extracted_blocks.append({
|
|
|
@@ -413,6 +426,44 @@ def extract_all_text_blocks_fitz(
|
|
|
return [], 0
|
|
|
|
|
|
|
|
|
+def get_page_rotation(pdf_doc: Any, page_idx: int) -> int:
|
|
|
+ """
|
|
|
+ 获取PDF页面的旋转角度(逆时针定义,用于图像旋转)
|
|
|
+
|
|
|
+ 返回的角度可直接用于PIL.rotate()等图像旋转函数。
|
|
|
+
|
|
|
+ Args:
|
|
|
+ pdf_doc: PDF文档对象 (pypdfium2.PdfDocument 或 fitz.Document)
|
|
|
+ page_idx: 页码索引(0-based)
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 旋转角度:0/90/180/270(逆时针旋转角度)
|
|
|
+
|
|
|
+ Examples:
|
|
|
+ >>> pdf_doc = fitz.open("test.pdf")
|
|
|
+ >>> rotate_angle = get_page_rotation(pdf_doc, 0)
|
|
|
+ >>> if rotate_angle != 0:
|
|
|
+ >>> image = image.rotate(-rotate_angle, expand=True) # 旋转为正视
|
|
|
+ """
|
|
|
+ try:
|
|
|
+ doc_type = detect_pdf_doc_type(pdf_doc)
|
|
|
+
|
|
|
+ # 获取PDF的rotation属性(顺时针定义)
|
|
|
+ if doc_type == "pypdfium2":
|
|
|
+ pdf_rotation = pdf_doc[page_idx].get_rotation()
|
|
|
+ else: # fitz
|
|
|
+ pdf_rotation = pdf_doc[page_idx].rotation
|
|
|
+
|
|
|
+ # 转换为图像rotation(逆时针定义)
|
|
|
+ image_rotation = pdf_rotation_to_image_rotation(pdf_rotation)
|
|
|
+
|
|
|
+ return image_rotation
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ logger.warning(f"Failed to get page rotation for page {page_idx}: {e}")
|
|
|
+ return 0
|
|
|
+
|
|
|
+
|
|
|
def detect_page_type(
|
|
|
pdf_doc: Any,
|
|
|
page_idx: int,
|
|
|
@@ -424,6 +475,7 @@ def detect_page_type(
|
|
|
基于字符密度的简单可靠方法
|
|
|
"""
|
|
|
try:
|
|
|
+ # 这里使用默认 with_rotation=True,因为只需要计数字符
|
|
|
text_blocks, _ = extract_all_text_blocks(pdf_doc, page_idx, scale=1.0)
|
|
|
total_chars = sum(len(block.get('text', '')) for block in text_blocks)
|
|
|
|