6 miesięcy temu · f32729046f
--- a/ocr_utils/pdf_README.md
+++ b/ocr_utils/pdf_README.md
@@ -38,17 +38,24 @@ images, pdf_type, pdf_doc, renderer = PDFUtils.load_and_classify_document(
 
				 )
			
 
				 ```
			
 
				 
			
 
				-### 2. pdf_coordinate_transform.py (~250行)
			
 
				+### 2. pdf_coordinate_transform.py (~315行)
			
 
				 **作用**: PDF坐标系与图像坐标系的转换
			
 
				 
			
 
				 **核心函数**:
			
 
				 - `pdf_rotation_to_image_rotation()`: **PDF旋转角度转换为图片旋转角度**
			
 
				-- `transform_bbox_for_rotation_fitz()`: fitz引擎的完整几何坐标变换
			
 
				-- `transform_bbox_for_rotation_pypdfium2()`: pypdfium2引擎的坐标值交换
			
 
				+- `transform_bbox_for_rotation_fitz()`: fitz引擎的坐标变换（支持正视/旋转后坐标输出）
			
 
				+- `transform_bbox_for_rotation_pypdfium2()`: pypdfium2引擎的坐标变换（支持正视/旋转后坐标输出）
			
 
				+
			
 
				+**⭐ 统一输出逻辑（重要）**:
			
 
				+两个引擎通过参数统一对外输出行为：
			
 
				+- **正视坐标**（`return_upright_coords=True`，默认，推荐）：坐标在正视方向，不受PDF rotation影响
			
 
				+- **旋转后坐标**（`return_upright_coords=False`）：坐标在旋转后坐标系，匹配PDF rotation
			
 
				 
			
 
				 **坐标系说明**:
			
 
				 - **PDF坐标系**: 左下角原点 (0,0)，X向右，Y向上
			
 
				 - **图像坐标系**: 左上角原点 (0,0)，X向右，Y向下
			
 
				+- **正视坐标**: 视觉上保持upright（rotation=0）的坐标系
			
 
				+- **旋转后坐标**: 匹配PDF rotation属性的坐标系
			
 
				 
			
 
				 **旋转定义（重要）**:
			
 
				 - **PDF rotation**: 0/90/180/270度（**顺时针旋转**，PDF规范）
			
@@ -63,36 +70,71 @@ images, pdf_type, pdf_doc, renderer = PDFUtils.load_and_classify_document(
 
				 | 180° | 180° |
			
 
				 | 270° | 90° |
			
 
				 
			
 
				-**关键区别**:
			
 
				+**引擎原始行为差异**:
			
 
				 | 特性 | fitz | pypdfium2 |
			
 
				-|------|------|-----------|
			
 
				-| 输入坐标系 | PDF原始坐标系 | 已旋转的坐标系（但bbox顺序错误） |
			
 
				-| 变换类型 | 几何空间变换 | 坐标值交换（修正min/max） |
			
 
				-| 复杂度 | 高（涉及旋转公式） | 低（只是交换） |
			
 
				+|------|------|-----------||
			
 
				+| 原生返回坐标 | **正视坐标**（总是upright） | **旋转后坐标**（匹配PDF rotation） |
			
 
				+| 坐标系类型 | 与rotation无关 | 与rotation相关 |
			
 
				+| 转换需求 | 需转换为旋转后（如需要） | 需转换为正视（如需要） |
			
 
				+
			
 
				+**统一后的输出**:
			
 
				+| 参数 | fitz输出 | pypdfium2输出 | 说明 |
			
 
				+|------|---------|--------------|------|
			
 
				+| `to_rotated=False`<br>`to_upright=True` | 正视坐标 | 正视坐标 | **默认，推荐** |
			
 
				+| `to_rotated=True`<br>`to_upright=False` | 旋转后坐标 | 旋转后坐标 | 匹配渲染图像 |
			
 
				 
			
 
				 **使用示例**:
			
 
				 ```python
			
 
				-from ocr_utils.pdf_coordinate_transform import transform_bbox_for_rotation_fitz
			
 
				+from ocr_utils.pdf_coordinate_transform import (
			
 
				+    transform_bbox_for_rotation_fitz,
			
 
				+    transform_bbox_for_rotation_pypdfium2
			
 
				+)
			
 
				+
			
 
				+# fitz引擎: 返回正视坐标（默认，推荐）
			
 
				+upright_bbox = transform_bbox_for_rotation_fitz(
			
 
				+    bbox=[100, 50, 200, 100],
			
 
				+    rotation=90,
			
 
				+    pdf_width=595,
			
 
				+    pdf_height=842,
			
 
				+    scale=2.778,
			
 
				+    to_rotated=False  # 默认值，返回正视坐标
			
 
				+)
			
 
				+
			
 
				+# fitz引擎: 返回旋转后坐标
			
 
				+rotated_bbox = transform_bbox_for_rotation_fitz(
			
 
				+    bbox=[100, 50, 200, 100],
			
 
				+    rotation=90,
			
 
				+    pdf_width=595,
			
 
				+    pdf_height=842,
			
 
				+    scale=2.778,
			
 
				+    to_rotated=True  # 返回旋转后坐标
			
 
				+)
			
 
				 
			
 
				-# fitz引擎: 完整几何变换
			
 
				-img_bbox = transform_bbox_for_rotation_fitz(
			
 
				+# pypdfium2引擎: 返回正视坐标（默认，推荐）
			
 
				+upright_bbox = transform_bbox_for_rotation_pypdfium2(
			
 
				     bbox=[100, 50, 200, 100],
			
 
				     rotation=90,
			
 
				     pdf_width=595,
			
 
				     pdf_height=842,
			
 
				-    scale=2.778
			
 
				+    scale=2.778,
			
 
				+    to_upright=True  # 默认值，返回正视坐标
			
 
				 )
			
 
				 ```
			
 
				 
			
 
				 ### 3. pdf_text_extraction.py (~450行)
			
 
				-**作用**: 从PDF提取文本，支持rotation处理
			
 
				+**作用**: 从PDF提取文本，支持rotation处理和坐标系选择
			
 
				 
			
 
				 **核心函数**:
			
 
				 - `extract_text_from_pdf()`: 从指定区域提取文本（自动检测引擎）
			
 
				-- `extract_all_text_blocks()`: 提取页面所有文本块（自动检测引擎）
			
 
				+- `extract_all_text_blocks()`: 提取页面所有文本块（自动检测引擎，支持坐标系选择）
			
 
				 - `detect_pdf_doc_type()`: 检测PDF文档类型(fitz/pypdfium2)
			
 
				 - `bbox_overlap()`: 检查bbox重叠
			
 
				 
			
 
				+**⭐ 坐标系选择参数**:
			
 
				+- `return_upright_coords=True`（默认）：返回正视坐标，所有rotation下坐标一致
			
 
				+- `return_upright_coords=False`：返回旋转后坐标，匹配渲染图像
			
 
				+- `with_rotation`（已废弃）：保留向后兼容，使用时会警告
			
 
				+
			
 
				 **支持引擎**:
			
 
				 - **pypdfium2**: MinerU标准引擎
			
 
				 - **fitz (PyMuPDF)**: 轻量级替代引擎
			
@@ -101,23 +143,37 @@ img_bbox = transform_bbox_for_rotation_fitz(
 
				 ```python
			
 
				 from ocr_utils.pdf_text_extraction import extract_all_text_blocks
			
 
				 
			
 
				-# 提取所有文本块（自动应用rotation变换）
			
 
				+# 方式1: 提取正视坐标（默认，推荐）
			
 
				+text_blocks, rotation = extract_all_text_blocks(
			
 
				+    pdf_doc=pdf_doc,
			
 
				+    page_idx=0,
			
 
				+    scale=2.778,
			
 
				+    return_upright_coords=True  # 默认值
			
 
				+)
			
 
				+
			
 
				+# 方式2: 提取旋转后坐标（匹配渲染图像）
			
 
				 text_blocks, rotation = extract_all_text_blocks(
			
 
				     pdf_doc=pdf_doc,
			
 
				     page_idx=0,
			
 
				-    scale=2.778
			
 
				+    scale=2.778,
			
 
				+    return_upright_coords=False
			
 
				 )
			
 
				 
			
 
				 # 返回格式:
			
 
				 # text_blocks = [
			
 
				-#     {'text': 'Hello', 'bbox': [x1, y1, x2, y2], 'origin_bbox': [...]},
			
 
				+#     {
			
 
				+#         'text': 'Hello',
			
 
				+#         'bbox': [x1, y1, x2, y2],          # 转换后的坐标（正视或旋转后）
			
 
				+#         'origin_bbox': [x1, y1, x2, y2]    # 原始坐标
			
 
				+#     },
			
 
				 #     ...
			
 
				 # ]
			
 
				 # rotation = 270  # 图片旋转角度：0/90/180/270（逆时针）
			
 
				-#                 # 注意：返回的是图片旋转定义（逆时针），不是PDF rotation（顺时针）
			
 
				 ```
			
 
				 
			
 
				-**⚠️ 重要说明**:
			
 
				+**⚠️ 坐标系说明**:
			
 
				+- `return_upright_coords=True`（推荐）：bbox在正视坐标系，不同rotation下相同文本坐标一致
			
 
				+- `return_upright_coords=False`：bbox在旋转后坐标系，与渲染图像对齐，可直接绘制
			
 
				 - `rotation` 返回值采用**图片旋转定义**（逆时针），不是PDF rotation（顺时针）
			
 
				 - PDF rotation 90° → 返回 270°（图片需要逆时针旋转270°）
			
 
				 - PDF rotation 270° → 返回 90°（图片需要逆时针旋转90°）
			
@@ -224,18 +280,30 @@ renderer = "fitz"  # 无需额外依赖，单进程
 
				 ## ✅ 测试验证
			
 
				 
			
 
				 重构后通过完整测试验证：
			
 
				-- ✅ 所有8个rotation测试通过 (4种rotation × 2种引擎)
			
 
				-- ✅ fitz引擎: rotation 0°/90°/180°/270° 全部正确
			
 
				-- ✅ pypdfium2引擎: rotation 0°/90°/180°/270° 全部正确
			
 
				-- ✅ 坐标都在图像边界内
			
 
				+- ✅ 所有24个测试通过 (4种rotation × 2种引擎 × 2种坐标模式 + 8个对比测试)
			
 
				+- ✅ fitz引擎: rotation 0°/90°/180°/270° 正视坐标正确
			
 
				+- ✅ pypdfium2引擎: rotation 0°/90°/180°/270° 正视坐标正确
			
 
				+- ✅ 两种坐标模式对比测试通过
			
 
				+- ✅ 坐标边界验证通过
			
 
				 - ✅ 向后兼容性验证通过
			
 
				 
			
 
				+**修复的关键bug**:
			
 
				+- 🐛 修复pypdfium2在90°/270°转换时使用错误页面尺寸的问题
			
 
				+  - 问题：使用原始PDF尺寸，导致负坐标
			
 
				+  - 修复：使用旋转后页面尺寸（90°: height×width, 270°: height×width）
			
 
				+
			
 
				 测试命令:
			
 
				 ```bash
			
 
				 cd ocr_tools/universal_doc_parser/tests
			
 
				 python test_pdf_rotation.py
			
 
				 ```
			
 
				 
			
 
				+测试覆盖：
			
 
				+- 正视坐标模式 (return_upright_coords=True)
			
 
				+- 旋转后坐标模式 (return_upright_coords=False)
			
 
				+- 两种模式对比验证
			
 
				+- 可视化验证（生成带bbox标注的图像）
			
 
				+
			
 
				 ## 📚 相关文档
			
 
				 
			
 
				 - [MinerU文档](https://github.com/opendatalab/MinerU)
			
@@ -262,5 +330,7 @@ python test_pdf_rotation.py
 
				 ---
			
 
				 
			
 
				 **重构日期**: 2026-01-05  
			
 
				+**API改进日期**: 2026-01-07  
			
 
				 **重构原因**: pdf_utils.py文件过大(984行)，难以维护  
			
 
				-**重构目标**: 按功能层次拆分，提高可维护性，保持向后兼容性
			
 
				+**重构目标**: 按功能层次拆分，提高可维护性，保持向后兼容性  
			
 
				+**API改进**: 统一坐标系输出逻辑，使用语义明确的`return_upright_coords`参数替代`with_rotation`
			
--- a/ocr_utils/pdf_coordinate_transform.py
+++ b/ocr_utils/pdf_coordinate_transform.py
@@ -9,9 +9,27 @@ PDF坐标转换模块
 
				 - PDF rotation：0/90/180/270度（顺时针旋转）
			
 
				 - 图片rotation：0/90/180/270度（逆时针旋转，对外统一使用此定义）
			
 
				 
			
 
				+**重要发现（2026-01-07 测试验证）**：
			
 
				+
			
 
				+1. **fitz (PyMuPDF)**:
			
 
				+   - `get_text("dict")` 返回**正视坐标系**
			
 
				+   - 与PDF的rotation属性无关
			
 
				+   - 相同文本在不同rotation下坐标完全一致
			
 
				+   - 可选择返回正视坐标或旋转后坐标
			
 
				+
			
 
				+2. **pypdfium2**:
			
 
				+   - `get_page_text()` 返回**旋转后坐标系**
			
 
				+   - 坐标与PDF的rotation角度匹配
			
 
				+   - 例如rotation=270时，返回270度旋转后的坐标
			
 
				+   - 可选择返回正视坐标或旋转后坐标
			
 
				+
			
 
				+**统一对外接口**：
			
 
				+- return_upright_coords=True: 返回正视坐标（推荐，与OCR坐标系一致）
			
 
				+- return_upright_coords=False: 返回旋转后坐标（与PDF rotation匹配）
			
 
				+
			
 
				 关键函数：
			
 
				-- transform_bbox_for_rotation_fitz: fitz引擎的完整几何坐标变换
			
 
				-- transform_bbox_for_rotation_pypdfium2: pypdfium2引擎的坐标值交换
			
 
				+- transform_bbox_for_rotation_fitz: fitz坐标转换（正视↔旋转后）
			
 
				+- transform_bbox_for_rotation_pypdfium2: pypdfium2坐标转换（旋转后↔正视）
			
 
				 - pdf_rotation_to_image_rotation: PDF rotation转换为图片rotation
			
 
				 """
			
 
				 from typing import List
			
@@ -65,55 +83,62 @@ def transform_bbox_for_rotation_fitz(
 
				     rotation: int,
			
 
				     pdf_width: float,
			
 
				     pdf_height: float,
			
 
				-    scale: float
			
 
				+    scale: float,
			
 
				+    to_rotated: bool = False
			
 
				 ) -> List[float]:
			
 
				     """
			
 
				-    fitz引擎的坐标转换（完整几何变换）
			
 
				+    fitz引擎的坐标转换
			
 
				+    
			
 
				+    **实际行为**：fitz的get_text("dict")返回正视坐标系（与rotation无关）
			
 
				     
			
 
				-    fitz的get_text("dict")返回PDF原始坐标系（左下角原点，Y向上），
			
 
				-    需要进行完整的旋转变换 + Y轴翻转。
			
 
				+    此函数可以：
			
 
				+    - 保持正视坐标（to_rotated=False，默认）
			
 
				+    - 转换为旋转后坐标（to_rotated=True）
			
 
				     
			
 
				     Args:
			
 
				-        bbox: PDF原始坐标 [x1, y1, x2, y2]
			
 
				-        rotation: PDF页面rotation (0/90/180/270)
			
 
				+        bbox: fitz返回的bbox（正视坐标系）[x1, y1, x2, y2]
			
 
				+        rotation: PDF页面rotation (0/90/180/270，顺时针)
			
 
				         pdf_width: PDF页面宽度（原始方向）
			
 
				         pdf_height: PDF页面高度（原始方向）
			
 
				         scale: 渲染缩放比例
			
 
				+        to_rotated: 是否转换为旋转后坐标
			
 
				+                    False=保持正视坐标（默认，推荐）
			
 
				+                    True=转换为旋转后坐标
			
 
				         
			
 
				     Returns:
			
 
				         图像坐标 [x1, y1, x2, y2]，已确保 x1<x2, y1<y2
			
 
				         
			
 
				-    变换公式：
			
 
				-        rotation=0:   (x, y) → (x, y)                    # 直接缩放
			
 
				-        rotation=90:  (x, y) → (pdf_height-y, x)         # 完整坐标变换
			
 
				-        rotation=180: (x, y) → (pdf_width-x, pdf_height-y) # 完整坐标变换
			
 
				-        rotation=270: (x, y) → (y, pdf_width-x)          # 完整坐标变换
			
 
				+    变换逻辑（to_rotated=True时）：
			
 
				+        rotation=0:   不变
			
 
				+        rotation=90:  正视 → 顺时针90度旋转后
			
 
				+        rotation=180: 正视 → 180度旋转后
			
 
				+        rotation=270: 正视 → 顺时针270度旋转后
			
 
				     """
			
 
				     x1, y1, x2, y2 = bbox
			
 
				     
			
 
				-    if rotation == 0:
			
 
				-        # 直接缩放（fitz返回图像坐标系）
			
 
				+    if not to_rotated or rotation == 0:
			
 
				+        # 保持正视坐标，只需缩放
			
 
				         new_x1 = x1 * scale
			
 
				         new_y1 = y1 * scale
			
 
				         new_x2 = x2 * scale
			
 
				         new_y2 = y2 * scale
			
 
				         
			
 
				     elif rotation == 90:
			
 
				-        # 顺时针转90度
			
 
				+        # 转换为顺时针90度旋转后的坐标系
			
 
				         new_x1 = (pdf_height - y2) * scale
			
 
				         new_y1 = x1 * scale
			
 
				         new_x2 = (pdf_height - y1) * scale
			
 
				         new_y2 = x2 * scale
			
 
				         
			
 
				     elif rotation == 180:
			
 
				-        # 旋转180度：X和Y都翻转
			
 
				+        # 转换为180度旋转后的坐标系
			
 
				         new_x1 = (pdf_width - x2) * scale
			
 
				         new_y1 = (pdf_height - y2) * scale
			
 
				         new_x2 = (pdf_width - x1) * scale
			
 
				         new_y2 = (pdf_height - y1) * scale
			
 
				         
			
 
				     elif rotation == 270:
			
 
				-        # 顺时针转270度（逆时针转90度）
			
 
				+        # 转换为顺时针270度旋转后的坐标系
			
 
				         new_x1 = y1 * scale
			
 
				         new_y1 = (pdf_width - x2) * scale
			
 
				         new_x2 = y2 * scale
			
@@ -133,12 +158,96 @@ def transform_bbox_for_rotation_fitz(
 
				         max(new_y1, new_y2)
			
 
				     ]
			
 
				 
			
 
				-
			
 
				 def transform_bbox_for_rotation_pypdfium2(
			
 
				     bbox: List[float],
			
 
				     rotation: int,
			
 
				     pdf_width: float,
			
 
				     pdf_height: float,
			
 
				+    scale: float,
			
 
				+    to_upright: bool = True
			
 
				+) -> List[float]:
			
 
				+    """
			
 
				+    pypdfium2引擎的坐标转换
			
 
				+    
			
 
				+    **实际行为**：pypdfium2的get_page_text()返回旋转后坐标系（与rotation匹配）
			
 
				+    
			
 
				+    此函数可以：
			
 
				+    - 转换为正视坐标（to_upright=True，默认，推荐）
			
 
				+    - 保持旋转后坐标（to_upright=False）
			
 
				+    
			
 
				+    Args:
			
 
				+        bbox: pypdfium2返回的bbox（旋转后坐标系）[x1, y1, x2, y2]
			
 
				+        rotation: PDF页面rotation (0/90/180/270，顺时针)
			
 
				+        pdf_width: PDF页面宽度（原始方向）
			
 
				+        pdf_height: PDF页面高度（原始方向）
			
 
				+        scale: 渲染缩放比例
			
 
				+        to_upright: 是否转换为正视坐标
			
 
				+                    True=转换为正视坐标（默认，推荐）
			
 
				+                    False=保持旋转后坐标
			
 
				+        
			
 
				+    Returns:
			
 
				+        图像坐标 [x1, y1, x2, y2]，已确保 x1<x2, y1<y2
			
 
				+        
			
 
				+    变换逻辑（to_upright=True时）：
			
 
				+        rotation=0:   不变
			
 
				+        rotation=90:  旋转后 → 正视（逆时针90度）
			
 
				+        rotation=180: 旋转后 → 正视（180度）
			
 
				+        rotation=270: 旋转后 → 正视（逆时针270度）
			
 
				+    """
			
 
				+    x1, y1, x2, y2 = bbox
			
 
				+    
			
 
				+    if not to_upright or rotation == 0:
			
 
				+        # 保持旋转后坐标，只需缩放
			
 
				+        new_x1 = x1 * scale
			
 
				+        new_y1 = y1 * scale
			
 
				+        new_x2 = x2 * scale
			
 
				+        new_y2 = y2 * scale
			
 
				+        
			
 
				+    elif rotation == 90:
			
 
				+        # 旋转后坐标系 → 正视坐标系（逆变换）
			
 
				+        # PDF顺时针90度 → 需要逆时针90度恢复
			
 
				+        # 注意：rotation=90时，旋转后页面的width=原始height
			
 
				+        rotated_width = pdf_height
			
 
				+        new_x1 = y1 * scale
			
 
				+        new_y1 = (rotated_width - x2) * scale
			
 
				+        new_x2 = y2 * scale
			
 
				+        new_y2 = (rotated_width - x1) * scale
			
 
				+        
			
 
				+    elif rotation == 180:
			
 
				+        # 180度逆变换
			
 
				+        new_x1 = (pdf_width - x2) * scale
			
 
				+        new_y1 = (pdf_height - y2) * scale
			
 
				+        new_x2 = (pdf_width - x1) * scale
			
 
				+        new_y2 = (pdf_height - y1) * scale
			
 
				+        
			
 
				+    elif rotation == 270:
			
 
				+        # PDF顺时针270度 → 需要逆时针270度恢复
			
 
				+        # 注意：rotation=270时，旋转后页面的height=原始width
			
 
				+        rotated_height = pdf_width
			
 
				+        new_x1 = (rotated_height - y2) * scale
			
 
				+        new_y1 = x1 * scale
			
 
				+        new_x2 = (rotated_height - y1) * scale
			
 
				+        new_y2 = x2 * scale
			
 
				+        
			
 
				+    else:
			
 
				+        logger.warning(f"Unknown rotation: {rotation}, using default transformation")
			
 
				+        new_x1 = x1 * scale
			
 
				+        new_y1 = y1 * scale
			
 
				+        new_x2 = x2 * scale
			
 
				+        new_y2 = y2 * scale
			
 
				+    
			
 
				+    return [
			
 
				+        min(new_x1, new_x2),
			
 
				+        min(new_y1, new_y2),
			
 
				+        max(new_x1, new_x2),
			
 
				+        max(new_y1, new_y2)
			
 
				+    ]
			
 
				+
			
 
				+def transform_bbox_for_rotation_pypdfium2_old(
			
 
				+    bbox: List[float],
			
 
				+    rotation: int,
			
 
				+    pdf_width: float,
			
 
				+    pdf_height: float,
			
 
				     scale: float
			
 
				 ) -> List[float]:
			
 
				     """
			
--- a/ocr_utils/pdf_text_extraction.py
+++ b/ocr_utils/pdf_text_extraction.py
@@ -11,7 +11,7 @@ PDF文本提取模块
 
				 - 自动rotation处理：自动应用PDF页面旋转变换
			
 
				 - 返回图片rotation（逆时针定义）：对外统一使用图片处理标准
			
 
				 """
			
 
				-from typing import Dict, List, Any, Tuple
			
 
				+from typing import Dict, List, Any, Tuple, Optional
			
 
				 from loguru import logger
			
 
				 
			
 
				 # 导入坐标转换函数
			
@@ -231,34 +231,43 @@ def extract_text_from_pdf_fitz(
 
				 def extract_all_text_blocks(
			
 
				     pdf_doc: Any,
			
 
				     page_idx: int,
			
 
				-    scale: float
			
 
				+    scale: float,
			
 
				+    return_upright_coords: bool = True,
			
 
				 ) -> Tuple[List[Dict[str, Any]], int]:
			
 
				     """
			
 
				-    提取页面所有文本块（支持 pypdfium2 和 fitz）+ PDF rotation处理
			
 
				+    提取页面所有文本块（支持 pypdfium2 和 fitz）
			
 
				+    
			
 
				+    **统一对外输出逻辑**：
			
 
				+    - return_upright_coords=True: 返回正视坐标（推荐，与OCR坐标系一致）
			
 
				+    - return_upright_coords=False: 返回旋转后坐标（与PDF rotation匹配）
			
 
				     
			
 
				     Args:
			
 
				         pdf_doc: PDF文档对象
			
 
				         page_idx: 页码索引（0-based）
			
 
				         scale: 缩放比例
			
 
				+        return_upright_coords: 是否返回正视坐标
			
 
				+                               True=正视坐标（默认，推荐）
			
 
				+                               False=旋转后坐标
			
 
				         
			
 
				     Returns:
			
 
				         (text_blocks, rotation_angle)
			
 
				         - text_blocks: 文本块列表 [{'text': str, 'bbox': [x1, y1, x2, y2], 'origin_bbox': [...]}, ...]
			
 
				-                      bbox坐标已转换为渲染图像坐标系（与OCR坐标系一致）
			
 
				         - rotation_angle: 图片旋转角度(0/90/180/270)，逆时针定义
			
 
				     """
			
 
				+    
			
 
				     doc_type = detect_pdf_doc_type(pdf_doc)
			
 
				     
			
 
				     if doc_type == 'fitz':
			
 
				-        return extract_all_text_blocks_fitz(pdf_doc, page_idx, scale)
			
 
				+        return extract_all_text_blocks_fitz(pdf_doc, page_idx, scale, return_upright_coords)
			
 
				     else:
			
 
				-        return extract_all_text_blocks_pypdfium2(pdf_doc, page_idx, scale)
			
 
				+        return extract_all_text_blocks_pypdfium2(pdf_doc, page_idx, scale, return_upright_coords)
			
 
				 
			
 
				 
			
 
				 def extract_all_text_blocks_pypdfium2(
			
 
				     pdf_doc: Any,
			
 
				     page_idx: int,
			
 
				-    scale: float
			
 
				+    scale: float,
			
 
				+    return_upright_coords: bool = True
			
 
				 ) -> Tuple[List[Dict[str, Any]], int]:
			
 
				     """
			
 
				     使用 pypdfium2 提取所有文本块并处理rotation
			
@@ -267,6 +276,7 @@ def extract_all_text_blocks_pypdfium2(
 
				         pdf_doc: pypdfium2.PdfDocument 对象
			
 
				         page_idx: 页码索引
			
 
				         scale: 缩放比例
			
 
				+        return_upright_coords: 是否返回正视坐标（True=正视，False=旋转后）
			
 
				         
			
 
				     Returns:
			
 
				         (text_blocks, rotation_angle)
			
@@ -284,7 +294,7 @@ def extract_all_text_blocks_pypdfium2(
 
				         pdf_height = page_dict.get('height', 0)
			
 
				         
			
 
				         if rotation != 0:
			
 
				-            logger.info(f"📐 Page {page_idx}: PDF rotation={rotation}°, size=({pdf_width}x{pdf_height})")
			
 
				+            logger.info(f"📐 Page {page_idx}: PDF rotation={rotation}°, size=({pdf_width}x{pdf_height}), return_upright={return_upright_coords}")
			
 
				         
			
 
				         extracted_blocks = []
			
 
				         
			
@@ -305,9 +315,9 @@ def extract_all_text_blocks_pypdfium2(
 
				                 else:
			
 
				                     continue
			
 
				                 
			
 
				-                # 应用rotation坐标转换
			
 
				+                # pypdfium2返回旋转后坐标，根据 return_upright_coords 决定是否转换为正视坐标
			
 
				                 img_bbox = transform_bbox_for_rotation_pypdfium2(
			
 
				-                    line_bbox, rotation, pdf_width, pdf_height, scale
			
 
				+                    line_bbox, rotation, pdf_width, pdf_height, scale, to_upright=return_upright_coords
			
 
				                 )
			
 
				                 
			
 
				                 extracted_blocks.append({
			
@@ -330,7 +340,8 @@ def extract_all_text_blocks_pypdfium2(
 
				 def extract_all_text_blocks_fitz(
			
 
				     pdf_doc: Any,
			
 
				     page_idx: int,
			
 
				-    scale: float
			
 
				+    scale: float,
			
 
				+    return_upright_coords: bool = True
			
 
				 ) -> Tuple[List[Dict[str, Any]], int]:
			
 
				     """
			
 
				     使用 fitz 提取所有文本块并处理rotation
			
@@ -339,6 +350,7 @@ def extract_all_text_blocks_fitz(
 
				         pdf_doc: fitz.Document 对象
			
 
				         page_idx: 页码索引
			
 
				         scale: 缩放比例
			
 
				+        return_upright_coords: 是否返回正视坐标（True=正视，False=旋转后）
			
 
				         
			
 
				     Returns:
			
 
				         (text_blocks, rotation_angle)
			
@@ -366,7 +378,7 @@ def extract_all_text_blocks_fitz(
 
				             pdf_height = page.rect.height
			
 
				         
			
 
				         if rotation != 0:
			
 
				-            logger.info(f"📐 Page {page_idx}: PDF rotation={rotation}°, original_size=({pdf_width}x{pdf_height})")
			
 
				+            logger.info(f"📐 Page {page_idx}: PDF rotation={rotation}°, original_size=({pdf_width}x{pdf_height}), return_upright={return_upright_coords}")
			
 
				         
			
 
				         # 使用 get_text("dict") 获取详细的文本信息
			
 
				         text_dict = page.get_text("dict")
			
@@ -391,9 +403,10 @@ def extract_all_text_blocks_fitz(
 
				                 if not line_text.strip() or not line_bbox:
			
 
				                     continue
			
 
				                 
			
 
				-                # 应用rotation坐标转换
			
 
				+                # fitz返回正视坐标，根据 return_upright_coords 决定是否转换为旋转后坐标
			
 
				+                to_rotated = not return_upright_coords  # 反转逻辑
			
 
				                 img_bbox = transform_bbox_for_rotation_fitz(
			
 
				-                    list(line_bbox), rotation, pdf_width, pdf_height, scale
			
 
				+                    list(line_bbox), rotation, pdf_width, pdf_height, scale, to_rotated=to_rotated
			
 
				                 )
			
 
				                 
			
 
				                 extracted_blocks.append({
			
@@ -413,6 +426,44 @@ def extract_all_text_blocks_fitz(
 
				         return [], 0
			
 
				 
			
 
				 
			
 
				+def get_page_rotation(pdf_doc: Any, page_idx: int) -> int:
			
 
				+    """
			
 
				+    获取PDF页面的旋转角度（逆时针定义，用于图像旋转）
			
 
				+    
			
 
				+    返回的角度可直接用于PIL.rotate()等图像旋转函数。
			
 
				+    
			
 
				+    Args:
			
 
				+        pdf_doc: PDF文档对象 (pypdfium2.PdfDocument 或 fitz.Document)
			
 
				+        page_idx: 页码索引（0-based）
			
 
				+        
			
 
				+    Returns:
			
 
				+        旋转角度：0/90/180/270（逆时针旋转角度）
			
 
				+        
			
 
				+    Examples:
			
 
				+        >>> pdf_doc = fitz.open("test.pdf")
			
 
				+        >>> rotate_angle = get_page_rotation(pdf_doc, 0)
			
 
				+        >>> if rotate_angle != 0:
			
 
				+        >>>     image = image.rotate(-rotate_angle, expand=True)  # 旋转为正视
			
 
				+    """
			
 
				+    try:
			
 
				+        doc_type = detect_pdf_doc_type(pdf_doc)
			
 
				+        
			
 
				+        # 获取PDF的rotation属性（顺时针定义）
			
 
				+        if doc_type == "pypdfium2":
			
 
				+            pdf_rotation = pdf_doc[page_idx].get_rotation()
			
 
				+        else:  # fitz
			
 
				+            pdf_rotation = pdf_doc[page_idx].rotation
			
 
				+        
			
 
				+        # 转换为图像rotation（逆时针定义）
			
 
				+        image_rotation = pdf_rotation_to_image_rotation(pdf_rotation)
			
 
				+        
			
 
				+        return image_rotation
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        logger.warning(f"Failed to get page rotation for page {page_idx}: {e}")
			
 
				+        return 0
			
 
				+
			
 
				+
			
 
				 def detect_page_type(
			
 
				     pdf_doc: Any, 
			
 
				     page_idx: int,
			
@@ -424,6 +475,7 @@ def detect_page_type(
 
				     基于字符密度的简单可靠方法
			
 
				     """
			
 
				     try:
			
 
				+        # 这里使用默认 with_rotation=True，因为只需要计数字符
			
 
				         text_blocks, _ = extract_all_text_blocks(pdf_doc, page_idx, scale=1.0)
			
 
				         total_chars = sum(len(block.get('text', '')) for block in text_blocks)
			
 
				         
			
--- a/ocr_utils/pdf_utils.py
+++ b/ocr_utils/pdf_utils.py
@@ -42,6 +42,7 @@ from .pdf_text_extraction import (
 
				     extract_all_text_blocks_pypdfium2,
			
 
				     extract_all_text_blocks_fitz,
			
 
				     detect_page_type,
			
 
				+    get_page_rotation,
			
 
				 )
			
 
				 
			
 
				 from .pdf_image_rendering import (
			
@@ -294,10 +295,15 @@ class PDFUtils:
 
				     def extract_all_text_blocks(
			
 
				         pdf_doc: Any,
			
 
				         page_idx: int,
			
 
				-        scale: float
			
 
				+        scale: float,
			
 
				+        return_upright_coords: bool = True,
			
 
				     ) -> Tuple[List[Dict[str, Any]], int]:
			
 
				-        """向后兼容包装：提取页面所有文本块"""
			
 
				-        return extract_all_text_blocks(pdf_doc, page_idx, scale)
			
 
				+        """向后兼容包装：提取页面所有文本块
			
 
				+        
			
 
				+        Args:
			
 
				+            return_upright_coords: 是否返回正视坐标（True=正视，False=旋转后）
			
 
				+        """
			
 
				+        return extract_all_text_blocks(pdf_doc, page_idx, scale, return_upright_coords)
			
 
				 
			
 
				     @staticmethod
			
 
				     def _extract_all_text_blocks_pypdfium2(
			
@@ -386,6 +392,27 @@ class PDFUtils:
 
				         """
			
 
				         return detect_page_type(pdf_doc, page_idx, char_threshold)
			
 
				 
			
 
				+    @staticmethod
			
 
				+    def get_page_rotation(pdf_doc: Any, page_idx: int) -> int:
			
 
				+        """
			
 
				+        获取PDF页面的旋转角度（逆时针定义，用于图像旋转）
			
 
				+        
			
 
				+        返回的角度可直接用于PIL.rotate()等图像旋转函数。
			
 
				+        
			
 
				+        Args:
			
 
				+            pdf_doc: PDF文档对象 (pypdfium2.PdfDocument 或 fitz.Document)
			
 
				+            page_idx: 页码索引（0-based）
			
 
				+            
			
 
				+        Returns:
			
 
				+            旋转角度：0/90/180/270（逆时针旋转角度）
			
 
				+            
			
 
				+        Examples:
			
 
				+            >>> rotate_angle = PDFUtils.get_page_rotation(pdf_doc, 0)
			
 
				+            >>> if rotate_angle != 0:
			
 
				+            >>>     image = image.rotate(-rotate_angle, expand=True)  # 旋转为正视
			
 
				+        """
			
 
				+        return get_page_rotation(pdf_doc, page_idx)
			
 
				+
			
 
				     # ========================================================================
			
 
				     # 其他功能
			
 
				     # ========================================================================