5 месяцев назад · 58d9568b0f
--- a/ocr_utils/pdf_README.md
+++ b/ocr_utils/pdf_README.md
@@ -0,0 +1,266 @@
 
															+# PDF处理模块重构说明
														
 
															+
														
 
															+## 📦 模块结构
														
 
															+
														
 
															+PDF处理功能已重构为4个模块，便于维护和理解：
														
 
															+
														
 
															+```
														
 
															+ocr_utils/
														
 
															+├── pdf_utils.py                    # 主入口 - 高级API和向后兼容包装
														
 
															+├── pdf_coordinate_transform.py     # 坐标转换功能
														
 
															+├── pdf_text_extraction.py          # 文本提取功能  
														
 
															+├── pdf_image_rendering.py          # 图像渲染功能
														
 
															+└── pdf_README.md                   # 本文档
														
 
															+```
														
 
															+
														
 
															+## 🔧 各模块功能
														
 
															+
														
 
															+### 1. pdf_utils.py (主入口，~400行)
														
 
															+**作用**: 高级API和统一入口，保持向后兼容性
														
 
															+
														
 
															+**主要类**: `PDFUtils`
														
 
															+
														
 
															+**核心功能**:
														
 
															+- `load_and_classify_document()`: 加载文档(PDF/图片)并分类
														
 
															+- `merge_cross_page_tables()`: 跨页表格合并(TODO)
														
 
															+- 所有子模块函数的重新导出（向后兼容）
														
 
															+
														
 
															+**使用示例**:
														
 
															+```python
														
 
															+from ocr_utils.pdf_utils import PDFUtils
														
 
															+
														
 
															+# 加载PDF并分类
														
 
															+images, pdf_type, pdf_doc, renderer = PDFUtils.load_and_classify_document(
														
 
															+    document_path=Path("test.pdf"),
														
 
															+    dpi=200,
														
 
															+    page_range="1-5",
														
 
															+    renderer="fitz"
														
 
															+)
														
 
															+```
														
 
															+
														
 
															+### 2. pdf_coordinate_transform.py (~250行)
														
 
															+**作用**: PDF坐标系与图像坐标系的转换
														
 
															+
														
 
															+**核心函数**:
														
 
															+- `pdf_rotation_to_image_rotation()`: **PDF旋转角度转换为图片旋转角度**
														
 
															+- `transform_bbox_for_rotation_fitz()`: fitz引擎的完整几何坐标变换
														
 
															+- `transform_bbox_for_rotation_pypdfium2()`: pypdfium2引擎的坐标值交换
														
 
															+
														
 
															+**坐标系说明**:
														
 
															+- **PDF坐标系**: 左下角原点 (0,0)，X向右，Y向上
														
 
															+- **图像坐标系**: 左上角原点 (0,0)，X向右，Y向下
														
 
															+
														
 
															+**旋转定义（重要）**:
														
 
															+- **PDF rotation**: 0/90/180/270度（**顺时针旋转**，PDF规范）
														
 
															+- **图片rotation**: 0/90/180/270度（**逆时针旋转**，图像处理标准）
														
 
															+- **对外API统一返回图片rotation**（逆时针定义）
														
 
															+
														
 
															+**旋转转换映射**:
														
 
															+| PDF Rotation (顺时针) | 图片 Rotation (逆时针) |
														
 
															+|---------------------|---------------------|
														
 
															+| 0° | 0° |
														
 
															+| 90° | 270° |
														
 
															+| 180° | 180° |
														
 
															+| 270° | 90° |
														
 
															+
														
 
															+**关键区别**:
														
 
															+| 特性 | fitz | pypdfium2 |
														
 
															+|------|------|-----------|
														
 
															+| 输入坐标系 | PDF原始坐标系 | 已旋转的坐标系（但bbox顺序错误） |
														
 
															+| 变换类型 | 几何空间变换 | 坐标值交换（修正min/max） |
														
 
															+| 复杂度 | 高（涉及旋转公式） | 低（只是交换） |
														
 
															+
														
 
															+**使用示例**:
														
 
															+```python
														
 
															+from ocr_utils.pdf_coordinate_transform import transform_bbox_for_rotation_fitz
														
 
															+
														
 
															+# fitz引擎: 完整几何变换
														
 
															+img_bbox = transform_bbox_for_rotation_fitz(
														
 
															+    bbox=[100, 50, 200, 100],
														
 
															+    rotation=90,
														
 
															+    pdf_width=595,
														
 
															+    pdf_height=842,
														
 
															+    scale=2.778
														
 
															+)
														
 
															+```
														
 
															+
														
 
															+### 3. pdf_text_extraction.py (~450行)
														
 
															+**作用**: 从PDF提取文本，支持rotation处理
														
 
															+
														
 
															+**核心函数**:
														
 
															+- `extract_text_from_pdf()`: 从指定区域提取文本（自动检测引擎）
														
 
															+- `extract_all_text_blocks()`: 提取页面所有文本块（自动检测引擎）
														
 
															+- `detect_pdf_doc_type()`: 检测PDF文档类型(fitz/pypdfium2)
														
 
															+- `bbox_overlap()`: 检查bbox重叠
														
 
															+
														
 
															+**支持引擎**:
														
 
															+- **pypdfium2**: MinerU标准引擎
														
 
															+- **fitz (PyMuPDF)**: 轻量级替代引擎
														
 
															+
														
 
															+**使用示例**:
														
 
															+```python
														
 
															+from ocr_utils.pdf_text_extraction import extract_all_text_blocks
														
 
															+
														
 
															+# 提取所有文本块（自动应用rotation变换）
														
 
															+text_blocks, rotation = extract_all_text_blocks(
														
 
															+    pdf_doc=pdf_doc,
														
 
															+    page_idx=0,
														
 
															+    scale=2.778
														
 
															+)
														
 
															+
														
 
															+# 返回格式:
														
 
															+# text_blocks = [
														
 
															+#     {'text': 'Hello', 'bbox': [x1, y1, x2, y2], 'origin_bbox': [...]},
														
 
															+#     ...
														
 
															+# ]
														
 
															+# rotation = 270  # 图片旋转角度：0/90/180/270（逆时针）
														
 
															+#                 # 注意：返回的是图片旋转定义（逆时针），不是PDF rotation（顺时针）
														
 
															+```
														
 
															+
														
 
															+**⚠️ 重要说明**:
														
 
															+- `rotation` 返回值采用**图片旋转定义**（逆时针），不是PDF rotation（顺时针）
														
 
															+- PDF rotation 90° → 返回 270°（图片需要逆时针旋转270°）
														
 
															+- PDF rotation 270° → 返回 90°（图片需要逆时针旋转90°）
														
 
															+
														
 
															+
														
 
															+### 4. pdf_image_rendering.py (~300行)
														
 
															+**作用**: PDF页面渲染为图像
														
 
															+
														
 
															+**核心函数**:
														
 
															+- `load_images_from_pdf_unified()`: 统一的PDF图像加载接口
														
 
															+- `load_images_pypdfium2()`: 使用pypdfium2渲染
														
 
															+- `load_images_fitz()`: 使用fitz渲染
														
 
															+
														
 
															+**渲染引擎对比**:
														
 
															+| 特性 | pypdfium2 | fitz |
														
 
															+|------|-----------|------|
														
 
															+| 渲染引擎 | Chrome PDFium | MuPDF |
														
 
															+| 多进程加速 | ✅ (非Windows) | ❌ |
														
 
															+| 超时控制 | ✅ | ❌ |
														
 
															+| 尺寸限制 | 3500px | 4500px |
														
 
															+| 超限处理 | 动态调整scale | 降到72 DPI |
														
 
															+| MinerU标准 | ✅ | ❌ |
														
 
															+
														
 
															+**使用示例**:
														
 
															+```python
														
 
															+from ocr_utils.pdf_image_rendering import load_images_from_pdf_unified
														
 
															+
														
 
															+# 使用pypdfium2（推荐）
														
 
															+images, doc = load_images_from_pdf_unified(
														
 
															+    pdf_bytes=pdf_bytes,
														
 
															+    dpi=200,
														
 
															+    renderer="pypdfium2",
														
 
															+    threads=4
														
 
															+)
														
 
															+
														
 
															+# 使用fitz
														
 
															+images, doc = load_images_from_pdf_unified(
														
 
															+    pdf_bytes=pdf_bytes,
														
 
															+    dpi=200,
														
 
															+    renderer="fitz"
														
 
															+)
														
 
															+
														
 
															+# 返回格式:
														
 
															+# images = [
														
 
															+#     {'img_pil': PIL.Image, 'scale': 2.778},
														
 
															+#     ...
														
 
															+# ]
														
 
															+```
														
 
															+
														
 
															+## 🔄 向后兼容性
														
 
															+
														
 
															+**所有原有代码无需修改！**
														
 
															+
														
 
															+`PDFUtils`类保留了所有原有方法作为包装函数，内部调用新模块的功能：
														
 
															+
														
 
															+```python
														
 
															+# 旧代码继续工作
														
 
															+from ocr_utils.pdf_utils import PDFUtils
														
 
															+
														
 
															+# 所有这些方法仍然可用:
														
 
															+PDFUtils.extract_text_from_pdf(...)
														
 
															+PDFUtils.extract_all_text_blocks(...)
														
 
															+PDFUtils.load_images_from_pdf_unified(...)
														
 
															+PDFUtils._transform_bbox_for_rotation_fitz(...)
														
 
															+# ... 等等
														
 
															+```
														
 
															+
														
 
															+## 📝 最佳实践
														
 
															+
														
 
															+### 1. **新代码**: 直接导入子模块
														
 
															+```python
														
 
															+# 推荐: 直接从子模块导入
														
 
															+from ocr_utils.pdf_text_extraction import extract_all_text_blocks
														
 
															+from ocr_utils.pdf_coordinate_transform import transform_bbox_for_rotation_fitz
														
 
															+
														
 
															+text_blocks, rotation = extract_all_text_blocks(pdf_doc, 0, 2.778)
														
 
															+```
														
 
															+
														
 
															+### 2. **旧代码**: 继续使用PDFUtils
														
 
															+```python
														
 
															+# 兼容: 通过PDFUtils类使用
														
 
															+from ocr_utils.pdf_utils import PDFUtils
														
 
															+
														
 
															+text_blocks, rotation = PDFUtils.extract_all_text_blocks(pdf_doc, 0, 2.778)
														
 
															+```
														
 
															+
														
 
															+### 3. **渲染引擎选择**
														
 
															+```python
														
 
															+# 生产环境推荐: pypdfium2 (MinerU标准)
														
 
															+renderer = "pypdfium2"  # 多进程加速，更好的细节保留
														
 
															+
														
 
															+# 开发/测试: fitz (简单轻量)
														
 
															+renderer = "fitz"  # 无需额外依赖，单进程
														
 
															+```
														
 
															+
														
 
															+## 🎯 重构优势
														
 
															+
														
 
															+1. **代码组织**: 从单个984行文件 → 4个模块，每个200-450行
														
 
															+2. **职责清晰**: 坐标变换、文本提取、图像渲染各自独立
														
 
															+3. **易于测试**: 各模块可独立测试
														
 
															+4. **向后兼容**: 现有代码无需修改
														
 
															+5. **易于扩展**: 新功能可加入对应模块，不影响其他部分
														
 
															+
														
 
															+## ✅ 测试验证
														
 
															+
														
 
															+重构后通过完整测试验证：
														
 
															+- ✅ 所有8个rotation测试通过 (4种rotation × 2种引擎)
														
 
															+- ✅ fitz引擎: rotation 0°/90°/180°/270° 全部正确
														
 
															+- ✅ pypdfium2引擎: rotation 0°/90°/180°/270° 全部正确
														
 
															+- ✅ 坐标都在图像边界内
														
 
															+- ✅ 向后兼容性验证通过
														
 
															+
														
 
															+测试命令:
														
 
															+```bash
														
 
															+cd ocr_tools/universal_doc_parser/tests
														
 
															+python test_pdf_rotation.py
														
 
															+```
														
 
															+
														
 
															+## 📚 相关文档
														
 
															+
														
 
															+- [MinerU文档](https://github.com/opendatalab/MinerU)
														
 
															+- [PyMuPDF文档](https://pymupdf.readthedocs.io/)
														
 
															+- [pypdfium2文档](https://pypdfium2.readthedocs.io/)
														
 
															+
														
 
															+## 🔧 维护指南
														
 
															+
														
 
															+### 添加新功能
														
 
															+
														
 
															+根据功能类型选择合适的模块：
														
 
															+
														
 
															+1. **坐标转换相关** → `pdf_coordinate_transform.py`
														
 
															+2. **文本提取相关** → `pdf_text_extraction.py`
														
 
															+3. **图像渲染相关** → `pdf_image_rendering.py`
														
 
															+4. **高级API/工作流** → `pdf_utils.py`
														
 
															+
														
 
															+### 修改现有功能
														
 
															+
														
 
															+1. 在对应子模块中修改实现
														
 
															+2. 如果改变了函数签名，需在`pdf_utils.py`中更新包装函数
														
 
															+3. 运行测试验证: `python test_pdf_rotation.py`
														
 
															+
														
 
															+---
														
 
															+
														
 
															+**重构日期**: 2026-01-05  
														
 
															+**重构原因**: pdf_utils.py文件过大(984行)，难以维护  
														
 
															+**重构目标**: 按功能层次拆分，提高可维护性，保持向后兼容性
														
--- a/ocr_utils/pdf_coordinate_transform.py
+++ b/ocr_utils/pdf_coordinate_transform.py
@@ -0,0 +1,209 @@
 
															+"""
														
 
															+PDF坐标转换模块
														
 
															+
														
 
															+提供PDF坐标系与图像坐标系之间的转换功能。
														
 
															+
														
 
															+坐标系说明：
														
 
															+- PDF坐标系：左下角原点 (0,0)，X向右，Y向上
														
 
															+- 图像坐标系：左上角原点 (0,0)，X向右，Y向下
														
 
															+- PDF rotation：0/90/180/270度（顺时针旋转）
														
 
															+- 图片rotation：0/90/180/270度（逆时针旋转，对外统一使用此定义）
														
 
															+
														
 
															+关键函数：
														
 
															+- transform_bbox_for_rotation_fitz: fitz引擎的完整几何坐标变换
														
 
															+- transform_bbox_for_rotation_pypdfium2: pypdfium2引擎的坐标值交换
														
 
															+- pdf_rotation_to_image_rotation: PDF rotation转换为图片rotation
														
 
															+"""
														
 
															+from typing import List
														
 
															+from loguru import logger
														
 
															+
														
 
															+
														
 
															+def pdf_rotation_to_image_rotation(pdf_rotation: int) -> int:
														
 
															+    """
														
 
															+    将PDF rotation（顺时针）转换为图片rotation（逆时针）
														
 
															+    
														
 
															+    PDF的rotation表示页面顺时针旋转的角度（0/90/180/270）
														
 
															+    图片处理中通常使用逆时针旋转角度
														
 
															+    
														
 
															+    转换规则：
														
 
															+    - PDF rotation=0   → 图片rotation=0   (无旋转)
														
 
															+    - PDF rotation=90  → 图片rotation=270 (相当于逆时针270°)
														
 
															+    - PDF rotation=180 → 图片rotation=180 (180度两个方向相同)
														
 
															+    - PDF rotation=270 → 图片rotation=90  (相当于逆时针90°)
														
 
															+    
														
 
															+    Args:
														
 
															+        pdf_rotation: PDF页面rotation角度 (0/90/180/270)
														
 
															+        
														
 
															+    Returns:
														
 
															+        图片rotation角度 (0/90/180/270)，逆时针定义
														
 
															+        
														
 
															+    示例：
														
 
															+        >>> pdf_rotation_to_image_rotation(0)
														
 
															+        0
														
 
															+        >>> pdf_rotation_to_image_rotation(90)
														
 
															+        270
														
 
															+        >>> pdf_rotation_to_image_rotation(180)
														
 
															+        180
														
 
															+        >>> pdf_rotation_to_image_rotation(270)
														
 
															+        90
														
 
															+    """
														
 
															+    if pdf_rotation == 0:
														
 
															+        return 0
														
 
															+    elif pdf_rotation == 90:
														
 
															+        return 270
														
 
															+    elif pdf_rotation == 180:
														
 
															+        return 180
														
 
															+    elif pdf_rotation == 270:
														
 
															+        return 90
														
 
															+    else:
														
 
															+        logger.warning(f"Unknown PDF rotation: {pdf_rotation}, returning 0")
														
 
															+        return 0
														
 
															+
														
 
															+
														
 
															+def transform_bbox_for_rotation_fitz(
														
 
															+    bbox: List[float],
														
 
															+    rotation: int,
														
 
															+    pdf_width: float,
														
 
															+    pdf_height: float,
														
 
															+    scale: float
														
 
															+) -> List[float]:
														
 
															+    """
														
 
															+    fitz引擎的坐标转换（完整几何变换）
														
 
															+    
														
 
															+    fitz的get_text("dict")返回PDF原始坐标系（左下角原点，Y向上），
														
 
															+    需要进行完整的旋转变换 + Y轴翻转。
														
 
															+    
														
 
															+    Args:
														
 
															+        bbox: PDF原始坐标 [x1, y1, x2, y2]
														
 
															+        rotation: PDF页面rotation (0/90/180/270)
														
 
															+        pdf_width: PDF页面宽度（原始方向）
														
 
															+        pdf_height: PDF页面高度（原始方向）
														
 
															+        scale: 渲染缩放比例
														
 
															+        
														
 
															+    Returns:
														
 
															+        图像坐标 [x1, y1, x2, y2]，已确保 x1<x2, y1<y2
														
 
															+        
														
 
															+    变换公式：
														
 
															+        rotation=0:   (x, y) → (x, y)                    # 直接缩放
														
 
															+        rotation=90:  (x, y) → (pdf_height-y, x)         # 完整坐标变换
														
 
															+        rotation=180: (x, y) → (pdf_width-x, pdf_height-y) # 完整坐标变换
														
 
															+        rotation=270: (x, y) → (y, pdf_width-x)          # 完整坐标变换
														
 
															+    """
														
 
															+    x1, y1, x2, y2 = bbox
														
 
															+    
														
 
															+    if rotation == 0:
														
 
															+        # 直接缩放（fitz返回图像坐标系）
														
 
															+        new_x1 = x1 * scale
														
 
															+        new_y1 = y1 * scale
														
 
															+        new_x2 = x2 * scale
														
 
															+        new_y2 = y2 * scale
														
 
															+        
														
 
															+    elif rotation == 90:
														
 
															+        # 顺时针转90度
														
 
															+        new_x1 = (pdf_height - y2) * scale
														
 
															+        new_y1 = x1 * scale
														
 
															+        new_x2 = (pdf_height - y1) * scale
														
 
															+        new_y2 = x2 * scale
														
 
															+        
														
 
															+    elif rotation == 180:
														
 
															+        # 旋转180度：X和Y都翻转
														
 
															+        new_x1 = (pdf_width - x2) * scale
														
 
															+        new_y1 = (pdf_height - y2) * scale
														
 
															+        new_x2 = (pdf_width - x1) * scale
														
 
															+        new_y2 = (pdf_height - y1) * scale
														
 
															+        
														
 
															+    elif rotation == 270:
														
 
															+        # 顺时针转270度（逆时针转90度）
														
 
															+        new_x1 = y1 * scale
														
 
															+        new_y1 = (pdf_width - x2) * scale
														
 
															+        new_x2 = y2 * scale
														
 
															+        new_y2 = (pdf_width - x1) * scale
														
 
															+        
														
 
															+    else:
														
 
															+        logger.warning(f"Unknown rotation: {rotation}, using default transformation")
														
 
															+        new_x1 = x1 * scale
														
 
															+        new_y1 = y1 * scale
														
 
															+        new_x2 = x2 * scale
														
 
															+        new_y2 = y2 * scale
														
 
															+
														
 
															+    return [
														
 
															+        min(new_x1, new_x2),
														
 
															+        min(new_y1, new_y2),
														
 
															+        max(new_x1, new_x2),
														
 
															+        max(new_y1, new_y2)
														
 
															+    ]
														
 
															+
														
 
															+
														
 
															+def transform_bbox_for_rotation_pypdfium2(
														
 
															+    bbox: List[float],
														
 
															+    rotation: int,
														
 
															+    pdf_width: float,
														
 
															+    pdf_height: float,
														
 
															+    scale: float
														
 
															+) -> List[float]:
														
 
															+    """
														
 
															+    pypdfium2引擎的坐标转换（坐标值交换）
														
 
															+    
														
 
															+    pypdfium2的pdftext返回的坐标已经过部分处理（已旋转到正确位置），
														
 
															+    但bbox的(x1,y1)和(x2,y2)的大小关系可能出错，只需交换坐标值即可。
														
 
															+    
														
 
															+    Args:
														
 
															+        bbox: 已旋转的坐标 [x1, y1, x2, y2]（但顺序可能错误）
														
 
															+        rotation: PDF页面rotation (0/90/180/270)
														
 
															+        pdf_width: PDF页面宽度（原始方向，本函数中未使用）
														
 
															+        pdf_height: PDF页面高度（原始方向，本函数中未使用）
														
 
															+        scale: 渲染缩放比例
														
 
															+        
														
 
															+    Returns:
														
 
															+        图像坐标 [x1, y1, x2, y2]，已确保 x1<x2, y1<y2
														
 
															+        
														
 
															+    变换规则：
														
 
															+        rotation=0:   (x1,y1,x2,y2) → (x1,y1,x2,y2)     # 不变
														
 
															+        rotation=90:  (x1,y1,x2,y2) → (x1,y2,x2,y1)     # y坐标交换
														
 
															+        rotation=180: (x1,y1,x2,y2) → (x1,y2,x2,y1)     # y坐标交换
														
 
															+        rotation=270: (x1,y1,x2,y2) → (x2,y1,x1,y2)     # x坐标交换
														
 
															+    """
														
 
															+    x1, y1, x2, y2 = bbox
														
 
															+    
														
 
															+    if rotation == 0:
														
 
															+        # rotation=0时，直接缩放
														
 
															+        new_x1 = x1 * scale
														
 
															+        new_y1 = y1 * scale
														
 
															+        new_x2 = x2 * scale
														
 
															+        new_y2 = y2 * scale
														
 
															+        
														
 
															+    elif rotation == 90:
														
 
															+        # 顺时针转90度：交换y坐标
														
 
															+        new_x1 = x1 * scale
														
 
															+        new_y1 = y2 * scale
														
 
															+        new_x2 = x2 * scale
														
 
															+        new_y2 = y1 * scale
														
 
															+
														
 
															+    elif rotation == 180:
														
 
															+        # 旋转180度：交换y坐标
														
 
															+        new_x1 = x1 * scale
														
 
															+        new_y1 = y2 * scale
														
 
															+        new_x2 = x2 * scale
														
 
															+        new_y2 = y1 * scale
														
 
															+        
														
 
															+    elif rotation == 270:
														
 
															+        # 顺时针转270度：交换x坐标
														
 
															+        new_x1 = x2 * scale
														
 
															+        new_y1 = y1 * scale
														
 
															+        new_x2 = x1 * scale
														
 
															+        new_y2 = y2 * scale
														
 
															+        
														
 
															+    else:
														
 
															+        logger.warning(f"Unknown rotation: {rotation}, using default transformation")
														
 
															+        new_x1 = x1 * scale
														
 
															+        new_y1 = y1 * scale
														
 
															+        new_x2 = x2 * scale
														
 
															+        new_y2 = y2 = y2 * scale
														
 
															+
														
 
															+    return [
														
 
															+        min(new_x1, new_x2),
														
 
															+        min(new_y1, new_y2),
														
 
															+        max(new_x1, new_x2),
														
 
															+        max(new_y1, new_y2)
														
 
															+    ]
														
 
															+
														
--- a/ocr_utils/pdf_image_rendering.py
+++ b/ocr_utils/pdf_image_rendering.py
@@ -0,0 +1,271 @@
 
															+"""
														
 
															+PDF图像渲染模块
														
 
															+
														
 
															+提供PDF页面渲染为图像的功能，支持多种渲染引擎：
														
 
															+- pypdfium2: MinerU标准引擎（Chrome PDFium），支持多进程加速
														
 
															+- fitz (PyMuPDF): 轻量级替代引擎（MuPDF）
														
 
															+
														
 
															+主要功能：
														
 
															+- 统一的PDF图像加载接口
														
 
															+- 多渲染引擎支持
														
 
															+- 自动处理PDF rotation
														
 
															+- DPI缩放和尺寸限制处理
														
 
															+"""
														
 
															+from typing import List, Dict, Any, Optional, Tuple
														
 
															+from PIL import Image
														
 
															+from loguru import logger
														
 
															+
														
 
															+
														
 
															+def load_images_from_pdf_unified(
														
 
															+    pdf_bytes: bytes,
														
 
															+    dpi: int = 200,
														
 
															+    start_page_id: int = 0,
														
 
															+    end_page_id: Optional[int] = None,
														
 
															+    image_type: str = "PIL",
														
 
															+    renderer: str = "pypdfium2",
														
 
															+    timeout: Optional[int] = None,
														
 
															+    threads: int = 4,
														
 
															+) -> Tuple[List[Dict[str, Any]], Any]:
														
 
															+    """
														
 
															+    从 PDF 加载图像，支持两种渲染引擎
														
 
															+    
														
 
															+    Args:
														
 
															+        pdf_bytes: PDF 文件的字节数据
														
 
															+        dpi: 渲染 DPI，默认 200
														
 
															+        start_page_id: 起始页码（0-based），默认 0
														
 
															+        end_page_id: 结束页码（0-based，包含），默认 None（处理到最后）
														
 
															+        image_type: 返回图像类型，"PIL" 或 "BASE64"
														
 
															+        renderer: 渲染引擎选择
														
 
															+            - "pypdfium2": 使用 MinerU 标准的 pypdfium2（推荐）
														
 
															+            * 优势: Chrome PDFium 引擎，多进程加速，更好的细节保留
														
 
															+            * 尺寸限制: 3500px，超过则动态调整 scale
														
 
															+            - "fitz" / "pymupdf": 使用 PyMuPDF (fitz)
														
 
															+            * 优势: MuPDF 引擎，简单直接，无需额外依赖
														
 
															+            * 尺寸限制: 4500px，超过则降到 72 DPI
														
 
															+        timeout: 超时时间（秒），仅 pypdfium2 支持
														
 
															+        threads: 进程数，仅 pypdfium2 支持多进程加速（Windows 下自动禁用）
														
 
															+        
														
 
															+    Returns:
														
 
															+        (images_list, pdf_doc)
														
 
															+        - images_list: 图像列表，每个元素为 {'img_pil': PIL.Image, 'scale': float}
														
 
															+                    或 {'img_base64': str, 'scale': float}（取决于 image_type）
														
 
															+        - pdf_doc: PDF 文档对象（pypdfium2.PdfDocument 或 fitz.Document）
														
 
															+        
														
 
															+    Raises:
														
 
															+        ImportError: 如果选择的渲染引擎不可用
														
 
															+        ValueError: 如果参数无效
														
 
															+        TimeoutError: 如果转换超时（仅 pypdfium2）
														
 
															+    
														
 
															+    渲染引擎对比:
														
 
															+        ┌─────────────┬──────────────┬──────────────┐
														
 
															+        │   特性      │  pypdfium2   │    fitz      │
														
 
															+        ├─────────────┼──────────────┼──────────────┤
														
 
															+        │ 渲染引擎    │ Chrome PDFium│ MuPDF        │
														
 
															+        │ 多进程加速  │ ✅ (非Windows)│ ❌           │
														
 
															+        │ 超时控制    │ ✅           │ ❌           │
														
 
															+        │ 尺寸限制    │ 3500px       │ 4500px       │
														
 
															+        │ 超限处理    │ 动态调整scale│ 降到72 DPI   │
														
 
															+        │ 细节保留    │ 更好         │ 良好         │
														
 
															+        │ MinerU标准  │ ✅           │ ❌           │
														
 
															+        └─────────────┴──────────────┴──────────────┘
														
 
															+    
														
 
															+    示例:
														
 
															+        # 使用 pypdfium2（推荐，MinerU 标准）
														
 
															+        images, doc = load_images_from_pdf_unified(
														
 
															+            pdf_bytes, 
														
 
															+            dpi=200, 
														
 
															+            renderer="pypdfium2",
														
 
															+            threads=4
														
 
															+        )
														
 
															+        
														
 
															+        # 使用 PyMuPDF (fitz)
														
 
															+        images, doc = load_images_from_pdf_unified(
														
 
															+            pdf_bytes, 
														
 
															+            dpi=200, 
														
 
															+            renderer="fitz"
														
 
															+        )
														
 
															+        
														
 
															+        # 访问图像
														
 
															+        for img_dict in images:
														
 
															+            pil_image = img_dict['img_pil']
														
 
															+            scale = img_dict['scale']
														
 
															+            # 处理图像...
														
 
															+    
														
 
															+    注意事项:
														
 
															+        1. pypdfium2 在生产环境中更推荐，因为它是 MinerU 的标准实现
														
 
															+        2. 两种渲染引擎可能产生略有不同的图像（SSIM ≈ 0.945）
														
 
															+        3. 建议在同一项目中保持使用同一渲染引擎，避免不一致
														
 
															+        4. 如果需要与现有测试图像对比，使用相同的渲染引擎
														
 
															+        5. 渲染时会自动应用PDF页面的rotation属性（0/90/180/270）
														
 
															+    """
														
 
															+    renderer = renderer.lower()
														
 
															+    
														
 
															+    if renderer in ["pypdfium2", "pdfium"]:
														
 
															+        return load_images_pypdfium2(
														
 
															+            pdf_bytes, dpi, start_page_id, end_page_id, 
														
 
															+            image_type, timeout, threads
														
 
															+        )
														
 
															+    elif renderer in ["fitz", "pymupdf", "mupdf"]:
														
 
															+        return load_images_fitz(
														
 
															+            pdf_bytes, dpi, start_page_id, end_page_id, image_type
														
 
															+        )
														
 
															+    else:
														
 
															+        raise ValueError(
														
 
															+            f"不支持的渲染引擎: {renderer}. "
														
 
															+            f"请使用 'pypdfium2' 或 'fitz'"
														
 
															+        )
														
 
															+
														
 
															+
														
 
															+def load_images_pypdfium2(
														
 
															+    pdf_bytes: bytes,
														
 
															+    dpi: int,
														
 
															+    start_page_id: int,
														
 
															+    end_page_id: Optional[int],
														
 
															+    image_type: str,
														
 
															+    timeout: Optional[int],
														
 
															+    threads: int
														
 
															+) -> Tuple[List[Dict[str, Any]], Any]:
														
 
															+    """
														
 
															+    使用 pypdfium2 渲染引擎（MinerU 标准）
														
 
															+    
														
 
															+    Args:
														
 
															+        pdf_bytes: PDF字节数据
														
 
															+        dpi: 渲染DPI
														
 
															+        start_page_id: 起始页码
														
 
															+        end_page_id: 结束页码
														
 
															+        image_type: 图像类型 ("PIL" 或 "BASE64")
														
 
															+        timeout: 超时时间（秒）
														
 
															+        threads: 进程数
														
 
															+        
														
 
															+    Returns:
														
 
															+        (images_list, pdf_doc)
														
 
															+        
														
 
															+    Raises:
														
 
															+        ImportError: 如果 MinerU 未安装
														
 
															+    """
														
 
															+    try:
														
 
															+        import pypdfium2 as pdfium
														
 
															+        from mineru.utils.pdf_image_tools import load_images_from_pdf as mineru_load_images
														
 
															+        from mineru.utils.enum_class import ImageType
														
 
															+    except ImportError as e:
														
 
															+        raise ImportError(
														
 
															+            f"pypdfium2 渲染引擎需要安装 MinerU: pip install mineru\n"
														
 
															+            f"原始错误: {e}"
														
 
															+        )
														
 
															+    
														
 
															+    # 转换 image_type
														
 
															+    img_type = ImageType.PIL if image_type.upper() == "PIL" else ImageType.BASE64
														
 
															+    
														
 
															+    # 使用 MinerU 的实现
														
 
															+    # 渲染时自动应用rotation
														
 
															+    images_list, pdf_doc = mineru_load_images(
														
 
															+        pdf_bytes=pdf_bytes,
														
 
															+        dpi=dpi,
														
 
															+        start_page_id=start_page_id,
														
 
															+        end_page_id=end_page_id,
														
 
															+        image_type=img_type,
														
 
															+        timeout=timeout,
														
 
															+        threads=threads
														
 
															+    )
														
 
															+    
														
 
															+    logger.info(
														
 
															+        f"✅ pypdfium2 渲染完成: {len(images_list)} 页 "
														
 
															+        f"(DPI={dpi}, 多进程={threads})"
														
 
															+    )
														
 
															+    
														
 
															+    return images_list, pdf_doc
														
 
															+
														
 
															+
														
 
															+def load_images_fitz(
														
 
															+    pdf_bytes: bytes,
														
 
															+    dpi: int,
														
 
															+    start_page_id: int,
														
 
															+    end_page_id: Optional[int],
														
 
															+    image_type: str
														
 
															+) -> Tuple[List[Dict[str, Any]], Any]:
														
 
															+    """
														
 
															+    使用 PyMuPDF (fitz) 渲染引擎
														
 
															+    
														
 
															+    Args:
														
 
															+        pdf_bytes: PDF字节数据
														
 
															+        dpi: 渲染DPI
														
 
															+        start_page_id: 起始页码
														
 
															+        end_page_id: 结束页码
														
 
															+        image_type: 图像类型 ("PIL" 或 "BASE64")
														
 
															+        
														
 
															+    Returns:
														
 
															+        (images_list, pdf_doc)
														
 
															+        
														
 
															+    Raises:
														
 
															+        ImportError: 如果 PyMuPDF 未安装
														
 
															+    """
														
 
															+    try:
														
 
															+        import fitz
														
 
															+    except ImportError as e:
														
 
															+        raise ImportError(
														
 
															+            f"PyMuPDF 渲染引擎需要安装: pip install PyMuPDF\n"
														
 
															+            f"原始错误: {e}"
														
 
															+        )
														
 
															+    
														
 
															+    from io import BytesIO
														
 
															+    import base64
														
 
															+    
														
 
															+    # 打开 PDF
														
 
															+    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
														
 
															+    pdf_page_num = doc.page_count
														
 
															+    
														
 
															+    # 处理 end_page_id
														
 
															+    if end_page_id is None or end_page_id < 0:
														
 
															+        end_page_id = pdf_page_num - 1
														
 
															+    end_page_id = min(end_page_id, pdf_page_num - 1)
														
 
															+    
														
 
															+    # 渲染图像
														
 
															+    images_list = []
														
 
															+    mat = fitz.Matrix(dpi / 72, dpi / 72)
														
 
															+    
														
 
															+    for index in range(start_page_id, end_page_id + 1):
														
 
															+        page = doc[index]
														
 
															+        
														
 
															+        # 渲染为 pixmap
														
 
															+        # get_pixmap 自动应用 page.rotation
														
 
															+        pm = page.get_pixmap(matrix=mat, alpha=False)  # type: ignore
														
 
															+        
														
 
															+        # 如果超过尺寸限制，降低到 72 DPI
														
 
															+        if pm.width > 4500 or pm.height > 4500:
														
 
															+            logger.warning(
														
 
															+                f"⚠️  页面 {index} 尺寸过大 ({pm.width}x{pm.height}), "
														
 
															+                f"降低到 72 DPI"
														
 
															+            )
														
 
															+            mat_fallback = fitz.Matrix(1, 1)  # 72 DPI
														
 
															+            pm = page.get_pixmap(matrix=mat_fallback, alpha=False)  # type: ignore
														
 
															+        
														
 
															+        # 转换为 PIL Image
														
 
															+        pil_img = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
														
 
															+        
														
 
															+        # 计算实际 scale
														
 
															+        page_rect = page.rect
														
 
															+        actual_scale = pm.width / page_rect.width
														
 
															+        
														
 
															+        # 构建返回字典
														
 
															+        image_dict = {
														
 
															+            'img_pil': pil_img,
														
 
															+            'scale': actual_scale
														
 
															+        }
														
 
															+        
														
 
															+        # 如果需要 BASE64
														
 
															+        if image_type.upper() == "BASE64":
														
 
															+            buffer = BytesIO()
														
 
															+            pil_img.save(buffer, format="JPEG")
														
 
															+            img_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
														
 
															+            image_dict['img_base64'] = img_base64
														
 
															+            # 移除 img_pil 以节省内存
														
 
															+            del image_dict['img_pil']
														
 
															+        
														
 
															+        images_list.append(image_dict)
														
 
															+    
														
 
															+    logger.info(
														
 
															+        f"✅ PyMuPDF (fitz) 渲染完成: {len(images_list)} 页 "
														
 
															+        f"(DPI={dpi}, 单进程)"
														
 
															+    )
														
 
															+    
														
 
															+    return images_list, doc
														
--- a/ocr_utils/pdf_text_extraction.py
+++ b/ocr_utils/pdf_text_extraction.py
@@ -0,0 +1,413 @@
 
															+"""
														
 
															+PDF文本提取模块
														
 
															+
														
 
															+提供从PDF文档中提取文本的功能，支持多种PDF引擎：
														
 
															+- pypdfium2: MinerU标准引擎
														
 
															+- fitz (PyMuPDF): 轻量级替代引擎
														
 
															+
														
 
															+主要功能：
														
 
															+- 区域文本提取：从指定bbox区域提取文本
														
 
															+- 全页文本提取：提取页面所有文本块及其坐标
														
 
															+- 自动rotation处理：自动应用PDF页面旋转变换
														
 
															+- 返回图片rotation（逆时针定义）：对外统一使用图片处理标准
														
 
															+"""
														
 
															+from typing import Dict, List, Any, Tuple
														
 
															+from loguru import logger
														
 
															+
														
 
															+# 导入坐标转换函数
														
 
															+from .pdf_coordinate_transform import (
														
 
															+    transform_bbox_for_rotation_fitz,
														
 
															+    transform_bbox_for_rotation_pypdfium2,
														
 
															+    pdf_rotation_to_image_rotation
														
 
															+)
														
 
															+
														
 
															+# 导入 MinerU 组件
														
 
															+try:
														
 
															+    from mineru.utils.pdf_text_tool import get_page as pdf_get_page_text
														
 
															+    MINERU_AVAILABLE = True
														
 
															+except ImportError:
														
 
															+    pdf_get_page_text = None
														
 
															+    MINERU_AVAILABLE = False
														
 
															+
														
 
															+
														
 
															+def detect_pdf_doc_type(pdf_doc: Any) -> str:
														
 
															+    """
														
 
															+    检测 PDF 文档对象类型
														
 
															+    
														
 
															+    Args:
														
 
															+        pdf_doc: PDF 文档对象
														
 
															+        
														
 
															+    Returns:
														
 
															+        'pypdfium2' 或 'fitz'
														
 
															+    """
														
 
															+    doc_type_name = type(pdf_doc).__name__
														
 
															+    doc_module = type(pdf_doc).__module__
														
 
															+    
														
 
															+    if 'pdfium' in doc_module.lower() or 'PdfDocument' in doc_type_name:
														
 
															+        return 'pypdfium2'
														
 
															+    elif 'fitz' in doc_module.lower() or 'Document' in doc_type_name:
														
 
															+        return 'fitz'
														
 
															+    else:
														
 
															+        # 尝试通过属性判断
														
 
															+        if hasattr(pdf_doc, 'get_page') or hasattr(pdf_doc, 'page_count'):
														
 
															+            return 'fitz'
														
 
															+        else:
														
 
															+            return 'pypdfium2'
														
 
															+
														
 
															+
														
 
															+def bbox_overlap(bbox1: List[float], bbox2: List[float]) -> bool:
														
 
															+    """
														
 
															+    检查两个 bbox 是否重叠
														
 
															+    
														
 
															+    Args:
														
 
															+        bbox1: 第一个bbox [x1, y1, x2, y2]
														
 
															+        bbox2: 第二个bbox [x1, y1, x2, y2]
														
 
															+        
														
 
															+    Returns:
														
 
															+        True 如果重叠，否则 False
														
 
															+    """
														
 
															+    if len(bbox1) < 4 or len(bbox2) < 4:
														
 
															+        return False
														
 
															+    
														
 
															+    x1_1, y1_1, x2_1, y2_1 = bbox1[:4]
														
 
															+    x1_2, y1_2, x2_2, y2_2 = bbox2[:4]
														
 
															+    
														
 
															+    # 检查是否不重叠（取反）
														
 
															+    if x2_1 < x1_2 or x2_2 < x1_1:
														
 
															+        return False
														
 
															+    if y2_1 < y1_2 or y2_2 < y1_1:
														
 
															+        return False
														
 
															+    
														
 
															+    return True
														
 
															+
														
 
															+
														
 
															+# ============================================================================
														
 
															+# 区域文本提取
														
 
															+# ============================================================================
														
 
															+
														
 
															+def extract_text_from_pdf(
														
 
															+    pdf_doc: Any,
														
 
															+    page_idx: int,
														
 
															+    bbox: List[float],
														
 
															+    scale: float
														
 
															+) -> Tuple[str, bool]:
														
 
															+    """
														
 
															+    从PDF指定区域提取文本（支持 pypdfium2 和 fitz）
														
 
															+    
														
 
															+    Args:
														
 
															+        pdf_doc: PDF文档对象 (pypdfium2.PdfDocument 或 fitz.Document)
														
 
															+        page_idx: 页码索引（0-based）
														
 
															+        bbox: 目标区域的bbox（图像坐标）[x1, y1, x2, y2]
														
 
															+        scale: 图像与PDF的缩放比例
														
 
															+        
														
 
															+    Returns:
														
 
															+        (text, success)
														
 
															+        - text: 提取的文本
														
 
															+        - success: 是否成功提取到文本
														
 
															+    """
														
 
															+    doc_type = detect_pdf_doc_type(pdf_doc)
														
 
															+    
														
 
															+    if doc_type == 'fitz':
														
 
															+        return extract_text_from_pdf_fitz(pdf_doc, page_idx, bbox, scale)
														
 
															+    else:  # pypdfium2
														
 
															+        return extract_text_from_pdf_pypdfium2(pdf_doc, page_idx, bbox, scale)
														
 
															+
														
 
															+
														
 
															+def extract_text_from_pdf_pypdfium2(
														
 
															+    pdf_doc: Any,
														
 
															+    page_idx: int,
														
 
															+    bbox: List[float],
														
 
															+    scale: float
														
 
															+) -> Tuple[str, bool]:
														
 
															+    """
														
 
															+    使用 pypdfium2 从指定区域提取文本
														
 
															+    
														
 
															+    Args:
														
 
															+        pdf_doc: pypdfium2.PdfDocument 对象
														
 
															+        page_idx: 页码索引
														
 
															+        bbox: 目标区域的bbox（图像坐标）
														
 
															+        scale: 缩放比例
														
 
															+        
														
 
															+    Returns:
														
 
															+        (text, success)
														
 
															+    """
														
 
															+    if not MINERU_AVAILABLE or pdf_get_page_text is None:
														
 
															+        logger.error("MinerU pdf_text_tool not available")
														
 
															+        return "", False
														
 
															+        
														
 
															+    try:
														
 
															+        page = pdf_doc[page_idx]
														
 
															+        
														
 
															+        # 将图像坐标转换为PDF坐标
														
 
															+        pdf_bbox = [
														
 
															+            bbox[0] / scale,
														
 
															+            bbox[1] / scale,
														
 
															+            bbox[2] / scale,
														
 
															+            bbox[3] / scale
														
 
															+        ]
														
 
															+        
														
 
															+        # 使用 MinerU 的方式获取页面文本信息
														
 
															+        page_dict = pdf_get_page_text(page)
														
 
															+        
														
 
															+        # 从 blocks 中提取与 bbox 重叠的文本
														
 
															+        text_parts = []
														
 
															+        for block in page_dict.get('blocks', []):
														
 
															+            for line in block.get('lines', []):
														
 
															+                line_bbox = line.get('bbox')
														
 
															+                if line_bbox and hasattr(line_bbox, 'bbox'):
														
 
															+                    line_bbox = line_bbox.bbox
														
 
															+                elif isinstance(line_bbox, (list, tuple)) and len(line_bbox) >= 4:
														
 
															+                    line_bbox = list(line_bbox)
														
 
															+                else:
														
 
															+                    continue
														
 
															+                
														
 
															+                if bbox_overlap(pdf_bbox, line_bbox):
														
 
															+                    for span in line.get('spans', []):
														
 
															+                        span_text = span.get('text', '')
														
 
															+                        if span_text:
														
 
															+                            text_parts.append(span_text)
														
 
															+        
														
 
															+        text = ' '.join(text_parts)
														
 
															+        return text.strip(), bool(text.strip())
														
 
															+        
														
 
															+    except Exception as e:
														
 
															+        import traceback
														
 
															+        logger.debug(f"pypdfium2 text extraction error: {e}")
														
 
															+        logger.debug(traceback.format_exc())
														
 
															+        return "", False
														
 
															+
														
 
															+
														
 
															+def extract_text_from_pdf_fitz(
														
 
															+    pdf_doc: Any,
														
 
															+    page_idx: int,
														
 
															+    bbox: List[float],
														
 
															+    scale: float
														
 
															+) -> Tuple[str, bool]:
														
 
															+    """
														
 
															+    使用 fitz 从指定区域提取文本
														
 
															+    
														
 
															+    Args:
														
 
															+        pdf_doc: fitz.Document 对象
														
 
															+        page_idx: 页码索引
														
 
															+        bbox: 目标区域的bbox（图像坐标）
														
 
															+        scale: 缩放比例
														
 
															+        
														
 
															+    Returns:
														
 
															+        (text, success)
														
 
															+    """
														
 
															+    try:
														
 
															+        import fitz
														
 
															+    except ImportError:
														
 
															+        logger.error("PyMuPDF (fitz) not available")
														
 
															+        return "", False
														
 
															+    
														
 
															+    try:
														
 
															+        page = pdf_doc[page_idx]
														
 
															+        
														
 
															+        # 将图像坐标转换为PDF坐标
														
 
															+        pdf_bbox = fitz.Rect(
														
 
															+            bbox[0] / scale,
														
 
															+            bbox[1] / scale,
														
 
															+            bbox[2] / scale,
														
 
															+            bbox[3] / scale
														
 
															+        )
														
 
															+        
														
 
															+        # 提取区域内的文本
														
 
															+        text = page.get_text("text", clip=pdf_bbox)
														
 
															+        
														
 
															+        return text.strip(), bool(text.strip())
														
 
															+        
														
 
															+    except Exception as e:
														
 
															+        import traceback
														
 
															+        logger.debug(f"fitz text extraction error: {e}")
														
 
															+        logger.debug(traceback.format_exc())
														
 
															+        return "", False
														
 
															+
														
 
															+
														
 
															+# ============================================================================
														
 
															+# 全页文本提取
														
 
															+# ============================================================================
														
 
															+
														
 
															+def extract_all_text_blocks(
														
 
															+    pdf_doc: Any,
														
 
															+    page_idx: int,
														
 
															+    scale: float
														
 
															+) -> Tuple[List[Dict[str, Any]], int]:
														
 
															+    """
														
 
															+    提取页面所有文本块（支持 pypdfium2 和 fitz）+ PDF rotation处理
														
 
															+    
														
 
															+    Args:
														
 
															+        pdf_doc: PDF文档对象
														
 
															+        page_idx: 页码索引（0-based）
														
 
															+        scale: 缩放比例
														
 
															+        
														
 
															+    Returns:
														
 
															+        (text_blocks, rotation_angle)
														
 
															+        - text_blocks: 文本块列表 [{'text': str, 'bbox': [x1, y1, x2, y2], 'origin_bbox': [...]}, ...]
														
 
															+                      bbox坐标已转换为渲染图像坐标系（与OCR坐标系一致）
														
 
															+        - rotation_angle: 图片旋转角度(0/90/180/270)，逆时针定义
														
 
															+    """
														
 
															+    doc_type = detect_pdf_doc_type(pdf_doc)
														
 
															+    
														
 
															+    if doc_type == 'fitz':
														
 
															+        return extract_all_text_blocks_fitz(pdf_doc, page_idx, scale)
														
 
															+    else:
														
 
															+        return extract_all_text_blocks_pypdfium2(pdf_doc, page_idx, scale)
														
 
															+
														
 
															+
														
 
															+def extract_all_text_blocks_pypdfium2(
														
 
															+    pdf_doc: Any,
														
 
															+    page_idx: int,
														
 
															+    scale: float
														
 
															+) -> Tuple[List[Dict[str, Any]], int]:
														
 
															+    """
														
 
															+    使用 pypdfium2 提取所有文本块并处理rotation
														
 
															+    
														
 
															+    Args:
														
 
															+        pdf_doc: pypdfium2.PdfDocument 对象
														
 
															+        page_idx: 页码索引
														
 
															+        scale: 缩放比例
														
 
															+        
														
 
															+    Returns:
														
 
															+        (text_blocks, rotation_angle)
														
 
															+    """
														
 
															+    if not MINERU_AVAILABLE or pdf_get_page_text is None:
														
 
															+        return [], 0
														
 
															+        
														
 
															+    try:
														
 
															+        page = pdf_doc[page_idx]
														
 
															+        page_dict = pdf_get_page_text(page)
														
 
															+        
														
 
															+        # 获取页面尺寸和rotation
														
 
															+        rotation = page_dict.get('rotation', 0)
														
 
															+        pdf_width = page_dict.get('width', 0)
														
 
															+        pdf_height = page_dict.get('height', 0)
														
 
															+        
														
 
															+        if rotation != 0:
														
 
															+            logger.info(f"📐 Page {page_idx}: PDF rotation={rotation}°, size=({pdf_width}x{pdf_height})")
														
 
															+        
														
 
															+        extracted_blocks = []
														
 
															+        
														
 
															+        for block in page_dict.get('blocks', []):
														
 
															+            for line in block.get('lines', []):
														
 
															+                line_text = ""
														
 
															+                for span in line.get('spans', []):
														
 
															+                    line_text += span.get('text', "")
														
 
															+                
														
 
															+                if not line_text.strip():
														
 
															+                    continue
														
 
															+                    
														
 
															+                line_bbox = line.get('bbox')
														
 
															+                if line_bbox and hasattr(line_bbox, 'bbox'):
														
 
															+                    line_bbox = line_bbox.bbox
														
 
															+                elif isinstance(line_bbox, (list, tuple)) and len(line_bbox) >= 4:
														
 
															+                    line_bbox = list(line_bbox)
														
 
															+                else:
														
 
															+                    continue
														
 
															+                
														
 
															+                # 应用rotation坐标转换
														
 
															+                img_bbox = transform_bbox_for_rotation_pypdfium2(
														
 
															+                    line_bbox, rotation, pdf_width, pdf_height, scale
														
 
															+                )
														
 
															+                
														
 
															+                extracted_blocks.append({
														
 
															+                    'text': line_text,
														
 
															+                    'bbox': img_bbox,
														
 
															+                    'origin_bbox': line_bbox
														
 
															+                })
														
 
															+        
														
 
															+        # 转换为图片rotation（逆时针定义）
														
 
															+        image_rotation = pdf_rotation_to_image_rotation(rotation)
														
 
															+        return extracted_blocks, image_rotation
														
 
															+        
														
 
															+    except Exception as e:
														
 
															+        logger.warning(f"pypdfium2 extract_all_text_blocks failed: {e}")
														
 
															+        import traceback
														
 
															+        logger.debug(traceback.format_exc())
														
 
															+        return [], 0
														
 
															+
														
 
															+
														
 
															+def extract_all_text_blocks_fitz(
														
 
															+    pdf_doc: Any,
														
 
															+    page_idx: int,
														
 
															+    scale: float
														
 
															+) -> Tuple[List[Dict[str, Any]], int]:
														
 
															+    """
														
 
															+    使用 fitz 提取所有文本块并处理rotation
														
 
															+    
														
 
															+    Args:
														
 
															+        pdf_doc: fitz.Document 对象
														
 
															+        page_idx: 页码索引
														
 
															+        scale: 缩放比例
														
 
															+        
														
 
															+    Returns:
														
 
															+        (text_blocks, rotation_angle)
														
 
															+    """
														
 
															+    try:
														
 
															+        import fitz
														
 
															+    except ImportError:
														
 
															+        logger.warning("PyMuPDF (fitz) not available")
														
 
															+        return [], 0
														
 
															+    
														
 
															+    try:
														
 
															+        page = pdf_doc[page_idx]
														
 
															+        
														
 
															+        # 获取页面rotation
														
 
															+        rotation = page.rotation  # 0, 90, 180, 270
														
 
															+        
														
 
															+        # 获取页面尺寸（原始方向，未旋转）
														
 
															+        # page.rect 是旋转后的尺寸，我们需要原始尺寸
														
 
															+        if rotation in [90, 270]:
														
 
															+            # 宽高互换回来
														
 
															+            pdf_width = page.rect.height
														
 
															+            pdf_height = page.rect.width
														
 
															+        else:
														
 
															+            pdf_width = page.rect.width
														
 
															+            pdf_height = page.rect.height
														
 
															+        
														
 
															+        if rotation != 0:
														
 
															+            logger.info(f"📐 Page {page_idx}: PDF rotation={rotation}°, original_size=({pdf_width}x{pdf_height})")
														
 
															+        
														
 
															+        # 使用 get_text("dict") 获取详细的文本信息
														
 
															+        text_dict = page.get_text("dict")
														
 
															+        
														
 
															+        extracted_blocks = []
														
 
															+        
														
 
															+        # 遍历所有 blocks
														
 
															+        for block in text_dict.get("blocks", []):
														
 
															+            # 只处理文本块（type=0）
														
 
															+            if block.get("type") != 0:
														
 
															+                continue
														
 
															+            
														
 
															+            # 遍历所有 lines
														
 
															+            for line in block.get("lines", []):
														
 
															+                line_text = ""
														
 
															+                line_bbox = line.get("bbox")
														
 
															+                
														
 
															+                # 提取 line 中的所有 span 文本
														
 
															+                for span in line.get("spans", []):
														
 
															+                    line_text += span.get("text", "")
														
 
															+                
														
 
															+                if not line_text.strip() or not line_bbox:
														
 
															+                    continue
														
 
															+                
														
 
															+                # 应用rotation坐标转换
														
 
															+                img_bbox = transform_bbox_for_rotation_fitz(
														
 
															+                    list(line_bbox), rotation, pdf_width, pdf_height, scale
														
 
															+                )
														
 
															+                
														
 
															+                extracted_blocks.append({
														
 
															+                    'text': line_text,
														
 
															+                    'bbox': img_bbox,
														
 
															+                    'origin_bbox': list(line_bbox)
														
 
															+                })
														
 
															+        
														
 
															+        # 转换为图片rotation（逆时针定义）
														
 
															+        image_rotation = pdf_rotation_to_image_rotation(rotation)
														
 
															+        return extracted_blocks, image_rotation
														
 
															+        
														
 
															+    except Exception as e:
														
 
															+        logger.warning(f"fitz extract_all_text_blocks failed: {e}")
														
 
															+        import traceback
														
 
															+        logger.debug(traceback.format_exc())
														
 
															+        return [], 0
														
--- a/ocr_utils/pdf_utils.py
+++ b/ocr_utils/pdf_utils.py
@@ -1,33 +1,76 @@
 
															 """
														
 
															-PDF处理工具模块
														
 
															+PDF处理工具模块（重构版）
														
 
															-提供PDF相关处理功能：
														
 
															+提供PDF相关处理功能的统一入口：
														
 
															 - PDF加载与分类
														
 
															-- PDF文本提取
														
 
															+- PDF文本提取（支持 pypdfium2 和 fitz）
														
 
															+- PDF图像渲染（支持多种引擎）
														
 
															+- 坐标转换（PDF坐标 ↔ 图像坐标）
														
 
															 - 跨页表格合并
														
 
															 - 页面范围解析与过滤
														
 
															+
														
 
															+本模块已重构为多个子模块：
														
 
															+- pdf_coordinate_transform: 坐标转换功能
														
 
															+- pdf_text_extraction: 文本提取功能
														
 
															+- pdf_image_rendering: 图像渲染功能
														
 
															+- pdf_utils: 高级API和统一入口（本文件）
														
 
															+
														
 
															+为保持向后兼容性，所有原有函数都从新模块重新导出。
														
 
															 """
														
 
															 from typing import Dict, List, Any, Optional, Tuple, Set
														
 
															 from pathlib import Path
														
 
															 from PIL import Image
														
 
															 from loguru import logger
														
 
															-import re
														
 
															 # 导入页面范围解析函数（不依赖 MinerU）
														
 
															 from .file_utils import parse_page_range
														
 
															+# 从子模块导入功能
														
 
															+from .pdf_coordinate_transform import (
														
 
															+    transform_bbox_for_rotation_fitz,
														
 
															+    transform_bbox_for_rotation_pypdfium2,
														
 
															+    pdf_rotation_to_image_rotation,
														
 
															+)
														
 
															+
														
 
															+from .pdf_text_extraction import (
														
 
															+    detect_pdf_doc_type,
														
 
															+    bbox_overlap,
														
 
															+    extract_text_from_pdf,
														
 
															+    extract_text_from_pdf_pypdfium2,
														
 
															+    extract_text_from_pdf_fitz,
														
 
															+    extract_all_text_blocks,
														
 
															+    extract_all_text_blocks_pypdfium2,
														
 
															+    extract_all_text_blocks_fitz,
														
 
															+)
														
 
															+
														
 
															+from .pdf_image_rendering import (
														
 
															+    load_images_from_pdf_unified,
														
 
															+    load_images_pypdfium2,
														
 
															+    load_images_fitz,
														
 
															+)
														
 
															+
														
 
															 # 导入 MinerU 组件
														
 
															 try:
														
 
															     from mineru.utils.pdf_classify import classify as pdf_classify
														
 
															-    from mineru.utils.pdf_image_tools import load_images_from_pdf
														
 
															     from mineru.utils.enum_class import ImageType
														
 
															-    from mineru.utils.pdf_text_tool import get_page as pdf_get_page_text
														
 
															     MINERU_AVAILABLE = True
														
 
															 except ImportError:
														
 
															     raise ImportError("MinerU components not available for PDF processing")
														
 
															+
														
 
															+
														
 
															 class PDFUtils:
														
 
															-    """PDF处理工具类"""
														
 
															+    """
														
 
															+    PDF处理工具类（重构版）
														
 
															+    
														
 
															+    本类提供PDF处理的高级API，内部调用已重构的子模块功能。
														
 
															+    保持原有接口不变，确保向后兼容性。
														
 
															+    
														
 
															+    子模块：
														
 
															+    - pdf_coordinate_transform: 坐标转换
														
 
															+    - pdf_text_extraction: 文本提取
														
 
															+    - pdf_image_rendering: 图像渲染
														
 
															+    """
														
 
															     @staticmethod
														
 
															     def parse_page_range(page_range: Optional[str], total_pages: int) -> Set[int]:
														
@@ -56,7 +99,7 @@ class PDFUtils:
 
															     @staticmethod
														
 
															     def _detect_pdf_doc_type(pdf_doc: Any) -> str:
														
 
															         """
														
 
															-        检测 PDF 文档对象类型
														
 
															+        检测 PDF 文档对象类型（向后兼容包装）
														
 
															         Args:
														
 
															             pdf_doc: PDF 文档对象
														
@@ -64,28 +107,14 @@ class PDFUtils:
 
															         Returns:
														
 
															             'pypdfium2' 或 'fitz'
														
 
															         """
														
 
															-        doc_type_name = type(pdf_doc).__name__
														
 
															-        doc_module = type(pdf_doc).__module__
														
 
															-        
														
 
															-        if 'pdfium' in doc_module.lower() or 'PdfDocument' in doc_type_name:
														
 
															-            return 'pypdfium2'
														
 
															-        elif 'fitz' in doc_module.lower() or 'Document' in doc_type_name:
														
 
															-            return 'fitz'
														
 
															-        else:
														
 
															-            # 尝试通过属性判断
														
 
															-            if hasattr(pdf_doc, 'get_page') or hasattr(pdf_doc, 'page_count'):
														
 
															-                # fitz.Document 有 page_count 属性
														
 
															-                return 'fitz'
														
 
															-            else:
														
 
															-                # pypdfium2 通过索引访问
														
 
															-                return 'pypdfium2'
														
 
															+        return detect_pdf_doc_type(pdf_doc)
														
 
															     @staticmethod
														
 
															     def load_and_classify_document(
														
 
															         document_path: Path,
														
 
															         dpi: int = 200,
														
 
															         page_range: Optional[str] = None,
														
 
															-        renderer: str = "fitz"  # 新增参数，默认 fitz
														
 
															+        renderer: str = "fitz"
														
 
															     ) -> Tuple[List[Dict], str, Optional[Any], str]:
														
 
															         """
														
 
															         加载文档并分类，支持页面范围过滤
														
@@ -99,10 +128,10 @@ class PDFUtils:
 
															             renderer: PDF渲染引擎，"fitz" 或 "pypdfium2"
														
 
															         Returns:
														
 
															-            (images_list, pdf_type, pdf_doc)
														
 
															+            (images_list, pdf_type, pdf_doc, renderer_used)
														
 
															             - images_list: 图像列表，每个元素包含 {'img_pil': PIL.Image, 'scale': float, 'page_idx': int}
														
 
															             - pdf_type: 'ocr' 或 'txt'
														
 
															-            - pdf_doc: PDF文档对象（如果PDF）
														
 
															+            - pdf_doc: PDF文档对象（如果是PDF）
														
 
															             - renderer_used: 实际使用的渲染器类型
														
 
															         """
														
 
															         pdf_doc = None
														
@@ -135,11 +164,11 @@ class PDFUtils:
 
															                     'img_pil': img,
														
 
															                     'scale': 1.0,
														
 
															                     'source_path': str(img_file),
														
 
															-                    'page_idx': idx,  # 原始索引
														
 
															-                    'page_name': img_file.stem  # 文件名（不含扩展名）
														
 
															+                    'page_idx': idx,
														
 
															+                    'page_name': img_file.stem
														
 
															                 })
														
 
															-            pdf_type = 'ocr'  # 图片目录始终使用OCR模式
														
 
															+            pdf_type = 'ocr'
														
 
															         elif document_path.suffix.lower() == '.pdf':
														
 
															             # 处理PDF文件
														
@@ -153,12 +182,12 @@ class PDFUtils:
 
															             pdf_type = pdf_classify(pdf_bytes)
														
 
															             logger.info(f"📋 PDF classified as: {pdf_type}")
														
 
															-            # 加载图像
														
 
															+            # 加载图像（使用重构后的函数）
														
 
															             images_list, pdf_doc = load_images_from_pdf_unified(
														
 
															                 pdf_bytes, 
														
 
															                 dpi=dpi,
														
 
															                 image_type=ImageType.PIL,
														
 
															-                renderer=renderer   # 使用指定的渲染引擎
														
 
															+                renderer=renderer
														
 
															             )
														
 
															             # 解析页面范围
														
@@ -176,7 +205,7 @@ class PDFUtils:
 
															                     'img_pil': img_dict['img_pil'],
														
 
															                     'scale': img_dict.get('scale', dpi / 72),
														
 
															                     'source_path': str(document_path),
														
 
															-                    'page_idx': idx,  # 原始页码索引
														
 
															+                    'page_idx': idx,
														
 
															                     'page_name': f"{document_path.stem}_page_{idx + 1:03d}"
														
 
															                 })
														
@@ -199,6 +228,37 @@ class PDFUtils:
 
															         return all_images, pdf_type, pdf_doc, renderer
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def _transform_bbox_for_rotation_fitz(
														
 
															+        bbox: List[float],
														
 
															+        rotation: int,
														
 
															+        pdf_width: float,
														
 
															+        pdf_height: float,
														
 
															+        scale: float
														
 
															+    ) -> List[float]:
														
 
															+        """向后兼容包装：fitz引擎坐标转换"""
														
 
															+        return transform_bbox_for_rotation_fitz(bbox, rotation, pdf_width, pdf_height, scale)
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def _transform_bbox_for_rotation_pypdfium2(
														
 
															+        bbox: List[float],
														
 
															+        rotation: int,
														
 
															+        pdf_width: float,
														
 
															+        pdf_height: float,
														
 
															+        scale: float
														
 
															+    ) -> List[float]:
														
 
															+        """向后兼容包装：pypdfium2引擎坐标转换"""
														
 
															+        return transform_bbox_for_rotation_pypdfium2(bbox, rotation, pdf_width, pdf_height, scale)
														
 
															+
														
 
															+    # ========================================================================
														
 
															+    # 文本提取函数（向后兼容包装）
														
 
															+    # ========================================================================
														
 
															+
														
 
															+    # ========================================================================
														
 
															+    # 文本提取函数（向后兼容包装）
														
 
															+    # ========================================================================
														
 
															+    
														
 
															     @staticmethod
														
 
															     def extract_text_from_pdf(
														
 
															         pdf_doc: Any,
														
@@ -206,25 +266,8 @@ class PDFUtils:
 
															         bbox: List[float],
														
 
															         scale: float
														
 
															     ) -> Tuple[str, bool]:
														
 
															-        """
														
 
															-        从PDF直接提取文本（支持 pypdfium2 和 fitz）
														
 
															-        
														
 
															-        Args:
														
 
															-            pdf_doc: PDF文档对象 (pypdfium2.PdfDocument 或 fitz.Document)
														
 
															-            page_idx: 页码索引
														
 
															-            bbox: 目标区域的bbox（图像坐标）
														
 
															-            scale: 图像与PDF的缩放比例
														
 
															-            
														
 
															-        Returns:
														
 
															-            (text, success)
														
 
															-        """
														
 
															-        # 检测 PDF 文档类型
														
 
															-        doc_type = PDFUtils._detect_pdf_doc_type(pdf_doc)
														
 
															-        
														
 
															-        if doc_type == 'fitz':
														
 
															-            return PDFUtils._extract_text_from_pdf_fitz(pdf_doc, page_idx, bbox, scale)
														
 
															-        else:  # pypdfium2
														
 
															-            return PDFUtils._extract_text_from_pdf_pypdfium2(pdf_doc, page_idx, bbox, scale)
														
 
															+        """向后兼容包装：从PDF指定区域提取文本"""
														
 
															+        return extract_text_from_pdf(pdf_doc, page_idx, bbox, scale)
														
 
															     @staticmethod
														
 
															     def _extract_text_from_pdf_pypdfium2(
														
@@ -233,51 +276,8 @@ class PDFUtils:
 
															         bbox: List[float],
														
 
															         scale: float
														
 
															     ) -> Tuple[str, bool]:
														
 
															-        """使用 pypdfium2 提取文本（原有实现）"""
														
 
															-        if not MINERU_AVAILABLE or pdf_get_page_text is None:
														
 
															-            logger.error("MinerU pdf_text_tool not available")
														
 
															-            return "", False
														
 
															-            
														
 
															-        try:
														
 
															-            page = pdf_doc[page_idx]
														
 
															-            
														
 
															-            # 将图像坐标转换为PDF坐标
														
 
															-            pdf_bbox = [
														
 
															-                bbox[0] / scale,
														
 
															-                bbox[1] / scale,
														
 
															-                bbox[2] / scale,
														
 
															-                bbox[3] / scale
														
 
															-            ]
														
 
															-            
														
 
															-            # 使用 MinerU 的方式获取页面文本信息
														
 
															-            page_dict = pdf_get_page_text(page)
														
 
															-            
														
 
															-            # 从 blocks 中提取与 bbox 重叠的文本
														
 
															-            text_parts = []
														
 
															-            for block in page_dict.get('blocks', []):
														
 
															-                for line in block.get('lines', []):
														
 
															-                    line_bbox = line.get('bbox')
														
 
															-                    if line_bbox and hasattr(line_bbox, 'bbox'):
														
 
															-                        line_bbox = line_bbox.bbox
														
 
															-                    elif isinstance(line_bbox, (list, tuple)) and len(line_bbox) >= 4:
														
 
															-                        line_bbox = list(line_bbox)
														
 
															-                    else:
														
 
															-                        continue
														
 
															-                    
														
 
															-                    if PDFUtils._bbox_overlap(pdf_bbox, line_bbox):
														
 
															-                        for span in line.get('spans', []):
														
 
															-                            span_text = span.get('text', '')
														
 
															-                            if span_text:
														
 
															-                                text_parts.append(span_text)
														
 
															-            
														
 
															-            text = ' '.join(text_parts)
														
 
															-            return text.strip(), bool(text.strip())
														
 
															-            
														
 
															-        except Exception as e:
														
 
															-            import traceback
														
 
															-            logger.debug(f"pypdfium2 text extraction error: {e}")
														
 
															-            logger.debug(traceback.format_exc())
														
 
															-            return "", False
														
 
															+        """向后兼容包装：使用pypdfium2提取文本"""
														
 
															+        return extract_text_from_pdf_pypdfium2(pdf_doc, page_idx, bbox, scale)
														
 
															     @staticmethod
														
 
															     def _extract_text_from_pdf_fitz(
														
@@ -286,190 +286,96 @@ class PDFUtils:
 
															         bbox: List[float],
														
 
															         scale: float
														
 
															     ) -> Tuple[str, bool]:
														
 
															-        """使用 fitz 提取文本"""
														
 
															-        try:
														
 
															-            import fitz
														
 
															-        except ImportError:
														
 
															-            logger.error("PyMuPDF (fitz) not available")
														
 
															-            return "", False
														
 
															-        
														
 
															-        try:
														
 
															-            page = pdf_doc[page_idx]
														
 
															-            
														
 
															-            # 将图像坐标转换为PDF坐标
														
 
															-            pdf_bbox = fitz.Rect(
														
 
															-                bbox[0] / scale,
														
 
															-                bbox[1] / scale,
														
 
															-                bbox[2] / scale,
														
 
															-                bbox[3] / scale
														
 
															-            )
														
 
															-            
														
 
															-            # 提取区域内的文本
														
 
															-            text = page.get_text("text", clip=pdf_bbox)
														
 
															-            
														
 
															-            return text.strip(), bool(text.strip())
														
 
															-            
														
 
															-        except Exception as e:
														
 
															-            import traceback
														
 
															-            logger.debug(f"fitz text extraction error: {e}")
														
 
															-            logger.debug(traceback.format_exc())
														
 
															-            return "", False
														
 
															-    
														
 
															+        """向后兼容包装：使用fitz提取文本"""
														
 
															+        return extract_text_from_pdf_fitz(pdf_doc, page_idx, bbox, scale)
														
 
															+
														
 
															     @staticmethod
														
 
															     def extract_all_text_blocks(
														
 
															         pdf_doc: Any,
														
 
															         page_idx: int,
														
 
															         scale: float
														
 
															-    ) -> List[Dict[str, Any]]:
														
 
															-        """
														
 
															-        提取页面所有文本块（支持 pypdfium2 和 fitz）
														
 
															-        
														
 
															-        Args:
														
 
															-            pdf_doc: PDF文档对象
														
 
															-            page_idx: 页码
														
 
															-            scale: 缩放比例
														
 
															-            
														
 
															-        Returns:
														
 
															-            文本块列表 [{'text': str, 'bbox': [x1, y1, x2, y2]}, ...]
														
 
															-        """
														
 
															-        # 检测 PDF 文档类型
														
 
															-        doc_type = PDFUtils._detect_pdf_doc_type(pdf_doc)
														
 
															-        
														
 
															-        if doc_type == 'fitz':
														
 
															-            return PDFUtils._extract_all_text_blocks_fitz(pdf_doc, page_idx, scale)
														
 
															-        else:  # pypdfium2
														
 
															-            return PDFUtils._extract_all_text_blocks_pypdfium2(pdf_doc, page_idx, scale)
														
 
															-    
														
 
															+    ) -> Tuple[List[Dict[str, Any]], int]:
														
 
															+        """向后兼容包装：提取页面所有文本块"""
														
 
															+        return extract_all_text_blocks(pdf_doc, page_idx, scale)
														
 
															+
														
 
															     @staticmethod
														
 
															     def _extract_all_text_blocks_pypdfium2(
														
 
															         pdf_doc: Any,
														
 
															         page_idx: int,
														
 
															         scale: float
														
 
															-    ) -> List[Dict[str, Any]]:
														
 
															-        """使用 pypdfium2 提取所有文本块（原有实现）"""
														
 
															-        if not MINERU_AVAILABLE or pdf_get_page_text is None:
														
 
															-            return []
														
 
															-            
														
 
															-        try:
														
 
															-            page = pdf_doc[page_idx]
														
 
															-            page_dict = pdf_get_page_text(page)
														
 
															-            
														
 
															-            extracted_blocks = []
														
 
															-            
														
 
															-            for block in page_dict.get('blocks', []):
														
 
															-                for line in block.get('lines', []):
														
 
															-                    line_text = ""
														
 
															-                    for span in line.get('spans', []):
														
 
															-                        line_text += span.get('text', "")
														
 
															-                    
														
 
															-                    if not line_text.strip():
														
 
															-                        continue
														
 
															-                        
														
 
															-                    line_bbox = line.get('bbox')
														
 
															-                    if line_bbox and hasattr(line_bbox, 'bbox'):
														
 
															-                        line_bbox = line_bbox.bbox
														
 
															-                    elif isinstance(line_bbox, (list, tuple)) and len(line_bbox) >= 4:
														
 
															-                        line_bbox = list(line_bbox)
														
 
															-                    else:
														
 
															-                        continue
														
 
															-                        
														
 
															-                    img_bbox = [
														
 
															-                        line_bbox[0] * scale,
														
 
															-                        line_bbox[1] * scale,
														
 
															-                        line_bbox[2] * scale,
														
 
															-                        line_bbox[3] * scale
														
 
															-                    ]
														
 
															-                    
														
 
															-                    extracted_blocks.append({
														
 
															-                        'text': line_text,
														
 
															-                        'bbox': img_bbox,
														
 
															-                        'origin_bbox': line_bbox
														
 
															-                    })
														
 
															-            
														
 
															-            return extracted_blocks
														
 
															-            
														
 
															-        except Exception as e:
														
 
															-            logger.warning(f"pypdfium2 extract_all_text_blocks failed: {e}")
														
 
															-            import traceback
														
 
															-            logger.debug(traceback.format_exc())
														
 
															-            return []
														
 
															-    
														
 
															+    ) -> Tuple[List[Dict[str, Any]], int]:
														
 
															+        """向后兼容包装：使用pypdfium2提取所有文本块"""
														
 
															+        return extract_all_text_blocks_pypdfium2(pdf_doc, page_idx, scale)
														
 
															+
														
 
															     @staticmethod
														
 
															     def _extract_all_text_blocks_fitz(
														
 
															         pdf_doc: Any,
														
 
															         page_idx: int,
														
 
															         scale: float
														
 
															-    ) -> List[Dict[str, Any]]:
														
 
															-        """使用 fitz 提取所有文本块"""
														
 
															-        try:
														
 
															-            import fitz
														
 
															-        except ImportError:
														
 
															-            logger.warning("PyMuPDF (fitz) not available")
														
 
															-            return []
														
 
															-        
														
 
															-        try:
														
 
															-            page = pdf_doc[page_idx]
														
 
															-            
														
 
															-            # 使用 get_text("dict") 获取详细的文本信息
														
 
															-            text_dict = page.get_text("dict")
														
 
															-            
														
 
															-            extracted_blocks = []
														
 
															-            
														
 
															-            # 遍历所有 blocks
														
 
															-            for block in text_dict.get("blocks", []):
														
 
															-                # 只处理文本块（type=0）
														
 
															-                if block.get("type") != 0:
														
 
															-                    continue
														
 
															-                
														
 
															-                # 遍历所有 lines
														
 
															-                for line in block.get("lines", []):
														
 
															-                    line_text = ""
														
 
															-                    line_bbox = line.get("bbox")
														
 
															-                    
														
 
															-                    # 提取 line 中的所有 span 文本
														
 
															-                    for span in line.get("spans", []):
														
 
															-                        line_text += span.get("text", "")
														
 
															-                    
														
 
															-                    if not line_text.strip() or not line_bbox:
														
 
															-                        continue
														
 
															-                    
														
 
															-                    # PDF 坐标转换为图像坐标
														
 
															-                    img_bbox = [
														
 
															-                        line_bbox[0] * scale,
														
 
															-                        line_bbox[1] * scale,
														
 
															-                        line_bbox[2] * scale,
														
 
															-                        line_bbox[3] * scale
														
 
															-                    ]
														
 
															-                    
														
 
															-                    extracted_blocks.append({
														
 
															-                        'text': line_text,
														
 
															-                        'bbox': img_bbox,
														
 
															-                        'origin_bbox': list(line_bbox)
														
 
															-                    })
														
 
															-            
														
 
															-            return extracted_blocks
														
 
															-            
														
 
															-        except Exception as e:
														
 
															-            logger.warning(f"fitz extract_all_text_blocks failed: {e}")
														
 
															-            import traceback
														
 
															-            logger.debug(traceback.format_exc())
														
 
															-            return []    
														
 
															+    ) -> Tuple[List[Dict[str, Any]], int]:
														
 
															+        """向后兼容包装：使用fitz提取所有文本块"""
														
 
															+        return extract_all_text_blocks_fitz(pdf_doc, page_idx, scale)
														
 
															     @staticmethod
														
 
															     def _bbox_overlap(bbox1: List[float], bbox2: List[float]) -> bool:
														
 
															-        """检查两个 bbox 是否重叠"""
														
 
															-        if len(bbox1) < 4 or len(bbox2) < 4:
														
 
															-            return False
														
 
															-        
														
 
															-        x1_1, y1_1, x2_1, y2_1 = bbox1[:4]
														
 
															-        x1_2, y1_2, x2_2, y2_2 = bbox2[:4]
														
 
															-        
														
 
															-        if x2_1 < x1_2 or x2_2 < x1_1:
														
 
															-            return False
														
 
															-        if y2_1 < y1_2 or y2_2 < y1_1:
														
 
															-            return False
														
 
															-        
														
 
															-        return True
														
 
															+        """向后兼容包装：检查两个bbox是否重叠"""
														
 
															+        return bbox_overlap(bbox1, bbox2)
														
 
															+    
														
 
															+    # ========================================================================
														
 
															+    # 图像渲染函数（向后兼容包装）
														
 
															+    # ========================================================================
														
 
															+    
														
 
															+    @staticmethod
														
 
															+    def load_images_from_pdf_unified(
														
 
															+        pdf_bytes: bytes,
														
 
															+        dpi: int = 200,
														
 
															+        start_page_id: int = 0,
														
 
															+        end_page_id: Optional[int] = None,
														
 
															+        image_type: str = "PIL",
														
 
															+        renderer: str = "pypdfium2",
														
 
															+        timeout: Optional[int] = None,
														
 
															+        threads: int = 4,
														
 
															+    ) -> Tuple[List[Dict[str, Any]], Any]:
														
 
															+        """向后兼容包装：统一的PDF图像加载接口"""
														
 
															+        return load_images_from_pdf_unified(
														
 
															+            pdf_bytes, dpi, start_page_id, end_page_id,
														
 
															+            image_type, renderer, timeout, threads
														
 
															+        )
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def _load_images_pypdfium2(
														
 
															+        pdf_bytes: bytes,
														
 
															+        dpi: int,
														
 
															+        start_page_id: int,
														
 
															+        end_page_id: Optional[int],
														
 
															+        image_type: str,
														
 
															+        timeout: Optional[int],
														
 
															+        threads: int
														
 
															+    ) -> Tuple[List[Dict[str, Any]], Any]:
														
 
															+        """向后兼容包装：使用pypdfium2渲染"""
														
 
															+        return load_images_pypdfium2(
														
 
															+            pdf_bytes, dpi, start_page_id, end_page_id,
														
 
															+            image_type, timeout, threads
														
 
															+        )
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def _load_images_fitz(
														
 
															+        pdf_bytes: bytes,
														
 
															+        dpi: int,
														
 
															+        start_page_id: int,
														
 
															+        end_page_id: Optional[int],
														
 
															+        image_type: str
														
 
															+    ) -> Tuple[List[Dict[str, Any]], Any]:
														
 
															+        """向后兼容包装：使用fitz渲染"""
														
 
															+        return load_images_fitz(
														
 
															+            pdf_bytes, dpi, start_page_id, end_page_id, image_type
														
 
															+        )
														
 
															+    
														
 
															+    # ========================================================================
														
 
															+    # 其他功能
														
 
															+    # ========================================================================
														
 
															+    # 其他功能
														
 
															+    # ========================================================================
														
 
															     @staticmethod
														
 
															     def merge_cross_page_tables(results: Dict[str, Any]) -> Dict[str, Any]:
														
@@ -488,226 +394,3 @@ class PDFUtils:
 
															         # TODO: 实现跨页表格合并逻辑
														
 
															         return results
														
 
															-
														
 
															-# ============================================================================
														
 
															-# 统一的 PDF 图像加载函数 - 支持多种渲染引擎
														
 
															-# ============================================================================
														
 
															-
														
 
															-def load_images_from_pdf_unified(
														
 
															-    pdf_bytes: bytes,
														
 
															-    dpi: int = 200,
														
 
															-    start_page_id: int = 0,
														
 
															-    end_page_id: Optional[int] = None,
														
 
															-    image_type: str = "PIL",
														
 
															-    renderer: str = "pypdfium2",
														
 
															-    timeout: Optional[int] = None,
														
 
															-    threads: int = 4,
														
 
															-) -> Tuple[List[Dict[str, Any]], Any]:
														
 
															-    """
														
 
															-    从 PDF 加载图像，支持两种渲染引擎
														
 
															-    
														
 
															-    Args:
														
 
															-        pdf_bytes: PDF 文件的字节数据
														
 
															-        dpi: 渲染 DPI，默认 200
														
 
															-        start_page_id: 起始页码（0-based），默认 0
														
 
															-        end_page_id: 结束页码（0-based，包含），默认 None（处理到最后）
														
 
															-        image_type: 返回图像类型，"PIL" 或 "BASE64"
														
 
															-        renderer: 渲染引擎选择
														
 
															-            - "pypdfium2": 使用 MinerU 标准的 pypdfium2（推荐）
														
 
															-              * 优势: Chrome PDFium 引擎，多进程加速，更好的细节保留
														
 
															-              * 尺寸限制: 3500px，超过则动态调整 scale
														
 
															-            - "fitz" / "pymupdf": 使用 PyMuPDF (fitz)
														
 
															-              * 优势: MuPDF 引擎，简单直接，无需额外依赖
														
 
															-              * 尺寸限制: 4500px，超过则降到 72 DPI
														
 
															-        timeout: 超时时间（秒），仅 pypdfium2 支持
														
 
															-        threads: 进程数，仅 pypdfium2 支持多进程加速（Windows 下自动禁用）
														
 
															-        
														
 
															-    Returns:
														
 
															-        (images_list, pdf_doc)
														
 
															-        - images_list: 图像列表，每个元素为 {'img_pil': PIL.Image, 'scale': float}
														
 
															-                      或 {'img_base64': str, 'scale': float}（取决于 image_type）
														
 
															-        - pdf_doc: PDF 文档对象（pypdfium2.PdfDocument 或 fitz.Document）
														
 
															-        
														
 
															-    Raises:
														
 
															-        ImportError: 如果选择的渲染引擎不可用
														
 
															-        ValueError: 如果参数无效
														
 
															-        TimeoutError: 如果转换超时（仅 pypdfium2）
														
 
															-    
														
 
															-    渲染引擎对比:
														
 
															-        ┌─────────────┬──────────────┬──────────────┐
														
 
															-        │   特性      │  pypdfium2   │    fitz      │
														
 
															-        ├─────────────┼──────────────┼──────────────┤
														
 
															-        │ 渲染引擎    │ Chrome PDFium│ MuPDF        │
														
 
															-        │ 多进程加速  │ ✅ (非Windows)│ ❌           │
														
 
															-        │ 超时控制    │ ✅           │ ❌           │
														
 
															-        │ 尺寸限制    │ 3500px       │ 4500px       │
														
 
															-        │ 超限处理    │ 动态调整scale│ 降到72 DPI   │
														
 
															-        │ 细节保留    │ 更好         │ 良好         │
														
 
															-        │ MinerU标准  │ ✅           │ ❌           │
														
 
															-        └─────────────┴──────────────┴──────────────┘
														
 
															-    
														
 
															-    示例:
														
 
															-        # 使用 pypdfium2（推荐，MinerU 标准）
														
 
															-        images, doc = load_images_from_pdf_unified(
														
 
															-            pdf_bytes, 
														
 
															-            dpi=200, 
														
 
															-            renderer="pypdfium2",
														
 
															-            threads=4
														
 
															-        )
														
 
															-        
														
 
															-        # 使用 PyMuPDF (fitz)
														
 
															-        images, doc = load_images_from_pdf_unified(
														
 
															-            pdf_bytes, 
														
 
															-            dpi=200, 
														
 
															-            renderer="fitz"
														
 
															-        )
														
 
															-        
														
 
															-        # 访问图像
														
 
															-        for img_dict in images:
														
 
															-            pil_image = img_dict['img_pil']
														
 
															-            scale = img_dict['scale']
														
 
															-            # 处理图像...
														
 
															-    
														
 
															-    注意事项:
														
 
															-        1. pypdfium2 在生产环境中更推荐，因为它是 MinerU 的标准实现
														
 
															-        2. 两种渲染引擎可能产生略有不同的图像（SSIM ≈ 0.945）
														
 
															-        3. 建议在同一项目中保持使用同一渲染引擎，避免不一致
														
 
															-        4. 如果需要与现有测试图像对比，使用相同的渲染引擎
														
 
															-    """
														
 
															-    renderer = renderer.lower()
														
 
															-    
														
 
															-    if renderer in ["pypdfium2", "pdfium"]:
														
 
															-        return _load_images_pypdfium2(
														
 
															-            pdf_bytes, dpi, start_page_id, end_page_id, 
														
 
															-            image_type, timeout, threads
														
 
															-        )
														
 
															-    elif renderer in ["fitz", "pymupdf", "mupdf"]:
														
 
															-        return _load_images_fitz(
														
 
															-            pdf_bytes, dpi, start_page_id, end_page_id, image_type
														
 
															-        )
														
 
															-    else:
														
 
															-        raise ValueError(
														
 
															-            f"不支持的渲染引擎: {renderer}. "
														
 
															-            f"请使用 'pypdfium2' 或 'fitz'"
														
 
															-        )
														
 
															-
														
 
															-
														
 
															-def _load_images_pypdfium2(
														
 
															-    pdf_bytes: bytes,
														
 
															-    dpi: int,
														
 
															-    start_page_id: int,
														
 
															-    end_page_id: Optional[int],
														
 
															-    image_type: str,
														
 
															-    timeout: Optional[int],
														
 
															-    threads: int
														
 
															-) -> Tuple[List[Dict[str, Any]], Any]:
														
 
															-    """使用 pypdfium2 渲染引擎（MinerU 标准）"""
														
 
															-    try:
														
 
															-        import pypdfium2 as pdfium
														
 
															-        from mineru.utils.pdf_image_tools import load_images_from_pdf as mineru_load_images
														
 
															-        from mineru.utils.enum_class import ImageType
														
 
															-    except ImportError as e:
														
 
															-        raise ImportError(
														
 
															-            f"pypdfium2 渲染引擎需要安装 MinerU: pip install mineru\n"
														
 
															-            f"原始错误: {e}"
														
 
															-        )
														
 
															-    
														
 
															-    # 转换 image_type
														
 
															-    img_type = ImageType.PIL if image_type.upper() == "PIL" else ImageType.BASE64
														
 
															-    
														
 
															-    # 使用 MinerU 的实现
														
 
															-    images_list, pdf_doc = mineru_load_images(
														
 
															-        pdf_bytes=pdf_bytes,
														
 
															-        dpi=dpi,
														
 
															-        start_page_id=start_page_id,
														
 
															-        end_page_id=end_page_id,
														
 
															-        image_type=img_type,
														
 
															-        timeout=timeout,
														
 
															-        threads=threads
														
 
															-    )
														
 
															-    
														
 
															-    logger.info(
														
 
															-        f"✅ pypdfium2 渲染完成: {len(images_list)} 页 "
														
 
															-        f"(DPI={dpi}, 多进程={threads})"
														
 
															-    )
														
 
															-    
														
 
															-    return images_list, pdf_doc
														
 
															-
														
 
															-
														
 
															-def _load_images_fitz(
														
 
															-    pdf_bytes: bytes,
														
 
															-    dpi: int,
														
 
															-    start_page_id: int,
														
 
															-    end_page_id: Optional[int],
														
 
															-    image_type: str
														
 
															-) -> Tuple[List[Dict[str, Any]], Any]:
														
 
															-    """使用 PyMuPDF (fitz) 渲染引擎"""
														
 
															-    try:
														
 
															-        import fitz
														
 
															-    except ImportError as e:
														
 
															-        raise ImportError(
														
 
															-            f"PyMuPDF 渲染引擎需要安装: pip install PyMuPDF\n"
														
 
															-            f"原始错误: {e}"
														
 
															-        )
														
 
															-    
														
 
															-    from io import BytesIO
														
 
															-    import base64
														
 
															-    
														
 
															-    # 打开 PDF
														
 
															-    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
														
 
															-    pdf_page_num = doc.page_count
														
 
															-    
														
 
															-    # 处理 end_page_id
														
 
															-    if end_page_id is None or end_page_id < 0:
														
 
															-        end_page_id = pdf_page_num - 1
														
 
															-    end_page_id = min(end_page_id, pdf_page_num - 1)
														
 
															-    
														
 
															-    # 渲染图像
														
 
															-    images_list = []
														
 
															-    mat = fitz.Matrix(dpi / 72, dpi / 72)
														
 
															-    
														
 
															-    for index in range(start_page_id, end_page_id + 1):
														
 
															-        page = doc[index]
														
 
															-        
														
 
															-        # 渲染为 pixmap
														
 
															-        pm = page.get_pixmap(matrix=mat, alpha=False)
														
 
															-        
														
 
															-        # 如果超过尺寸限制，降低到 72 DPI
														
 
															-        if pm.width > 4500 or pm.height > 4500:
														
 
															-            logger.warning(
														
 
															-                f"⚠️  页面 {index} 尺寸过大 ({pm.width}x{pm.height}), "
														
 
															-                f"降低到 72 DPI"
														
 
															-            )
														
 
															-            mat_fallback = fitz.Matrix(1, 1)  # 72 DPI
														
 
															-            pm = page.get_pixmap(matrix=mat_fallback, alpha=False)
														
 
															-        
														
 
															-        # 转换为 PIL Image
														
 
															-        pil_img = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
														
 
															-        
														
 
															-        # 计算实际 scale
														
 
															-        page_rect = page.rect
														
 
															-        actual_scale = pm.width / page_rect.width
														
 
															-        
														
 
															-        # 构建返回字典
														
 
															-        image_dict = {
														
 
															-            'img_pil': pil_img,
														
 
															-            'scale': actual_scale
														
 
															-        }
														
 
															-        
														
 
															-        # 如果需要 BASE64
														
 
															-        if image_type.upper() == "BASE64":
														
 
															-            buffer = BytesIO()
														
 
															-            pil_img.save(buffer, format="JPEG")
														
 
															-            img_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
														
 
															-            image_dict['img_base64'] = img_base64
														
 
															-            # 移除 img_pil 以节省内存
														
 
															-            del image_dict['img_pil']
														
 
															-        
														
 
															-        images_list.append(image_dict)
														
 
															-    
														
 
															-    logger.info(
														
 
															-        f"✅ PyMuPDF (fitz) 渲染完成: {len(images_list)} 页 "
														
 
															-        f"(DPI={dpi}, 单进程)"
														
 
															-    )
														
 
															-    
														
 
															-    return images_list, doc