|
@@ -7,8 +7,9 @@
|
|
|
- 文件列表处理
|
|
- 文件列表处理
|
|
|
"""
|
|
"""
|
|
|
import tempfile
|
|
import tempfile
|
|
|
|
|
+import re
|
|
|
from pathlib import Path
|
|
from pathlib import Path
|
|
|
-from typing import List, Tuple
|
|
|
|
|
|
|
+from typing import List, Tuple, Optional, Set
|
|
|
import json
|
|
import json
|
|
|
import traceback
|
|
import traceback
|
|
|
from loguru import logger
|
|
from loguru import logger
|
|
@@ -23,6 +24,60 @@ except ImportError:
|
|
|
ImageType = None
|
|
ImageType = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+def parse_page_range(page_range: Optional[str], total_pages: int) -> Set[int]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 解析页面范围字符串
|
|
|
|
|
+
|
|
|
|
|
+ 支持格式:
|
|
|
|
|
+ - "1-5" → {0, 1, 2, 3, 4}(页码从1开始,内部转为0-based索引)
|
|
|
|
|
+ - "3" → {2}
|
|
|
|
|
+ - "1-5,7,9-12" → {0, 1, 2, 3, 4, 6, 8, 9, 10, 11}
|
|
|
|
|
+ - "1-" → 从第1页到最后
|
|
|
|
|
+ - "-5" → 从第1页到第5页
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ page_range: 页面范围字符串(页码从1开始)
|
|
|
|
|
+ total_pages: 总页数
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ 页面索引集合(0-based)
|
|
|
|
|
+ """
|
|
|
|
|
+ if not page_range or not page_range.strip():
|
|
|
|
|
+ return set(range(total_pages))
|
|
|
|
|
+
|
|
|
|
|
+ pages = set()
|
|
|
|
|
+ parts = page_range.replace(' ', '').split(',')
|
|
|
|
|
+
|
|
|
|
|
+ for part in parts:
|
|
|
|
|
+ part = part.strip()
|
|
|
|
|
+ if not part:
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ if '-' in part:
|
|
|
|
|
+ # 范围格式
|
|
|
|
|
+ match = re.match(r'^(\d*)-(\d*)$', part)
|
|
|
|
|
+ if match:
|
|
|
|
|
+ start_str, end_str = match.groups()
|
|
|
|
|
+ start = int(start_str) if start_str else 1
|
|
|
|
|
+ end = int(end_str) if end_str else total_pages
|
|
|
|
|
+
|
|
|
|
|
+ # 转换为 0-based 索引
|
|
|
|
|
+ start = max(0, start - 1)
|
|
|
|
|
+ end = min(total_pages, end)
|
|
|
|
|
+
|
|
|
|
|
+ pages.update(range(start, end))
|
|
|
|
|
+ else:
|
|
|
|
|
+ # 单页
|
|
|
|
|
+ try:
|
|
|
|
|
+ page_num = int(part)
|
|
|
|
|
+ if 1 <= page_num <= total_pages:
|
|
|
|
|
+ pages.add(page_num - 1) # 转换为 0-based 索引
|
|
|
|
|
+ except ValueError:
|
|
|
|
|
+ logger.warning(f"Invalid page number: {part}")
|
|
|
|
|
+
|
|
|
|
|
+ return pages
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
def split_files(file_list: List[str], num_splits: int) -> List[List[str]]:
|
|
def split_files(file_list: List[str], num_splits: int) -> List[List[str]]:
|
|
|
"""
|
|
"""
|
|
|
将文件列表分割成指定数量的子列表
|
|
将文件列表分割成指定数量的子列表
|
|
@@ -235,62 +290,107 @@ def convert_pdf_to_images(
|
|
|
output_path.mkdir(parents=True, exist_ok=True)
|
|
output_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
|
try:
|
|
try:
|
|
|
- # 使用MinerU的函数加载PDF图像
|
|
|
|
|
- if not MINERU_AVAILABLE or load_images_from_pdf is None or ImageType is None:
|
|
|
|
|
- logger.error("❌ MinerU components not available for PDF to image conversion")
|
|
|
|
|
- return []
|
|
|
|
|
-
|
|
|
|
|
- images, _ = load_images_from_pdf(
|
|
|
|
|
- pdf_path.read_bytes(),
|
|
|
|
|
- dpi=dpi,
|
|
|
|
|
- image_type=ImageType.PIL # 返回包含 img_pil 的字典列表
|
|
|
|
|
- )
|
|
|
|
|
-
|
|
|
|
|
- # 应用页面范围过滤
|
|
|
|
|
- selected_pages = None
|
|
|
|
|
- if page_range:
|
|
|
|
|
- from .pdf_utils import PDFUtils
|
|
|
|
|
- total_pages = len(images)
|
|
|
|
|
- selected_pages = PDFUtils.parse_page_range(page_range, total_pages)
|
|
|
|
|
- if selected_pages:
|
|
|
|
|
- images = [images[i] for i in sorted(selected_pages)]
|
|
|
|
|
- logger.info(f"📋 PDF 共 {total_pages} 页,选择处理 {len(images)} 页")
|
|
|
|
|
|
|
+ # 优先使用 MinerU 的函数(如果可用)
|
|
|
|
|
+ if MINERU_AVAILABLE and load_images_from_pdf is not None and ImageType is not None:
|
|
|
|
|
+ images, _ = load_images_from_pdf(
|
|
|
|
|
+ pdf_path.read_bytes(),
|
|
|
|
|
+ dpi=dpi,
|
|
|
|
|
+ image_type=ImageType.PIL # 返回包含 img_pil 的字典列表
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ # 应用页面范围过滤
|
|
|
|
|
+ selected_pages = None
|
|
|
|
|
+ if page_range:
|
|
|
|
|
+ total_pages = len(images)
|
|
|
|
|
+ selected_pages = parse_page_range(page_range, total_pages)
|
|
|
|
|
+ if selected_pages:
|
|
|
|
|
+ images = [images[i] for i in sorted(selected_pages)]
|
|
|
|
|
+ logger.info(f"📋 PDF 共 {total_pages} 页,选择处理 {len(images)} 页")
|
|
|
|
|
+ else:
|
|
|
|
|
+ logger.warning(f"⚠️ 页面范围 '{page_range}' 没有匹配到任何有效页面")
|
|
|
|
|
+ return []
|
|
|
else:
|
|
else:
|
|
|
- logger.warning(f"⚠️ 页面范围 '{page_range}' 没有匹配到任何有效页面")
|
|
|
|
|
- return []
|
|
|
|
|
-
|
|
|
|
|
- image_paths = []
|
|
|
|
|
- # 需要跟踪原始页码索引,以便正确命名文件
|
|
|
|
|
- original_indices = sorted(selected_pages) if selected_pages else list(range(len(images)))
|
|
|
|
|
-
|
|
|
|
|
- for idx, image in enumerate(images):
|
|
|
|
|
- # 获取原始页码索引(用于文件命名)
|
|
|
|
|
- original_idx = original_indices[idx] if selected_pages else idx
|
|
|
|
|
- # 生成图像文件名(使用原始页码,从1开始)
|
|
|
|
|
- image_filename = f"{pdf_path.stem}_page_{original_idx + 1:03d}.png"
|
|
|
|
|
- image_path = output_path / image_filename
|
|
|
|
|
|
|
+ selected_pages = None
|
|
|
|
|
+
|
|
|
|
|
+ image_paths = []
|
|
|
|
|
+ # 需要跟踪原始页码索引,以便正确命名文件
|
|
|
|
|
+ original_indices = sorted(selected_pages) if selected_pages else list(range(len(images)))
|
|
|
|
|
+
|
|
|
|
|
+ for idx, image in enumerate(images):
|
|
|
|
|
+ # 获取原始页码索引(用于文件命名)
|
|
|
|
|
+ original_idx = original_indices[idx] if selected_pages else idx
|
|
|
|
|
+ # 生成图像文件名(使用原始页码,从1开始)
|
|
|
|
|
+ image_filename = f"{pdf_path.stem}_page_{original_idx + 1:03d}.png"
|
|
|
|
|
+ image_path = output_path / image_filename
|
|
|
|
|
|
|
|
- # 保存图像 - 从字典中提取 img_pil
|
|
|
|
|
- if isinstance(image, dict):
|
|
|
|
|
- pil_image = image.get('img_pil')
|
|
|
|
|
- if pil_image is None:
|
|
|
|
|
- logger.error(f"❌ Image dict at index {idx} does not contain 'img_pil' key")
|
|
|
|
|
- continue
|
|
|
|
|
- pil_image.save(str(image_path))
|
|
|
|
|
- else:
|
|
|
|
|
- # 如果不是字典,假设是直接的 PIL Image
|
|
|
|
|
- image.save(str(image_path))
|
|
|
|
|
- image_paths.append(str(image_path))
|
|
|
|
|
|
|
+ # 保存图像 - 从字典中提取 img_pil
|
|
|
|
|
+ if isinstance(image, dict):
|
|
|
|
|
+ pil_image = image.get('img_pil')
|
|
|
|
|
+ if pil_image is None:
|
|
|
|
|
+ logger.error(f"❌ Image dict at index {idx} does not contain 'img_pil' key")
|
|
|
|
|
+ continue
|
|
|
|
|
+ pil_image.save(str(image_path))
|
|
|
|
|
+ else:
|
|
|
|
|
+ # 如果不是字典,假设是直接的 PIL Image
|
|
|
|
|
+ image.save(str(image_path))
|
|
|
|
|
+ image_paths.append(str(image_path))
|
|
|
|
|
|
|
|
- logger.info(f"✅ Converted {len(images)} pages from {pdf_path.name} to images")
|
|
|
|
|
- return image_paths
|
|
|
|
|
|
|
+ logger.info(f"✅ Converted {len(images)} pages from {pdf_path.name} to images (using MinerU)")
|
|
|
|
|
+ return image_paths
|
|
|
|
|
+
|
|
|
|
|
+ else:
|
|
|
|
|
+ # Fallback: 使用 pypdfium2(PaddleX 环境中可用)
|
|
|
|
|
+ logger.info("ℹ️ MinerU 不可用,使用 pypdfium2 进行 PDF 转图像")
|
|
|
|
|
+ try:
|
|
|
|
|
+ import pypdfium2 as pdfium
|
|
|
|
|
+ except ImportError:
|
|
|
|
|
+ logger.error("❌ pypdfium2 未安装,无法转换 PDF。请安装: pip install pypdfium2")
|
|
|
|
|
+ return []
|
|
|
|
|
+
|
|
|
|
|
+ pdf_doc = pdfium.PdfDocument(pdf_path)
|
|
|
|
|
+ try:
|
|
|
|
|
+ total_pages = len(pdf_doc)
|
|
|
|
|
+
|
|
|
|
|
+ # 解析页面范围(使用本地函数,不依赖 PDFUtils)
|
|
|
|
|
+ selected_pages = parse_page_range(page_range, total_pages)
|
|
|
|
|
+ if not selected_pages:
|
|
|
|
|
+ logger.warning(f"⚠️ 页面范围 '{page_range}' 没有匹配到任何有效页面")
|
|
|
|
|
+ return []
|
|
|
|
|
+
|
|
|
|
|
+ if page_range:
|
|
|
|
|
+ logger.info(f"📋 PDF 共 {total_pages} 页,选择处理 {len(selected_pages)} 页")
|
|
|
|
|
+
|
|
|
|
|
+ # 计算缩放比例(DPI 转换)
|
|
|
|
|
+ # pypdfium2 的 scale 参数:1.0 = 72 DPI,所以 dpi/72 = scale
|
|
|
|
|
+ scale = dpi / 72.0
|
|
|
|
|
+
|
|
|
|
|
+ image_paths = []
|
|
|
|
|
+ for page_idx in sorted(selected_pages):
|
|
|
|
|
+ page = pdf_doc[page_idx]
|
|
|
|
|
+
|
|
|
|
|
+ # 渲染页面为图像
|
|
|
|
|
+ bitmap = page.render(scale=scale)
|
|
|
|
|
+ pil_image = bitmap.to_pil()
|
|
|
|
|
+
|
|
|
|
|
+ # 生成图像文件名(页码从1开始)
|
|
|
|
|
+ image_filename = f"{pdf_path.stem}_page_{page_idx + 1:03d}.png"
|
|
|
|
|
+ image_path = output_path / image_filename
|
|
|
|
|
+
|
|
|
|
|
+ # 保存图像
|
|
|
|
|
+ pil_image.save(str(image_path))
|
|
|
|
|
+ image_paths.append(str(image_path))
|
|
|
|
|
+
|
|
|
|
|
+ logger.info(f"✅ Converted {len(image_paths)} pages from {pdf_path.name} to images (using pypdfium2)")
|
|
|
|
|
+ return image_paths
|
|
|
|
|
+
|
|
|
|
|
+ finally:
|
|
|
|
|
+ pdf_doc.close()
|
|
|
|
|
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
logger.error(f"❌ Error converting PDF {pdf_path}: {e}")
|
|
logger.error(f"❌ Error converting PDF {pdf_path}: {e}")
|
|
|
traceback.print_exc()
|
|
traceback.print_exc()
|
|
|
return []
|
|
return []
|
|
|
|
|
|
|
|
-
|
|
|
|
|
def get_input_files(args, page_range: str | None = None) -> List[str]:
|
|
def get_input_files(args, page_range: str | None = None) -> List[str]:
|
|
|
"""
|
|
"""
|
|
|
获取输入文件列表,统一处理PDF和图像文件,支持页面范围过滤
|
|
获取输入文件列表,统一处理PDF和图像文件,支持页面范围过滤
|
|
@@ -356,9 +456,8 @@ def get_input_files(args, page_range: str | None = None) -> List[str]:
|
|
|
|
|
|
|
|
# 对于图片目录,应用页面范围过滤
|
|
# 对于图片目录,应用页面范围过滤
|
|
|
if page_range and image_files:
|
|
if page_range and image_files:
|
|
|
- from .pdf_utils import PDFUtils
|
|
|
|
|
total_pages = len(image_files)
|
|
total_pages = len(image_files)
|
|
|
- selected_pages = PDFUtils.parse_page_range(page_range, total_pages)
|
|
|
|
|
|
|
+ selected_pages = parse_page_range(page_range, total_pages)
|
|
|
if selected_pages:
|
|
if selected_pages:
|
|
|
image_files = [image_files[i] for i in sorted(selected_pages)]
|
|
image_files = [image_files[i] for i in sorted(selected_pages)]
|
|
|
logger.info(f"📋 图片目录共 {total_pages} 张,选择处理 {len(image_files)} 张")
|
|
logger.info(f"📋 图片目录共 {total_pages} 张,选择处理 {len(image_files)} 张")
|