|
|
@@ -2,6 +2,8 @@ import tempfile
|
|
|
from pathlib import Path
|
|
|
from typing import List, Tuple
|
|
|
import json
|
|
|
+from .doc_utils import load_images_from_pdf
|
|
|
+import traceback
|
|
|
|
|
|
def split_files(file_list: List[str], num_splits: int) -> List[List[str]]:
|
|
|
"""
|
|
|
@@ -47,7 +49,7 @@ def create_temp_file_list(file_chunk: List[str]) -> str:
|
|
|
f.write(f"{file_path}\n")
|
|
|
return f.name
|
|
|
|
|
|
-def get_image_files_from_dir(input_dir: Path, max_files: int = None) -> List[str]:
|
|
|
+def get_image_files_from_dir(input_dir: Path, pattern: str = "*", max_files: int = None) -> List[str]:
|
|
|
"""
|
|
|
从目录获取图像文件列表
|
|
|
|
|
|
@@ -62,9 +64,9 @@ def get_image_files_from_dir(input_dir: Path, max_files: int = None) -> List[str
|
|
|
image_files = []
|
|
|
|
|
|
for ext in image_extensions:
|
|
|
- image_files.extend(list(input_dir.glob(f"*{ext}")))
|
|
|
- image_files.extend(list(input_dir.glob(f"*{ext.upper()}")))
|
|
|
-
|
|
|
+ image_files.extend(list(input_dir.glob(f"{pattern}{ext}")))
|
|
|
+ image_files.extend(list(input_dir.glob(f"{pattern}{ext.upper()}")))
|
|
|
+
|
|
|
# 去重并排序
|
|
|
image_files = sorted(list(set(str(f) for f in image_files)))
|
|
|
|
|
|
@@ -174,4 +176,117 @@ def collect_pid_files(pid_output_file: str) -> List[Tuple[str, str]]:
|
|
|
image_path = file_result.get("image_path", "")
|
|
|
status = "success" if file_result.get("success", False) else "fail"
|
|
|
file_list.append((image_path, status))
|
|
|
- return file_list
|
|
|
+ return file_list
|
|
|
+
|
|
|
+def convert_pdf_to_images(pdf_file: str, output_dir: str | None = None, dpi: int = 200) -> List[str]:
|
|
|
+ """
|
|
|
+ 将PDF转换为图像文件
|
|
|
+
|
|
|
+ Args:
|
|
|
+ pdf_file: PDF文件路径
|
|
|
+ output_dir: 输出目录
|
|
|
+ dpi: 图像分辨率
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 生成的图像文件路径列表
|
|
|
+ """
|
|
|
+ pdf_path = Path(pdf_file)
|
|
|
+ if not pdf_path.exists() or pdf_path.suffix.lower() != '.pdf':
|
|
|
+ print(f"❌ Invalid PDF file: {pdf_path}")
|
|
|
+ return []
|
|
|
+
|
|
|
+ # 如果没有指定输出目录,使用PDF同名目录
|
|
|
+ if output_dir is None:
|
|
|
+ output_path = pdf_path.parent / f"{pdf_path.stem}"
|
|
|
+ else:
|
|
|
+ output_path = Path(output_dir) / f"{pdf_path.stem}"
|
|
|
+ output_path = output_path.resolve()
|
|
|
+ output_path.mkdir(parents=True, exist_ok=True)
|
|
|
+
|
|
|
+ try:
|
|
|
+ # 使用doc_utils中的函数加载PDF图像
|
|
|
+ images = load_images_from_pdf(str(pdf_path), dpi=dpi)
|
|
|
+
|
|
|
+ image_paths = []
|
|
|
+ for i, image in enumerate(images):
|
|
|
+ # 生成图像文件名
|
|
|
+ image_filename = f"{pdf_path.stem}_page_{i+1:03d}.png"
|
|
|
+ image_path = output_path / image_filename
|
|
|
+
|
|
|
+ # 保存图像
|
|
|
+ image.save(str(image_path))
|
|
|
+ image_paths.append(str(image_path))
|
|
|
+
|
|
|
+ print(f"✅ Converted {len(images)} pages from {pdf_path.name} to images")
|
|
|
+ return image_paths
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ print(f"❌ Error converting PDF {pdf_path}: {e}")
|
|
|
+ traceback.print_exc()
|
|
|
+ return []
|
|
|
+
|
|
|
+def get_input_files(args) -> List[str]:
|
|
|
+ """
|
|
|
+ 获取输入文件列表,统一处理PDF和图像文件
|
|
|
+
|
|
|
+ Args:
|
|
|
+ args: 命令行参数
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 处理后的图像文件路径列表
|
|
|
+ """
|
|
|
+ input_files = []
|
|
|
+
|
|
|
+ # 获取原始输入文件
|
|
|
+ if args.input_csv:
|
|
|
+ raw_files = get_image_files_from_csv(args.input_csv, "fail")
|
|
|
+ elif args.input_file_list:
|
|
|
+ raw_files = get_image_files_from_list(args.input_file_list)
|
|
|
+ elif args.input_file:
|
|
|
+ raw_files = [Path(args.input_file).resolve()]
|
|
|
+ else:
|
|
|
+ input_dir = Path(args.input_dir).resolve()
|
|
|
+ if not input_dir.exists():
|
|
|
+ print(f"❌ Input directory does not exist: {input_dir}")
|
|
|
+ return []
|
|
|
+
|
|
|
+ # 获取所有支持的文件(图像和PDF)
|
|
|
+ image_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif']
|
|
|
+ pdf_extensions = ['.pdf']
|
|
|
+
|
|
|
+ raw_files = []
|
|
|
+ for ext in image_extensions + pdf_extensions:
|
|
|
+ raw_files.extend(list(input_dir.glob(f"*{ext}")))
|
|
|
+ raw_files.extend(list(input_dir.glob(f"*{ext.upper()}")))
|
|
|
+
|
|
|
+ raw_files = [str(f) for f in raw_files]
|
|
|
+
|
|
|
+ # 分别处理PDF和图像文件
|
|
|
+ pdf_count = 0
|
|
|
+ image_count = 0
|
|
|
+
|
|
|
+ for file_path in raw_files:
|
|
|
+ file_path = Path(file_path)
|
|
|
+
|
|
|
+ if file_path.suffix.lower() == '.pdf':
|
|
|
+ # 转换PDF为图像
|
|
|
+ print(f"📄 Processing PDF: {file_path.name}")
|
|
|
+ pdf_images = convert_pdf_to_images(
|
|
|
+ str(file_path),
|
|
|
+ args.output_dir,
|
|
|
+ dpi=args.pdf_dpi
|
|
|
+ )
|
|
|
+ input_files.extend(pdf_images)
|
|
|
+ pdf_count += 1
|
|
|
+ else:
|
|
|
+ # 直接添加图像文件
|
|
|
+ if file_path.exists():
|
|
|
+ input_files.append(str(file_path))
|
|
|
+ image_count += 1
|
|
|
+
|
|
|
+ print(f"📊 Input summary:")
|
|
|
+ print(f" PDF files processed: {pdf_count}")
|
|
|
+ print(f" Image files found: {image_count}")
|
|
|
+ print(f" Total image files to process: {len(input_files)}")
|
|
|
+
|
|
|
+ return sorted(list(set(str(f) for f in input_files)))
|