import tempfile from pathlib import Path from typing import List, Tuple import json from .doc_utils import load_images_from_pdf import traceback def split_files(file_list: List[str], num_splits: int) -> List[List[str]]: """ 将文件列表分割成指定数量的子列表 Args: file_list: 文件路径列表 num_splits: 分割数量 Returns: 分割后的文件列表 """ if num_splits <= 0: return [file_list] chunk_size = len(file_list) // num_splits remainder = len(file_list) % num_splits chunks = [] start = 0 for i in range(num_splits): # 前remainder个chunk多分配一个文件 current_chunk_size = chunk_size + (1 if i < remainder else 0) if current_chunk_size > 0: chunks.append(file_list[start:start + current_chunk_size]) start += current_chunk_size return [chunk for chunk in chunks if chunk] # 过滤空列表 def create_temp_file_list(file_chunk: List[str]) -> str: """ 创建临时文件列表文件 Args: file_chunk: 文件路径列表 Returns: 临时文件路径 """ with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f: for file_path in file_chunk: f.write(f"{file_path}\n") return f.name def get_image_files_from_dir(input_dir: Path, pattern: str = "*", max_files: int = None) -> List[str]: """ 从目录获取图像文件列表 Args: input_dir: 输入目录 max_files: 最大文件数量限制 Returns: 图像文件路径列表 """ image_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif'] image_files = [] for ext in image_extensions: image_files.extend(list(input_dir.glob(f"{pattern}{ext}"))) image_files.extend(list(input_dir.glob(f"{pattern}{ext.upper()}"))) # 去重并排序 image_files = sorted(list(set(str(f) for f in image_files))) # 限制文件数量 if max_files: image_files = image_files[:max_files] return image_files def get_image_files_from_list(file_list_path: str) -> List[str]: """ 从文件列表获取图像文件列表 Args: file_list_path: 文件列表路径 Returns: 图像文件路径列表 """ print(f"📄 Reading file list from: {file_list_path}") with open(file_list_path, 'r', encoding='utf-8') as f: image_files = [line.strip() for line in f if line.strip()] # 验证文件存在性 valid_files = [] missing_files = [] for file_path in image_files: if Path(file_path).exists(): valid_files.append(file_path) else: missing_files.append(file_path) if missing_files: print(f"⚠️ Warning: {len(missing_files)} files not found:") for missing_file in missing_files[:5]: # 只显示前5个 print(f" - {missing_file}") if len(missing_files) > 5: print(f" ... and {len(missing_files) - 5} more") print(f"✅ Found {len(valid_files)} valid files out of {len(image_files)} in list") return valid_files def get_image_files_from_csv(csv_file: str, status_filter: str = "fail") -> List[str]: """ 从CSV文件获取图像文件列表 Args: csv_file: CSV文件路径 status_filter: 状态过滤器 Returns: 图像文件路径列表 """ print(f"📄 Reading image files from CSV: {csv_file}") # 读取CSV文件, 表头:image_path,status image_files = [] with open(csv_file, 'r', encoding='utf-8') as f: for line in f: # 需要去掉表头, 按“,”分割,读取文件名,状态 image_file, status = line.strip().split(",") if status.lower() == status_filter.lower(): image_files.append(image_file) return image_files def collect_pid_files(pid_output_file: str) -> List[Tuple[str, str]]: """ 从进程输出文件中收集文件 Args: pid_output_file: 进程输出文件路径 Returns: 文件列表(文件路径,处理结果) """ """ 单进程结果统计文件格式 "results": [ { "image_path": "docstructbench_dianzishu_zhongwenzaixian-o.O-61520612.pdf_140.jpg", "processing_time": 2.0265579223632812e-06, "success": true, "device": "gpu:3", "output_json": "/home/ubuntu/zhch/PaddleX/zhch/OmniDocBench_Results_Scheduler/process_3/docstructbench_dianzishu_zhongwenzaixian-o.O-61520612.pdf_140.json", "output_md": "/home/ubuntu/zhch/PaddleX/zhch/OmniDocBench_Results_Scheduler/process_3/docstructbench_dianzishu_zhongwenzaixian-o.O-61520612.pdf_140.md" }, ... """ if not Path(pid_output_file).exists(): print(f"⚠️ Warning: PID output file not found: {pid_output_file}") return [] with open(pid_output_file, 'r', encoding='utf-8') as f: data = json.load(f) if not isinstance(data, dict) or "results" not in data: print(f"⚠️ Warning: Invalid PID output file format: {pid_output_file}") return [] # 返回文件路径和处理状态, 如果“success”: True, 则状态为“success”, 否则为“fail” file_list = [] for file_result in data.get("results", []): image_path = file_result.get("image_path", "") status = "success" if file_result.get("success", False) else "fail" file_list.append((image_path, status)) return file_list def convert_pdf_to_images(pdf_file: str, output_dir: str | None = None, dpi: int = 200) -> List[str]: """ 将PDF转换为图像文件 Args: pdf_file: PDF文件路径 output_dir: 输出目录 dpi: 图像分辨率 Returns: 生成的图像文件路径列表 """ pdf_path = Path(pdf_file) if not pdf_path.exists() or pdf_path.suffix.lower() != '.pdf': print(f"❌ Invalid PDF file: {pdf_path}") return [] # 如果没有指定输出目录,使用PDF同名目录 if output_dir is None: output_path = pdf_path.parent / f"{pdf_path.stem}" else: output_path = Path(output_dir) / f"{pdf_path.stem}" output_path = output_path.resolve() output_path.mkdir(parents=True, exist_ok=True) try: # 使用doc_utils中的函数加载PDF图像 images = load_images_from_pdf(str(pdf_path), dpi=dpi) image_paths = [] for i, image in enumerate(images): # 生成图像文件名 image_filename = f"{pdf_path.stem}_page_{i+1:03d}.png" image_path = output_path / image_filename # 保存图像 image.save(str(image_path)) image_paths.append(str(image_path)) print(f"✅ Converted {len(images)} pages from {pdf_path.name} to images") return image_paths except Exception as e: print(f"❌ Error converting PDF {pdf_path}: {e}") traceback.print_exc() return [] def get_input_files(args) -> List[str]: """ 获取输入文件列表,统一处理PDF和图像文件 Args: args: 命令行参数 Returns: 处理后的图像文件路径列表 """ input_files = [] # 获取原始输入文件 if args.input_csv: raw_files = get_image_files_from_csv(args.input_csv, "fail") elif args.input_file_list: raw_files = get_image_files_from_list(args.input_file_list) elif args.input_file: raw_files = [Path(args.input_file).resolve()] else: input_dir = Path(args.input_dir).resolve() if not input_dir.exists(): print(f"❌ Input directory does not exist: {input_dir}") return [] # 获取所有支持的文件(图像和PDF) image_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif'] pdf_extensions = ['.pdf'] raw_files = [] for ext in image_extensions + pdf_extensions: raw_files.extend(list(input_dir.glob(f"*{ext}"))) raw_files.extend(list(input_dir.glob(f"*{ext.upper()}"))) raw_files = [str(f) for f in raw_files] # 分别处理PDF和图像文件 pdf_count = 0 image_count = 0 for file_path in raw_files: file_path = Path(file_path) if file_path.suffix.lower() == '.pdf': # 转换PDF为图像 print(f"📄 Processing PDF: {file_path.name}") pdf_images = convert_pdf_to_images( str(file_path), args.output_dir, dpi=args.pdf_dpi ) input_files.extend(pdf_images) pdf_count += 1 else: # 直接添加图像文件 if file_path.exists(): input_files.append(str(file_path)) image_count += 1 print(f"📊 Input summary:") print(f" PDF files processed: {pdf_count}") print(f" Image files found: {image_count}") print(f" Total image files to process: {len(input_files)}") return sorted(list(set(str(f) for f in input_files)))