import tempfile from pathlib import Path from typing import List, Tuple import json def split_files(file_list: List[str], num_splits: int) -> List[List[str]]: """ 将文件列表分割成指定数量的子列表 Args: file_list: 文件路径列表 num_splits: 分割数量 Returns: 分割后的文件列表 """ if num_splits <= 0: return [file_list] chunk_size = len(file_list) // num_splits remainder = len(file_list) % num_splits chunks = [] start = 0 for i in range(num_splits): # 前remainder个chunk多分配一个文件 current_chunk_size = chunk_size + (1 if i < remainder else 0) if current_chunk_size > 0: chunks.append(file_list[start:start + current_chunk_size]) start += current_chunk_size return [chunk for chunk in chunks if chunk] # 过滤空列表 def create_temp_file_list(file_chunk: List[str]) -> str: """ 创建临时文件列表文件 Args: file_chunk: 文件路径列表 Returns: 临时文件路径 """ with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f: for file_path in file_chunk: f.write(f"{file_path}\n") return f.name def get_image_files_from_dir(input_dir: Path, pattern: str = "*", max_files: int = None) -> List[str]: """ 从目录获取图像文件列表 Args: input_dir: 输入目录 max_files: 最大文件数量限制 Returns: 图像文件路径列表 """ image_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif'] image_files = [] for ext in image_extensions: image_files.extend(list(input_dir.glob(f"{pattern}{ext}"))) image_files.extend(list(input_dir.glob(f"{pattern}{ext.upper()}"))) # 去重并排序 image_files = sorted(list(set(str(f) for f in image_files))) # 限制文件数量 if max_files: image_files = image_files[:max_files] return image_files def get_image_files_from_list(file_list_path: str) -> List[str]: """ 从文件列表获取图像文件列表 Args: file_list_path: 文件列表路径 Returns: 图像文件路径列表 """ print(f"📄 Reading file list from: {file_list_path}") with open(file_list_path, 'r', encoding='utf-8') as f: image_files = [line.strip() for line in f if line.strip()] # 验证文件存在性 valid_files = [] missing_files = [] for file_path in image_files: if Path(file_path).exists(): valid_files.append(file_path) else: missing_files.append(file_path) if missing_files: print(f"⚠️ Warning: {len(missing_files)} files not found:") for missing_file in missing_files[:5]: # 只显示前5个 print(f" - {missing_file}") if len(missing_files) > 5: print(f" ... and {len(missing_files) - 5} more") print(f"✅ Found {len(valid_files)} valid files out of {len(image_files)} in list") return valid_files def get_image_files_from_csv(csv_file: str, status_filter: str = "fail") -> List[str]: """ 从CSV文件获取图像文件列表 Args: csv_file: CSV文件路径 status_filter: 状态过滤器 Returns: 图像文件路径列表 """ print(f"📄 Reading image files from CSV: {csv_file}") # 读取CSV文件, 表头:image_path,status image_files = [] with open(csv_file, 'r', encoding='utf-8') as f: for line in f: # 需要去掉表头, 按“,”分割,读取文件名,状态 image_file, status = line.strip().split(",") if status.lower() == status_filter.lower(): image_files.append(image_file) return image_files def collect_pid_files(pid_output_file: str) -> List[Tuple[str, str]]: """ 从进程输出文件中收集文件 Args: pid_output_file: 进程输出文件路径 Returns: 文件列表(文件路径,处理结果) """ """ 单进程结果统计文件格式 "results": [ { "image_path": "docstructbench_dianzishu_zhongwenzaixian-o.O-61520612.pdf_140.jpg", "processing_time": 2.0265579223632812e-06, "success": true, "device": "gpu:3", "output_json": "/home/ubuntu/zhch/PaddleX/zhch/OmniDocBench_Results_Scheduler/process_3/docstructbench_dianzishu_zhongwenzaixian-o.O-61520612.pdf_140.json", "output_md": "/home/ubuntu/zhch/PaddleX/zhch/OmniDocBench_Results_Scheduler/process_3/docstructbench_dianzishu_zhongwenzaixian-o.O-61520612.pdf_140.md" }, ... """ if not Path(pid_output_file).exists(): print(f"⚠️ Warning: PID output file not found: {pid_output_file}") return [] with open(pid_output_file, 'r', encoding='utf-8') as f: data = json.load(f) if not isinstance(data, dict) or "results" not in data: print(f"⚠️ Warning: Invalid PID output file format: {pid_output_file}") return [] # 返回文件路径和处理状态, 如果“success”: True, 则状态为“success”, 否则为“fail” file_list = [] for file_result in data.get("results", []): image_path = file_result.get("image_path", "") status = "success" if file_result.get("success", False) else "fail" file_list.append((image_path, status)) return file_list