file_utils.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. import tempfile
  2. from pathlib import Path
  3. from typing import List
  4. def split_files(file_list: List[str], num_splits: int) -> List[List[str]]:
  5. """
  6. 将文件列表分割成指定数量的子列表
  7. Args:
  8. file_list: 文件路径列表
  9. num_splits: 分割数量
  10. Returns:
  11. 分割后的文件列表
  12. """
  13. if num_splits <= 0:
  14. return [file_list]
  15. chunk_size = len(file_list) // num_splits
  16. remainder = len(file_list) % num_splits
  17. chunks = []
  18. start = 0
  19. for i in range(num_splits):
  20. # 前remainder个chunk多分配一个文件
  21. current_chunk_size = chunk_size + (1 if i < remainder else 0)
  22. if current_chunk_size > 0:
  23. chunks.append(file_list[start:start + current_chunk_size])
  24. start += current_chunk_size
  25. return [chunk for chunk in chunks if chunk] # 过滤空列表
  26. def create_temp_file_list(file_chunk: List[str]) -> str:
  27. """
  28. 创建临时文件列表文件
  29. Args:
  30. file_chunk: 文件路径列表
  31. Returns:
  32. 临时文件路径
  33. """
  34. with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
  35. for file_path in file_chunk:
  36. f.write(f"{file_path}\n")
  37. return f.name
  38. def get_image_files_from_dir(input_dir: Path, max_files: int = None) -> List[str]:
  39. """
  40. 从目录获取图像文件列表
  41. Args:
  42. input_dir: 输入目录
  43. max_files: 最大文件数量限制
  44. Returns:
  45. 图像文件路径列表
  46. """
  47. image_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif']
  48. image_files = []
  49. for ext in image_extensions:
  50. image_files.extend(list(input_dir.glob(f"*{ext}")))
  51. image_files.extend(list(input_dir.glob(f"*{ext.upper()}")))
  52. # 去重并排序
  53. image_files = sorted(list(set(str(f) for f in image_files)))
  54. # 限制文件数量
  55. if max_files:
  56. image_files = image_files[:max_files]
  57. return image_files
  58. def get_image_files_from_list(file_list_path: str) -> List[str]:
  59. """
  60. 从文件列表获取图像文件列表
  61. Args:
  62. file_list_path: 文件列表路径
  63. Returns:
  64. 图像文件路径列表
  65. """
  66. print(f"📄 Reading file list from: {file_list_path}")
  67. with open(file_list_path, 'r', encoding='utf-8') as f:
  68. image_files = [line.strip() for line in f if line.strip()]
  69. # 验证文件存在性
  70. valid_files = []
  71. missing_files = []
  72. for file_path in image_files:
  73. if Path(file_path).exists():
  74. valid_files.append(file_path)
  75. else:
  76. missing_files.append(file_path)
  77. if missing_files:
  78. print(f"⚠️ Warning: {len(missing_files)} files not found:")
  79. for missing_file in missing_files[:5]: # 只显示前5个
  80. print(f" - {missing_file}")
  81. if len(missing_files) > 5:
  82. print(f" ... and {len(missing_files) - 5} more")
  83. print(f"✅ Found {len(valid_files)} valid files out of {len(image_files)} in list")
  84. return valid_files
  85. def get_image_files_from_csv(csv_file: str, status_filter: str = "fail") -> List[str]:
  86. """
  87. 从CSV文件获取图像文件列表
  88. Args:
  89. csv_file: CSV文件路径
  90. status_filter: 状态过滤器
  91. Returns:
  92. 图像文件路径列表
  93. """
  94. print(f"📄 Reading image files from CSV: {csv_file}")
  95. # 读取CSV文件, 表头:image_path,status
  96. image_files = []
  97. with open(csv_file, 'r', encoding='utf-8') as f:
  98. for line in f:
  99. # 需要去掉表头, 按“,”分割,读取文件名,状态
  100. image_file, status = line.strip().split(",")
  101. if status.lower() == status_filter.lower():
  102. image_files.append(image_file)
  103. return image_files