file_utils.py 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177
  1. import tempfile
  2. from pathlib import Path
  3. from typing import List, Tuple
  4. import json
  5. def split_files(file_list: List[str], num_splits: int) -> List[List[str]]:
  6. """
  7. 将文件列表分割成指定数量的子列表
  8. Args:
  9. file_list: 文件路径列表
  10. num_splits: 分割数量
  11. Returns:
  12. 分割后的文件列表
  13. """
  14. if num_splits <= 0:
  15. return [file_list]
  16. chunk_size = len(file_list) // num_splits
  17. remainder = len(file_list) % num_splits
  18. chunks = []
  19. start = 0
  20. for i in range(num_splits):
  21. # 前remainder个chunk多分配一个文件
  22. current_chunk_size = chunk_size + (1 if i < remainder else 0)
  23. if current_chunk_size > 0:
  24. chunks.append(file_list[start:start + current_chunk_size])
  25. start += current_chunk_size
  26. return [chunk for chunk in chunks if chunk] # 过滤空列表
  27. def create_temp_file_list(file_chunk: List[str]) -> str:
  28. """
  29. 创建临时文件列表文件
  30. Args:
  31. file_chunk: 文件路径列表
  32. Returns:
  33. 临时文件路径
  34. """
  35. with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
  36. for file_path in file_chunk:
  37. f.write(f"{file_path}\n")
  38. return f.name
  39. def get_image_files_from_dir(input_dir: Path, pattern: str = "*", max_files: int = None) -> List[str]:
  40. """
  41. 从目录获取图像文件列表
  42. Args:
  43. input_dir: 输入目录
  44. max_files: 最大文件数量限制
  45. Returns:
  46. 图像文件路径列表
  47. """
  48. image_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif']
  49. image_files = []
  50. for ext in image_extensions:
  51. image_files.extend(list(input_dir.glob(f"{pattern}{ext}")))
  52. image_files.extend(list(input_dir.glob(f"{pattern}{ext.upper()}")))
  53. # 去重并排序
  54. image_files = sorted(list(set(str(f) for f in image_files)))
  55. # 限制文件数量
  56. if max_files:
  57. image_files = image_files[:max_files]
  58. return image_files
  59. def get_image_files_from_list(file_list_path: str) -> List[str]:
  60. """
  61. 从文件列表获取图像文件列表
  62. Args:
  63. file_list_path: 文件列表路径
  64. Returns:
  65. 图像文件路径列表
  66. """
  67. print(f"📄 Reading file list from: {file_list_path}")
  68. with open(file_list_path, 'r', encoding='utf-8') as f:
  69. image_files = [line.strip() for line in f if line.strip()]
  70. # 验证文件存在性
  71. valid_files = []
  72. missing_files = []
  73. for file_path in image_files:
  74. if Path(file_path).exists():
  75. valid_files.append(file_path)
  76. else:
  77. missing_files.append(file_path)
  78. if missing_files:
  79. print(f"⚠️ Warning: {len(missing_files)} files not found:")
  80. for missing_file in missing_files[:5]: # 只显示前5个
  81. print(f" - {missing_file}")
  82. if len(missing_files) > 5:
  83. print(f" ... and {len(missing_files) - 5} more")
  84. print(f"✅ Found {len(valid_files)} valid files out of {len(image_files)} in list")
  85. return valid_files
  86. def get_image_files_from_csv(csv_file: str, status_filter: str = "fail") -> List[str]:
  87. """
  88. 从CSV文件获取图像文件列表
  89. Args:
  90. csv_file: CSV文件路径
  91. status_filter: 状态过滤器
  92. Returns:
  93. 图像文件路径列表
  94. """
  95. print(f"📄 Reading image files from CSV: {csv_file}")
  96. # 读取CSV文件, 表头:image_path,status
  97. image_files = []
  98. with open(csv_file, 'r', encoding='utf-8') as f:
  99. for line in f:
  100. # 需要去掉表头, 按“,”分割,读取文件名,状态
  101. image_file, status = line.strip().split(",")
  102. if status.lower() == status_filter.lower():
  103. image_files.append(image_file)
  104. return image_files
  105. def collect_pid_files(pid_output_file: str) -> List[Tuple[str, str]]:
  106. """
  107. 从进程输出文件中收集文件
  108. Args:
  109. pid_output_file: 进程输出文件路径
  110. Returns:
  111. 文件列表(文件路径,处理结果)
  112. """
  113. """
  114. 单进程结果统计文件格式
  115. "results": [
  116. {
  117. "image_path": "docstructbench_dianzishu_zhongwenzaixian-o.O-61520612.pdf_140.jpg",
  118. "processing_time": 2.0265579223632812e-06,
  119. "success": true,
  120. "device": "gpu:3",
  121. "output_json": "/home/ubuntu/zhch/PaddleX/zhch/OmniDocBench_Results_Scheduler/process_3/docstructbench_dianzishu_zhongwenzaixian-o.O-61520612.pdf_140.json",
  122. "output_md": "/home/ubuntu/zhch/PaddleX/zhch/OmniDocBench_Results_Scheduler/process_3/docstructbench_dianzishu_zhongwenzaixian-o.O-61520612.pdf_140.md"
  123. },
  124. ...
  125. """
  126. if not Path(pid_output_file).exists():
  127. print(f"⚠️ Warning: PID output file not found: {pid_output_file}")
  128. return []
  129. with open(pid_output_file, 'r', encoding='utf-8') as f:
  130. data = json.load(f)
  131. if not isinstance(data, dict) or "results" not in data:
  132. print(f"⚠️ Warning: Invalid PID output file format: {pid_output_file}")
  133. return []
  134. # 返回文件路径和处理状态, 如果“success”: True, 则状态为“success”, 否则为“fail”
  135. file_list = []
  136. for file_result in data.get("results", []):
  137. image_path = file_result.get("image_path", "")
  138. status = "success" if file_result.get("success", False) else "fail"
  139. file_list.append((image_path, status))
  140. return file_list