| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292 |
- import tempfile
- from pathlib import Path
- from typing import List, Tuple
- import json
- from .doc_utils import load_images_from_pdf
- import traceback
- def split_files(file_list: List[str], num_splits: int) -> List[List[str]]:
- """
- 将文件列表分割成指定数量的子列表
-
- Args:
- file_list: 文件路径列表
- num_splits: 分割数量
-
- Returns:
- 分割后的文件列表
- """
- if num_splits <= 0:
- return [file_list]
-
- chunk_size = len(file_list) // num_splits
- remainder = len(file_list) % num_splits
-
- chunks = []
- start = 0
-
- for i in range(num_splits):
- # 前remainder个chunk多分配一个文件
- current_chunk_size = chunk_size + (1 if i < remainder else 0)
- if current_chunk_size > 0:
- chunks.append(file_list[start:start + current_chunk_size])
- start += current_chunk_size
-
- return [chunk for chunk in chunks if chunk] # 过滤空列表
- def create_temp_file_list(file_chunk: List[str]) -> str:
- """
- 创建临时文件列表文件
-
- Args:
- file_chunk: 文件路径列表
-
- Returns:
- 临时文件路径
- """
- with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
- for file_path in file_chunk:
- f.write(f"{file_path}\n")
- return f.name
- def get_image_files_from_dir(input_dir: Path, pattern: str = "*", max_files: int = None) -> List[str]:
- """
- 从目录获取图像文件列表
-
- Args:
- input_dir: 输入目录
- max_files: 最大文件数量限制
-
- Returns:
- 图像文件路径列表
- """
- image_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif']
- image_files = []
-
- for ext in image_extensions:
- image_files.extend(list(input_dir.glob(f"{pattern}{ext}")))
- image_files.extend(list(input_dir.glob(f"{pattern}{ext.upper()}")))
- # 去重并排序
- image_files = sorted(list(set(str(f) for f in image_files)))
-
- # 限制文件数量
- if max_files:
- image_files = image_files[:max_files]
-
- return image_files
- def get_image_files_from_list(file_list_path: str) -> List[str]:
- """
- 从文件列表获取图像文件列表
-
- Args:
- file_list_path: 文件列表路径
-
- Returns:
- 图像文件路径列表
- """
- print(f"📄 Reading file list from: {file_list_path}")
-
- with open(file_list_path, 'r', encoding='utf-8') as f:
- image_files = [line.strip() for line in f if line.strip()]
-
- # 验证文件存在性
- valid_files = []
- missing_files = []
-
- for file_path in image_files:
- if Path(file_path).exists():
- valid_files.append(file_path)
- else:
- missing_files.append(file_path)
-
- if missing_files:
- print(f"⚠️ Warning: {len(missing_files)} files not found:")
- for missing_file in missing_files[:5]: # 只显示前5个
- print(f" - {missing_file}")
- if len(missing_files) > 5:
- print(f" ... and {len(missing_files) - 5} more")
-
- print(f"✅ Found {len(valid_files)} valid files out of {len(image_files)} in list")
- return valid_files
- def get_image_files_from_csv(csv_file: str, status_filter: str = "fail") -> List[str]:
- """
- 从CSV文件获取图像文件列表
- Args:
- csv_file: CSV文件路径
- status_filter: 状态过滤器
- Returns:
- 图像文件路径列表
- """
- print(f"📄 Reading image files from CSV: {csv_file}")
- # 读取CSV文件, 表头:image_path,status
- image_files = []
- with open(csv_file, 'r', encoding='utf-8') as f:
- for line in f:
- # 需要去掉表头, 按“,”分割,读取文件名,状态
- image_file, status = line.strip().split(",")
- if status.lower() == status_filter.lower():
- image_files.append(image_file)
- return image_files
- def collect_pid_files(pid_output_file: str) -> List[Tuple[str, str]]:
- """
- 从进程输出文件中收集文件
- Args:
- pid_output_file: 进程输出文件路径
- Returns:
- 文件列表(文件路径,处理结果)
- """
- """
- 单进程结果统计文件格式
- "results": [
- {
- "image_path": "docstructbench_dianzishu_zhongwenzaixian-o.O-61520612.pdf_140.jpg",
- "processing_time": 2.0265579223632812e-06,
- "success": true,
- "device": "gpu:3",
- "output_json": "/home/ubuntu/zhch/PaddleX/zhch/OmniDocBench_Results_Scheduler/process_3/docstructbench_dianzishu_zhongwenzaixian-o.O-61520612.pdf_140.json",
- "output_md": "/home/ubuntu/zhch/PaddleX/zhch/OmniDocBench_Results_Scheduler/process_3/docstructbench_dianzishu_zhongwenzaixian-o.O-61520612.pdf_140.md"
- },
- ...
- """
- if not Path(pid_output_file).exists():
- print(f"⚠️ Warning: PID output file not found: {pid_output_file}")
- return []
- with open(pid_output_file, 'r', encoding='utf-8') as f:
- data = json.load(f)
- if not isinstance(data, dict) or "results" not in data:
- print(f"⚠️ Warning: Invalid PID output file format: {pid_output_file}")
- return []
- # 返回文件路径和处理状态, 如果“success”: True, 则状态为“success”, 否则为“fail”
- file_list = []
- for file_result in data.get("results", []):
- image_path = file_result.get("image_path", "")
- status = "success" if file_result.get("success", False) else "fail"
- file_list.append((image_path, status))
- return file_list
- def convert_pdf_to_images(pdf_file: str, output_dir: str | None = None, dpi: int = 200) -> List[str]:
- """
- 将PDF转换为图像文件
-
- Args:
- pdf_file: PDF文件路径
- output_dir: 输出目录
- dpi: 图像分辨率
-
- Returns:
- 生成的图像文件路径列表
- """
- pdf_path = Path(pdf_file)
- if not pdf_path.exists() or pdf_path.suffix.lower() != '.pdf':
- print(f"❌ Invalid PDF file: {pdf_path}")
- return []
- # 如果没有指定输出目录,使用PDF同名目录
- if output_dir is None:
- output_path = pdf_path.parent / f"{pdf_path.stem}"
- else:
- output_path = Path(output_dir) / f"{pdf_path.stem}"
- output_path = output_path.resolve()
- output_path.mkdir(parents=True, exist_ok=True)
- try:
- # 使用doc_utils中的函数加载PDF图像
- images = load_images_from_pdf(str(pdf_path), dpi=dpi)
-
- image_paths = []
- for i, image in enumerate(images):
- # 生成图像文件名
- image_filename = f"{pdf_path.stem}_page_{i+1:03d}.png"
- image_path = output_path / image_filename
- # 保存图像
- image.save(str(image_path))
- image_paths.append(str(image_path))
-
- print(f"✅ Converted {len(images)} pages from {pdf_path.name} to images")
- return image_paths
-
- except Exception as e:
- print(f"❌ Error converting PDF {pdf_path}: {e}")
- traceback.print_exc()
- return []
- def get_input_files(args) -> List[str]:
- """
- 获取输入文件列表,统一处理PDF和图像文件
-
- Args:
- args: 命令行参数
-
- Returns:
- 处理后的图像文件路径列表
- """
- input_files = []
-
- # 获取原始输入文件
- if args.input_csv:
- raw_files = get_image_files_from_csv(args.input_csv, "fail")
- elif args.input_file_list:
- raw_files = get_image_files_from_list(args.input_file_list)
- elif args.input_file:
- raw_files = [Path(args.input_file).resolve()]
- else:
- input_dir = Path(args.input_dir).resolve()
- if not input_dir.exists():
- print(f"❌ Input directory does not exist: {input_dir}")
- return []
-
- # 获取所有支持的文件(图像和PDF)
- image_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif']
- pdf_extensions = ['.pdf']
-
- raw_files = []
- for ext in image_extensions + pdf_extensions:
- raw_files.extend(list(input_dir.glob(f"*{ext}")))
- raw_files.extend(list(input_dir.glob(f"*{ext.upper()}")))
-
- raw_files = [str(f) for f in raw_files]
-
- # 分别处理PDF和图像文件
- pdf_count = 0
- image_count = 0
-
- for file_path in raw_files:
- file_path = Path(file_path)
-
- if file_path.suffix.lower() == '.pdf':
- # 转换PDF为图像
- print(f"📄 Processing PDF: {file_path.name}")
- pdf_images = convert_pdf_to_images(
- str(file_path),
- args.output_dir,
- dpi=args.pdf_dpi
- )
- input_files.extend(pdf_images)
- pdf_count += 1
- else:
- # 直接添加图像文件
- if file_path.exists():
- input_files.append(str(file_path))
- image_count += 1
-
- print(f"📊 Input summary:")
- print(f" PDF files processed: {pdf_count}")
- print(f" Image files found: {image_count}")
- print(f" Total image files to process: {len(input_files)}")
-
- return sorted(list(set(str(f) for f in input_files)))
|