file_utils.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496
  1. """
  2. 文件处理工具模块
  3. 提供文件处理相关功能:
  4. - 输入文件获取(支持文件/目录/列表/CSV)
  5. - PDF转图片
  6. - 文件列表处理
  7. """
  8. import tempfile
  9. import re
  10. from pathlib import Path
  11. from typing import List, Tuple, Optional, Set
  12. import json
  13. import traceback
  14. from loguru import logger
  15. try:
  16. from mineru.utils.pdf_image_tools import load_images_from_pdf
  17. from mineru.utils.enum_class import ImageType
  18. MINERU_AVAILABLE = True
  19. except ImportError:
  20. MINERU_AVAILABLE = False
  21. load_images_from_pdf = None
  22. ImageType = None
  23. def parse_page_range(page_range: Optional[str], total_pages: int) -> Set[int]:
  24. """
  25. 解析页面范围字符串
  26. 支持格式:
  27. - "1-5" → {0, 1, 2, 3, 4}(页码从1开始,内部转为0-based索引)
  28. - "3" → {2}
  29. - "1-5,7,9-12" → {0, 1, 2, 3, 4, 6, 8, 9, 10, 11}
  30. - "1-" → 从第1页到最后
  31. - "-5" → 从第1页到第5页
  32. Args:
  33. page_range: 页面范围字符串(页码从1开始)
  34. total_pages: 总页数
  35. Returns:
  36. 页面索引集合(0-based)
  37. """
  38. if not page_range or not page_range.strip():
  39. return set(range(total_pages))
  40. pages = set()
  41. parts = page_range.replace(' ', '').split(',')
  42. for part in parts:
  43. part = part.strip()
  44. if not part:
  45. continue
  46. if '-' in part:
  47. # 范围格式
  48. match = re.match(r'^(\d*)-(\d*)$', part)
  49. if match:
  50. start_str, end_str = match.groups()
  51. start = int(start_str) if start_str else 1
  52. end = int(end_str) if end_str else total_pages
  53. # 转换为 0-based 索引
  54. start = max(0, start - 1)
  55. end = min(total_pages, end)
  56. pages.update(range(start, end))
  57. else:
  58. # 单页
  59. try:
  60. page_num = int(part)
  61. if 1 <= page_num <= total_pages:
  62. pages.add(page_num - 1) # 转换为 0-based 索引
  63. except ValueError:
  64. logger.warning(f"Invalid page number: {part}")
  65. return pages
  66. def split_files(file_list: List[str], num_splits: int) -> List[List[str]]:
  67. """
  68. 将文件列表分割成指定数量的子列表
  69. Args:
  70. file_list: 文件路径列表
  71. num_splits: 分割数量
  72. Returns:
  73. 分割后的文件列表
  74. """
  75. if num_splits <= 0:
  76. return [file_list]
  77. chunk_size = len(file_list) // num_splits
  78. remainder = len(file_list) % num_splits
  79. chunks = []
  80. start = 0
  81. for i in range(num_splits):
  82. # 前remainder个chunk多分配一个文件
  83. current_chunk_size = chunk_size + (1 if i < remainder else 0)
  84. if current_chunk_size > 0:
  85. chunks.append(file_list[start:start + current_chunk_size])
  86. start += current_chunk_size
  87. return [chunk for chunk in chunks if chunk] # 过滤空列表
  88. def create_temp_file_list(file_chunk: List[str]) -> str:
  89. """
  90. 创建临时文件列表文件
  91. Args:
  92. file_chunk: 文件路径列表
  93. Returns:
  94. 临时文件路径
  95. """
  96. with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
  97. for file_path in file_chunk:
  98. f.write(f"{file_path}\n")
  99. return f.name
  100. def get_image_files_from_dir(input_dir: Path, pattern: str = "*", max_files: int | None = None) -> List[str]:
  101. """
  102. 从目录获取图像文件列表
  103. Args:
  104. input_dir: 输入目录
  105. pattern: 文件名模式
  106. max_files: 最大文件数量限制
  107. Returns:
  108. 图像文件路径列表
  109. """
  110. image_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif']
  111. image_files = []
  112. for ext in image_extensions:
  113. image_files.extend(list(input_dir.glob(f"{pattern}{ext}")))
  114. image_files.extend(list(input_dir.glob(f"{pattern}{ext.upper()}")))
  115. # 去重并排序
  116. image_files = sorted(list(set(str(f) for f in image_files)))
  117. # 限制文件数量
  118. if max_files:
  119. image_files = image_files[:max_files]
  120. return image_files
  121. def get_image_files_from_list(file_list_path: str) -> List[str]:
  122. """
  123. 从文件列表获取图像文件列表
  124. Args:
  125. file_list_path: 文件列表路径
  126. Returns:
  127. 图像文件路径列表
  128. """
  129. logger.info(f"📄 Reading file list from: {file_list_path}")
  130. with open(file_list_path, 'r', encoding='utf-8') as f:
  131. image_files = [line.strip() for line in f if line.strip()]
  132. # 验证文件存在性
  133. valid_files = []
  134. missing_files = []
  135. for file_path in image_files:
  136. if Path(file_path).exists():
  137. valid_files.append(file_path)
  138. else:
  139. missing_files.append(file_path)
  140. if missing_files:
  141. logger.warning(f"⚠️ Warning: {len(missing_files)} files not found:")
  142. for missing_file in missing_files[:5]: # 只显示前5个
  143. logger.warning(f" - {missing_file}")
  144. if len(missing_files) > 5:
  145. logger.warning(f" ... and {len(missing_files) - 5} more")
  146. logger.info(f"✅ Found {len(valid_files)} valid files out of {len(image_files)} in list")
  147. return valid_files
  148. def get_image_files_from_csv(csv_file: str, status_filter: str = "fail") -> List[str]:
  149. """
  150. 从CSV文件获取图像文件列表
  151. Args:
  152. csv_file: CSV文件路径
  153. status_filter: 状态过滤器
  154. Returns:
  155. 图像文件路径列表
  156. """
  157. logger.info(f"📄 Reading image files from CSV: {csv_file}")
  158. # 读取CSV文件, 表头:image_path,status
  159. image_files = []
  160. with open(csv_file, 'r', encoding='utf-8') as f:
  161. for line in f:
  162. # 需要去掉表头, 按","分割,读取文件名,状态
  163. parts = line.strip().split(",")
  164. if len(parts) >= 2:
  165. image_file, status = parts[0], parts[1]
  166. if status.lower() == status_filter.lower():
  167. image_files.append(image_file)
  168. return image_files
  169. def collect_pid_files(pid_output_file: str) -> List[Tuple[str, str]]:
  170. """
  171. 从进程输出文件中收集文件
  172. Args:
  173. pid_output_file: 进程输出文件路径
  174. Returns:
  175. 文件列表(文件路径,处理结果)
  176. """
  177. """
  178. 单进程结果统计文件格式
  179. "results": [
  180. {
  181. "image_path": "docstructbench_dianzishu_zhongwenzaixian-o.O-61520612.pdf_140.jpg",
  182. "processing_time": 2.0265579223632812e-06,
  183. "success": true,
  184. "device": "gpu:3",
  185. "output_json": "/home/ubuntu/zhch/PaddleX/zhch/OmniDocBench_Results_Scheduler/process_3/docstructbench_dianzishu_zhongwenzaixian-o.O-61520612.pdf_140.json",
  186. "output_md": "/home/ubuntu/zhch/PaddleX/zhch/OmniDocBench_Results_Scheduler/process_3/docstructbench_dianzishu_zhongwenzaixian-o.O-61520612.pdf_140.md"
  187. },
  188. ...
  189. """
  190. if not Path(pid_output_file).exists():
  191. logger.warning(f"⚠️ Warning: PID output file not found: {pid_output_file}")
  192. return []
  193. with open(pid_output_file, 'r', encoding='utf-8') as f:
  194. data = json.load(f)
  195. if not isinstance(data, dict) or "results" not in data:
  196. logger.warning(f"⚠️ Warning: Invalid PID output file format: {pid_output_file}")
  197. return []
  198. # 返回文件路径和处理状态, 如果"success": True, 则状态为"success", 否则为"fail"
  199. file_list = []
  200. for file_result in data.get("results", []):
  201. image_path = file_result.get("image_path", "")
  202. status = "success" if file_result.get("success", False) else "fail"
  203. file_list.append((image_path, status))
  204. return file_list
  205. def convert_pdf_to_images(
  206. pdf_file: str,
  207. output_dir: str | None = None,
  208. dpi: int = 200,
  209. page_range: str | None = None
  210. ) -> List[str]:
  211. """
  212. 将PDF转换为图像文件,支持页面范围过滤
  213. Args:
  214. pdf_file: PDF文件路径
  215. output_dir: 输出目录
  216. dpi: 图像分辨率
  217. page_range: 页面范围字符串,如 "1-5,7,9-12"
  218. Returns:
  219. 生成的图像文件路径列表
  220. """
  221. pdf_path = Path(pdf_file)
  222. if not pdf_path.exists() or pdf_path.suffix.lower() != '.pdf':
  223. logger.error(f"❌ Invalid PDF file: {pdf_path}")
  224. return []
  225. # 如果没有指定输出目录,使用PDF同名目录
  226. if output_dir is None:
  227. output_path = pdf_path.parent / f"{pdf_path.stem}"
  228. else:
  229. output_path = Path(output_dir) / f"{pdf_path.stem}"
  230. output_path = output_path.resolve()
  231. output_path.mkdir(parents=True, exist_ok=True)
  232. try:
  233. # 优先使用 MinerU 的函数(如果可用)
  234. if MINERU_AVAILABLE and load_images_from_pdf is not None and ImageType is not None:
  235. images, _ = load_images_from_pdf(
  236. pdf_path.read_bytes(),
  237. dpi=dpi,
  238. image_type=ImageType.PIL # 返回包含 img_pil 的字典列表
  239. )
  240. # 应用页面范围过滤
  241. selected_pages = None
  242. if page_range:
  243. total_pages = len(images)
  244. selected_pages = parse_page_range(page_range, total_pages)
  245. if selected_pages:
  246. images = [images[i] for i in sorted(selected_pages)]
  247. logger.info(f"📋 PDF 共 {total_pages} 页,选择处理 {len(images)} 页")
  248. else:
  249. logger.warning(f"⚠️ 页面范围 '{page_range}' 没有匹配到任何有效页面")
  250. return []
  251. else:
  252. selected_pages = None
  253. image_paths = []
  254. # 需要跟踪原始页码索引,以便正确命名文件
  255. original_indices = sorted(selected_pages) if selected_pages else list(range(len(images)))
  256. for idx, image in enumerate(images):
  257. # 获取原始页码索引(用于文件命名)
  258. original_idx = original_indices[idx] if selected_pages else idx
  259. # 生成图像文件名(使用原始页码,从1开始)
  260. image_filename = f"{pdf_path.stem}_page_{original_idx + 1:03d}.png"
  261. image_path = output_path / image_filename
  262. # 保存图像 - 从字典中提取 img_pil
  263. if isinstance(image, dict):
  264. pil_image = image.get('img_pil')
  265. if pil_image is None:
  266. logger.error(f"❌ Image dict at index {idx} does not contain 'img_pil' key")
  267. continue
  268. pil_image.save(str(image_path))
  269. else:
  270. # 如果不是字典,假设是直接的 PIL Image
  271. image.save(str(image_path))
  272. image_paths.append(str(image_path))
  273. logger.info(f"✅ Converted {len(images)} pages from {pdf_path.name} to images (using MinerU)")
  274. return image_paths
  275. else:
  276. # Fallback: 使用 pypdfium2(PaddleX 环境中可用)
  277. logger.info("ℹ️ MinerU 不可用,使用 pypdfium2 进行 PDF 转图像")
  278. try:
  279. import pypdfium2 as pdfium
  280. except ImportError:
  281. logger.error("❌ pypdfium2 未安装,无法转换 PDF。请安装: pip install pypdfium2")
  282. return []
  283. pdf_doc = pdfium.PdfDocument(pdf_path)
  284. try:
  285. total_pages = len(pdf_doc)
  286. # 解析页面范围(使用本地函数,不依赖 PDFUtils)
  287. selected_pages = parse_page_range(page_range, total_pages)
  288. if not selected_pages:
  289. logger.warning(f"⚠️ 页面范围 '{page_range}' 没有匹配到任何有效页面")
  290. return []
  291. if page_range:
  292. logger.info(f"📋 PDF 共 {total_pages} 页,选择处理 {len(selected_pages)} 页")
  293. # 计算缩放比例(DPI 转换)
  294. # pypdfium2 的 scale 参数:1.0 = 72 DPI,所以 dpi/72 = scale
  295. scale = dpi / 72.0
  296. image_paths = []
  297. for page_idx in sorted(selected_pages):
  298. page = pdf_doc[page_idx]
  299. # 渲染页面为图像
  300. bitmap = page.render(scale=scale)
  301. pil_image = bitmap.to_pil()
  302. # 生成图像文件名(页码从1开始)
  303. image_filename = f"{pdf_path.stem}_page_{page_idx + 1:03d}.png"
  304. image_path = output_path / image_filename
  305. # 保存图像
  306. pil_image.save(str(image_path))
  307. image_paths.append(str(image_path))
  308. logger.info(f"✅ Converted {len(image_paths)} pages from {pdf_path.name} to images (using pypdfium2)")
  309. return image_paths
  310. finally:
  311. pdf_doc.close()
  312. except Exception as e:
  313. logger.error(f"❌ Error converting PDF {pdf_path}: {e}")
  314. traceback.print_exc()
  315. return []
  316. def get_input_files(args, page_range: str | None = None) -> List[str]:
  317. """
  318. 获取输入文件列表,统一处理PDF和图像文件,支持页面范围过滤
  319. 支持自动判断输入类型:
  320. - 如果是文件路径,判断是PDF还是图片
  321. - 如果是目录,扫描所有PDF和图片文件
  322. - 如果是CSV文件,读取文件列表
  323. - 如果是文本文件,读取文件列表
  324. Args:
  325. args: 命令行参数对象,需要包含 input, output_dir, pdf_dpi 属性
  326. page_range: 页面范围字符串(可选),如 "1-5,7,9-12"
  327. Returns:
  328. 处理后的图像文件路径列表
  329. """
  330. input_files = []
  331. input_path = Path(args.input)
  332. if not input_path.exists():
  333. logger.error(f"❌ Input path does not exist: {input_path}")
  334. return []
  335. # 判断输入类型
  336. if input_path.is_file():
  337. # 单个文件
  338. if input_path.suffix.lower() == '.pdf':
  339. # PDF文件:转换为图片
  340. logger.info(f"📄 Processing PDF: {input_path.name}")
  341. pdf_images = convert_pdf_to_images(
  342. str(input_path),
  343. getattr(args, 'output_dir', None),
  344. dpi=getattr(args, 'pdf_dpi', 200),
  345. page_range=page_range # 传递页面范围参数
  346. )
  347. input_files.extend(pdf_images)
  348. elif input_path.suffix.lower() in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif']:
  349. # 图片文件:直接添加
  350. input_files.append(str(input_path))
  351. elif input_path.suffix.lower() == '.csv':
  352. # CSV文件:读取文件列表
  353. input_files = get_image_files_from_csv(str(input_path), "fail")
  354. elif input_path.suffix.lower() in ['.txt', '.list']:
  355. # 文本文件:读取文件列表
  356. input_files = get_image_files_from_list(str(input_path))
  357. else:
  358. logger.warning(f"⚠️ Unsupported file type: {input_path.suffix}")
  359. elif input_path.is_dir():
  360. # 目录:扫描所有PDF和图片文件
  361. image_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif']
  362. pdf_extensions = ['.pdf']
  363. raw_files = []
  364. for ext in image_extensions + pdf_extensions:
  365. raw_files.extend(list(input_path.glob(f"*{ext}")))
  366. raw_files.extend(list(input_path.glob(f"*{ext.upper()}")))
  367. # 分离PDF和图像文件
  368. pdf_files = [f for f in sorted(set(raw_files)) if f.suffix.lower() == '.pdf']
  369. image_files = [f for f in sorted(set(raw_files)) if f.suffix.lower() in image_extensions]
  370. # 对于图片目录,应用页面范围过滤
  371. if page_range and image_files:
  372. total_pages = len(image_files)
  373. selected_pages = parse_page_range(page_range, total_pages)
  374. if selected_pages:
  375. image_files = [image_files[i] for i in sorted(selected_pages)]
  376. logger.info(f"📋 图片目录共 {total_pages} 张,选择处理 {len(image_files)} 张")
  377. else:
  378. logger.warning(f"⚠️ 页面范围 '{page_range}' 没有匹配到任何有效图片")
  379. image_files = []
  380. # 分别处理PDF和图像文件
  381. pdf_count = 0
  382. image_count = 0
  383. for file_path in pdf_files:
  384. # 转换PDF为图像
  385. logger.info(f"📄 Processing PDF: {file_path.name}")
  386. pdf_images = convert_pdf_to_images(
  387. str(file_path),
  388. getattr(args, 'output_dir', None),
  389. dpi=getattr(args, 'pdf_dpi', 200),
  390. page_range=page_range # 传递页面范围参数
  391. )
  392. input_files.extend(pdf_images)
  393. pdf_count += 1
  394. for file_path in image_files:
  395. # 直接添加图像文件
  396. input_files.append(str(file_path))
  397. image_count += 1
  398. logger.info(f"📊 Input summary:")
  399. logger.info(f" PDF files processed: {pdf_count}")
  400. logger.info(f" Image files found: {image_count}")
  401. logger.info(f"📊 Total image files to process: {len(input_files)}")
  402. return sorted(list(set(str(f) for f in input_files)))