mineru2_vllm_multthreads.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659
  1. """
  2. 批量处理图片/PDF文件并生成符合评测要求的预测结果(MinerU版本)
  3. 根据 MinerU demo.py 框架调用方式:
  4. - 输入:支持 PDF 和各种图片格式
  5. - 输出:每个文件对应的 .md、.json 文件,所有图片保存为单独的图片文件
  6. - 调用方式:通过 vlm-http-client 连接到 MinerU vLLM 服务器
  7. """
  8. import os
  9. import sys
  10. import json
  11. import copy
  12. import shutil
  13. import time
  14. import traceback
  15. from pathlib import Path
  16. from typing import List, Dict, Any
  17. from PIL import Image
  18. from tqdm import tqdm
  19. import argparse
  20. from loguru import logger
  21. # 导入 MinerU 核心组件 (参考 demo.py)
  22. from mineru.cli.common import read_fn, convert_pdf_bytes_to_bytes_by_pypdfium2, prepare_env
  23. from mineru.data.data_reader_writer import FileBasedDataWriter
  24. from mineru.utils.draw_bbox import draw_layout_bbox
  25. from mineru.utils.enum_class import MakeMode
  26. from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
  27. from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
  28. # 导入工具函数
  29. from utils import (
  30. get_input_files,
  31. collect_pid_files,
  32. normalize_markdown_table,
  33. normalize_json_table,
  34. )
  35. class MinerUVLLMProcessor:
  36. """MinerU vLLM 处理器 (基于 demo.py 框架)"""
  37. def __init__(self,
  38. server_url: str = "http://127.0.0.1:8121",
  39. timeout: int = 300,
  40. normalize_numbers: bool = False,
  41. debug: bool = False):
  42. """
  43. 初始化处理器
  44. Args:
  45. server_url: vLLM 服务器地址
  46. timeout: 请求超时时间
  47. normalize_numbers: 是否标准化数字
  48. debug: 是否启用调试模式
  49. """
  50. self.server_url = server_url.rstrip('/')
  51. self.timeout = timeout
  52. self.normalize_numbers = normalize_numbers
  53. self.debug = debug
  54. self.backend = "http-client" # 固定使用 http-client 后端
  55. print(f"MinerU vLLM Processor 初始化完成:")
  56. print(f" - 服务器: {server_url}")
  57. print(f" - 后端: vlm-{self.backend}")
  58. print(f" - 超时: {timeout}s")
  59. print(f" - 数字标准化: {normalize_numbers}")
  60. print(f" - 调试模式: {debug}")
  61. def do_parse_single_file(self,
  62. input_file: str,
  63. output_dir: str,
  64. start_page_id: int = 0,
  65. end_page_id: int = None) -> Dict[str, Any]:
  66. """
  67. 解析单个文件 (参考 demo.py 的 do_parse 函数)
  68. Args:
  69. file_path: 文件路径
  70. output_dir: 输出目录
  71. start_page_id: 起始页ID
  72. end_page_id: 结束页ID
  73. Returns:
  74. dict: 处理结果
  75. """
  76. try:
  77. # 准备文件名和字节数据
  78. file_path = Path(input_file)
  79. pdf_file_name = file_path.stem
  80. pdf_bytes = read_fn(str(file_path))
  81. # 转换PDF字节流 (如果需要)
  82. if file_path.suffix.lower() == '.pdf':
  83. pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(
  84. pdf_bytes, start_page_id, end_page_id
  85. )
  86. # 准备环境 (创建输出目录)
  87. # parse_method = "vlm"
  88. # local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
  89. local_md_dir = Path(output_dir).resolve()
  90. local_image_dir = local_md_dir / "images"
  91. image_writer = FileBasedDataWriter(local_image_dir.as_posix())
  92. md_writer = FileBasedDataWriter(local_md_dir.as_posix())
  93. # 使用 VLM 分析文档 (核心调用)
  94. middle_json, model_output = vlm_doc_analyze(
  95. pdf_bytes,
  96. image_writer=image_writer,
  97. backend=self.backend,
  98. server_url=self.server_url
  99. )
  100. pdf_info = middle_json["pdf_info"]
  101. # 处理输出 (参考 demo.py 的 _process_output)
  102. output_files = self._process_output(
  103. pdf_info=pdf_info,
  104. pdf_bytes=pdf_bytes,
  105. pdf_file_name=pdf_file_name,
  106. local_md_dir=local_md_dir,
  107. local_image_dir=local_image_dir,
  108. md_writer=md_writer,
  109. middle_json=middle_json,
  110. model_output=model_output,
  111. original_file_path=str(file_path)
  112. )
  113. # 统计提取信息
  114. extraction_stats = self._get_extraction_stats(middle_json)
  115. return {
  116. "success": True,
  117. "pdf_info": pdf_info,
  118. "middle_json": middle_json,
  119. "model_output": model_output,
  120. "output_files": output_files,
  121. "extraction_stats": extraction_stats
  122. }
  123. except Exception as e:
  124. logger.error(f"Failed to process {file_path}: {e}")
  125. return {
  126. "success": False,
  127. "error": str(e)
  128. }
  129. def _process_output(self,
  130. pdf_info,
  131. pdf_bytes,
  132. pdf_file_name,
  133. local_md_dir,
  134. local_image_dir,
  135. md_writer,
  136. middle_json,
  137. model_output,
  138. original_file_path: str) -> Dict[str, str]:
  139. """
  140. 处理输出文件 (改进版的 demo.py _process_output)
  141. Args:
  142. pdf_info: PDF信息
  143. pdf_bytes: PDF字节数据
  144. pdf_file_name: PDF文件名
  145. local_md_dir: Markdown目录
  146. local_image_dir: 图片目录
  147. md_writer: Markdown写入器
  148. middle_json: 中间JSON数据
  149. model_output: 模型输出
  150. original_file_path: 原始文件路径
  151. Returns:
  152. dict: 保存的文件路径信息
  153. """
  154. saved_files = {}
  155. try:
  156. # 设置相对图片目录名
  157. image_dir = str(os.path.basename(local_image_dir))
  158. # 1. 生成并保存 Markdown 文件
  159. md_content_str = vlm_union_make(pdf_info, MakeMode.MM_MD, image_dir)
  160. # 数字标准化处理
  161. if self.normalize_numbers:
  162. original_md = md_content_str
  163. md_content_str = normalize_markdown_table(md_content_str)
  164. changes_count = len([1 for o, n in zip(original_md, md_content_str) if o != n])
  165. if changes_count > 0:
  166. saved_files['md_normalized'] = f"✅ 已标准化 {changes_count} 个字符(全角→半角)"
  167. else:
  168. saved_files['md_normalized'] = "ℹ️ 无需标准化(已是标准格式)"
  169. md_writer.write_string(f"{pdf_file_name}.md", md_content_str)
  170. saved_files['md'] = os.path.join(local_md_dir, f"{pdf_file_name}.md")
  171. # 2. 生成并保存 content_list JSON 文件
  172. content_list = vlm_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir)
  173. content_list_str = json.dumps(content_list, ensure_ascii=False, indent=2)
  174. md_writer.write_string(f"{pdf_file_name}_original.json", content_list_str)
  175. # 这里需要将middle_json->content_list 中的bbox坐标同比例缩放回去, 不用PDF坐标,使用针对图片坐标
  176. # 因为已是单页处理,所以直接用第0页的width,height
  177. page_width, page_height = pdf_info[0].get('page_size')
  178. for element in content_list:
  179. if "bbox" in element:
  180. x0, y0, x1, y1 = element["bbox"]
  181. element["bbox"] = [
  182. int(x0 / 1000 * page_width),
  183. int(y0 / 1000 * page_height),
  184. int(x1 / 1000 * page_width),
  185. int(y1 / 1000 * page_height),
  186. ]
  187. content_list_str = json.dumps(content_list, ensure_ascii=False, indent=2)
  188. # 数字标准化处理
  189. if self.normalize_numbers:
  190. original_json = content_list_str
  191. content_list_str = normalize_json_table(content_list_str)
  192. changes_count = len([1 for o, n in zip(original_json, content_list_str) if o != n])
  193. if changes_count > 0:
  194. saved_files['json_normalized'] = f"✅ 已标准化 {changes_count} 个字符(全角→半角)"
  195. else:
  196. saved_files['json_normalized'] = "ℹ️ 无需标准化(已是标准格式)"
  197. md_writer.write_string(f"{pdf_file_name}.json", content_list_str)
  198. saved_files['json'] = os.path.join(local_md_dir, f"{pdf_file_name}.json")
  199. # 绘制布局边界框
  200. try:
  201. draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_layout.pdf")
  202. saved_files['layout_pdf'] = os.path.join(local_md_dir, f"{pdf_file_name}_layout.pdf")
  203. except Exception as e:
  204. logger.warning(f"Failed to draw layout bbox: {e}")
  205. # 3. 保存原始文件到 images 目录
  206. # original_file_path = Path(original_file_path)
  207. # output_image_path = os.path.join(local_image_dir, f"{pdf_file_name}{original_file_path.suffix}")
  208. # if original_file_path.exists():
  209. # shutil.copy2(str(original_file_path), output_image_path)
  210. # saved_files['image'] = output_image_path
  211. # 4. 调试模式下保存额外信息
  212. if self.debug:
  213. # 保存 middle.json
  214. middle_json_str = json.dumps(middle_json, ensure_ascii=False, indent=2)
  215. if self.normalize_numbers:
  216. middle_json_str = normalize_json_table(middle_json_str)
  217. md_writer.write_string(f"{pdf_file_name}_middle.json", middle_json_str)
  218. saved_files['middle_json'] = os.path.join(local_md_dir, f"{pdf_file_name}_middle.json")
  219. # 保存 model output
  220. if model_output:
  221. model_output_str = json.dumps(model_output, ensure_ascii=False, indent=2)
  222. md_writer.write_string(f"{pdf_file_name}_model.json", model_output_str)
  223. saved_files['model_output'] = os.path.join(local_md_dir, f"{pdf_file_name}_model.json")
  224. # # 保存原始PDF
  225. # md_writer.write(f"{pdf_file_name}_origin.pdf", pdf_bytes)
  226. # saved_files['origin_pdf'] = os.path.join(local_md_dir, f"{pdf_file_name}_origin.pdf")
  227. logger.info(f"Output saved to: {local_md_dir}")
  228. except Exception as e:
  229. logger.error(f"Error in _process_output: {e}")
  230. if self.debug:
  231. traceback.print_exc()
  232. return saved_files
  233. def _get_extraction_stats(self, middle_json: Dict) -> Dict[str, Any]:
  234. """
  235. 获取提取统计信息
  236. Args:
  237. middle_json: 中间JSON数据
  238. Returns:
  239. dict: 统计信息
  240. """
  241. stats = {
  242. "total_blocks": 0,
  243. "block_types": {},
  244. "total_pages": 0
  245. }
  246. try:
  247. pdf_info = middle_json.get("pdf_info", [])
  248. if isinstance(pdf_info, list):
  249. stats["total_pages"] = len(pdf_info)
  250. for page_info in pdf_info:
  251. para_blocks = page_info.get("para_blocks", [])
  252. stats["total_blocks"] += len(para_blocks)
  253. for block in para_blocks:
  254. block_type = block.get("type", "unknown")
  255. stats["block_types"][block_type] = stats["block_types"].get(block_type, 0) + 1
  256. except Exception as e:
  257. logger.warning(f"Failed to get extraction stats: {e}")
  258. return stats
  259. def process_single_image(self, image_path: str, output_dir: str) -> Dict[str, Any]:
  260. """
  261. 处理单张图片 (保持与原接口兼容)
  262. Args:
  263. image_path: 图片路径
  264. output_dir: 输出目录
  265. Returns:
  266. dict: 处理结果
  267. """
  268. start_time = time.time()
  269. image_name = Path(image_path).stem
  270. result_info = {
  271. "image_path": image_path,
  272. "processing_time": 0,
  273. "success": False,
  274. "server": self.server_url,
  275. "error": None,
  276. "output_files": {},
  277. "is_pdf_page": "_page_" in Path(image_path).name,
  278. "extraction_stats": {}
  279. }
  280. try:
  281. # 检查输出文件是否已存在
  282. expected_md_path = Path(output_dir) / f"{image_name}.md"
  283. expected_json_path = Path(output_dir) / f"{image_name}.json"
  284. if expected_md_path.exists() and expected_json_path.exists():
  285. result_info.update({
  286. "success": True,
  287. "processing_time": 0,
  288. "output_files": {
  289. "md": str(expected_md_path),
  290. "json": str(expected_json_path)
  291. },
  292. "skipped": True
  293. })
  294. return result_info
  295. # 使用 do_parse_single_file 处理
  296. parse_result = self.do_parse_single_file(image_path, output_dir)
  297. if parse_result["success"]:
  298. result_info.update({
  299. "success": True,
  300. "output_files": parse_result["output_files"],
  301. "extraction_stats": parse_result["extraction_stats"]
  302. })
  303. else:
  304. result_info["error"] = parse_result.get("error", "Unknown error")
  305. except Exception as e:
  306. result_info["error"] = str(e)
  307. logger.error(f"Error processing {image_name}: {e}")
  308. if self.debug:
  309. traceback.print_exc()
  310. finally:
  311. result_info["processing_time"] = time.time() - start_time
  312. return result_info
  313. def process_images_single_process(image_paths: List[str],
  314. processor: MinerUVLLMProcessor,
  315. batch_size: int = 1,
  316. output_dir: str = "./output") -> List[Dict[str, Any]]:
  317. """
  318. 单进程版本的图像处理函数
  319. """
  320. # 创建输出目录
  321. output_path = Path(output_dir)
  322. output_path.mkdir(parents=True, exist_ok=True)
  323. all_results = []
  324. total_images = len(image_paths)
  325. print(f"Processing {total_images} images with batch size {batch_size}")
  326. with tqdm(total=total_images, desc="Processing images", unit="img",
  327. bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]') as pbar:
  328. for i in range(0, total_images, batch_size):
  329. batch = image_paths[i:i + batch_size]
  330. batch_start_time = time.time()
  331. batch_results = []
  332. try:
  333. for image_path in batch:
  334. try:
  335. result = processor.process_single_image(image_path, output_dir)
  336. batch_results.append(result)
  337. except Exception as e:
  338. logger.error(f"Error processing {image_path}: {e}")
  339. batch_results.append({
  340. "image_path": image_path,
  341. "processing_time": 0,
  342. "success": False,
  343. "server": processor.server_url,
  344. "error": str(e)
  345. })
  346. batch_processing_time = time.time() - batch_start_time
  347. all_results.extend(batch_results)
  348. # 更新进度条
  349. success_count = sum(1 for r in batch_results if r.get('success', False))
  350. skipped_count = sum(1 for r in batch_results if r.get('skipped', False))
  351. total_success = sum(1 for r in all_results if r.get('success', False))
  352. total_skipped = sum(1 for r in all_results if r.get('skipped', False))
  353. avg_time = batch_processing_time / len(batch) if len(batch) > 0 else 0
  354. total_blocks = sum(r.get('extraction_stats', {}).get('total_blocks', 0) for r in batch_results)
  355. pbar.update(len(batch))
  356. pbar.set_postfix({
  357. 'batch_time': f"{batch_processing_time:.2f}s",
  358. 'avg_time': f"{avg_time:.2f}s/img",
  359. 'success': f"{total_success}/{len(all_results)}",
  360. 'skipped': f"{total_skipped}",
  361. 'blocks': f"{total_blocks}",
  362. 'rate': f"{total_success/len(all_results)*100:.1f}%" if len(all_results) > 0 else "0%"
  363. })
  364. except Exception as e:
  365. logger.error(f"Error processing batch {[Path(p).name for p in batch]}: {e}")
  366. error_results = []
  367. for img_path in batch:
  368. error_results.append({
  369. "image_path": str(img_path),
  370. "processing_time": 0,
  371. "success": False,
  372. "server": processor.server_url,
  373. "error": str(e)
  374. })
  375. all_results.extend(error_results)
  376. pbar.update(len(batch))
  377. return all_results
  378. def main():
  379. """主函数"""
  380. parser = argparse.ArgumentParser(description="MinerU vLLM Batch Processing (demo.py framework)")
  381. # 输入参数组
  382. input_group = parser.add_mutually_exclusive_group(required=True)
  383. input_group.add_argument("--input_file", type=str, help="Input file (supports both PDF and image file)")
  384. input_group.add_argument("--input_dir", type=str, help="Input directory (supports both PDF and image files)")
  385. input_group.add_argument("--input_file_list", type=str, help="Input file list (one file per line)")
  386. input_group.add_argument("--input_csv", type=str, help="Input CSV file with image_path and status columns")
  387. # 输出参数
  388. parser.add_argument("--output_dir", type=str, required=True, help="Output directory")
  389. # MinerU vLLM 参数
  390. parser.add_argument("--server_url", type=str, default="http://127.0.0.1:8121",
  391. help="MinerU vLLM server URL")
  392. parser.add_argument("--timeout", type=int, default=300, help="Request timeout in seconds")
  393. parser.add_argument("--pdf_dpi", type=int, default=200, help="DPI for PDF to image conversion")
  394. parser.add_argument('--no-normalize', action='store_true', help='禁用数字标准化')
  395. parser.add_argument('--debug', action='store_true', help='启用调试模式')
  396. # 处理参数
  397. parser.add_argument("--batch_size", type=int, default=1, help="Batch size")
  398. parser.add_argument("--test_mode", action="store_true", help="Test mode (process only 10 images)")
  399. parser.add_argument("--collect_results", type=str, help="收集处理结果到指定CSV文件")
  400. args = parser.parse_args()
  401. try:
  402. # 获取并预处理输入文件
  403. print("🔄 Preprocessing input files...")
  404. image_files = get_input_files(args)
  405. if not image_files:
  406. print("❌ No input files found or processed")
  407. return 1
  408. output_dir = Path(args.output_dir).resolve()
  409. print(f"📁 Output dir: {output_dir}")
  410. print(f"📊 Found {len(image_files)} image files to process")
  411. if args.test_mode:
  412. image_files = image_files[:10]
  413. print(f"🧪 Test mode: processing only {len(image_files)} images")
  414. print(f"🌐 Using server: {args.server_url}")
  415. print(f"📦 Batch size: {args.batch_size}")
  416. print(f"⏱️ Timeout: {args.timeout}s")
  417. # 创建处理器
  418. processor = MinerUVLLMProcessor(
  419. server_url=args.server_url,
  420. timeout=args.timeout,
  421. normalize_numbers=not args.no_normalize,
  422. debug=args.debug
  423. )
  424. # 开始处理
  425. start_time = time.time()
  426. results = process_images_single_process(
  427. image_files,
  428. processor,
  429. args.batch_size,
  430. str(output_dir)
  431. )
  432. total_time = time.time() - start_time
  433. # 统计结果
  434. success_count = sum(1 for r in results if r.get('success', False))
  435. skipped_count = sum(1 for r in results if r.get('skipped', False))
  436. error_count = len(results) - success_count
  437. pdf_page_count = sum(1 for r in results if r.get('is_pdf_page', False))
  438. # 统计提取的块信息
  439. total_blocks = sum(r.get('extraction_stats', {}).get('total_blocks', 0) for r in results)
  440. block_type_stats = {}
  441. for result in results:
  442. if 'extraction_stats' in result and 'block_types' in result['extraction_stats']:
  443. for block_type, count in result['extraction_stats']['block_types'].items():
  444. block_type_stats[block_type] = block_type_stats.get(block_type, 0) + count
  445. print(f"\n" + "="*60)
  446. print(f"✅ Processing completed!")
  447. print(f"📊 Statistics:")
  448. print(f" Total files processed: {len(image_files)}")
  449. print(f" PDF pages processed: {pdf_page_count}")
  450. print(f" Regular images processed: {len(image_files) - pdf_page_count}")
  451. print(f" Successful: {success_count}")
  452. print(f" Skipped: {skipped_count}")
  453. print(f" Failed: {error_count}")
  454. if len(image_files) > 0:
  455. print(f" Success rate: {success_count / len(image_files) * 100:.2f}%")
  456. print(f"📋 Content Extraction:")
  457. print(f" Total blocks extracted: {total_blocks}")
  458. if block_type_stats:
  459. print(f" Block types:")
  460. for block_type, count in sorted(block_type_stats.items()):
  461. print(f" {block_type}: {count}")
  462. print(f"⏱️ Performance:")
  463. print(f" Total time: {total_time:.2f} seconds")
  464. if total_time > 0:
  465. print(f" Throughput: {len(image_files) / total_time:.2f} images/second")
  466. print(f" Avg time per image: {total_time / len(image_files):.2f} seconds")
  467. print(f"\n📁 Output Structure (demo.py compatible):")
  468. print(f" output_dir/")
  469. print(f" ├── filename.md # Markdown content")
  470. print(f" ├── filename.json # Content list")
  471. print(f" ├── filename_layout.json # Debug: layout bbox")
  472. print(f" └── images/ # Extracted images")
  473. print(f" └── filename.png")
  474. if args.debug:
  475. print(f" ├── filename_middle.json # Debug: middle JSON")
  476. print(f" └── filename_model.json # Debug: model output")
  477. # 保存结果统计
  478. stats = {
  479. "total_files": len(image_files),
  480. "pdf_pages": pdf_page_count,
  481. "regular_images": len(image_files) - pdf_page_count,
  482. "success_count": success_count,
  483. "skipped_count": skipped_count,
  484. "error_count": error_count,
  485. "success_rate": success_count / len(image_files) if len(image_files) > 0 else 0,
  486. "total_time": total_time,
  487. "throughput": len(image_files) / total_time if total_time > 0 else 0,
  488. "avg_time_per_image": total_time / len(image_files) if len(image_files) > 0 else 0,
  489. "batch_size": args.batch_size,
  490. "server": args.server_url,
  491. "backend": "vlm-http-client",
  492. "timeout": args.timeout,
  493. "pdf_dpi": args.pdf_dpi,
  494. "total_blocks": total_blocks,
  495. "block_type_stats": block_type_stats,
  496. "normalization_enabled": not args.no_normalize,
  497. "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
  498. }
  499. # 保存最终结果
  500. output_file_name = Path(output_dir).name
  501. output_file = os.path.join(output_dir, f"{output_file_name}_results.json")
  502. final_results = {
  503. "stats": stats,
  504. "results": results
  505. }
  506. with open(output_file, 'w', encoding='utf-8') as f:
  507. json.dump(final_results, f, ensure_ascii=False, indent=2)
  508. print(f"💾 Results saved to: {output_file}")
  509. # 收集处理结果
  510. if not args.collect_results:
  511. output_file_processed = Path(args.output_dir) / f"processed_files_{time.strftime('%Y%m%d_%H%M%S')}.csv"
  512. else:
  513. output_file_processed = Path(args.collect_results).resolve()
  514. processed_files = collect_pid_files(output_file)
  515. with open(output_file_processed, 'w', encoding='utf-8') as f:
  516. f.write("image_path,status\n")
  517. for file_path, status in processed_files:
  518. f.write(f"{file_path},{status}\n")
  519. print(f"💾 Processed files saved to: {output_file_processed}")
  520. return 0
  521. except Exception as e:
  522. logger.error(f"Processing failed: {e}")
  523. traceback.print_exc()
  524. return 1
  525. if __name__ == "__main__":
  526. print(f"🚀 启动MinerU vLLM统一PDF/图像处理程序...")
  527. print(f"🔧 CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'Not set')}")
  528. if len(sys.argv) == 1:
  529. # 如果没有命令行参数,使用默认配置运行
  530. print("ℹ️ No command line arguments provided. Running with default configuration...")
  531. # 默认配置
  532. default_config = {
  533. # "input_file": "/home/ubuntu/zhch/data/至远彩色印刷工业有限公司/2023年度报告母公司.img/2023年度报告母公司_page_003.png",
  534. "input_dir": "/Users/zhch158/workspace/data/流水分析/B用户_扫描流水.img",
  535. "output_dir": "/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/mineru-vlm-2.5.3_Results",
  536. # "collect_results": f"./output/processed_files_{time.strftime('%Y%m%d_%H%M%S')}.csv",
  537. "server_url": "http://10.192.72.11:8121",
  538. "timeout": "300",
  539. "batch_size": "1",
  540. "pdf_dpi": "200",
  541. }
  542. # 构造参数
  543. sys.argv = [sys.argv[0]]
  544. for key, value in default_config.items():
  545. sys.argv.extend([f"--{key}", str(value)])
  546. # 测试模式和调试模式
  547. # sys.argv.append("--test_mode")
  548. sys.argv.append("--debug")
  549. sys.exit(main())