OmniDocBench_DotsOCR.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443
  1. """
  2. 批量处理 OmniDocBench 图片并生成符合评测要求的预测结果
  3. 根据 OmniDocBench 评测要求:
  4. - 输入:OpenDataLab___OmniDocBench/images 下的所有 .jpg 图片
  5. - 输出:每个图片对应的 .md、.json 和带标注的 layout 图片文件
  6. - 输出目录:用于后续的 end2end 评测
  7. """
  8. import os
  9. import sys
  10. import json
  11. import tempfile
  12. import uuid
  13. import shutil
  14. from pathlib import Path
  15. from PIL import Image
  16. from tqdm import tqdm
  17. import argparse
  18. # 导入 dots.ocr 相关模块
  19. from dots_ocr.parser import DotsOCRParser
  20. from dots_ocr.utils import dict_promptmode_to_prompt
  21. from dots_ocr.utils.consts import MIN_PIXELS, MAX_PIXELS
  22. class OmniDocBenchProcessor:
  23. """OmniDocBench 批量处理器"""
  24. def __init__(self,
  25. ip="127.0.0.1",
  26. port=8101,
  27. model_name="DotsOCR",
  28. prompt_mode="prompt_layout_all_en",
  29. dpi=200,
  30. min_pixels=MIN_PIXELS,
  31. max_pixels=MAX_PIXELS):
  32. """
  33. 初始化处理器
  34. Args:
  35. ip: vLLM 服务器 IP
  36. port: vLLM 服务器端口
  37. model_name: 模型名称
  38. prompt_mode: 提示模式
  39. dpi: PDF 处理 DPI
  40. min_pixels: 最小像素数
  41. max_pixels: 最大像素数
  42. """
  43. self.parser = DotsOCRParser(
  44. ip=ip,
  45. port=port,
  46. dpi=dpi,
  47. min_pixels=min_pixels,
  48. max_pixels=max_pixels,
  49. model_name=model_name
  50. )
  51. self.prompt_mode = prompt_mode
  52. print(f"DotsOCR Parser 初始化完成:")
  53. print(f" - 服务器: {ip}:{port}")
  54. print(f" - 模型: {model_name}")
  55. print(f" - 提示模式: {prompt_mode}")
  56. print(f" - 像素范围: {min_pixels} - {max_pixels}")
  57. def create_temp_session_dir(self):
  58. """创建临时会话目录"""
  59. session_id = uuid.uuid4().hex[:8]
  60. temp_dir = os.path.join(tempfile.gettempdir(), f"omnidocbench_batch_{session_id}")
  61. os.makedirs(temp_dir, exist_ok=True)
  62. return temp_dir, session_id
  63. def save_results_to_output_dir(self, result, image_name, output_dir):
  64. """
  65. 将处理结果保存到输出目录
  66. Args:
  67. result: 解析结果
  68. image_name: 图片文件名(不含扩展名)
  69. output_dir: 输出目录
  70. Returns:
  71. dict: 保存的文件路径
  72. """
  73. saved_files = {}
  74. # 1. 保存 Markdown 文件(OmniDocBench 评测必需)
  75. output_md_path = os.path.join(output_dir, f"{image_name}.md")
  76. md_content = ""
  77. # 优先使用无页眉页脚的版本(符合 OmniDocBench 评测要求)
  78. if 'md_content_nohf_path' in result and os.path.exists(result['md_content_nohf_path']):
  79. with open(result['md_content_nohf_path'], 'r', encoding='utf-8') as f:
  80. md_content = f.read()
  81. elif 'md_content_path' in result and os.path.exists(result['md_content_path']):
  82. with open(result['md_content_path'], 'r', encoding='utf-8') as f:
  83. md_content = f.read()
  84. else:
  85. md_content = "# 解析失败\n\n未能提取到有效的文档内容。"
  86. with open(output_md_path, 'w', encoding='utf-8') as f:
  87. f.write(md_content)
  88. saved_files['md'] = output_md_path
  89. # 2. 保存 JSON 文件
  90. output_json_path = os.path.join(output_dir, f"{image_name}.json")
  91. json_data = {}
  92. if 'layout_info_path' in result and os.path.exists(result['layout_info_path']):
  93. with open(result['layout_info_path'], 'r', encoding='utf-8') as f:
  94. json_data = json.load(f)
  95. else:
  96. json_data = {
  97. "error": "未能提取到有效的布局信息",
  98. "cells": []
  99. }
  100. with open(output_json_path, 'w', encoding='utf-8') as f:
  101. json.dump(json_data, f, ensure_ascii=False, indent=2)
  102. saved_files['json'] = output_json_path
  103. # 3. 保存带标注的布局图片
  104. output_layout_image_path = os.path.join(output_dir, f"{image_name}_layout.jpg")
  105. if 'layout_image_path' in result and os.path.exists(result['layout_image_path']):
  106. # 直接复制布局图片
  107. shutil.copy2(result['layout_image_path'], output_layout_image_path)
  108. saved_files['layout_image'] = output_layout_image_path
  109. else:
  110. # 如果没有布局图片,使用原始图片作为占位符
  111. try:
  112. original_image = Image.open(result.get('original_image_path', ''))
  113. original_image.save(output_layout_image_path, 'JPEG', quality=95)
  114. saved_files['layout_image'] = output_layout_image_path
  115. print(f"⚠️ 使用原始图片作为布局图片: {image_name}")
  116. except Exception as e:
  117. print(f"⚠️ 无法保存布局图片: {image_name}, 错误: {e}")
  118. saved_files['layout_image'] = None
  119. # 4. 可选:保存原始图片副本
  120. output_original_image_path = os.path.join(output_dir, f"{image_name}_original.jpg")
  121. if 'original_image_path' in result and os.path.exists(result['original_image_path']):
  122. shutil.copy2(result['original_image_path'], output_original_image_path)
  123. saved_files['original_image'] = output_original_image_path
  124. return saved_files
  125. def process_single_image(self, image_path, output_dir):
  126. """
  127. 处理单张图片
  128. Args:
  129. image_path: 图片路径
  130. output_dir: 输出目录
  131. Returns:
  132. bool: 处理是否成功
  133. """
  134. try:
  135. # 获取图片文件名(不含扩展名)
  136. image_name = Path(image_path).stem
  137. # 检查输出文件是否已存在
  138. output_md_path = os.path.join(output_dir, f"{image_name}.md")
  139. output_json_path = os.path.join(output_dir, f"{image_name}.json")
  140. output_layout_path = os.path.join(output_dir, f"{image_name}_layout.jpg")
  141. if all(os.path.exists(p) for p in [output_md_path, output_json_path, output_layout_path]):
  142. print(f"跳过已存在的文件: {image_name}")
  143. return True
  144. # 创建临时会话目录
  145. temp_dir, session_id = self.create_temp_session_dir()
  146. try:
  147. # 读取图片
  148. image = Image.open(image_path)
  149. # 使用 DotsOCRParser 处理图片
  150. filename = f"omnidocbench_{session_id}"
  151. results = self.parser.parse_image(
  152. input_path=image,
  153. filename=filename,
  154. prompt_mode=self.prompt_mode,
  155. save_dir=temp_dir,
  156. fitz_preprocess=True # 对图片使用 fitz 预处理
  157. )
  158. # 解析结果
  159. if not results:
  160. print(f"警告: {image_name} 未返回解析结果")
  161. return False
  162. result = results[0] # parse_image 返回单个结果的列表
  163. # 添加原始图片路径到结果中
  164. result['original_image_path'] = image_path
  165. # 保存所有结果文件到输出目录
  166. saved_files = self.save_results_to_output_dir(result, image_name, output_dir)
  167. # 验证保存结果
  168. success_count = sum(1 for path in saved_files.values() if path and os.path.exists(path))
  169. total_expected = 3 # md, json, layout_image
  170. if success_count >= 2: # 至少保存了 md 和 json
  171. print(f"✅ 成功处理: {image_name} (保存了 {success_count}/{total_expected} 个文件)")
  172. return True
  173. else:
  174. print(f"⚠️ 部分成功: {image_name} (保存了 {success_count}/{total_expected} 个文件)")
  175. return False
  176. except Exception as e:
  177. print(f"❌ 处理 {image_name} 时出错: {str(e)}")
  178. return False
  179. finally:
  180. # 清理临时目录
  181. if os.path.exists(temp_dir):
  182. shutil.rmtree(temp_dir, ignore_errors=True)
  183. except Exception as e:
  184. print(f"❌ 处理 {image_path} 时出现致命错误: {str(e)}")
  185. return False
  186. def process_batch(self, images_dir, output_dir):
  187. """
  188. 批量处理图片
  189. Args:
  190. images_dir: 输入图片目录
  191. output_dir: 输出目录
  192. """
  193. # 创建输出目录
  194. os.makedirs(output_dir, exist_ok=True)
  195. # 获取所有图片文件
  196. image_extensions = ['.jpg', '.jpeg', '.png']
  197. image_files = []
  198. for ext in image_extensions:
  199. image_files.extend(Path(images_dir).glob(f"*{ext}"))
  200. image_files.extend(Path(images_dir).glob(f"*{ext.upper()}"))
  201. image_files = sorted(image_files)
  202. if not image_files:
  203. print(f"在 {images_dir} 中未找到图片文件")
  204. return
  205. print(f"找到 {len(image_files)} 个图片文件")
  206. print(f"输出目录结构: {output_dir}")
  207. # 统计变量
  208. success_count = 0
  209. failed_count = 0
  210. skipped_count = 0
  211. # 使用进度条处理
  212. with tqdm(image_files, desc="处理图片", unit="张") as pbar:
  213. for image_path in pbar:
  214. # 更新进度条描述
  215. pbar.set_description(f"处理: {image_path.name}")
  216. # 检查输出文件是否已存在(在主输出目录中)
  217. image_name = image_path.stem
  218. output_md_path = os.path.join(output_dir, f"{image_name}.md")
  219. output_json_path = os.path.join(output_dir, f"{image_name}.json")
  220. output_layout_path = os.path.join(output_dir, f"{image_name}_layout.jpg")
  221. if all(os.path.exists(p) for p in [output_md_path, output_json_path, output_layout_path]):
  222. skipped_count += 1
  223. continue
  224. # 处理图片
  225. if self.process_single_image(str(image_path), output_dir):
  226. success_count += 1
  227. else:
  228. failed_count += 1
  229. # 更新进度条后缀
  230. pbar.set_postfix({
  231. 'success': success_count,
  232. 'failed': failed_count,
  233. 'skipped': skipped_count
  234. })
  235. # 输出最终统计
  236. print(f"\n🎉 批量处理完成!")
  237. print(f" ✅ 成功: {success_count}")
  238. print(f" ❌ 失败: {failed_count}")
  239. print(f" ⏭️ 跳过: {skipped_count}")
  240. print(f" 📁 输出目录: {output_dir}")
  241. # 生成处理报告
  242. self.generate_processing_report(output_dir, success_count, failed_count, skipped_count)
  243. def generate_processing_report(self, output_dir, success_count, failed_count, skipped_count):
  244. """生成处理报告"""
  245. report_path = os.path.join(output_dir, "processing_report.json")
  246. report = {
  247. "processing_summary": {
  248. "success_count": success_count,
  249. "failed_count": failed_count,
  250. "skipped_count": skipped_count,
  251. "total_processed": success_count + failed_count + skipped_count
  252. },
  253. "output_structure": {
  254. "markdown_files": f"{output_dir}/*.md",
  255. "json_files": f"{output_dir}/*.json",
  256. "layout_images": f"{output_dir}/*_layout.jpg",
  257. "original_images": f"{output_dir}/*_original.jpg"
  258. },
  259. "configuration": {
  260. "prompt_mode": self.prompt_mode,
  261. "server": f"{self.parser.ip}:{self.parser.port}",
  262. "pixel_range": f"{self.parser.min_pixels} - {self.parser.max_pixels}"
  263. }
  264. }
  265. with open(report_path, 'w', encoding='utf-8') as f:
  266. json.dump(report, f, ensure_ascii=False, indent=2)
  267. print(f"📊 处理报告已保存: {report_path}")
  268. def main():
  269. parser = argparse.ArgumentParser(description="批量处理 OmniDocBench 图片")
  270. parser.add_argument(
  271. "--images_dir",
  272. type=str,
  273. default="../OmniDocBench/OpenDataLab___OmniDocBench/images",
  274. help="输入图片目录路径"
  275. )
  276. parser.add_argument(
  277. "--output_dir",
  278. type=str,
  279. default="./omnidocbench_predictions",
  280. help="输出目录路径"
  281. )
  282. parser.add_argument(
  283. "--ip",
  284. type=str,
  285. default="127.0.0.1",
  286. help="vLLM 服务器 IP"
  287. )
  288. parser.add_argument(
  289. "--port",
  290. type=int,
  291. default=8101,
  292. help="vLLM 服务器端口"
  293. )
  294. parser.add_argument(
  295. "--model_name",
  296. type=str,
  297. default="DotsOCR",
  298. help="模型名称"
  299. )
  300. parser.add_argument(
  301. "--prompt_mode",
  302. type=str,
  303. default="prompt_layout_all_en",
  304. choices=list(dict_promptmode_to_prompt.keys()),
  305. help="提示模式"
  306. )
  307. parser.add_argument(
  308. "--min_pixels",
  309. type=int,
  310. default=MIN_PIXELS,
  311. help="最小像素数"
  312. )
  313. parser.add_argument(
  314. "--max_pixels",
  315. type=int,
  316. default=MAX_PIXELS,
  317. help="最大像素数"
  318. )
  319. parser.add_argument(
  320. "--dpi",
  321. type=int,
  322. default=200,
  323. help="PDF 处理 DPI"
  324. )
  325. args = parser.parse_args()
  326. # 检查输入目录
  327. if not os.path.exists(args.images_dir):
  328. print(f"❌ 输入目录不存在: {args.images_dir}")
  329. return
  330. print(f"🚀 开始批量处理 OmniDocBench 图片")
  331. print(f"📁 输入目录: {args.images_dir}")
  332. print(f"📁 输出目录: {args.output_dir}")
  333. print("="*60)
  334. # 创建处理器
  335. processor = OmniDocBenchProcessor(
  336. ip=args.ip,
  337. port=args.port,
  338. model_name=args.model_name,
  339. prompt_mode=args.prompt_mode,
  340. dpi=args.dpi,
  341. min_pixels=args.min_pixels,
  342. max_pixels=args.max_pixels
  343. )
  344. # 开始批量处理
  345. processor.process_batch(args.images_dir, args.output_dir)
  346. if __name__ == "__main__":
  347. print(f"🚀 启动单进程DotsOCR程序...")
  348. print(f"🔧 CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'Not set')}")
  349. if len(sys.argv) == 1:
  350. # 如果没有命令行参数,使用默认配置运行
  351. print("ℹ️ No command line arguments provided. Running with default configuration...")
  352. # 默认配置
  353. default_config = {
  354. "input_dir": "../../OmniDocBench/OpenDataLab___OmniDocBench/images",
  355. "output_dir": "./OmniDocBench_Results_Single",
  356. }
  357. # 构造参数
  358. sys.argv = [sys.argv[0]]
  359. for key, value in default_config.items():
  360. sys.argv.extend([f"--{key}", str(value)])
  361. # 测试模式
  362. # sys.argv.append("--test_mode")
  363. sys.exit(main())