OmniDocBench_DotsOCR.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422
  1. """
  2. 批量处理 OmniDocBench 图片并生成符合评测要求的预测结果
  3. 根据 OmniDocBench 评测要求:
  4. - 输入:OpenDataLab___OmniDocBench/images 下的所有 .jpg 图片
  5. - 输出:每个图片对应的 .md、.json 和带标注的 layout 图片文件
  6. - 输出目录:用于后续的 end2end 评测
  7. """
  8. import os
  9. import json
  10. import tempfile
  11. import uuid
  12. import shutil
  13. from pathlib import Path
  14. from PIL import Image
  15. from tqdm import tqdm
  16. import argparse
  17. # 导入 dots.ocr 相关模块
  18. from dots_ocr.parser import DotsOCRParser
  19. from dots_ocr.utils import dict_promptmode_to_prompt
  20. from dots_ocr.utils.consts import MIN_PIXELS, MAX_PIXELS
  21. class OmniDocBenchProcessor:
  22. """OmniDocBench 批量处理器"""
  23. def __init__(self,
  24. ip="127.0.0.1",
  25. port=8101,
  26. model_name="DotsOCR",
  27. prompt_mode="prompt_layout_all_en",
  28. dpi=200,
  29. min_pixels=MIN_PIXELS,
  30. max_pixels=MAX_PIXELS):
  31. """
  32. 初始化处理器
  33. Args:
  34. ip: vLLM 服务器 IP
  35. port: vLLM 服务器端口
  36. model_name: 模型名称
  37. prompt_mode: 提示模式
  38. dpi: PDF 处理 DPI
  39. min_pixels: 最小像素数
  40. max_pixels: 最大像素数
  41. """
  42. self.parser = DotsOCRParser(
  43. ip=ip,
  44. port=port,
  45. dpi=dpi,
  46. min_pixels=min_pixels,
  47. max_pixels=max_pixels,
  48. model_name=model_name
  49. )
  50. self.prompt_mode = prompt_mode
  51. print(f"DotsOCR Parser 初始化完成:")
  52. print(f" - 服务器: {ip}:{port}")
  53. print(f" - 模型: {model_name}")
  54. print(f" - 提示模式: {prompt_mode}")
  55. print(f" - 像素范围: {min_pixels} - {max_pixels}")
  56. def create_temp_session_dir(self):
  57. """创建临时会话目录"""
  58. session_id = uuid.uuid4().hex[:8]
  59. temp_dir = os.path.join(tempfile.gettempdir(), f"omnidocbench_batch_{session_id}")
  60. os.makedirs(temp_dir, exist_ok=True)
  61. return temp_dir, session_id
  62. def save_results_to_output_dir(self, result, image_name, output_dir):
  63. """
  64. 将处理结果保存到输出目录
  65. Args:
  66. result: 解析结果
  67. image_name: 图片文件名(不含扩展名)
  68. output_dir: 输出目录
  69. Returns:
  70. dict: 保存的文件路径
  71. """
  72. saved_files = {}
  73. # 1. 保存 Markdown 文件(OmniDocBench 评测必需)
  74. output_md_path = os.path.join(output_dir, f"{image_name}.md")
  75. md_content = ""
  76. # 优先使用无页眉页脚的版本(符合 OmniDocBench 评测要求)
  77. if 'md_content_nohf_path' in result and os.path.exists(result['md_content_nohf_path']):
  78. with open(result['md_content_nohf_path'], 'r', encoding='utf-8') as f:
  79. md_content = f.read()
  80. elif 'md_content_path' in result and os.path.exists(result['md_content_path']):
  81. with open(result['md_content_path'], 'r', encoding='utf-8') as f:
  82. md_content = f.read()
  83. else:
  84. md_content = "# 解析失败\n\n未能提取到有效的文档内容。"
  85. with open(output_md_path, 'w', encoding='utf-8') as f:
  86. f.write(md_content)
  87. saved_files['md'] = output_md_path
  88. # 2. 保存 JSON 文件
  89. output_json_path = os.path.join(output_dir, f"{image_name}.json")
  90. json_data = {}
  91. if 'layout_info_path' in result and os.path.exists(result['layout_info_path']):
  92. with open(result['layout_info_path'], 'r', encoding='utf-8') as f:
  93. json_data = json.load(f)
  94. else:
  95. json_data = {
  96. "error": "未能提取到有效的布局信息",
  97. "cells": []
  98. }
  99. with open(output_json_path, 'w', encoding='utf-8') as f:
  100. json.dump(json_data, f, ensure_ascii=False, indent=2)
  101. saved_files['json'] = output_json_path
  102. # 3. 保存带标注的布局图片
  103. output_layout_image_path = os.path.join(output_dir, f"{image_name}_layout.jpg")
  104. if 'layout_image_path' in result and os.path.exists(result['layout_image_path']):
  105. # 直接复制布局图片
  106. shutil.copy2(result['layout_image_path'], output_layout_image_path)
  107. saved_files['layout_image'] = output_layout_image_path
  108. else:
  109. # 如果没有布局图片,使用原始图片作为占位符
  110. try:
  111. original_image = Image.open(result.get('original_image_path', ''))
  112. original_image.save(output_layout_image_path, 'JPEG', quality=95)
  113. saved_files['layout_image'] = output_layout_image_path
  114. print(f"⚠️ 使用原始图片作为布局图片: {image_name}")
  115. except Exception as e:
  116. print(f"⚠️ 无法保存布局图片: {image_name}, 错误: {e}")
  117. saved_files['layout_image'] = None
  118. # 4. 可选:保存原始图片副本
  119. output_original_image_path = os.path.join(output_dir, f"{image_name}_original.jpg")
  120. if 'original_image_path' in result and os.path.exists(result['original_image_path']):
  121. shutil.copy2(result['original_image_path'], output_original_image_path)
  122. saved_files['original_image'] = output_original_image_path
  123. return saved_files
  124. def process_single_image(self, image_path, output_dir):
  125. """
  126. 处理单张图片
  127. Args:
  128. image_path: 图片路径
  129. output_dir: 输出目录
  130. Returns:
  131. bool: 处理是否成功
  132. """
  133. try:
  134. # 获取图片文件名(不含扩展名)
  135. image_name = Path(image_path).stem
  136. # 检查输出文件是否已存在
  137. output_md_path = os.path.join(output_dir, f"{image_name}.md")
  138. output_json_path = os.path.join(output_dir, f"{image_name}.json")
  139. output_layout_path = os.path.join(output_dir, f"{image_name}_layout.jpg")
  140. if all(os.path.exists(p) for p in [output_md_path, output_json_path, output_layout_path]):
  141. print(f"跳过已存在的文件: {image_name}")
  142. return True
  143. # 创建临时会话目录
  144. temp_dir, session_id = self.create_temp_session_dir()
  145. try:
  146. # 读取图片
  147. image = Image.open(image_path)
  148. # 使用 DotsOCRParser 处理图片
  149. filename = f"omnidocbench_{session_id}"
  150. results = self.parser.parse_image(
  151. input_path=image,
  152. filename=filename,
  153. prompt_mode=self.prompt_mode,
  154. save_dir=temp_dir,
  155. fitz_preprocess=True # 对图片使用 fitz 预处理
  156. )
  157. # 解析结果
  158. if not results:
  159. print(f"警告: {image_name} 未返回解析结果")
  160. return False
  161. result = results[0] # parse_image 返回单个结果的列表
  162. # 添加原始图片路径到结果中
  163. result['original_image_path'] = image_path
  164. # 保存所有结果文件到输出目录
  165. saved_files = self.save_results_to_output_dir(result, image_name, output_dir)
  166. # 验证保存结果
  167. success_count = sum(1 for path in saved_files.values() if path and os.path.exists(path))
  168. total_expected = 3 # md, json, layout_image
  169. if success_count >= 2: # 至少保存了 md 和 json
  170. print(f"✅ 成功处理: {image_name} (保存了 {success_count}/{total_expected} 个文件)")
  171. return True
  172. else:
  173. print(f"⚠️ 部分成功: {image_name} (保存了 {success_count}/{total_expected} 个文件)")
  174. return False
  175. except Exception as e:
  176. print(f"❌ 处理 {image_name} 时出错: {str(e)}")
  177. return False
  178. finally:
  179. # 清理临时目录
  180. if os.path.exists(temp_dir):
  181. shutil.rmtree(temp_dir, ignore_errors=True)
  182. except Exception as e:
  183. print(f"❌ 处理 {image_path} 时出现致命错误: {str(e)}")
  184. return False
  185. def process_batch(self, images_dir, output_dir):
  186. """
  187. 批量处理图片
  188. Args:
  189. images_dir: 输入图片目录
  190. output_dir: 输出目录
  191. """
  192. # 创建输出目录
  193. os.makedirs(output_dir, exist_ok=True)
  194. # 获取所有图片文件
  195. image_extensions = ['.jpg', '.jpeg', '.png']
  196. image_files = []
  197. for ext in image_extensions:
  198. image_files.extend(Path(images_dir).glob(f"*{ext}"))
  199. image_files.extend(Path(images_dir).glob(f"*{ext.upper()}"))
  200. image_files = sorted(image_files)
  201. if not image_files:
  202. print(f"在 {images_dir} 中未找到图片文件")
  203. return
  204. print(f"找到 {len(image_files)} 个图片文件")
  205. print(f"输出目录结构: {output_dir}")
  206. # 统计变量
  207. success_count = 0
  208. failed_count = 0
  209. skipped_count = 0
  210. # 使用进度条处理
  211. with tqdm(image_files, desc="处理图片", unit="张") as pbar:
  212. for image_path in pbar:
  213. # 更新进度条描述
  214. pbar.set_description(f"处理: {image_path.name}")
  215. # 检查输出文件是否已存在(在主输出目录中)
  216. image_name = image_path.stem
  217. output_md_path = os.path.join(output_dir, f"{image_name}.md")
  218. output_json_path = os.path.join(output_dir, f"{image_name}.json")
  219. output_layout_path = os.path.join(output_dir, f"{image_name}_layout.jpg")
  220. if all(os.path.exists(p) for p in [output_md_path, output_json_path, output_layout_path]):
  221. skipped_count += 1
  222. continue
  223. # 处理图片
  224. if self.process_single_image(str(image_path), output_dir):
  225. success_count += 1
  226. else:
  227. failed_count += 1
  228. # 更新进度条后缀
  229. pbar.set_postfix({
  230. 'success': success_count,
  231. 'failed': failed_count,
  232. 'skipped': skipped_count
  233. })
  234. # 输出最终统计
  235. print(f"\n🎉 批量处理完成!")
  236. print(f" ✅ 成功: {success_count}")
  237. print(f" ❌ 失败: {failed_count}")
  238. print(f" ⏭️ 跳过: {skipped_count}")
  239. print(f" 📁 输出目录: {output_dir}")
  240. # 生成处理报告
  241. self.generate_processing_report(output_dir, success_count, failed_count, skipped_count)
  242. def generate_processing_report(self, output_dir, success_count, failed_count, skipped_count):
  243. """生成处理报告"""
  244. report_path = os.path.join(output_dir, "processing_report.json")
  245. report = {
  246. "processing_summary": {
  247. "success_count": success_count,
  248. "failed_count": failed_count,
  249. "skipped_count": skipped_count,
  250. "total_processed": success_count + failed_count + skipped_count
  251. },
  252. "output_structure": {
  253. "markdown_files": f"{output_dir}/*.md",
  254. "json_files": f"{output_dir}/*.json",
  255. "layout_images": f"{output_dir}/*_layout.jpg",
  256. "original_images": f"{output_dir}/*_original.jpg"
  257. },
  258. "configuration": {
  259. "prompt_mode": self.prompt_mode,
  260. "server": f"{self.parser.ip}:{self.parser.port}",
  261. "pixel_range": f"{self.parser.min_pixels} - {self.parser.max_pixels}"
  262. }
  263. }
  264. with open(report_path, 'w', encoding='utf-8') as f:
  265. json.dump(report, f, ensure_ascii=False, indent=2)
  266. print(f"📊 处理报告已保存: {report_path}")
  267. def main():
  268. parser = argparse.ArgumentParser(description="批量处理 OmniDocBench 图片")
  269. parser.add_argument(
  270. "--images_dir",
  271. type=str,
  272. default="../OmniDocBench/OpenDataLab___OmniDocBench/images",
  273. help="输入图片目录路径"
  274. )
  275. parser.add_argument(
  276. "--output_dir",
  277. type=str,
  278. default="./omnidocbench_predictions",
  279. help="输出目录路径"
  280. )
  281. parser.add_argument(
  282. "--ip",
  283. type=str,
  284. default="127.0.0.1",
  285. help="vLLM 服务器 IP"
  286. )
  287. parser.add_argument(
  288. "--port",
  289. type=int,
  290. default=8101,
  291. help="vLLM 服务器端口"
  292. )
  293. parser.add_argument(
  294. "--model_name",
  295. type=str,
  296. default="DotsOCR",
  297. help="模型名称"
  298. )
  299. parser.add_argument(
  300. "--prompt_mode",
  301. type=str,
  302. default="prompt_layout_all_en",
  303. choices=list(dict_promptmode_to_prompt.keys()),
  304. help="提示模式"
  305. )
  306. parser.add_argument(
  307. "--min_pixels",
  308. type=int,
  309. default=MIN_PIXELS,
  310. help="最小像素数"
  311. )
  312. parser.add_argument(
  313. "--max_pixels",
  314. type=int,
  315. default=MAX_PIXELS,
  316. help="最大像素数"
  317. )
  318. parser.add_argument(
  319. "--dpi",
  320. type=int,
  321. default=200,
  322. help="PDF 处理 DPI"
  323. )
  324. args = parser.parse_args()
  325. # 检查输入目录
  326. if not os.path.exists(args.images_dir):
  327. print(f"❌ 输入目录不存在: {args.images_dir}")
  328. return
  329. print(f"🚀 开始批量处理 OmniDocBench 图片")
  330. print(f"📁 输入目录: {args.images_dir}")
  331. print(f"📁 输出目录: {args.output_dir}")
  332. print("="*60)
  333. # 创建处理器
  334. processor = OmniDocBenchProcessor(
  335. ip=args.ip,
  336. port=args.port,
  337. model_name=args.model_name,
  338. prompt_mode=args.prompt_mode,
  339. dpi=args.dpi,
  340. min_pixels=args.min_pixels,
  341. max_pixels=args.max_pixels
  342. )
  343. # 开始批量处理
  344. processor.process_batch(args.images_dir, args.output_dir)
  345. if __name__ == "__main__":
  346. main()