processor.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377
  1. """
  2. MinerU vLLM 处理器
  3. 基于 MinerU demo.py 框架的文档处理类
  4. """
  5. import os
  6. import json
  7. import time
  8. import traceback
  9. from pathlib import Path
  10. from typing import List, Dict, Any
  11. from loguru import logger
  12. # 导入 MinerU 核心组件
  13. from mineru.cli.common import read_fn, convert_pdf_bytes_to_bytes_by_pypdfium2
  14. from mineru.data.data_reader_writer import FileBasedDataWriter
  15. from mineru.utils.draw_bbox import draw_layout_bbox
  16. from mineru.utils.enum_class import MakeMode
  17. from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
  18. from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
  19. # 导入 ocr_utils
  20. import sys
  21. ocr_platform_root = Path(__file__).parents[2]
  22. if str(ocr_platform_root) not in sys.path:
  23. sys.path.insert(0, str(ocr_platform_root))
  24. from ocr_utils import normalize_markdown_table, normalize_json_table
  25. class MinerUVLLMProcessor:
  26. """MinerU vLLM 处理器 (基于 demo.py 框架)"""
  27. def __init__(self,
  28. server_url: str = "http://127.0.0.1:8121",
  29. timeout: int = 300,
  30. normalize_numbers: bool = False,
  31. debug: bool = False):
  32. """
  33. 初始化处理器
  34. Args:
  35. server_url: vLLM 服务器地址
  36. timeout: 请求超时时间
  37. normalize_numbers: 是否标准化数字
  38. debug: 是否启用调试模式
  39. """
  40. self.server_url = server_url.rstrip('/')
  41. self.timeout = timeout
  42. self.normalize_numbers = normalize_numbers
  43. self.debug = debug
  44. self.backend = "http-client" # 固定使用 http-client 后端
  45. logger.info(f"MinerU vLLM Processor 初始化完成:")
  46. logger.info(f" - 服务器: {server_url}")
  47. logger.info(f" - 后端: vlm-{self.backend}")
  48. logger.info(f" - 超时: {timeout}s")
  49. logger.info(f" - 数字标准化: {normalize_numbers}")
  50. logger.info(f" - 调试模式: {debug}")
  51. def do_parse_single_file(self,
  52. input_file: str,
  53. output_dir: str,
  54. start_page_id: int = 0,
  55. end_page_id: int | None = None) -> Dict[str, Any]:
  56. """
  57. 解析单个文件 (参考 demo.py 的 do_parse 函数)
  58. Args:
  59. input_file: 文件路径
  60. output_dir: 输出目录
  61. start_page_id: 起始页ID
  62. end_page_id: 结束页ID
  63. Returns:
  64. dict: 处理结果
  65. """
  66. try:
  67. # 准备文件名和字节数据
  68. file_path = Path(input_file)
  69. pdf_file_name = file_path.stem
  70. pdf_bytes = read_fn(str(file_path))
  71. # 转换PDF字节流 (如果需要)
  72. if file_path.suffix.lower() == '.pdf':
  73. pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(
  74. pdf_bytes, start_page_id, end_page_id
  75. )
  76. # 准备环境 (创建输出目录)
  77. local_md_dir = Path(output_dir).resolve()
  78. local_image_dir = local_md_dir / "images"
  79. image_writer = FileBasedDataWriter(local_image_dir.as_posix())
  80. md_writer = FileBasedDataWriter(local_md_dir.as_posix())
  81. # 使用 VLM 分析文档 (核心调用)
  82. middle_json, model_output = vlm_doc_analyze(
  83. pdf_bytes,
  84. image_writer=image_writer,
  85. backend=self.backend,
  86. server_url=self.server_url
  87. )
  88. pdf_info = middle_json["pdf_info"]
  89. # 处理输出
  90. output_files = self._process_output(
  91. pdf_info=pdf_info,
  92. pdf_bytes=pdf_bytes,
  93. pdf_file_name=pdf_file_name,
  94. local_md_dir=local_md_dir,
  95. local_image_dir=local_image_dir,
  96. md_writer=md_writer,
  97. middle_json=middle_json,
  98. model_output=model_output,
  99. original_file_path=str(file_path)
  100. )
  101. # 统计提取信息
  102. extraction_stats = self._get_extraction_stats(middle_json)
  103. return {
  104. "success": True,
  105. "pdf_info": pdf_info,
  106. "middle_json": middle_json,
  107. "model_output": model_output,
  108. "output_files": output_files,
  109. "extraction_stats": extraction_stats
  110. }
  111. except Exception as e:
  112. logger.error(f"Failed to process {file_path}: {e}")
  113. if self.debug:
  114. traceback.print_exc()
  115. return {
  116. "success": False,
  117. "error": str(e)
  118. }
  119. def _process_output(self,
  120. pdf_info,
  121. pdf_bytes,
  122. pdf_file_name,
  123. local_md_dir,
  124. local_image_dir,
  125. md_writer,
  126. middle_json,
  127. model_output,
  128. original_file_path: str) -> Dict[str, str]:
  129. """
  130. 处理输出文件
  131. Args:
  132. pdf_info: PDF信息
  133. pdf_bytes: PDF字节数据
  134. pdf_file_name: PDF文件名
  135. local_md_dir: Markdown目录
  136. local_image_dir: 图片目录
  137. md_writer: Markdown写入器
  138. middle_json: 中间JSON数据
  139. model_output: 模型输出
  140. original_file_path: 原始文件路径
  141. Returns:
  142. dict: 保存的文件路径信息
  143. """
  144. saved_files = {}
  145. try:
  146. # 设置相对图片目录名
  147. image_dir = str(os.path.basename(local_image_dir))
  148. # 1. 生成并保存 Markdown 文件
  149. md_content_str = vlm_union_make(pdf_info, MakeMode.MM_MD, image_dir)
  150. # 数字标准化处理
  151. if self.normalize_numbers:
  152. original_md = md_content_str
  153. md_content_str = normalize_markdown_table(md_content_str)
  154. changes_count = len([1 for o, n in zip(original_md, md_content_str) if o != n])
  155. if changes_count > 0:
  156. saved_files['md_normalized'] = f"✅ 已标准化 {changes_count} 个字符(全角→半角)"
  157. else:
  158. saved_files['md_normalized'] = "ℹ️ 无需标准化(已是标准格式)"
  159. md_writer.write_string(f"{pdf_file_name}.md", md_content_str)
  160. saved_files['md'] = os.path.join(local_md_dir, f"{pdf_file_name}.md")
  161. # 2. 生成并保存 content_list JSON 文件
  162. content_list = vlm_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir)
  163. content_list_str = json.dumps(content_list, ensure_ascii=False, indent=2)
  164. md_writer.write_string(f"{pdf_file_name}_original.json", content_list_str)
  165. # 转换bbox坐标(从1000-based到像素坐标)
  166. if pdf_info and len(pdf_info) > 0:
  167. page_width, page_height = pdf_info[0].get('page_size', [1000, 1000])
  168. for element in content_list:
  169. if "bbox" in element:
  170. x0, y0, x1, y1 = element["bbox"]
  171. element["bbox"] = [
  172. int(x0 / 1000 * page_width),
  173. int(y0 / 1000 * page_height),
  174. int(x1 / 1000 * page_width),
  175. int(y1 / 1000 * page_height),
  176. ]
  177. content_list_str = json.dumps(content_list, ensure_ascii=False, indent=2)
  178. # 数字标准化处理
  179. if self.normalize_numbers:
  180. original_json = content_list_str
  181. content_list_str = normalize_json_table(content_list_str)
  182. changes_count = len([1 for o, n in zip(original_json, content_list_str) if o != n])
  183. if changes_count > 0:
  184. saved_files['json_normalized'] = f"✅ 已标准化 {changes_count} 个字符(全角→半角)"
  185. else:
  186. saved_files['json_normalized'] = "ℹ️ 无需标准化(已是标准格式)"
  187. md_writer.write_string(f"{pdf_file_name}.json", content_list_str)
  188. saved_files['json'] = os.path.join(local_md_dir, f"{pdf_file_name}.json")
  189. # 绘制布局边界框
  190. try:
  191. draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_layout.pdf")
  192. saved_files['layout_pdf'] = os.path.join(local_md_dir, f"{pdf_file_name}_layout.pdf")
  193. except Exception as e:
  194. logger.warning(f"Failed to draw layout bbox: {e}")
  195. # 调试模式下保存额外信息
  196. if self.debug:
  197. # 保存 middle.json
  198. middle_json_str = json.dumps(middle_json, ensure_ascii=False, indent=2)
  199. if self.normalize_numbers:
  200. middle_json_str = normalize_json_table(middle_json_str)
  201. md_writer.write_string(f"{pdf_file_name}_middle.json", middle_json_str)
  202. saved_files['middle_json'] = os.path.join(local_md_dir, f"{pdf_file_name}_middle.json")
  203. # 保存 model output
  204. if model_output:
  205. model_output_str = json.dumps(model_output, ensure_ascii=False, indent=2)
  206. md_writer.write_string(f"{pdf_file_name}_model.json", model_output_str)
  207. saved_files['model_output'] = os.path.join(local_md_dir, f"{pdf_file_name}_model.json")
  208. logger.info(f"Output saved to: {local_md_dir}")
  209. except Exception as e:
  210. logger.error(f"Error in _process_output: {e}")
  211. if self.debug:
  212. traceback.print_exc()
  213. return saved_files
  214. def _get_extraction_stats(self, middle_json: Dict) -> Dict[str, Any]:
  215. """
  216. 获取提取统计信息
  217. Args:
  218. middle_json: 中间JSON数据
  219. Returns:
  220. dict: 统计信息
  221. """
  222. stats = {
  223. "total_blocks": 0,
  224. "block_types": {},
  225. "total_pages": 0
  226. }
  227. try:
  228. pdf_info = middle_json.get("pdf_info", [])
  229. if isinstance(pdf_info, list):
  230. stats["total_pages"] = len(pdf_info)
  231. for page_info in pdf_info:
  232. para_blocks = page_info.get("para_blocks", [])
  233. stats["total_blocks"] += len(para_blocks)
  234. for block in para_blocks:
  235. block_type = block.get("type", "unknown")
  236. stats["block_types"][block_type] = stats["block_types"].get(block_type, 0) + 1
  237. except Exception as e:
  238. logger.warning(f"Failed to get extraction stats: {e}")
  239. return stats
  240. def process_single_image(self, image_path: str, output_dir: str) -> Dict[str, Any]:
  241. """
  242. 处理单张图片
  243. Args:
  244. image_path: 图片路径
  245. output_dir: 输出目录
  246. Returns:
  247. dict: 处理结果,包含 success 字段(基于输出文件存在性判断)
  248. """
  249. start_time = time.time()
  250. image_path_obj = Path(image_path)
  251. image_name = image_path_obj.stem
  252. # 判断是否为PDF页面(根据文件名模式)
  253. is_pdf_page = "_page_" in image_path_obj.name
  254. # 根据输入类型生成预期的输出文件名
  255. if is_pdf_page:
  256. # PDF页面:文件名格式为 filename_page_001.png
  257. # 输出文件名:filename_page_001.md 和 filename_page_001.json
  258. expected_md_path = Path(output_dir) / f"{image_name}.md"
  259. expected_json_path = Path(output_dir) / f"{image_name}.json"
  260. else:
  261. # 普通图片:输出文件名:filename.md 和 filename.json
  262. expected_md_path = Path(output_dir) / f"{image_name}.md"
  263. expected_json_path = Path(output_dir) / f"{image_name}.json"
  264. result_info = {
  265. "image_path": image_path,
  266. "processing_time": 0,
  267. "success": False,
  268. "server": self.server_url,
  269. "error": None,
  270. "output_files": {},
  271. "is_pdf_page": is_pdf_page,
  272. "extraction_stats": {}
  273. }
  274. try:
  275. # 检查输出文件是否已存在(成功判断标准)
  276. if expected_md_path.exists() and expected_json_path.exists():
  277. result_info.update({
  278. "success": True,
  279. "processing_time": 0,
  280. "output_files": {
  281. "md": str(expected_md_path),
  282. "json": str(expected_json_path)
  283. },
  284. "skipped": True
  285. })
  286. logger.info(f"✅ 文件已存在,跳过处理: {image_name}")
  287. return result_info
  288. # 使用 do_parse_single_file 处理
  289. parse_result = self.do_parse_single_file(image_path, output_dir)
  290. # 处理完成后,再次检查输出文件是否存在(成功判断标准)
  291. if expected_md_path.exists() and expected_json_path.exists():
  292. result_info.update({
  293. "success": True,
  294. "output_files": parse_result.get("output_files", {}),
  295. "extraction_stats": parse_result.get("extraction_stats", {})
  296. })
  297. logger.info(f"✅ 处理成功: {image_name}")
  298. else:
  299. # 文件不存在,标记为失败
  300. missing_files = []
  301. if not expected_md_path.exists():
  302. missing_files.append("md")
  303. if not expected_json_path.exists():
  304. missing_files.append("json")
  305. result_info["error"] = f"输出文件不存在: {', '.join(missing_files)}"
  306. result_info["success"] = False
  307. logger.error(f"❌ 处理失败: {image_name} - {result_info['error']}")
  308. except Exception as e:
  309. result_info["error"] = str(e)
  310. result_info["success"] = False
  311. logger.error(f"Error processing {image_name}: {e}")
  312. if self.debug:
  313. traceback.print_exc()
  314. finally:
  315. result_info["processing_time"] = time.time() - start_time
  316. return result_info